diff --git "a/sft/665K/Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" "b/sft/665K/Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/665K/Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.07383558, + "auxiliary_loss_mlp": 1.48236787, + "balance_loss_clip": 3.0267601, + "balance_loss_mlp": 2.87545991, + "epoch": 6.012325266796934e-05, + "flos": 24461755832880.0, + "grad_norm": 81.29780881384477, + "language_loss": 3.27742982, + "learning_rate": 0.0, + "loss": 3.19807053, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 43.515625, + "router_z_loss_mlp": 1452.5, + "step": 1, + "time_per_iteration": 18.905760049819946 + }, + { + "auxiliary_loss_clip": 0.04888222, + "auxiliary_loss_mlp": 0.92343092, + "balance_loss_clip": 2.02775002, + "balance_loss_mlp": 1.86457253, + "epoch": 0.00012024650533593868, + "flos": 20230245731160.0, + "grad_norm": 71.45192004168315, + "language_loss": 2.04273796, + "learning_rate": 4.4628432569317594e-07, + "loss": 3.01505089, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 28.609375, + "router_z_loss_mlp": 904.5, + "step": 2, + "time_per_iteration": 2.6701111793518066 + }, + { + "auxiliary_loss_clip": 0.04870947, + "auxiliary_loss_mlp": 0.95361519, + "balance_loss_clip": 2.03673625, + "balance_loss_mlp": 1.90448701, + "epoch": 0.000180369758003908, + "flos": 22315085605800.0, + "grad_norm": 70.85510990266013, + "language_loss": 1.81512225, + "learning_rate": 7.073439208833112e-07, + "loss": 2.81744671, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 28.359375, + "router_z_loss_mlp": 934.5, + "step": 3, + "time_per_iteration": 2.610840320587158 + }, + { + "auxiliary_loss_clip": 0.04831625, + "auxiliary_loss_mlp": 0.82981712, + "balance_loss_clip": 1.99493444, + "balance_loss_mlp": 1.82936764, + "epoch": 0.00024049301067187735, + "flos": 22419195105600.0, + "grad_norm": 71.62891617232775, + "language_loss": 1.95933294, + "learning_rate": 8.925686513863519e-07, + "loss": 2.837466, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 28.375, + "router_z_loss_mlp": 810.5, + "step": 4, + "time_per_iteration": 2.832792043685913 + }, + { + "auxiliary_loss_clip": 0.04875773, + "auxiliary_loss_mlp": 0.89363742, + "balance_loss_clip": 2.02962399, + "balance_loss_mlp": 1.8637408, + "epoch": 0.0003006162633398467, + "flos": 21401903252520.0, + "grad_norm": 77.83451050814793, + "language_loss": 2.23753643, + "learning_rate": 1.0362401141348472e-06, + "loss": 3.17993164, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 28.4375, + "router_z_loss_mlp": 875.5, + "step": 5, + "time_per_iteration": 2.9504010677337646 + }, + { + "auxiliary_loss_clip": 0.04844721, + "auxiliary_loss_mlp": 0.93838829, + "balance_loss_clip": 2.00692821, + "balance_loss_mlp": 1.89546347, + "epoch": 0.000360739516007816, + "flos": 21657332793600.0, + "grad_norm": 71.8094343165896, + "language_loss": 1.87692428, + "learning_rate": 1.153628246576487e-06, + "loss": 2.86375952, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 28.390625, + "router_z_loss_mlp": 919.0, + "step": 6, + "time_per_iteration": 2.8917224407196045 + }, + { + "auxiliary_loss_clip": 0.04864228, + "auxiliary_loss_mlp": 0.94501972, + "balance_loss_clip": 2.01934671, + "balance_loss_mlp": 1.87501645, + "epoch": 0.0004208627686757854, + "flos": 27165160650960.0, + "grad_norm": 67.36990161396687, + "language_loss": 1.77520537, + "learning_rate": 1.2528784983718962e-06, + "loss": 2.76886725, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 28.4375, + "router_z_loss_mlp": 926.5, + "step": 7, + "time_per_iteration": 3.0379741191864014 + }, + { + "auxiliary_loss_clip": 0.04678736, + "auxiliary_loss_mlp": 0.64406514, + "balance_loss_clip": 1.97293615, + "balance_loss_mlp": 1.71120524, + "epoch": 0.0004809860213437547, + "flos": 31325053785480.0, + "grad_norm": 59.92423655795522, + "language_loss": 1.72372746, + "learning_rate": 1.338852977079528e-06, + "loss": 2.41458011, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 27.046875, + "router_z_loss_mlp": 626.5, + "step": 8, + "time_per_iteration": 2.9637036323547363 + }, + { + "auxiliary_loss_clip": 0.04702004, + "auxiliary_loss_mlp": 0.67788607, + "balance_loss_clip": 1.97607064, + "balance_loss_mlp": 1.72415805, + "epoch": 0.000541109274011724, + "flos": 32166758779200.0, + "grad_norm": 57.69977978916644, + "language_loss": 1.85848331, + "learning_rate": 1.4146878417666224e-06, + "loss": 2.58338928, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 27.25, + "router_z_loss_mlp": 660.5, + "step": 9, + "time_per_iteration": 3.057691812515259 + }, + { + "auxiliary_loss_clip": 0.04664749, + "auxiliary_loss_mlp": 0.63390499, + "balance_loss_clip": 1.96913517, + "balance_loss_mlp": 1.72057962, + "epoch": 0.0006012325266796934, + "flos": 18921521702880.0, + "grad_norm": 51.3325315191088, + "language_loss": 1.74184561, + "learning_rate": 1.4825244398280232e-06, + "loss": 2.42239809, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 26.96875, + "router_z_loss_mlp": 615.5, + "step": 10, + "time_per_iteration": 2.9195775985717773 + }, + { + "auxiliary_loss_clip": 0.04647355, + "auxiliary_loss_mlp": 0.57253242, + "balance_loss_clip": 1.97282863, + "balance_loss_mlp": 1.61359429, + "epoch": 0.0006613557793476627, + "flos": 20779462732320.0, + "grad_norm": 48.83548301742367, + "language_loss": 1.74650574, + "learning_rate": 1.5438901072051983e-06, + "loss": 2.36551189, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 26.71875, + "router_z_loss_mlp": 556.5, + "step": 11, + "time_per_iteration": 2.9609224796295166 + }, + { + "auxiliary_loss_clip": 0.04609998, + "auxiliary_loss_mlp": 0.57498235, + "balance_loss_clip": 1.9653821, + "balance_loss_mlp": 1.68768728, + "epoch": 0.000721479032015632, + "flos": 16586084682000.0, + "grad_norm": 48.46388062636147, + "language_loss": 1.69402766, + "learning_rate": 1.5999125722696629e-06, + "loss": 2.31511021, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 26.421875, + "router_z_loss_mlp": 558.0, + "step": 12, + "time_per_iteration": 2.97611665725708 + }, + { + "auxiliary_loss_clip": 0.04476679, + "auxiliary_loss_mlp": 0.35114872, + "balance_loss_clip": 1.91909802, + "balance_loss_mlp": 1.47229278, + "epoch": 0.0007816022846836014, + "flos": 23810766834240.0, + "grad_norm": 30.714837231961422, + "language_loss": 1.57158792, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.96750331, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 25.609375, + "router_z_loss_mlp": 336.25, + "step": 13, + "time_per_iteration": 2.9836511611938477 + }, + { + "auxiliary_loss_clip": 0.04390551, + "auxiliary_loss_mlp": 0.2087833, + "balance_loss_clip": 1.96163797, + "balance_loss_mlp": 1.38370192, + "epoch": 0.0008417255373515708, + "flos": 19176951243960.0, + "grad_norm": 16.36436335216372, + "language_loss": 1.42724121, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.67992997, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 24.28125, + "router_z_loss_mlp": 195.125, + "step": 14, + "time_per_iteration": 3.0297322273254395 + }, + { + "auxiliary_loss_clip": 0.04438049, + "auxiliary_loss_mlp": 0.18033329, + "balance_loss_clip": 1.97856855, + "balance_loss_mlp": 1.34631777, + "epoch": 0.00090184879001954, + "flos": 26401146095880.0, + "grad_norm": 15.204864338158941, + "language_loss": 1.32710385, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.55181766, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 24.59375, + "router_z_loss_mlp": 166.9375, + "step": 15, + "time_per_iteration": 3.1217479705810547 + }, + { + "auxiliary_loss_clip": 0.04430978, + "auxiliary_loss_mlp": 0.15433693, + "balance_loss_clip": 1.98607111, + "balance_loss_mlp": 1.30405366, + "epoch": 0.0009619720426875094, + "flos": 24684697884600.0, + "grad_norm": 10.735324575263716, + "language_loss": 1.2689302, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.46757698, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 24.4375, + "router_z_loss_mlp": 141.25, + "step": 16, + "time_per_iteration": 3.089539051055908 + }, + { + "auxiliary_loss_clip": 0.04403584, + "auxiliary_loss_mlp": 0.1704134, + "balance_loss_clip": 1.97534919, + "balance_loss_mlp": 1.31868339, + "epoch": 0.0010220952953554788, + "flos": 18629155535400.0, + "grad_norm": 13.70507991695278, + "language_loss": 1.36980462, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.58425379, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 24.28125, + "router_z_loss_mlp": 157.125, + "step": 17, + "time_per_iteration": 6.97815728187561 + }, + { + "auxiliary_loss_clip": 0.04436743, + "auxiliary_loss_mlp": 0.12451539, + "balance_loss_clip": 2.00057173, + "balance_loss_mlp": 1.20275986, + "epoch": 0.001082218548023448, + "flos": 26148396706560.0, + "grad_norm": 9.65177442751895, + "language_loss": 1.21844351, + "learning_rate": 1.860972167459798e-06, + "loss": 1.38732624, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 24.390625, + "router_z_loss_mlp": 112.5, + "step": 18, + "time_per_iteration": 5.421044111251831 + }, + { + "auxiliary_loss_clip": 0.04445352, + "auxiliary_loss_mlp": 0.11560911, + "balance_loss_clip": 2.01082587, + "balance_loss_mlp": 1.20324492, + "epoch": 0.0011423418006914173, + "flos": 19614546198720.0, + "grad_norm": 10.606259158742807, + "language_loss": 1.20858109, + "learning_rate": 1.89578346593066e-06, + "loss": 1.36864376, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 24.328125, + "router_z_loss_mlp": 103.5625, + "step": 19, + "time_per_iteration": 2.931133508682251 + }, + { + "auxiliary_loss_clip": 0.04426527, + "auxiliary_loss_mlp": 0.10936055, + "balance_loss_clip": 2.01653242, + "balance_loss_mlp": 1.18873978, + "epoch": 0.0012024650533593868, + "flos": 17899965972000.0, + "grad_norm": 8.289933937839864, + "language_loss": 1.30099177, + "learning_rate": 1.928808765521199e-06, + "loss": 1.45461774, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 24.078125, + "router_z_loss_mlp": 97.5, + "step": 20, + "time_per_iteration": 2.997614622116089 + }, + { + "auxiliary_loss_clip": 0.04434618, + "auxiliary_loss_mlp": 0.10129222, + "balance_loss_clip": 2.01476765, + "balance_loss_mlp": 1.12653589, + "epoch": 0.001262588306027356, + "flos": 21257080548840.0, + "grad_norm": 8.112038291802216, + "language_loss": 1.26797032, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.41360855, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 24.203125, + "router_z_loss_mlp": 90.0625, + "step": 21, + "time_per_iteration": 2.987572193145752 + }, + { + "auxiliary_loss_clip": 0.04611077, + "auxiliary_loss_mlp": 0.04036671, + "balance_loss_clip": 2.09797883, + "balance_loss_mlp": 1.19090676, + "epoch": 0.0013227115586953253, + "flos": 26109673312320.0, + "grad_norm": 7.085182738320366, + "language_loss": 1.26343799, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.3499155, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 25.125, + "router_z_loss_mlp": 28.484375, + "step": 22, + "time_per_iteration": 3.0308990478515625 + }, + { + "auxiliary_loss_clip": 0.04750575, + "auxiliary_loss_mlp": 0.03371604, + "balance_loss_clip": 2.14496374, + "balance_loss_mlp": 1.34905124, + "epoch": 0.0013828348113632948, + "flos": 23956685963640.0, + "grad_norm": 3.1461257575748376, + "language_loss": 1.0665127, + "learning_rate": 2.018794797290208e-06, + "loss": 1.1477344, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 26.078125, + "router_z_loss_mlp": 20.2109375, + "step": 23, + "time_per_iteration": 2.9992730617523193 + }, + { + "auxiliary_loss_clip": 0.04775387, + "auxiliary_loss_mlp": 0.02840055, + "balance_loss_clip": 2.17508507, + "balance_loss_mlp": 1.43319416, + "epoch": 0.001442958064031264, + "flos": 15963969028680.0, + "grad_norm": 2.7747811810712832, + "language_loss": 1.17699075, + "learning_rate": 2.046196897962839e-06, + "loss": 1.2531451, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 25.984375, + "router_z_loss_mlp": 14.0546875, + "step": 24, + "time_per_iteration": 2.9469337463378906 + }, + { + "auxiliary_loss_clip": 0.04640371, + "auxiliary_loss_mlp": 0.02835918, + "balance_loss_clip": 2.15806603, + "balance_loss_mlp": 1.3993026, + "epoch": 0.0015030813166992333, + "flos": 18112205199240.0, + "grad_norm": 4.043941632885009, + "language_loss": 1.20046806, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.27523088, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 24.8125, + "router_z_loss_mlp": 14.3671875, + "step": 25, + "time_per_iteration": 3.008183002471924 + }, + { + "auxiliary_loss_clip": 0.04491832, + "auxiliary_loss_mlp": 0.02747259, + "balance_loss_clip": 2.15734053, + "balance_loss_mlp": 1.38617492, + "epoch": 0.0015632045693672028, + "flos": 22239384976800.0, + "grad_norm": 2.965638085403528, + "language_loss": 1.16435003, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.23674107, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 23.359375, + "router_z_loss_mlp": 13.6015625, + "step": 26, + "time_per_iteration": 2.9603188037872314 + }, + { + "auxiliary_loss_clip": 0.04406982, + "auxiliary_loss_mlp": 0.02507789, + "balance_loss_clip": 2.16046834, + "balance_loss_mlp": 1.37291646, + "epoch": 0.001623327822035172, + "flos": 23997805251120.0, + "grad_norm": 2.7276295900927345, + "language_loss": 1.03467929, + "learning_rate": 2.122031762649933e-06, + "loss": 1.103827, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 22.46875, + "router_z_loss_mlp": 11.34765625, + "step": 27, + "time_per_iteration": 3.059278726577759 + }, + { + "auxiliary_loss_clip": 0.04291555, + "auxiliary_loss_mlp": 0.02165693, + "balance_loss_clip": 2.17333126, + "balance_loss_mlp": 1.34095585, + "epoch": 0.0016834510747031415, + "flos": 19681759680480.0, + "grad_norm": 2.6864416132236175, + "language_loss": 1.15891516, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.22348762, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 21.171875, + "router_z_loss_mlp": 8.23828125, + "step": 28, + "time_per_iteration": 2.958341121673584 + }, + { + "auxiliary_loss_clip": 0.04149532, + "auxiliary_loss_mlp": 0.02344714, + "balance_loss_clip": 2.13617373, + "balance_loss_mlp": 1.30673432, + "epoch": 0.0017435743273711108, + "flos": 20928833572320.0, + "grad_norm": 5.201050008330381, + "language_loss": 1.13644123, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.20138383, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 20.125, + "router_z_loss_mlp": 10.3828125, + "step": 29, + "time_per_iteration": 2.967421293258667 + }, + { + "auxiliary_loss_clip": 0.04036696, + "auxiliary_loss_mlp": 0.02385071, + "balance_loss_clip": 2.11529684, + "balance_loss_mlp": 1.28987145, + "epoch": 0.00180369758003908, + "flos": 19532713707360.0, + "grad_norm": 4.194859559167218, + "language_loss": 1.3521297, + "learning_rate": 2.189868360711334e-06, + "loss": 1.41634738, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 19.21875, + "router_z_loss_mlp": 10.94140625, + "step": 30, + "time_per_iteration": 2.9772400856018066 + }, + { + "auxiliary_loss_clip": 0.03928063, + "auxiliary_loss_mlp": 0.02068966, + "balance_loss_clip": 2.11912298, + "balance_loss_mlp": 1.30945921, + "epoch": 0.0018638208327070496, + "flos": 27458826285960.0, + "grad_norm": 2.956538476705738, + "language_loss": 1.15525877, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.21522903, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 18.09375, + "router_z_loss_mlp": 7.59960938, + "step": 31, + "time_per_iteration": 2.9981369972229004 + }, + { + "auxiliary_loss_clip": 0.03812653, + "auxiliary_loss_mlp": 0.02015005, + "balance_loss_clip": 2.12182379, + "balance_loss_mlp": 1.3167243, + "epoch": 0.0019239440853750188, + "flos": 13593422757600.0, + "grad_norm": 2.8406397690253833, + "language_loss": 1.05704892, + "learning_rate": 2.2314216284658796e-06, + "loss": 1.11532545, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 6.99023438, + "step": 32, + "time_per_iteration": 2.908867835998535 + }, + { + "auxiliary_loss_clip": 0.03733581, + "auxiliary_loss_mlp": 0.0203245, + "balance_loss_clip": 2.10238528, + "balance_loss_mlp": 1.2964046, + "epoch": 0.001984067338042988, + "flos": 11257457828040.0, + "grad_norm": 4.240266118004514, + "language_loss": 1.10639715, + "learning_rate": 2.2512340280885094e-06, + "loss": 1.16405749, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 16.3125, + "router_z_loss_mlp": 7.36132812, + "step": 33, + "time_per_iteration": 2.9208106994628906 + }, + { + "auxiliary_loss_clip": 0.03645969, + "auxiliary_loss_mlp": 0.01865258, + "balance_loss_clip": 2.10797167, + "balance_loss_mlp": 1.32967448, + "epoch": 0.0020441905907109576, + "flos": 22391842052160.0, + "grad_norm": 2.3490223960200236, + "language_loss": 0.98359466, + "learning_rate": 2.270454923596497e-06, + "loss": 1.03870678, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 15.3671875, + "router_z_loss_mlp": 5.35546875, + "step": 34, + "time_per_iteration": 3.032965660095215 + }, + { + "auxiliary_loss_clip": 0.03511994, + "auxiliary_loss_mlp": 0.02016281, + "balance_loss_clip": 2.06428504, + "balance_loss_mlp": 1.29282391, + "epoch": 0.0021043138433789266, + "flos": 49786645935600.0, + "grad_norm": 2.5453484144630223, + "language_loss": 0.87847376, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.93375641, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 14.4765625, + "router_z_loss_mlp": 7.234375, + "step": 35, + "time_per_iteration": 3.1981112957000732 + }, + { + "auxiliary_loss_clip": 0.03359491, + "auxiliary_loss_mlp": 0.01925085, + "balance_loss_clip": 2.06112671, + "balance_loss_mlp": 1.32045603, + "epoch": 0.002164437096046896, + "flos": 20562797193840.0, + "grad_norm": 3.4271665634825075, + "language_loss": 0.98187953, + "learning_rate": 2.307256493152974e-06, + "loss": 1.03472519, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 12.9921875, + "router_z_loss_mlp": 6.04296875, + "step": 36, + "time_per_iteration": 2.963249683380127 + }, + { + "auxiliary_loss_clip": 0.032328, + "auxiliary_loss_mlp": 0.01904388, + "balance_loss_clip": 2.04118395, + "balance_loss_mlp": 1.29842293, + "epoch": 0.0022245603487148656, + "flos": 26547999217560.0, + "grad_norm": 4.391810080444991, + "language_loss": 1.03121722, + "learning_rate": 2.3248973825097614e-06, + "loss": 1.08258915, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 11.9296875, + "router_z_loss_mlp": 6.06054688, + "step": 37, + "time_per_iteration": 3.0789928436279297 + }, + { + "auxiliary_loss_clip": 0.03110283, + "auxiliary_loss_mlp": 0.01734359, + "balance_loss_clip": 2.03768516, + "balance_loss_mlp": 1.31931973, + "epoch": 0.0022846836013828346, + "flos": 20342882986560.0, + "grad_norm": 2.603280198922284, + "language_loss": 1.09486628, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.14331269, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 10.7421875, + "router_z_loss_mlp": 4.15429688, + "step": 38, + "time_per_iteration": 3.036226987838745 + }, + { + "auxiliary_loss_clip": 0.03015135, + "auxiliary_loss_mlp": 0.01643613, + "balance_loss_clip": 2.02095747, + "balance_loss_mlp": 1.30448639, + "epoch": 0.002344806854050804, + "flos": 26252871681600.0, + "grad_norm": 2.6704309870793295, + "language_loss": 0.92428911, + "learning_rate": 2.358792165262154e-06, + "loss": 0.97087657, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 9.953125, + "router_z_loss_mlp": 3.38671875, + "step": 39, + "time_per_iteration": 2.9474117755889893 + }, + { + "auxiliary_loss_clip": 0.02963057, + "auxiliary_loss_mlp": 0.01671123, + "balance_loss_clip": 1.98754668, + "balance_loss_mlp": 1.3066287, + "epoch": 0.0024049301067187736, + "flos": 11805253536600.0, + "grad_norm": 6.470990765375315, + "language_loss": 1.00903106, + "learning_rate": 2.3750930912143747e-06, + "loss": 1.05537271, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 9.7421875, + "router_z_loss_mlp": 3.64550781, + "step": 40, + "time_per_iteration": 2.9353060722351074 + }, + { + "auxiliary_loss_clip": 0.02878208, + "auxiliary_loss_mlp": 0.01732716, + "balance_loss_clip": 1.9791975, + "balance_loss_mlp": 1.30985641, + "epoch": 0.0024650533593867426, + "flos": 20636426796480.0, + "grad_norm": 2.8185758752682, + "language_loss": 1.01517522, + "learning_rate": 2.3909914837471044e-06, + "loss": 1.06128454, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 8.984375, + "router_z_loss_mlp": 4.23046875, + "step": 41, + "time_per_iteration": 2.9920215606689453 + }, + { + "auxiliary_loss_clip": 0.02827896, + "auxiliary_loss_mlp": 0.0167429, + "balance_loss_clip": 1.96830368, + "balance_loss_mlp": 1.31961823, + "epoch": 0.002525176612054712, + "flos": 18410784445800.0, + "grad_norm": 2.8051391255934255, + "language_loss": 1.02975869, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.07478046, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 8.5859375, + "router_z_loss_mlp": 3.54785156, + "step": 42, + "time_per_iteration": 2.8946666717529297 + }, + { + "auxiliary_loss_clip": 0.02774787, + "auxiliary_loss_mlp": 0.01526836, + "balance_loss_clip": 1.95231342, + "balance_loss_mlp": 1.27430296, + "epoch": 0.0025852998647226816, + "flos": 28189924442280.0, + "grad_norm": 2.669204732431518, + "language_loss": 1.04925299, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.0922693, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 8.22265625, + "router_z_loss_mlp": 2.52441406, + "step": 43, + "time_per_iteration": 2.9822280406951904 + }, + { + "auxiliary_loss_clip": 0.02749417, + "auxiliary_loss_mlp": 0.01593239, + "balance_loss_clip": 1.9340713, + "balance_loss_mlp": 1.27089715, + "epoch": 0.0026454231173906506, + "flos": 14287218812280.0, + "grad_norm": 2.8926008702153743, + "language_loss": 1.0221622, + "learning_rate": 2.4364587585915504e-06, + "loss": 1.06558871, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 8.15234375, + "router_z_loss_mlp": 3.22851562, + "step": 44, + "time_per_iteration": 2.90075421333313 + }, + { + "auxiliary_loss_clip": 0.02698755, + "auxiliary_loss_mlp": 0.01512856, + "balance_loss_clip": 1.92891395, + "balance_loss_mlp": 1.26804733, + "epoch": 0.00270554637005862, + "flos": 22424352367320.0, + "grad_norm": 2.7562680321418824, + "language_loss": 1.05797076, + "learning_rate": 2.450927955901469e-06, + "loss": 1.10008681, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 7.703125, + "router_z_loss_mlp": 2.44824219, + "step": 45, + "time_per_iteration": 2.99454927444458 + }, + { + "auxiliary_loss_clip": 0.0269388, + "auxiliary_loss_mlp": 0.0157886, + "balance_loss_clip": 1.92497444, + "balance_loss_mlp": 1.24602818, + "epoch": 0.0027656696227265896, + "flos": 23990820613200.0, + "grad_norm": 2.2007020634137944, + "language_loss": 1.06858373, + "learning_rate": 2.465079122983384e-06, + "loss": 1.11131108, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 7.6875, + "router_z_loss_mlp": 3.32910156, + "step": 46, + "time_per_iteration": 3.0135529041290283 + }, + { + "auxiliary_loss_clip": 0.0266468, + "auxiliary_loss_mlp": 0.01482602, + "balance_loss_clip": 1.91811204, + "balance_loss_mlp": 1.2570585, + "epoch": 0.0028257928753945586, + "flos": 37676535837840.0, + "grad_norm": 2.656915818211797, + "language_loss": 0.95579678, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.99726963, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 7.4609375, + "router_z_loss_mlp": 2.25585938, + "step": 47, + "time_per_iteration": 3.076782464981079 + }, + { + "auxiliary_loss_clip": 0.02614724, + "auxiliary_loss_mlp": 0.01460839, + "balance_loss_clip": 1.90813208, + "balance_loss_mlp": 1.25370121, + "epoch": 0.002885916128062528, + "flos": 22459542834240.0, + "grad_norm": 2.7646667648241148, + "language_loss": 0.92303824, + "learning_rate": 2.492481223656015e-06, + "loss": 0.96379387, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 7.0625, + "router_z_loss_mlp": 2.07128906, + "step": 48, + "time_per_iteration": 3.0412168502807617 + }, + { + "auxiliary_loss_clip": 0.02631606, + "auxiliary_loss_mlp": 0.01501379, + "balance_loss_clip": 1.89858031, + "balance_loss_mlp": 1.24884605, + "epoch": 0.0029460393807304976, + "flos": 27018063879120.0, + "grad_norm": 2.3953205897810186, + "language_loss": 0.94396341, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.98529321, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 7.3359375, + "router_z_loss_mlp": 2.52539062, + "step": 49, + "time_per_iteration": 3.172506809234619 + }, + { + "auxiliary_loss_clip": 0.02602164, + "auxiliary_loss_mlp": 0.01468769, + "balance_loss_clip": 1.88459074, + "balance_loss_mlp": 1.22462821, + "epoch": 0.0030061626333984666, + "flos": 15856123559760.0, + "grad_norm": 4.288105330941103, + "language_loss": 0.95095778, + "learning_rate": 2.51876455396287e-06, + "loss": 0.99166709, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 7.1796875, + "router_z_loss_mlp": 2.43847656, + "step": 50, + "time_per_iteration": 3.0960779190063477 + }, + { + "auxiliary_loss_clip": 0.02609633, + "auxiliary_loss_mlp": 0.01421078, + "balance_loss_clip": 1.87995255, + "balance_loss_mlp": 1.23139191, + "epoch": 0.003066285886066436, + "flos": 31832704807200.0, + "grad_norm": 3.6479769092374874, + "language_loss": 0.94503093, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.98533809, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 7.30078125, + "router_z_loss_mlp": 1.8984375, + "step": 51, + "time_per_iteration": 3.0078628063201904 + }, + { + "auxiliary_loss_clip": 0.02569248, + "auxiliary_loss_mlp": 0.01397428, + "balance_loss_clip": 1.86833239, + "balance_loss_mlp": 1.22652984, + "epoch": 0.0031264091387344056, + "flos": 41434999085160.0, + "grad_norm": 2.2712793783421263, + "language_loss": 0.99170339, + "learning_rate": 2.5440168957651953e-06, + "loss": 1.03137016, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 7.01171875, + "router_z_loss_mlp": 1.7109375, + "step": 52, + "time_per_iteration": 3.1168057918548584 + }, + { + "auxiliary_loss_clip": 0.0257421, + "auxiliary_loss_mlp": 0.01395112, + "balance_loss_clip": 1.862921, + "balance_loss_mlp": 1.22297382, + "epoch": 0.0031865323914023747, + "flos": 23446232965080.0, + "grad_norm": 2.5764659355375024, + "language_loss": 0.96726453, + "learning_rate": 2.5562811176888872e-06, + "loss": 1.00695777, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 7.10546875, + "router_z_loss_mlp": 1.72167969, + "step": 53, + "time_per_iteration": 3.0449509620666504 + }, + { + "auxiliary_loss_clip": 0.02578982, + "auxiliary_loss_mlp": 0.01425442, + "balance_loss_clip": 1.86494446, + "balance_loss_mlp": 1.23089218, + "epoch": 0.003246655644070344, + "flos": 14433340983480.0, + "grad_norm": 2.4172346606425674, + "language_loss": 0.87199432, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.91203856, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 7.14453125, + "router_z_loss_mlp": 1.94628906, + "step": 54, + "time_per_iteration": 2.998241424560547 + }, + { + "auxiliary_loss_clip": 0.02565141, + "auxiliary_loss_mlp": 0.01456917, + "balance_loss_clip": 1.85157359, + "balance_loss_mlp": 1.25025558, + "epoch": 0.0033067788967383136, + "flos": 35925059593080.0, + "grad_norm": 3.4253574306142283, + "language_loss": 0.87017465, + "learning_rate": 2.580130221340046e-06, + "loss": 0.91039526, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 7.1328125, + "router_z_loss_mlp": 2.06640625, + "step": 55, + "time_per_iteration": 4.733135461807251 + }, + { + "auxiliary_loss_clip": 0.02585571, + "auxiliary_loss_mlp": 0.01553812, + "balance_loss_clip": 1.84491968, + "balance_loss_mlp": 1.25540757, + "epoch": 0.003366902149406283, + "flos": 22962929978160.0, + "grad_norm": 2.8354162392883775, + "language_loss": 0.94420063, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.98559451, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 7.40234375, + "router_z_loss_mlp": 2.98535156, + "step": 56, + "time_per_iteration": 5.982388973236084 + }, + { + "auxiliary_loss_clip": 0.02603261, + "auxiliary_loss_mlp": 0.01451862, + "balance_loss_clip": 1.86075878, + "balance_loss_mlp": 1.27915192, + "epoch": 0.003427025402074252, + "flos": 26589077896680.0, + "grad_norm": 2.3625273378314926, + "language_loss": 0.96819842, + "learning_rate": 2.6031273868139713e-06, + "loss": 1.00874972, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 7.43359375, + "router_z_loss_mlp": 1.72851562, + "step": 57, + "time_per_iteration": 3.1607131958007812 + }, + { + "auxiliary_loss_clip": 0.02535258, + "auxiliary_loss_mlp": 0.01438421, + "balance_loss_clip": 1.83960903, + "balance_loss_mlp": 1.24406183, + "epoch": 0.0034871486547422216, + "flos": 23956482921840.0, + "grad_norm": 2.4977966395986573, + "language_loss": 1.03788364, + "learning_rate": 2.614325098333948e-06, + "loss": 1.07762051, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 6.95703125, + "router_z_loss_mlp": 1.94140625, + "step": 58, + "time_per_iteration": 3.095659017562866 + }, + { + "auxiliary_loss_clip": 0.0252574, + "auxiliary_loss_mlp": 0.01437119, + "balance_loss_clip": 1.83206224, + "balance_loss_mlp": 1.27737808, + "epoch": 0.003547271907410191, + "flos": 21219940880640.0, + "grad_norm": 2.7103883230045183, + "language_loss": 0.91555429, + "learning_rate": 2.625331386578098e-06, + "loss": 0.95518285, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 6.93359375, + "router_z_loss_mlp": 1.59765625, + "step": 59, + "time_per_iteration": 2.992643117904663 + }, + { + "auxiliary_loss_clip": 0.02516474, + "auxiliary_loss_mlp": 0.0145831, + "balance_loss_clip": 1.82803416, + "balance_loss_mlp": 1.28435969, + "epoch": 0.00360739516007816, + "flos": 16508800326960.0, + "grad_norm": 2.409485320908312, + "language_loss": 0.98184025, + "learning_rate": 2.63615268640451e-06, + "loss": 1.02158809, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 1.74121094, + "step": 60, + "time_per_iteration": 2.9902100563049316 + }, + { + "auxiliary_loss_clip": 0.02529059, + "auxiliary_loss_mlp": 0.01409652, + "balance_loss_clip": 1.81943297, + "balance_loss_mlp": 1.26383471, + "epoch": 0.0036675184127461296, + "flos": 19469682886680.0, + "grad_norm": 3.0604801911476653, + "language_loss": 0.93901527, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.97840238, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 7.09765625, + "router_z_loss_mlp": 1.45800781, + "step": 61, + "time_per_iteration": 2.9729530811309814 + }, + { + "auxiliary_loss_clip": 0.02529139, + "auxiliary_loss_mlp": 0.01433813, + "balance_loss_clip": 1.82193685, + "balance_loss_mlp": 1.25890899, + "epoch": 0.003727641665414099, + "flos": 20961953012880.0, + "grad_norm": 2.229528545104095, + "language_loss": 0.92404711, + "learning_rate": 2.657264485425803e-06, + "loss": 0.96367669, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 7.06640625, + "router_z_loss_mlp": 1.75, + "step": 62, + "time_per_iteration": 2.988952875137329 + }, + { + "auxiliary_loss_clip": 0.02493615, + "auxiliary_loss_mlp": 0.01415762, + "balance_loss_clip": 1.80717802, + "balance_loss_mlp": 1.24691379, + "epoch": 0.003787764918082068, + "flos": 18410703229080.0, + "grad_norm": 2.207546801829675, + "language_loss": 0.98159599, + "learning_rate": 2.6675663401385186e-06, + "loss": 1.02068985, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 6.87109375, + "router_z_loss_mlp": 1.68603516, + "step": 63, + "time_per_iteration": 2.952565908432007 + }, + { + "auxiliary_loss_clip": 0.02541361, + "auxiliary_loss_mlp": 0.01430803, + "balance_loss_clip": 1.83873248, + "balance_loss_mlp": 1.26638997, + "epoch": 0.0038478881707500376, + "flos": 12463168782240.0, + "grad_norm": 2.6831062177784313, + "language_loss": 1.04152572, + "learning_rate": 2.677705954159056e-06, + "loss": 1.08124733, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 7.0234375, + "router_z_loss_mlp": 1.64355469, + "step": 64, + "time_per_iteration": 3.003755807876587 + }, + { + "auxiliary_loss_clip": 0.02522797, + "auxiliary_loss_mlp": 0.0139574, + "balance_loss_clip": 1.81860042, + "balance_loss_mlp": 1.24677646, + "epoch": 0.003908011423418007, + "flos": 13557338906760.0, + "grad_norm": 2.824921476888571, + "language_loss": 0.90516341, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.94434875, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 7.03515625, + "router_z_loss_mlp": 1.48876953, + "step": 65, + "time_per_iteration": 3.076458215713501 + }, + { + "auxiliary_loss_clip": 0.02487938, + "auxiliary_loss_mlp": 0.01481029, + "balance_loss_clip": 1.81186759, + "balance_loss_mlp": 1.26187491, + "epoch": 0.003968134676085976, + "flos": 18338170052160.0, + "grad_norm": 2.5712088027918707, + "language_loss": 0.89940202, + "learning_rate": 2.697518353781685e-06, + "loss": 0.93909168, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 6.75390625, + "router_z_loss_mlp": 2.19335938, + "step": 66, + "time_per_iteration": 3.1230530738830566 + }, + { + "auxiliary_loss_clip": 0.02479517, + "auxiliary_loss_mlp": 0.01397113, + "balance_loss_clip": 1.80295515, + "balance_loss_mlp": 1.24395299, + "epoch": 0.004028257928753946, + "flos": 20490101583480.0, + "grad_norm": 3.8124112132804933, + "language_loss": 1.00022233, + "learning_rate": 2.7072005239581103e-06, + "loss": 1.03898859, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 6.7734375, + "router_z_loss_mlp": 1.53125, + "step": 67, + "time_per_iteration": 3.0326719284057617 + }, + { + "auxiliary_loss_clip": 0.02444385, + "auxiliary_loss_mlp": 0.01398622, + "balance_loss_clip": 1.80639851, + "balance_loss_mlp": 1.25061178, + "epoch": 0.004088381181421915, + "flos": 18848541834000.0, + "grad_norm": 2.186205391407914, + "language_loss": 0.97169638, + "learning_rate": 2.7167392492896727e-06, + "loss": 1.01012635, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 6.3828125, + "router_z_loss_mlp": 1.48046875, + "step": 68, + "time_per_iteration": 3.0320968627929688 + }, + { + "auxiliary_loss_clip": 0.024364, + "auxiliary_loss_mlp": 0.01403187, + "balance_loss_clip": 1.79650974, + "balance_loss_mlp": 1.23238444, + "epoch": 0.004148504434089885, + "flos": 19432624435200.0, + "grad_norm": 2.3244648454070824, + "language_loss": 0.99314088, + "learning_rate": 2.7261387181735195e-06, + "loss": 1.03153682, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 6.41015625, + "router_z_loss_mlp": 1.7109375, + "step": 69, + "time_per_iteration": 2.997891426086426 + }, + { + "auxiliary_loss_clip": 0.02422151, + "auxiliary_loss_mlp": 0.01349646, + "balance_loss_clip": 1.78156066, + "balance_loss_mlp": 1.20154047, + "epoch": 0.004208627686757853, + "flos": 20815587191520.0, + "grad_norm": 2.4537610837940327, + "language_loss": 1.01748824, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.05520618, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 6.40625, + "router_z_loss_mlp": 1.47949219, + "step": 70, + "time_per_iteration": 2.977381944656372 + }, + { + "auxiliary_loss_clip": 0.0243884, + "auxiliary_loss_mlp": 0.01412148, + "balance_loss_clip": 1.77287519, + "balance_loss_mlp": 1.20977795, + "epoch": 0.004268750939425823, + "flos": 19103281032960.0, + "grad_norm": 4.754916067404747, + "language_loss": 1.02051091, + "learning_rate": 2.7445357464116983e-06, + "loss": 1.05902076, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 6.65234375, + "router_z_loss_mlp": 2.0234375, + "step": 71, + "time_per_iteration": 2.9782018661499023 + }, + { + "auxiliary_loss_clip": 0.02439494, + "auxiliary_loss_mlp": 0.01312951, + "balance_loss_clip": 1.94722867, + "balance_loss_mlp": 1.24581265, + "epoch": 0.004328874192093792, + "flos": 52452499157280.0, + "grad_norm": 2.657000020447302, + "language_loss": 0.66704619, + "learning_rate": 2.75354081884615e-06, + "loss": 0.70457065, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.671875, + "step": 72, + "time_per_iteration": 3.379183053970337 + }, + { + "auxiliary_loss_clip": 0.02426425, + "auxiliary_loss_mlp": 0.01290413, + "balance_loss_clip": 1.93760729, + "balance_loss_mlp": 1.22708941, + "epoch": 0.004388997444761762, + "flos": 66490948199520.0, + "grad_norm": 2.5078491019529126, + "language_loss": 0.64547467, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.68264306, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 4.875, + "router_z_loss_mlp": 0.6328125, + "step": 73, + "time_per_iteration": 3.4308037757873535 + }, + { + "auxiliary_loss_clip": 0.02373609, + "auxiliary_loss_mlp": 0.01341753, + "balance_loss_clip": 1.76919854, + "balance_loss_mlp": 1.18620849, + "epoch": 0.004449120697429731, + "flos": 18957646162080.0, + "grad_norm": 2.8681072774166028, + "language_loss": 0.90241253, + "learning_rate": 2.771181708202938e-06, + "loss": 0.93956614, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 1.55371094, + "step": 74, + "time_per_iteration": 2.9129228591918945 + }, + { + "auxiliary_loss_clip": 0.02378416, + "auxiliary_loss_mlp": 0.01332122, + "balance_loss_clip": 1.75298607, + "balance_loss_mlp": 1.16966414, + "epoch": 0.004509243950097701, + "flos": 21110430468960.0, + "grad_norm": 2.8884890488615573, + "language_loss": 1.01158595, + "learning_rate": 2.779824149153005e-06, + "loss": 1.04869139, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 1.62353516, + "step": 75, + "time_per_iteration": 2.918656826019287 + }, + { + "auxiliary_loss_clip": 0.02363513, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_clip": 1.75372338, + "balance_loss_mlp": 1.17101777, + "epoch": 0.004569367202765669, + "flos": 20702909327760.0, + "grad_norm": 2.5461909283697994, + "language_loss": 0.90990841, + "learning_rate": 2.788352117317012e-06, + "loss": 0.94685209, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 6.1015625, + "router_z_loss_mlp": 1.59960938, + "step": 76, + "time_per_iteration": 2.9543182849884033 + }, + { + "auxiliary_loss_clip": 0.02364219, + "auxiliary_loss_mlp": 0.01389759, + "balance_loss_clip": 1.75351739, + "balance_loss_mlp": 1.16297483, + "epoch": 0.004629490455433639, + "flos": 28664456023440.0, + "grad_norm": 2.3485605607973583, + "language_loss": 0.95213616, + "learning_rate": 2.796768605577095e-06, + "loss": 0.98967588, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 6.11328125, + "router_z_loss_mlp": 2.26855469, + "step": 77, + "time_per_iteration": 2.9655039310455322 + }, + { + "auxiliary_loss_clip": 0.02370039, + "auxiliary_loss_mlp": 0.0141005, + "balance_loss_clip": 1.7671653, + "balance_loss_mlp": 1.20653558, + "epoch": 0.004689613708101608, + "flos": 11076104581560.0, + "grad_norm": 12.178421781712144, + "language_loss": 0.95849013, + "learning_rate": 2.80507649095533e-06, + "loss": 0.9962911, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 6.02734375, + "router_z_loss_mlp": 2.03515625, + "step": 78, + "time_per_iteration": 2.9279592037200928 + }, + { + "auxiliary_loss_clip": 0.0235299, + "auxiliary_loss_mlp": 0.01295503, + "balance_loss_clip": 1.74838495, + "balance_loss_mlp": 1.16093993, + "epoch": 0.004749736960769578, + "flos": 21804185915280.0, + "grad_norm": 5.4730651423463215, + "language_loss": 0.88804299, + "learning_rate": 2.813278540517843e-06, + "loss": 0.92452794, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 6.04296875, + "router_z_loss_mlp": 1.34667969, + "step": 79, + "time_per_iteration": 3.1428544521331787 + }, + { + "auxiliary_loss_clip": 0.02376195, + "auxiliary_loss_mlp": 0.01356233, + "balance_loss_clip": 1.76037967, + "balance_loss_mlp": 1.16988528, + "epoch": 0.004809860213437547, + "flos": 19797442562880.0, + "grad_norm": 2.3170633221240307, + "language_loss": 0.93912554, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.97644985, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 6.1640625, + "router_z_loss_mlp": 1.86425781, + "step": 80, + "time_per_iteration": 3.031053304672241 + }, + { + "auxiliary_loss_clip": 0.02355308, + "auxiliary_loss_mlp": 0.01305895, + "balance_loss_clip": 1.75490332, + "balance_loss_mlp": 1.15473795, + "epoch": 0.004869983466105517, + "flos": 26579453715360.0, + "grad_norm": 2.5473947078374755, + "language_loss": 0.98814142, + "learning_rate": 2.829375683533245e-06, + "loss": 1.02475345, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 6.01171875, + "router_z_loss_mlp": 1.51074219, + "step": 81, + "time_per_iteration": 2.9599668979644775 + }, + { + "auxiliary_loss_clip": 0.02361395, + "auxiliary_loss_mlp": 0.01381657, + "balance_loss_clip": 1.74825644, + "balance_loss_mlp": 1.14276147, + "epoch": 0.004930106718773485, + "flos": 12827255959440.0, + "grad_norm": 4.280025643039845, + "language_loss": 1.01081133, + "learning_rate": 2.8372758094402803e-06, + "loss": 1.04824173, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 6.1328125, + "router_z_loss_mlp": 2.38964844, + "step": 82, + "time_per_iteration": 2.901214599609375 + }, + { + "auxiliary_loss_clip": 0.02332247, + "auxiliary_loss_mlp": 0.01322075, + "balance_loss_clip": 1.73330438, + "balance_loss_mlp": 1.14965045, + "epoch": 0.004990229971441455, + "flos": 25780370518440.0, + "grad_norm": 11.355664120332234, + "language_loss": 0.89362657, + "learning_rate": 2.84508017388607e-06, + "loss": 0.93016982, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 5.98828125, + "router_z_loss_mlp": 1.72460938, + "step": 83, + "time_per_iteration": 3.0435421466827393 + }, + { + "auxiliary_loss_clip": 0.02341092, + "auxiliary_loss_mlp": 0.01364863, + "balance_loss_clip": 1.72735667, + "balance_loss_mlp": 1.1498096, + "epoch": 0.005050353224109424, + "flos": 17461761891840.0, + "grad_norm": 3.34411162147988, + "language_loss": 0.951882, + "learning_rate": 2.852791070641559e-06, + "loss": 0.98894155, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 6.13671875, + "router_z_loss_mlp": 2.15039062, + "step": 84, + "time_per_iteration": 2.979510545730591 + }, + { + "auxiliary_loss_clip": 0.02290769, + "auxiliary_loss_mlp": 0.01130765, + "balance_loss_clip": 1.85917091, + "balance_loss_mlp": 1.02853143, + "epoch": 0.005110476476777394, + "flos": 69820408483200.0, + "grad_norm": 1.3527418384676728, + "language_loss": 0.62420768, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65842301, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 1.0234375, + "step": 85, + "time_per_iteration": 3.4835236072540283 + }, + { + "auxiliary_loss_clip": 0.02333175, + "auxiliary_loss_mlp": 0.01301268, + "balance_loss_clip": 1.73824072, + "balance_loss_mlp": 1.12951112, + "epoch": 0.005170599729445363, + "flos": 24795467155440.0, + "grad_norm": 2.401958793782406, + "language_loss": 0.92242289, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.95876735, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 5.94921875, + "router_z_loss_mlp": 1.71777344, + "step": 86, + "time_per_iteration": 3.2073185443878174 + }, + { + "auxiliary_loss_clip": 0.02334995, + "auxiliary_loss_mlp": 0.01283021, + "balance_loss_clip": 1.72868085, + "balance_loss_mlp": 1.13825321, + "epoch": 0.005230722982113333, + "flos": 23263255384200.0, + "grad_norm": 4.086424442214271, + "language_loss": 0.85898036, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.89516056, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 6.05859375, + "router_z_loss_mlp": 1.44726562, + "step": 87, + "time_per_iteration": 3.0452423095703125 + }, + { + "auxiliary_loss_clip": 0.02311902, + "auxiliary_loss_mlp": 0.01275908, + "balance_loss_clip": 1.73440623, + "balance_loss_mlp": 1.12274814, + "epoch": 0.005290846234781301, + "flos": 16732369286640.0, + "grad_norm": 2.4660637112929775, + "language_loss": 0.97635376, + "learning_rate": 2.8827430842847267e-06, + "loss": 1.01223183, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 5.78125, + "router_z_loss_mlp": 1.53222656, + "step": 88, + "time_per_iteration": 3.063822031021118 + }, + { + "auxiliary_loss_clip": 0.02329001, + "auxiliary_loss_mlp": 0.01275212, + "balance_loss_clip": 1.72207928, + "balance_loss_mlp": 1.124722, + "epoch": 0.005350969487449271, + "flos": 20890922345280.0, + "grad_norm": 2.5906852775287317, + "language_loss": 0.89361882, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.92966092, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 6.0625, + "router_z_loss_mlp": 1.50585938, + "step": 89, + "time_per_iteration": 3.0137367248535156 + }, + { + "auxiliary_loss_clip": 0.02323288, + "auxiliary_loss_mlp": 0.01263055, + "balance_loss_clip": 1.71963704, + "balance_loss_mlp": 1.1151402, + "epoch": 0.00541109274011724, + "flos": 26215122888000.0, + "grad_norm": 2.466411160447402, + "language_loss": 0.94289619, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.97875965, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 1.48046875, + "step": 90, + "time_per_iteration": 2.989194393157959 + }, + { + "auxiliary_loss_clip": 0.02308549, + "auxiliary_loss_mlp": 0.01292107, + "balance_loss_clip": 1.71230459, + "balance_loss_mlp": 1.12015986, + "epoch": 0.00547121599278521, + "flos": 21183532162920.0, + "grad_norm": 2.420292121200716, + "language_loss": 0.89123726, + "learning_rate": 2.90432674275074e-06, + "loss": 0.92724383, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 1.71875, + "step": 91, + "time_per_iteration": 3.1537468433380127 + }, + { + "auxiliary_loss_clip": 0.02310853, + "auxiliary_loss_mlp": 0.01287323, + "balance_loss_clip": 1.71655464, + "balance_loss_mlp": 1.12805915, + "epoch": 0.005531339245453179, + "flos": 19723528701720.0, + "grad_norm": 2.5489818640636788, + "language_loss": 0.89880675, + "learning_rate": 2.91136344867656e-06, + "loss": 0.93478847, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 5.9453125, + "router_z_loss_mlp": 1.59375, + "step": 92, + "time_per_iteration": 3.1503703594207764 + }, + { + "auxiliary_loss_clip": 0.02303322, + "auxiliary_loss_mlp": 0.01262327, + "balance_loss_clip": 1.69518888, + "balance_loss_mlp": 1.11002493, + "epoch": 0.005591462498121149, + "flos": 17640150728040.0, + "grad_norm": 4.473280288465613, + "language_loss": 0.96619523, + "learning_rate": 2.918324080615938e-06, + "loss": 1.00185168, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 6.078125, + "router_z_loss_mlp": 1.52246094, + "step": 93, + "time_per_iteration": 3.1014907360076904 + }, + { + "auxiliary_loss_clip": 0.02324942, + "auxiliary_loss_mlp": 0.01310443, + "balance_loss_clip": 1.69721258, + "balance_loss_mlp": 1.13611102, + "epoch": 0.005651585750789117, + "flos": 20016057302640.0, + "grad_norm": 2.8565107619424372, + "language_loss": 0.90932733, + "learning_rate": 2.925210265866963e-06, + "loss": 0.94568121, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 6.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 94, + "time_per_iteration": 6.096091270446777 + }, + { + "auxiliary_loss_clip": 0.02140645, + "auxiliary_loss_mlp": 0.010938, + "balance_loss_clip": 1.73721921, + "balance_loss_mlp": 1.01331031, + "epoch": 0.005711709003457087, + "flos": 59827201883280.0, + "grad_norm": 1.346630432397944, + "language_loss": 0.67955017, + "learning_rate": 2.932023580065507e-06, + "loss": 0.71189463, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 4.03125, + "router_z_loss_mlp": 0.8046875, + "step": 95, + "time_per_iteration": 6.457536935806274 + }, + { + "auxiliary_loss_clip": 0.02293566, + "auxiliary_loss_mlp": 0.01289287, + "balance_loss_clip": 1.688555, + "balance_loss_mlp": 1.14409029, + "epoch": 0.005771832256125056, + "flos": 15563798000640.0, + "grad_norm": 5.038240511346431, + "language_loss": 0.93507719, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.97090572, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 6.0546875, + "router_z_loss_mlp": 1.45263672, + "step": 96, + "time_per_iteration": 2.944394111633301 + }, + { + "auxiliary_loss_clip": 0.02308163, + "auxiliary_loss_mlp": 0.0125125, + "balance_loss_clip": 1.68956363, + "balance_loss_mlp": 1.11373007, + "epoch": 0.005831955508793026, + "flos": 22533497303760.0, + "grad_norm": 3.1418939036856397, + "language_loss": 0.94996965, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.98556376, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 1.37597656, + "step": 97, + "time_per_iteration": 2.98871111869812 + }, + { + "auxiliary_loss_clip": 0.02295875, + "auxiliary_loss_mlp": 0.01271741, + "balance_loss_clip": 1.68650579, + "balance_loss_mlp": 1.1194396, + "epoch": 0.005892078761460995, + "flos": 22054336369560.0, + "grad_norm": 2.31619933921609, + "language_loss": 0.78745663, + "learning_rate": 2.952041322436969e-06, + "loss": 0.82313281, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 6.09375, + "router_z_loss_mlp": 1.52441406, + "step": 98, + "time_per_iteration": 3.1150062084198 + }, + { + "auxiliary_loss_clip": 0.02088052, + "auxiliary_loss_mlp": 0.0109936, + "balance_loss_clip": 1.7023015, + "balance_loss_mlp": 1.04290295, + "epoch": 0.005952202014128965, + "flos": 68555321460720.0, + "grad_norm": 1.0410350932576165, + "language_loss": 0.65373659, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68561077, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.56640625, + "step": 99, + "time_per_iteration": 3.647531509399414 + }, + { + "auxiliary_loss_clip": 0.02249245, + "auxiliary_loss_mlp": 0.01313034, + "balance_loss_clip": 1.66796279, + "balance_loss_mlp": 1.1272583, + "epoch": 0.006012325266796933, + "flos": 22965366479760.0, + "grad_norm": 2.6758481040790594, + "language_loss": 0.94651711, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.98213989, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 5.81640625, + "router_z_loss_mlp": 1.85791016, + "step": 100, + "time_per_iteration": 3.0169825553894043 + }, + { + "auxiliary_loss_clip": 0.02286175, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_clip": 1.6750741, + "balance_loss_mlp": 1.09716201, + "epoch": 0.006072448519464903, + "flos": 17352454521960.0, + "grad_norm": 3.4653922874997902, + "language_loss": 0.94280982, + "learning_rate": 2.971455421902446e-06, + "loss": 0.97782528, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 6.109375, + "router_z_loss_mlp": 1.18212891, + "step": 101, + "time_per_iteration": 2.990457057952881 + }, + { + "auxiliary_loss_clip": 0.0226419, + "auxiliary_loss_mlp": 0.01236716, + "balance_loss_clip": 1.67383361, + "balance_loss_mlp": 1.11469316, + "epoch": 0.006132571772132872, + "flos": 24686525260800.0, + "grad_norm": 3.928889579291564, + "language_loss": 0.94373554, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.97874463, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 5.90234375, + "router_z_loss_mlp": 1.22021484, + "step": 102, + "time_per_iteration": 3.04077410697937 + }, + { + "auxiliary_loss_clip": 0.02273258, + "auxiliary_loss_mlp": 0.01214479, + "balance_loss_clip": 1.67040873, + "balance_loss_mlp": 1.10013306, + "epoch": 0.006192695024800842, + "flos": 21470050726560.0, + "grad_norm": 2.3591841843950108, + "language_loss": 0.9058314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.94070876, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 6.01953125, + "router_z_loss_mlp": 1.14404297, + "step": 103, + "time_per_iteration": 2.9731009006500244 + }, + { + "auxiliary_loss_clip": 0.02262161, + "auxiliary_loss_mlp": 0.01232108, + "balance_loss_clip": 1.66806114, + "balance_loss_mlp": 1.11323261, + "epoch": 0.006252818277468811, + "flos": 17424865873800.0, + "grad_norm": 3.062426097636632, + "language_loss": 0.960509, + "learning_rate": 2.990301221458371e-06, + "loss": 0.99545169, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 5.94140625, + "router_z_loss_mlp": 1.18994141, + "step": 104, + "time_per_iteration": 3.0385799407958984 + }, + { + "auxiliary_loss_clip": 0.02256158, + "auxiliary_loss_mlp": 0.01228837, + "balance_loss_clip": 1.66957736, + "balance_loss_mlp": 1.1141572, + "epoch": 0.006312941530136781, + "flos": 19104336850320.0, + "grad_norm": 4.528416581398636, + "language_loss": 0.98616183, + "learning_rate": 2.9964625333900544e-06, + "loss": 1.02101171, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 5.86328125, + "router_z_loss_mlp": 1.14697266, + "step": 105, + "time_per_iteration": 3.081022262573242 + }, + { + "auxiliary_loss_clip": 0.02260626, + "auxiliary_loss_mlp": 0.01216119, + "balance_loss_clip": 1.66320872, + "balance_loss_mlp": 1.10892606, + "epoch": 0.006373064782804749, + "flos": 24066155766960.0, + "grad_norm": 3.616143642563739, + "language_loss": 0.92128044, + "learning_rate": 3.002565443382063e-06, + "loss": 0.95604789, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 5.98046875, + "router_z_loss_mlp": 1.07177734, + "step": 106, + "time_per_iteration": 3.0420944690704346 + }, + { + "auxiliary_loss_clip": 0.02265153, + "auxiliary_loss_mlp": 0.01255955, + "balance_loss_clip": 1.65614676, + "balance_loss_mlp": 1.12515855, + "epoch": 0.006433188035472719, + "flos": 18336992409720.0, + "grad_norm": 3.0632531186678205, + "language_loss": 0.87613881, + "learning_rate": 3.008611048208843e-06, + "loss": 0.91134989, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 6.08984375, + "router_z_loss_mlp": 1.30957031, + "step": 107, + "time_per_iteration": 2.9755961894989014 + }, + { + "auxiliary_loss_clip": 0.02000169, + "auxiliary_loss_mlp": 0.01056802, + "balance_loss_clip": 1.62692177, + "balance_loss_mlp": 1.00797415, + "epoch": 0.006493311288140688, + "flos": 62578687800960.0, + "grad_norm": 1.0008650974766762, + "language_loss": 0.64780974, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67837948, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.48828125, + "step": 108, + "time_per_iteration": 3.42583966255188 + }, + { + "auxiliary_loss_clip": 0.0223127, + "auxiliary_loss_mlp": 0.01228672, + "balance_loss_clip": 1.65179646, + "balance_loss_mlp": 1.11637735, + "epoch": 0.006553434540808658, + "flos": 19504873353600.0, + "grad_norm": 2.390377455940002, + "language_loss": 1.00559962, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.04019904, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 1.12353516, + "step": 109, + "time_per_iteration": 3.008054256439209 + }, + { + "auxiliary_loss_clip": 0.02221263, + "auxiliary_loss_mlp": 0.01182531, + "balance_loss_clip": 1.64636731, + "balance_loss_mlp": 1.09546018, + "epoch": 0.006613557793476627, + "flos": 21110024385360.0, + "grad_norm": 2.2203626782352868, + "language_loss": 0.87260604, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.90664399, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 5.75390625, + "router_z_loss_mlp": 0.87060547, + "step": 110, + "time_per_iteration": 3.064440965652466 + }, + { + "auxiliary_loss_clip": 0.02209156, + "auxiliary_loss_mlp": 0.0120445, + "balance_loss_clip": 1.63861513, + "balance_loss_mlp": 1.10159612, + "epoch": 0.006673681046144597, + "flos": 26036287359840.0, + "grad_norm": 2.4184601645426973, + "language_loss": 0.84918296, + "learning_rate": 3.032241303393073e-06, + "loss": 0.88331902, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 5.703125, + "router_z_loss_mlp": 1.02929688, + "step": 111, + "time_per_iteration": 3.075230836868286 + }, + { + "auxiliary_loss_clip": 0.02221137, + "auxiliary_loss_mlp": 0.01235089, + "balance_loss_clip": 1.64024282, + "balance_loss_mlp": 1.10305309, + "epoch": 0.006733804298812566, + "flos": 23152851588600.0, + "grad_norm": 7.2296796569872255, + "language_loss": 0.96263599, + "learning_rate": 3.0380158011446e-06, + "loss": 0.99719828, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 5.8125, + "router_z_loss_mlp": 1.31884766, + "step": 112, + "time_per_iteration": 3.015571117401123 + }, + { + "auxiliary_loss_clip": 0.02201288, + "auxiliary_loss_mlp": 0.01210961, + "balance_loss_clip": 1.63753188, + "balance_loss_mlp": 1.0903213, + "epoch": 0.006793927551480535, + "flos": 11768073260040.0, + "grad_norm": 2.8623551003949617, + "language_loss": 0.83837211, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.87249458, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 5.64453125, + "router_z_loss_mlp": 1.20703125, + "step": 113, + "time_per_iteration": 2.9556546211242676 + }, + { + "auxiliary_loss_clip": 0.02197304, + "auxiliary_loss_mlp": 0.01206695, + "balance_loss_clip": 1.63204706, + "balance_loss_mlp": 1.0873425, + "epoch": 0.006854050804148504, + "flos": 19176666985440.0, + "grad_norm": 4.955850084023601, + "language_loss": 0.96332502, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.99736494, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 5.65234375, + "router_z_loss_mlp": 1.19384766, + "step": 114, + "time_per_iteration": 3.03183650970459 + }, + { + "auxiliary_loss_clip": 0.02194377, + "auxiliary_loss_mlp": 0.01191176, + "balance_loss_clip": 1.6344111, + "balance_loss_mlp": 1.08517504, + "epoch": 0.006914174056816474, + "flos": 21987041671080.0, + "grad_norm": 2.474767585213077, + "language_loss": 0.97342253, + "learning_rate": 3.055034911425055e-06, + "loss": 1.00727808, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 5.58984375, + "router_z_loss_mlp": 1.06005859, + "step": 115, + "time_per_iteration": 2.9447216987609863 + }, + { + "auxiliary_loss_clip": 0.02204338, + "auxiliary_loss_mlp": 0.01185814, + "balance_loss_clip": 1.62992954, + "balance_loss_mlp": 1.09006524, + "epoch": 0.006974297309484443, + "flos": 16293434256000.0, + "grad_norm": 2.748251633446233, + "language_loss": 0.85907042, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.89297193, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 5.7421875, + "router_z_loss_mlp": 0.95751953, + "step": 116, + "time_per_iteration": 2.906254529953003 + }, + { + "auxiliary_loss_clip": 0.02181793, + "auxiliary_loss_mlp": 0.01193626, + "balance_loss_clip": 1.63119054, + "balance_loss_mlp": 1.08948445, + "epoch": 0.007034420562152413, + "flos": 26109592095600.0, + "grad_norm": 2.745295052249728, + "language_loss": 0.90950119, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.94325536, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 1.04296875, + "step": 117, + "time_per_iteration": 3.009208917617798 + }, + { + "auxiliary_loss_clip": 0.02163252, + "auxiliary_loss_mlp": 0.01190917, + "balance_loss_clip": 1.61932611, + "balance_loss_mlp": 1.08749127, + "epoch": 0.007094543814820382, + "flos": 14207985255960.0, + "grad_norm": 4.248099591895315, + "language_loss": 0.89292085, + "learning_rate": 3.071615712271274e-06, + "loss": 0.92646253, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 1.03466797, + "step": 118, + "time_per_iteration": 2.9557502269744873 + }, + { + "auxiliary_loss_clip": 0.02185456, + "auxiliary_loss_mlp": 0.01229869, + "balance_loss_clip": 1.62930799, + "balance_loss_mlp": 1.1130445, + "epoch": 0.007154667067488351, + "flos": 14980162091400.0, + "grad_norm": 5.401573864164735, + "language_loss": 1.01076055, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.04491377, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 1.16845703, + "step": 119, + "time_per_iteration": 2.896955728530884 + }, + { + "auxiliary_loss_clip": 0.02210404, + "auxiliary_loss_mlp": 0.01194329, + "balance_loss_clip": 1.62709785, + "balance_loss_mlp": 1.08608747, + "epoch": 0.00721479032015632, + "flos": 20198100891240.0, + "grad_norm": 3.5724264410301405, + "language_loss": 0.95161933, + "learning_rate": 3.082437012097686e-06, + "loss": 0.98566669, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 5.83203125, + "router_z_loss_mlp": 1.08203125, + "step": 120, + "time_per_iteration": 2.945824146270752 + }, + { + "auxiliary_loss_clip": 0.02159895, + "auxiliary_loss_mlp": 0.01173796, + "balance_loss_clip": 1.62488365, + "balance_loss_mlp": 1.07609189, + "epoch": 0.00727491357282429, + "flos": 23152242463200.0, + "grad_norm": 1.8861643552617042, + "language_loss": 0.9511283, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.98446524, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 0.97705078, + "step": 121, + "time_per_iteration": 2.936647653579712 + }, + { + "auxiliary_loss_clip": 0.02206326, + "auxiliary_loss_mlp": 0.01218208, + "balance_loss_clip": 1.64002395, + "balance_loss_mlp": 1.11034799, + "epoch": 0.007335036825492259, + "flos": 15525683731800.0, + "grad_norm": 4.948564121832953, + "language_loss": 0.93340486, + "learning_rate": 3.09307943925077e-06, + "loss": 0.96765018, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 1.07861328, + "step": 122, + "time_per_iteration": 2.8920748233795166 + }, + { + "auxiliary_loss_clip": 0.02190107, + "auxiliary_loss_mlp": 0.01203729, + "balance_loss_clip": 1.63952661, + "balance_loss_mlp": 1.08876395, + "epoch": 0.007395160078160229, + "flos": 24248727264240.0, + "grad_norm": 2.37415693700739, + "language_loss": 0.95656645, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.99050486, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 5.50390625, + "router_z_loss_mlp": 1.14941406, + "step": 123, + "time_per_iteration": 2.974151849746704 + }, + { + "auxiliary_loss_clip": 0.02210392, + "auxiliary_loss_mlp": 0.0119092, + "balance_loss_clip": 1.6255641, + "balance_loss_mlp": 1.10160875, + "epoch": 0.007455283330828198, + "flos": 31766181667560.0, + "grad_norm": 3.7474382185316073, + "language_loss": 0.74118376, + "learning_rate": 3.103548811118979e-06, + "loss": 0.77519691, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 5.8515625, + "router_z_loss_mlp": 0.89306641, + "step": 124, + "time_per_iteration": 2.9988999366760254 + }, + { + "auxiliary_loss_clip": 0.02160802, + "auxiliary_loss_mlp": 0.01174728, + "balance_loss_clip": 1.62436032, + "balance_loss_mlp": 1.08021891, + "epoch": 0.007515406583496167, + "flos": 26620857261360.0, + "grad_norm": 2.7617735996482855, + "language_loss": 0.915254, + "learning_rate": 3.108720342404542e-06, + "loss": 0.94860935, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.9453125, + "step": 125, + "time_per_iteration": 3.037155866622925 + }, + { + "auxiliary_loss_clip": 0.02205313, + "auxiliary_loss_mlp": 0.01200844, + "balance_loss_clip": 1.62760568, + "balance_loss_mlp": 1.10838568, + "epoch": 0.007575529836164136, + "flos": 18228334773600.0, + "grad_norm": 6.408535640983813, + "language_loss": 0.86197567, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.89603722, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 5.77734375, + "router_z_loss_mlp": 0.92480469, + "step": 126, + "time_per_iteration": 2.991724967956543 + }, + { + "auxiliary_loss_clip": 0.02189226, + "auxiliary_loss_mlp": 0.0120425, + "balance_loss_clip": 1.61883008, + "balance_loss_mlp": 1.11093307, + "epoch": 0.007635653088832106, + "flos": 21585489958800.0, + "grad_norm": 2.5162383136938664, + "language_loss": 0.71192753, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.74586225, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 5.70703125, + "router_z_loss_mlp": 0.93359375, + "step": 127, + "time_per_iteration": 3.088700294494629 + }, + { + "auxiliary_loss_clip": 0.0216752, + "auxiliary_loss_mlp": 0.01179194, + "balance_loss_clip": 1.62921321, + "balance_loss_mlp": 1.08869088, + "epoch": 0.007695776341500075, + "flos": 25380646182360.0, + "grad_norm": 2.2535545689489904, + "language_loss": 0.89933896, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.93280602, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.90478516, + "step": 128, + "time_per_iteration": 3.1247403621673584 + }, + { + "auxiliary_loss_clip": 0.02166234, + "auxiliary_loss_mlp": 0.01190495, + "balance_loss_clip": 1.62064075, + "balance_loss_mlp": 1.10199392, + "epoch": 0.007755899594168045, + "flos": 22348732955040.0, + "grad_norm": 1.8164133122031065, + "language_loss": 0.86406451, + "learning_rate": 3.129000827968184e-06, + "loss": 0.89763176, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 5.45703125, + "router_z_loss_mlp": 0.88427734, + "step": 129, + "time_per_iteration": 3.086168050765991 + }, + { + "auxiliary_loss_clip": 0.02167163, + "auxiliary_loss_mlp": 0.01172532, + "balance_loss_clip": 1.62558234, + "balance_loss_mlp": 1.07397008, + "epoch": 0.007816022846836013, + "flos": 22643413799040.0, + "grad_norm": 2.502169404355333, + "language_loss": 0.99651957, + "learning_rate": 3.133972684206866e-06, + "loss": 1.02991652, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 5.41015625, + "router_z_loss_mlp": 0.98583984, + "step": 130, + "time_per_iteration": 3.1404731273651123 + }, + { + "auxiliary_loss_clip": 0.02155338, + "auxiliary_loss_mlp": 0.01184717, + "balance_loss_clip": 1.6238656, + "balance_loss_mlp": 1.09402251, + "epoch": 0.007876146099503984, + "flos": 18186971835960.0, + "grad_norm": 2.0686570774576576, + "language_loss": 0.84657174, + "learning_rate": 3.138906441556014e-06, + "loss": 0.87997234, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.90673828, + "step": 131, + "time_per_iteration": 2.9964020252227783 + }, + { + "auxiliary_loss_clip": 0.02171797, + "auxiliary_loss_mlp": 0.01197322, + "balance_loss_clip": 1.61758995, + "balance_loss_mlp": 1.09341955, + "epoch": 0.007936269352171952, + "flos": 27124406838720.0, + "grad_norm": 2.428133057764951, + "language_loss": 0.84343505, + "learning_rate": 3.143802679474861e-06, + "loss": 0.87712622, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 5.53125, + "router_z_loss_mlp": 1.03955078, + "step": 132, + "time_per_iteration": 2.9970405101776123 + }, + { + "auxiliary_loss_clip": 0.02160766, + "auxiliary_loss_mlp": 0.01176693, + "balance_loss_clip": 1.61745727, + "balance_loss_mlp": 1.08394837, + "epoch": 0.007996392604839923, + "flos": 19031072722920.0, + "grad_norm": 2.4436209759432943, + "language_loss": 0.97996569, + "learning_rate": 3.1486619643025565e-06, + "loss": 1.01334035, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 5.4375, + "router_z_loss_mlp": 0.92724609, + "step": 133, + "time_per_iteration": 5.9974894523620605 + }, + { + "auxiliary_loss_clip": 0.02131947, + "auxiliary_loss_mlp": 0.01154461, + "balance_loss_clip": 1.61606193, + "balance_loss_mlp": 1.07230151, + "epoch": 0.008056515857507891, + "flos": 25489709902080.0, + "grad_norm": 1.6875766153686849, + "language_loss": 0.75471008, + "learning_rate": 3.153484849651286e-06, + "loss": 0.78757417, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 5.15234375, + "router_z_loss_mlp": 0.82128906, + "step": 134, + "time_per_iteration": 4.542304277420044 + }, + { + "auxiliary_loss_clip": 0.02154032, + "auxiliary_loss_mlp": 0.01170142, + "balance_loss_clip": 1.61359727, + "balance_loss_mlp": 1.08378637, + "epoch": 0.00811663911017586, + "flos": 20562350501880.0, + "grad_norm": 3.901688015890559, + "language_loss": 0.91881722, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.95205903, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 5.40625, + "router_z_loss_mlp": 0.86376953, + "step": 135, + "time_per_iteration": 2.9057955741882324 + }, + { + "auxiliary_loss_clip": 0.02153363, + "auxiliary_loss_mlp": 0.01207751, + "balance_loss_clip": 1.61958468, + "balance_loss_mlp": 1.10666144, + "epoch": 0.00817676236284383, + "flos": 18803442927240.0, + "grad_norm": 2.347415188999892, + "language_loss": 0.91751552, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.95112664, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 5.34375, + "router_z_loss_mlp": 1.01123047, + "step": 136, + "time_per_iteration": 2.8962440490722656 + }, + { + "auxiliary_loss_clip": 0.02158534, + "auxiliary_loss_mlp": 0.0115815, + "balance_loss_clip": 1.62806702, + "balance_loss_mlp": 1.08214259, + "epoch": 0.008236885615511799, + "flos": 23878264574520.0, + "grad_norm": 2.4030007440941006, + "language_loss": 0.87321836, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.9063853, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 5.3046875, + "router_z_loss_mlp": 0.76025391, + "step": 137, + "time_per_iteration": 2.93778657913208 + }, + { + "auxiliary_loss_clip": 0.02135672, + "auxiliary_loss_mlp": 0.01169445, + "balance_loss_clip": 1.6121918, + "balance_loss_mlp": 1.09062386, + "epoch": 0.00829700886817977, + "flos": 24648898292280.0, + "grad_norm": 2.0004718741020038, + "language_loss": 0.92682797, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.95987916, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 5.23046875, + "router_z_loss_mlp": 0.78857422, + "step": 138, + "time_per_iteration": 2.9759292602539062 + }, + { + "auxiliary_loss_clip": 0.02135105, + "auxiliary_loss_mlp": 0.01176409, + "balance_loss_clip": 1.61519694, + "balance_loss_mlp": 1.08666873, + "epoch": 0.008357132120847738, + "flos": 25267115543040.0, + "grad_norm": 2.743356606117833, + "language_loss": 0.94829977, + "learning_rate": 3.177071816289865e-06, + "loss": 0.98141491, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.89648438, + "step": 139, + "time_per_iteration": 2.947166919708252 + }, + { + "auxiliary_loss_clip": 0.02160627, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_clip": 1.63016152, + "balance_loss_mlp": 1.09234476, + "epoch": 0.008417255373515706, + "flos": 27350696558520.0, + "grad_norm": 3.0662361947009806, + "language_loss": 0.89116085, + "learning_rate": 3.181687263893095e-06, + "loss": 0.92460662, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.91601562, + "step": 140, + "time_per_iteration": 3.086230754852295 + }, + { + "auxiliary_loss_clip": 0.02129261, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_clip": 1.61270189, + "balance_loss_mlp": 1.07664323, + "epoch": 0.008477378626183677, + "flos": 17643521221920.0, + "grad_norm": 2.734356353841718, + "language_loss": 0.87777543, + "learning_rate": 3.186269861057098e-06, + "loss": 0.91063321, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 5.1640625, + "router_z_loss_mlp": 0.79882812, + "step": 141, + "time_per_iteration": 3.0262677669525146 + }, + { + "auxiliary_loss_clip": 0.02146236, + "auxiliary_loss_mlp": 0.01149393, + "balance_loss_clip": 1.61317253, + "balance_loss_mlp": 1.06871223, + "epoch": 0.008537501878851645, + "flos": 13885342233120.0, + "grad_norm": 3.361743975592642, + "language_loss": 0.84780228, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.88075858, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.80712891, + "step": 142, + "time_per_iteration": 2.894798755645752 + }, + { + "auxiliary_loss_clip": 0.01897224, + "auxiliary_loss_mlp": 0.01053522, + "balance_loss_clip": 1.58457446, + "balance_loss_mlp": 1.02395821, + "epoch": 0.008597625131519616, + "flos": 71265809916000.0, + "grad_norm": 1.0639101965199569, + "language_loss": 0.66802061, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69752806, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.29492188, + "step": 143, + "time_per_iteration": 3.511657476425171 + }, + { + "auxiliary_loss_clip": 0.02125463, + "auxiliary_loss_mlp": 0.01163576, + "balance_loss_clip": 1.61431146, + "balance_loss_mlp": 1.0785557, + "epoch": 0.008657748384187584, + "flos": 17607315546000.0, + "grad_norm": 5.51662990136268, + "language_loss": 0.85958815, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.89247859, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.85009766, + "step": 144, + "time_per_iteration": 2.9729671478271484 + }, + { + "auxiliary_loss_clip": 0.02110011, + "auxiliary_loss_mlp": 0.01140541, + "balance_loss_clip": 1.60445428, + "balance_loss_mlp": 1.05671275, + "epoch": 0.008717871636855555, + "flos": 19719589690800.0, + "grad_norm": 2.3888136294586433, + "language_loss": 0.90994596, + "learning_rate": 3.204280886775619e-06, + "loss": 0.94245148, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.83886719, + "step": 145, + "time_per_iteration": 3.032445192337036 + }, + { + "auxiliary_loss_clip": 0.02137524, + "auxiliary_loss_mlp": 0.01167893, + "balance_loss_clip": 1.61110497, + "balance_loss_mlp": 1.06618381, + "epoch": 0.008777994889523523, + "flos": 24722812153440.0, + "grad_norm": 2.0204071782673036, + "language_loss": 0.89241266, + "learning_rate": 3.208706005112005e-06, + "loss": 0.9254669, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 1.01660156, + "step": 146, + "time_per_iteration": 3.019010543823242 + }, + { + "auxiliary_loss_clip": 0.01881201, + "auxiliary_loss_mlp": 0.01026257, + "balance_loss_clip": 1.56621981, + "balance_loss_mlp": 1.00069809, + "epoch": 0.008838118142191492, + "flos": 70146828083520.0, + "grad_norm": 0.8547950703750005, + "language_loss": 0.60009474, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62916929, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.25585938, + "step": 147, + "time_per_iteration": 3.4616634845733643 + }, + { + "auxiliary_loss_clip": 0.0209322, + "auxiliary_loss_mlp": 0.0114476, + "balance_loss_clip": 1.59765017, + "balance_loss_mlp": 1.06565297, + "epoch": 0.008898241394859462, + "flos": 20049257959920.0, + "grad_norm": 8.45436957660344, + "language_loss": 0.8564806, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.88886034, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.79101562, + "step": 148, + "time_per_iteration": 2.964738607406616 + }, + { + "auxiliary_loss_clip": 0.02085862, + "auxiliary_loss_mlp": 0.01171168, + "balance_loss_clip": 1.60580504, + "balance_loss_mlp": 1.07265306, + "epoch": 0.008958364647527431, + "flos": 10747207871280.0, + "grad_norm": 5.832297874251113, + "language_loss": 0.90924293, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.94181323, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 4.8046875, + "router_z_loss_mlp": 0.98486328, + "step": 149, + "time_per_iteration": 3.0088367462158203 + }, + { + "auxiliary_loss_clip": 0.02094994, + "auxiliary_loss_mlp": 0.01145964, + "balance_loss_clip": 1.59160829, + "balance_loss_mlp": 1.06308925, + "epoch": 0.009018487900195401, + "flos": 29132815133880.0, + "grad_norm": 3.959114502185728, + "language_loss": 0.9504289, + "learning_rate": 3.226108474846181e-06, + "loss": 0.98283851, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 5.03125, + "router_z_loss_mlp": 0.82861328, + "step": 150, + "time_per_iteration": 3.061580181121826 + }, + { + "auxiliary_loss_clip": 0.02091877, + "auxiliary_loss_mlp": 0.01138456, + "balance_loss_clip": 1.59152639, + "balance_loss_mlp": 1.06154263, + "epoch": 0.00907861115286337, + "flos": 32970187070640.0, + "grad_norm": 2.376225956390476, + "language_loss": 0.76387215, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.79617548, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.76953125, + "step": 151, + "time_per_iteration": 3.117372989654541 + }, + { + "auxiliary_loss_clip": 0.02093505, + "auxiliary_loss_mlp": 0.0116755, + "balance_loss_clip": 1.58893216, + "balance_loss_mlp": 1.08481836, + "epoch": 0.009138734405531338, + "flos": 21767492939040.0, + "grad_norm": 2.519838034684616, + "language_loss": 0.913068, + "learning_rate": 3.234636443010188e-06, + "loss": 0.94567859, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.82714844, + "step": 152, + "time_per_iteration": 3.0236384868621826 + }, + { + "auxiliary_loss_clip": 0.02087448, + "auxiliary_loss_mlp": 0.01130886, + "balance_loss_clip": 1.59252822, + "balance_loss_mlp": 1.05711961, + "epoch": 0.009198857658199309, + "flos": 20846432563920.0, + "grad_norm": 3.656330537475008, + "language_loss": 0.86697316, + "learning_rate": 3.238858439669943e-06, + "loss": 0.89915651, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.73828125, + "step": 153, + "time_per_iteration": 3.1497182846069336 + }, + { + "auxiliary_loss_clip": 0.02074846, + "auxiliary_loss_mlp": 0.01144574, + "balance_loss_clip": 1.58853352, + "balance_loss_mlp": 1.07018733, + "epoch": 0.009258980910867277, + "flos": 24832850473800.0, + "grad_norm": 1.8775414061798659, + "language_loss": 0.91662788, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.94882202, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 4.86328125, + "router_z_loss_mlp": 0.74414062, + "step": 154, + "time_per_iteration": 2.9667487144470215 + }, + { + "auxiliary_loss_clip": 0.02069843, + "auxiliary_loss_mlp": 0.01162874, + "balance_loss_clip": 1.58863711, + "balance_loss_mlp": 1.07976151, + "epoch": 0.009319104163535248, + "flos": 28773763393320.0, + "grad_norm": 2.156240929989502, + "language_loss": 0.90857279, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.94089997, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 4.81640625, + "router_z_loss_mlp": 0.83154297, + "step": 155, + "time_per_iteration": 2.942723512649536 + }, + { + "auxiliary_loss_clip": 0.02089064, + "auxiliary_loss_mlp": 0.0115023, + "balance_loss_clip": 1.5850631, + "balance_loss_mlp": 1.07298279, + "epoch": 0.009379227416203216, + "flos": 16586450157240.0, + "grad_norm": 5.1842870345662435, + "language_loss": 0.893242, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.92563498, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 5.03515625, + "router_z_loss_mlp": 0.77197266, + "step": 156, + "time_per_iteration": 2.866316795349121 + }, + { + "auxiliary_loss_clip": 0.02074919, + "auxiliary_loss_mlp": 0.0117685, + "balance_loss_clip": 1.58506763, + "balance_loss_mlp": 1.07628477, + "epoch": 0.009439350668871187, + "flos": 18334149824520.0, + "grad_norm": 2.491471702679428, + "language_loss": 1.02513504, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.05765283, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 1.00634766, + "step": 157, + "time_per_iteration": 2.8778202533721924 + }, + { + "auxiliary_loss_clip": 0.02035231, + "auxiliary_loss_mlp": 0.01130729, + "balance_loss_clip": 1.56619763, + "balance_loss_mlp": 1.06115842, + "epoch": 0.009499473921539155, + "flos": 24354907790400.0, + "grad_norm": 2.958113692233074, + "language_loss": 0.90136433, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.93302393, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.69628906, + "step": 158, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.02055015, + "auxiliary_loss_mlp": 0.01146873, + "balance_loss_clip": 1.57153893, + "balance_loss_mlp": 1.06905341, + "epoch": 0.009559597174207124, + "flos": 16403716226520.0, + "grad_norm": 2.565646535460671, + "language_loss": 0.8886779, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.92069668, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.77832031, + "step": 159, + "time_per_iteration": 2.881338119506836 + }, + { + "auxiliary_loss_clip": 0.02033801, + "auxiliary_loss_mlp": 0.01145447, + "balance_loss_clip": 1.56535399, + "balance_loss_mlp": 1.0666256, + "epoch": 0.009619720426875094, + "flos": 22862069147160.0, + "grad_norm": 1.929815766029379, + "language_loss": 0.87988853, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.91168106, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.78808594, + "step": 160, + "time_per_iteration": 3.041569471359253 + }, + { + "auxiliary_loss_clip": 0.02049328, + "auxiliary_loss_mlp": 0.01153916, + "balance_loss_clip": 1.56791496, + "balance_loss_mlp": 1.06655931, + "epoch": 0.009679843679543063, + "flos": 19139649142320.0, + "grad_norm": 2.6279087563176047, + "language_loss": 0.9414714, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.97350383, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.87402344, + "step": 161, + "time_per_iteration": 2.9295337200164795 + }, + { + "auxiliary_loss_clip": 0.02059525, + "auxiliary_loss_mlp": 0.01142621, + "balance_loss_clip": 1.56399465, + "balance_loss_mlp": 1.07223988, + "epoch": 0.009739966932211033, + "flos": 20307854953080.0, + "grad_norm": 2.1916414302985414, + "language_loss": 0.94324553, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.97526705, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 4.95703125, + "router_z_loss_mlp": 0.703125, + "step": 162, + "time_per_iteration": 2.9542980194091797 + }, + { + "auxiliary_loss_clip": 0.01841141, + "auxiliary_loss_mlp": 0.01058199, + "balance_loss_clip": 1.5194062, + "balance_loss_mlp": 1.03092372, + "epoch": 0.009800090184879002, + "flos": 67049041450320.0, + "grad_norm": 1.2026354518475204, + "language_loss": 0.72373962, + "learning_rate": 3.279622189013474e-06, + "loss": 0.75273299, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.2734375, + "step": 163, + "time_per_iteration": 3.3203065395355225 + }, + { + "auxiliary_loss_clip": 0.02018313, + "auxiliary_loss_mlp": 0.01129636, + "balance_loss_clip": 1.5606612, + "balance_loss_mlp": 1.06321311, + "epoch": 0.00986021343754697, + "flos": 17169070857480.0, + "grad_norm": 2.339684975525139, + "language_loss": 0.87523746, + "learning_rate": 3.283560135133457e-06, + "loss": 0.90671694, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.6640625, + "step": 164, + "time_per_iteration": 3.2252819538116455 + }, + { + "auxiliary_loss_clip": 0.02021094, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_clip": 1.55238223, + "balance_loss_mlp": 1.05191779, + "epoch": 0.00992033669021494, + "flos": 17754006234240.0, + "grad_norm": 4.237796580255477, + "language_loss": 0.91562867, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.94709122, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.73193359, + "step": 165, + "time_per_iteration": 3.028468132019043 + }, + { + "auxiliary_loss_clip": 0.02027621, + "auxiliary_loss_mlp": 0.01124747, + "balance_loss_clip": 1.55797672, + "balance_loss_mlp": 1.05799031, + "epoch": 0.00998045994288291, + "flos": 25302102968160.0, + "grad_norm": 2.159665204299299, + "language_loss": 0.8299644, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.86148804, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.66796875, + "step": 166, + "time_per_iteration": 3.1406500339508057 + }, + { + "auxiliary_loss_clip": 0.02021144, + "auxiliary_loss_mlp": 0.01118479, + "balance_loss_clip": 1.55587602, + "balance_loss_mlp": 1.05296147, + "epoch": 0.01004058319555088, + "flos": 32304353194800.0, + "grad_norm": 2.6187613739539537, + "language_loss": 0.92924881, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.96064508, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 4.6484375, + "router_z_loss_mlp": 0.65478516, + "step": 167, + "time_per_iteration": 2.900569438934326 + }, + { + "auxiliary_loss_clip": 0.02011916, + "auxiliary_loss_mlp": 0.01121117, + "balance_loss_clip": 1.55258965, + "balance_loss_mlp": 1.05102158, + "epoch": 0.010100706448218848, + "flos": 11322640891800.0, + "grad_norm": 2.697844655764055, + "language_loss": 0.93759394, + "learning_rate": 3.299075396334735e-06, + "loss": 0.96892416, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.70166016, + "step": 168, + "time_per_iteration": 2.8483285903930664 + }, + { + "auxiliary_loss_clip": 0.020078, + "auxiliary_loss_mlp": 0.01136441, + "balance_loss_clip": 1.54876781, + "balance_loss_mlp": 1.05027652, + "epoch": 0.010160829700886819, + "flos": 29725994007720.0, + "grad_norm": 2.746371364518744, + "language_loss": 0.88445222, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.91589463, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.86083984, + "step": 169, + "time_per_iteration": 3.1160659790039062 + }, + { + "auxiliary_loss_clip": 0.02011888, + "auxiliary_loss_mlp": 0.01125676, + "balance_loss_clip": 1.55111182, + "balance_loss_mlp": 1.05357862, + "epoch": 0.010220952953554787, + "flos": 20417202931320.0, + "grad_norm": 1.8458764318974672, + "language_loss": 0.86025095, + "learning_rate": 3.306695037731344e-06, + "loss": 0.8916266, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.72070312, + "step": 170, + "time_per_iteration": 3.0028200149536133 + }, + { + "auxiliary_loss_clip": 0.02039552, + "auxiliary_loss_mlp": 0.01162517, + "balance_loss_clip": 1.56091738, + "balance_loss_mlp": 1.08412492, + "epoch": 0.010281076206222756, + "flos": 31291406436240.0, + "grad_norm": 2.05323497452994, + "language_loss": 0.91998076, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.95200145, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 4.78515625, + "router_z_loss_mlp": 0.78369141, + "step": 171, + "time_per_iteration": 4.358152389526367 + }, + { + "auxiliary_loss_clip": 0.02011495, + "auxiliary_loss_mlp": 0.01122359, + "balance_loss_clip": 1.55000877, + "balance_loss_mlp": 1.04782987, + "epoch": 0.010341199458890726, + "flos": 21987488363040.0, + "grad_norm": 2.5634230214925555, + "language_loss": 0.905195, + "learning_rate": 3.314225558471224e-06, + "loss": 0.93653357, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.74511719, + "step": 172, + "time_per_iteration": 6.20492148399353 + }, + { + "auxiliary_loss_clip": 0.01995448, + "auxiliary_loss_mlp": 0.01116313, + "balance_loss_clip": 1.55017805, + "balance_loss_mlp": 1.04745781, + "epoch": 0.010401322711558695, + "flos": 30816712421640.0, + "grad_norm": 1.9603081509927667, + "language_loss": 0.82323033, + "learning_rate": 3.317958045350308e-06, + "loss": 0.85434794, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 4.453125, + "router_z_loss_mlp": 0.68896484, + "step": 173, + "time_per_iteration": 3.0078775882720947 + }, + { + "auxiliary_loss_clip": 0.02016913, + "auxiliary_loss_mlp": 0.01126548, + "balance_loss_clip": 1.55181301, + "balance_loss_mlp": 1.05015826, + "epoch": 0.010461445964226665, + "flos": 24720213218400.0, + "grad_norm": 1.9237357486503404, + "language_loss": 0.83904153, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.87047613, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.76416016, + "step": 174, + "time_per_iteration": 2.9721245765686035 + }, + { + "auxiliary_loss_clip": 0.02002784, + "auxiliary_loss_mlp": 0.0113677, + "balance_loss_clip": 1.54544246, + "balance_loss_mlp": 1.04431176, + "epoch": 0.010521569216894634, + "flos": 27716286245040.0, + "grad_norm": 2.254873663288948, + "language_loss": 0.74364752, + "learning_rate": 3.325358726641591e-06, + "loss": 0.77504307, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 4.5703125, + "router_z_loss_mlp": 0.92480469, + "step": 175, + "time_per_iteration": 2.9938299655914307 + }, + { + "auxiliary_loss_clip": 0.02005292, + "auxiliary_loss_mlp": 0.0116475, + "balance_loss_clip": 1.55016303, + "balance_loss_mlp": 1.06003666, + "epoch": 0.010581692469562603, + "flos": 12462153573240.0, + "grad_norm": 2.6347747024663333, + "language_loss": 1.00211811, + "learning_rate": 3.329027409977902e-06, + "loss": 1.03381848, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 4.55078125, + "router_z_loss_mlp": 1.04833984, + "step": 176, + "time_per_iteration": 2.9813284873962402 + }, + { + "auxiliary_loss_clip": 0.01994286, + "auxiliary_loss_mlp": 0.01139956, + "balance_loss_clip": 1.53823495, + "balance_loss_mlp": 1.05913162, + "epoch": 0.010641815722230573, + "flos": 19432380785040.0, + "grad_norm": 2.764016149240593, + "language_loss": 0.78902107, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.82036346, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.80810547, + "step": 177, + "time_per_iteration": 2.91410493850708 + }, + { + "auxiliary_loss_clip": 0.02014325, + "auxiliary_loss_mlp": 0.01121563, + "balance_loss_clip": 1.55482674, + "balance_loss_mlp": 1.04989457, + "epoch": 0.010701938974898541, + "flos": 18337154843160.0, + "grad_norm": 2.930961711601036, + "language_loss": 0.80261332, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.83397222, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.71679688, + "step": 178, + "time_per_iteration": 2.946094036102295 + }, + { + "auxiliary_loss_clip": 0.01995983, + "auxiliary_loss_mlp": 0.01118089, + "balance_loss_clip": 1.53996778, + "balance_loss_mlp": 1.04832768, + "epoch": 0.010762062227566512, + "flos": 19208121483240.0, + "grad_norm": 2.391102197827376, + "language_loss": 0.85858744, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.88972819, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 4.5546875, + "router_z_loss_mlp": 0.69775391, + "step": 179, + "time_per_iteration": 2.911485195159912 + }, + { + "auxiliary_loss_clip": 0.02009402, + "auxiliary_loss_mlp": 0.011249, + "balance_loss_clip": 1.54177666, + "balance_loss_mlp": 1.05227733, + "epoch": 0.01082218548023448, + "flos": 31430990661480.0, + "grad_norm": 2.3232628240927977, + "language_loss": 0.84867036, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.88001341, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.72558594, + "step": 180, + "time_per_iteration": 3.0077545642852783 + }, + { + "auxiliary_loss_clip": 0.02002143, + "auxiliary_loss_mlp": 0.01128817, + "balance_loss_clip": 1.54515123, + "balance_loss_mlp": 1.06406212, + "epoch": 0.01088230873290245, + "flos": 25051952513880.0, + "grad_norm": 2.563842551370628, + "language_loss": 0.79473805, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.82604766, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 4.56640625, + "router_z_loss_mlp": 0.64746094, + "step": 181, + "time_per_iteration": 2.9158613681793213 + }, + { + "auxiliary_loss_clip": 0.020095, + "auxiliary_loss_mlp": 0.01151569, + "balance_loss_clip": 1.54029608, + "balance_loss_mlp": 1.07847023, + "epoch": 0.01094243198557042, + "flos": 22898599689960.0, + "grad_norm": 4.252731219253597, + "language_loss": 0.80906487, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.84067559, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 4.6875, + "router_z_loss_mlp": 0.73095703, + "step": 182, + "time_per_iteration": 2.9411656856536865 + }, + { + "auxiliary_loss_clip": 0.02010304, + "auxiliary_loss_mlp": 0.01140819, + "balance_loss_clip": 1.54140222, + "balance_loss_mlp": 1.06972277, + "epoch": 0.011002555238238388, + "flos": 17169476941080.0, + "grad_norm": 2.339100387776502, + "language_loss": 0.90137619, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.93288743, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.7109375, + "step": 183, + "time_per_iteration": 3.059999465942383 + }, + { + "auxiliary_loss_clip": 0.02004309, + "auxiliary_loss_mlp": 0.01123078, + "balance_loss_clip": 1.54151702, + "balance_loss_mlp": 1.05427074, + "epoch": 0.011062678490906358, + "flos": 22315329255960.0, + "grad_norm": 3.0011049878726945, + "language_loss": 0.88349116, + "learning_rate": 3.357647774369736e-06, + "loss": 0.914765, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 4.62890625, + "router_z_loss_mlp": 0.68701172, + "step": 184, + "time_per_iteration": 2.8539113998413086 + }, + { + "auxiliary_loss_clip": 0.01991982, + "auxiliary_loss_mlp": 0.01136583, + "balance_loss_clip": 1.53870821, + "balance_loss_mlp": 1.06534398, + "epoch": 0.011122801743574327, + "flos": 24393631184640.0, + "grad_norm": 1.8661750936240407, + "language_loss": 0.85767919, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.88896483, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.71289062, + "step": 185, + "time_per_iteration": 2.9075307846069336 + }, + { + "auxiliary_loss_clip": 0.01997671, + "auxiliary_loss_mlp": 0.01129949, + "balance_loss_clip": 1.52966475, + "balance_loss_mlp": 1.05890083, + "epoch": 0.011182924996242297, + "flos": 18154826996040.0, + "grad_norm": 4.100006070570332, + "language_loss": 0.73922527, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.77050149, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 4.68359375, + "router_z_loss_mlp": 0.71044922, + "step": 186, + "time_per_iteration": 2.8630313873291016 + }, + { + "auxiliary_loss_clip": 0.01986124, + "auxiliary_loss_mlp": 0.01110439, + "balance_loss_clip": 1.52944279, + "balance_loss_mlp": 1.04234648, + "epoch": 0.011243048248910266, + "flos": 15491021173560.0, + "grad_norm": 2.697728255099167, + "language_loss": 1.03642082, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.06738639, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 4.5703125, + "router_z_loss_mlp": 0.68164062, + "step": 187, + "time_per_iteration": 2.8281943798065186 + }, + { + "auxiliary_loss_clip": 0.01980555, + "auxiliary_loss_mlp": 0.0111975, + "balance_loss_clip": 1.53453016, + "balance_loss_mlp": 1.04784346, + "epoch": 0.011303171501578235, + "flos": 40924343044800.0, + "grad_norm": 1.822829604472294, + "language_loss": 0.77117634, + "learning_rate": 3.371494591560139e-06, + "loss": 0.80217934, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 4.4609375, + "router_z_loss_mlp": 0.71875, + "step": 188, + "time_per_iteration": 3.1027402877807617 + }, + { + "auxiliary_loss_clip": 0.01793974, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.49310708, + "balance_loss_mlp": 1.00387466, + "epoch": 0.011363294754246205, + "flos": 66317171735160.0, + "grad_norm": 0.7418645120552031, + "language_loss": 0.56097716, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58925509, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29882812, + "step": 189, + "time_per_iteration": 3.398355007171631 + }, + { + "auxiliary_loss_clip": 0.01978748, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_clip": 1.53181922, + "balance_loss_mlp": 1.06912684, + "epoch": 0.011423418006914174, + "flos": 24905749125960.0, + "grad_norm": 5.886032389777664, + "language_loss": 0.97435546, + "learning_rate": 3.3783079057586833e-06, + "loss": 1.00555325, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 4.46875, + "router_z_loss_mlp": 0.71972656, + "step": 190, + "time_per_iteration": 2.9293582439422607 + }, + { + "auxiliary_loss_clip": 0.0198412, + "auxiliary_loss_mlp": 0.01127446, + "balance_loss_clip": 1.53115165, + "balance_loss_mlp": 1.05677843, + "epoch": 0.011483541259582144, + "flos": 19796630395680.0, + "grad_norm": 3.2082353601857485, + "language_loss": 0.88208103, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.91319668, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 4.53515625, + "router_z_loss_mlp": 0.70605469, + "step": 191, + "time_per_iteration": 2.8770864009857178 + }, + { + "auxiliary_loss_clip": 0.01993188, + "auxiliary_loss_mlp": 0.01160331, + "balance_loss_clip": 1.53303814, + "balance_loss_mlp": 1.0688256, + "epoch": 0.011543664512250112, + "flos": 26182531356120.0, + "grad_norm": 3.740299818589274, + "language_loss": 0.92621291, + "learning_rate": 3.385049875042367e-06, + "loss": 0.95774812, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.91552734, + "step": 192, + "time_per_iteration": 2.898691415786743 + }, + { + "auxiliary_loss_clip": 0.01985872, + "auxiliary_loss_mlp": 0.01134662, + "balance_loss_clip": 1.52871585, + "balance_loss_mlp": 1.06718922, + "epoch": 0.011603787764918083, + "flos": 23774276899800.0, + "grad_norm": 6.678860244317198, + "language_loss": 0.88923717, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.92044246, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 4.5703125, + "router_z_loss_mlp": 0.67480469, + "step": 193, + "time_per_iteration": 2.9711101055145264 + }, + { + "auxiliary_loss_clip": 0.01976534, + "auxiliary_loss_mlp": 0.01143367, + "balance_loss_clip": 1.52489853, + "balance_loss_mlp": 1.06182778, + "epoch": 0.011663911017586051, + "flos": 25959815172000.0, + "grad_norm": 3.162912063026831, + "language_loss": 0.93966585, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.97086477, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.81591797, + "step": 194, + "time_per_iteration": 2.9657206535339355 + }, + { + "auxiliary_loss_clip": 0.0198966, + "auxiliary_loss_mlp": 0.01131289, + "balance_loss_clip": 1.52932143, + "balance_loss_mlp": 1.05351663, + "epoch": 0.01172403427025402, + "flos": 17899722321840.0, + "grad_norm": 2.92193140008891, + "language_loss": 0.92721415, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.95842361, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.77832031, + "step": 195, + "time_per_iteration": 2.874267578125 + }, + { + "auxiliary_loss_clip": 0.01980774, + "auxiliary_loss_mlp": 0.01149591, + "balance_loss_clip": 1.53046536, + "balance_loss_mlp": 1.075014, + "epoch": 0.01178415752292199, + "flos": 17899031979720.0, + "grad_norm": 3.038366120980423, + "language_loss": 0.87578154, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.9070853, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.74560547, + "step": 196, + "time_per_iteration": 2.8640503883361816 + }, + { + "auxiliary_loss_clip": 0.0197013, + "auxiliary_loss_mlp": 0.01132215, + "balance_loss_clip": 1.52450097, + "balance_loss_mlp": 1.04724276, + "epoch": 0.011844280775589959, + "flos": 22898762123400.0, + "grad_norm": 2.7576114406559338, + "language_loss": 0.95690131, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.98792475, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 4.4609375, + "router_z_loss_mlp": 0.84960938, + "step": 197, + "time_per_iteration": 2.9459903240203857 + }, + { + "auxiliary_loss_clip": 0.01963943, + "auxiliary_loss_mlp": 0.01134882, + "balance_loss_clip": 1.51519668, + "balance_loss_mlp": 1.0625459, + "epoch": 0.01190440402825793, + "flos": 26986122081000.0, + "grad_norm": 2.4039392330122635, + "language_loss": 0.801494, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.83248216, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.72314453, + "step": 198, + "time_per_iteration": 3.0039563179016113 + }, + { + "auxiliary_loss_clip": 0.01957451, + "auxiliary_loss_mlp": 0.01139376, + "balance_loss_clip": 1.52670646, + "balance_loss_mlp": 1.06298673, + "epoch": 0.011964527280925898, + "flos": 20526429084480.0, + "grad_norm": 2.040486156470013, + "language_loss": 0.89505661, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.92602491, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.76367188, + "step": 199, + "time_per_iteration": 2.895477533340454 + }, + { + "auxiliary_loss_clip": 0.01971524, + "auxiliary_loss_mlp": 0.0113046, + "balance_loss_clip": 1.52247715, + "balance_loss_mlp": 1.05974507, + "epoch": 0.012024650533593867, + "flos": 27751801578840.0, + "grad_norm": 1.8628364496034786, + "language_loss": 0.8292973, + "learning_rate": 3.411333205349222e-06, + "loss": 0.86031717, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.70654297, + "step": 200, + "time_per_iteration": 2.946681499481201 + }, + { + "auxiliary_loss_clip": 0.01966291, + "auxiliary_loss_mlp": 0.01115146, + "balance_loss_clip": 1.51408005, + "balance_loss_mlp": 1.05315745, + "epoch": 0.012084773786261837, + "flos": 10455613262640.0, + "grad_norm": 2.349755598262711, + "language_loss": 0.8953771, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.92619145, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.62011719, + "step": 201, + "time_per_iteration": 2.86869478225708 + }, + { + "auxiliary_loss_clip": 0.01961914, + "auxiliary_loss_mlp": 0.01111719, + "balance_loss_clip": 1.51999366, + "balance_loss_mlp": 1.05156636, + "epoch": 0.012144897038929806, + "flos": 23110107966720.0, + "grad_norm": 1.8325207602551634, + "language_loss": 0.85902059, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.88975686, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.60131836, + "step": 202, + "time_per_iteration": 2.859752655029297 + }, + { + "auxiliary_loss_clip": 0.01959275, + "auxiliary_loss_mlp": 0.01125872, + "balance_loss_clip": 1.51899076, + "balance_loss_mlp": 1.05208111, + "epoch": 0.012205020291597776, + "flos": 21038222158920.0, + "grad_norm": 1.962986220354828, + "language_loss": 0.92159379, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.95244527, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 4.40234375, + "router_z_loss_mlp": 0.73803711, + "step": 203, + "time_per_iteration": 2.837930202484131 + }, + { + "auxiliary_loss_clip": 0.01762242, + "auxiliary_loss_mlp": 0.01049855, + "balance_loss_clip": 1.47724915, + "balance_loss_mlp": 1.02334332, + "epoch": 0.012265143544265745, + "flos": 68461550111520.0, + "grad_norm": 1.0490487989168369, + "language_loss": 0.61211109, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.64023209, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.265625, + "step": 204, + "time_per_iteration": 3.259650945663452 + }, + { + "auxiliary_loss_clip": 0.01953477, + "auxiliary_loss_mlp": 0.01122302, + "balance_loss_clip": 1.50488853, + "balance_loss_mlp": 1.05425751, + "epoch": 0.012325266796933715, + "flos": 17023679636760.0, + "grad_norm": 2.511729079587798, + "language_loss": 0.93292177, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.96367955, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 4.484375, + "router_z_loss_mlp": 0.6796875, + "step": 205, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0197925, + "auxiliary_loss_mlp": 0.01122507, + "balance_loss_clip": 1.52260983, + "balance_loss_mlp": 1.05574989, + "epoch": 0.012385390049601683, + "flos": 20194446138840.0, + "grad_norm": 2.139392259368788, + "language_loss": 0.9138242, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.94484174, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.66748047, + "step": 206, + "time_per_iteration": 3.0152950286865234 + }, + { + "auxiliary_loss_clip": 0.01956618, + "auxiliary_loss_mlp": 0.01120994, + "balance_loss_clip": 1.50923896, + "balance_loss_mlp": 1.05352199, + "epoch": 0.012445513302269652, + "flos": 16257472230240.0, + "grad_norm": 3.1040141108205925, + "language_loss": 0.97187042, + "learning_rate": 3.43348263905683e-06, + "loss": 1.00264657, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 4.47265625, + "router_z_loss_mlp": 0.67480469, + "step": 207, + "time_per_iteration": 3.002028465270996 + }, + { + "auxiliary_loss_clip": 0.01946943, + "auxiliary_loss_mlp": 0.01115273, + "balance_loss_clip": 1.51250494, + "balance_loss_mlp": 1.05333245, + "epoch": 0.012505636554937622, + "flos": 23774723591760.0, + "grad_norm": 1.940203221199135, + "language_loss": 0.77609336, + "learning_rate": 3.436585547151547e-06, + "loss": 0.80671555, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.61914062, + "step": 208, + "time_per_iteration": 2.9896368980407715 + }, + { + "auxiliary_loss_clip": 0.01924373, + "auxiliary_loss_mlp": 0.01122018, + "balance_loss_clip": 1.49970734, + "balance_loss_mlp": 1.0588851, + "epoch": 0.012565759807605591, + "flos": 30597691598280.0, + "grad_norm": 3.008363896188805, + "language_loss": 0.99905485, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.02951884, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.63134766, + "step": 209, + "time_per_iteration": 2.9683644771575928 + }, + { + "auxiliary_loss_clip": 0.0193716, + "auxiliary_loss_mlp": 0.01119568, + "balance_loss_clip": 1.50571382, + "balance_loss_mlp": 1.05431271, + "epoch": 0.012625883060273561, + "flos": 40120792928280.0, + "grad_norm": 3.1343916409771118, + "language_loss": 0.88292331, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.91349059, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.65258789, + "step": 210, + "time_per_iteration": 3.012847423553467 + }, + { + "auxiliary_loss_clip": 0.01936267, + "auxiliary_loss_mlp": 0.01127695, + "balance_loss_clip": 1.50197601, + "balance_loss_mlp": 1.06599212, + "epoch": 0.01268600631294153, + "flos": 27094901542200.0, + "grad_norm": 2.217289579345974, + "language_loss": 0.98101377, + "learning_rate": 3.445805545042314e-06, + "loss": 1.01165342, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.6171875, + "step": 211, + "time_per_iteration": 5.984842300415039 + }, + { + "auxiliary_loss_clip": 0.0193661, + "auxiliary_loss_mlp": 0.01118655, + "balance_loss_clip": 1.50214219, + "balance_loss_mlp": 1.05504465, + "epoch": 0.012746129565609499, + "flos": 16987189702320.0, + "grad_norm": 2.9339729012608964, + "language_loss": 0.97502911, + "learning_rate": 3.448849769075239e-06, + "loss": 1.00558174, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.63623047, + "step": 212, + "time_per_iteration": 2.8800768852233887 + }, + { + "auxiliary_loss_clip": 0.01927423, + "auxiliary_loss_mlp": 0.01120262, + "balance_loss_clip": 1.50510216, + "balance_loss_mlp": 1.06153965, + "epoch": 0.012806252818277469, + "flos": 46541397055320.0, + "grad_norm": 2.0078190414330614, + "language_loss": 0.78622252, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.81669933, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.5871582, + "step": 213, + "time_per_iteration": 3.0994296073913574 + }, + { + "auxiliary_loss_clip": 0.01943023, + "auxiliary_loss_mlp": 0.0110656, + "balance_loss_clip": 1.50451076, + "balance_loss_mlp": 1.05003142, + "epoch": 0.012866376070945438, + "flos": 14392221696000.0, + "grad_norm": 2.693800811343849, + "language_loss": 0.89349037, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.92398626, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.56567383, + "step": 214, + "time_per_iteration": 2.8470609188079834 + }, + { + "auxiliary_loss_clip": 0.01930206, + "auxiliary_loss_mlp": 0.01120743, + "balance_loss_clip": 1.51072931, + "balance_loss_mlp": 1.06063795, + "epoch": 0.012926499323613408, + "flos": 26146447505280.0, + "grad_norm": 2.652585994764431, + "language_loss": 0.80937994, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.83988941, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 4.19140625, + "router_z_loss_mlp": 0.60131836, + "step": 215, + "time_per_iteration": 2.8791022300720215 + }, + { + "auxiliary_loss_clip": 0.01937369, + "auxiliary_loss_mlp": 0.01122628, + "balance_loss_clip": 1.50553811, + "balance_loss_mlp": 1.05141211, + "epoch": 0.012986622576281377, + "flos": 30123281842200.0, + "grad_norm": 2.3240333539302918, + "language_loss": 0.93132031, + "learning_rate": 3.460884739729461e-06, + "loss": 0.96192026, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 4.3203125, + "router_z_loss_mlp": 0.71191406, + "step": 216, + "time_per_iteration": 2.929049015045166 + }, + { + "auxiliary_loss_clip": 0.01947752, + "auxiliary_loss_mlp": 0.0111687, + "balance_loss_clip": 1.50903869, + "balance_loss_mlp": 1.06065059, + "epoch": 0.013046745828949347, + "flos": 13957997235120.0, + "grad_norm": 4.107884379868282, + "language_loss": 0.96018064, + "learning_rate": 3.463858658104523e-06, + "loss": 0.99082685, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.56201172, + "step": 217, + "time_per_iteration": 2.812250852584839 + }, + { + "auxiliary_loss_clip": 0.01927339, + "auxiliary_loss_mlp": 0.01114737, + "balance_loss_clip": 1.504637, + "balance_loss_mlp": 1.05041218, + "epoch": 0.013106869081617315, + "flos": 17352332696880.0, + "grad_norm": 2.170912271942004, + "language_loss": 0.95326316, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.98368388, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.64257812, + "step": 218, + "time_per_iteration": 2.845515489578247 + }, + { + "auxiliary_loss_clip": 0.01933271, + "auxiliary_loss_mlp": 0.01101458, + "balance_loss_clip": 1.50241733, + "balance_loss_mlp": 1.04809928, + "epoch": 0.013166992334285284, + "flos": 25890449447160.0, + "grad_norm": 2.212831177643327, + "language_loss": 0.88532424, + "learning_rate": 3.46976560030214e-06, + "loss": 0.91567147, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 4.30859375, + "router_z_loss_mlp": 0.53320312, + "step": 219, + "time_per_iteration": 2.913290500640869 + }, + { + "auxiliary_loss_clip": 0.01934906, + "auxiliary_loss_mlp": 0.0111, + "balance_loss_clip": 1.50647616, + "balance_loss_mlp": 1.04686725, + "epoch": 0.013227115586953254, + "flos": 31182261499800.0, + "grad_norm": 1.9205446964167825, + "language_loss": 0.89384645, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.92429554, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 4.28125, + "router_z_loss_mlp": 0.63183594, + "step": 220, + "time_per_iteration": 2.88338041305542 + }, + { + "auxiliary_loss_clip": 0.01921973, + "auxiliary_loss_mlp": 0.01108355, + "balance_loss_clip": 1.49506068, + "balance_loss_mlp": 1.05072939, + "epoch": 0.013287238839621223, + "flos": 20413954262520.0, + "grad_norm": 2.0751493603104105, + "language_loss": 0.88588953, + "learning_rate": 3.475618842282164e-06, + "loss": 0.91619289, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.57617188, + "step": 221, + "time_per_iteration": 2.8237500190734863 + }, + { + "auxiliary_loss_clip": 0.01931718, + "auxiliary_loss_mlp": 0.01130298, + "balance_loss_clip": 1.50145566, + "balance_loss_mlp": 1.04983163, + "epoch": 0.013347362092289193, + "flos": 14141218466160.0, + "grad_norm": 2.7854163786390496, + "language_loss": 0.95138812, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.98200834, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.80395508, + "step": 222, + "time_per_iteration": 2.765223979949951 + }, + { + "auxiliary_loss_clip": 0.01920176, + "auxiliary_loss_mlp": 0.01103809, + "balance_loss_clip": 1.5011878, + "balance_loss_mlp": 1.04115224, + "epoch": 0.013407485344957162, + "flos": 21802602189240.0, + "grad_norm": 2.8470531856459926, + "language_loss": 0.9707135, + "learning_rate": 3.481419351635897e-06, + "loss": 1.00095332, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.62646484, + "step": 223, + "time_per_iteration": 2.8313674926757812 + }, + { + "auxiliary_loss_clip": 0.01944956, + "auxiliary_loss_mlp": 0.01118284, + "balance_loss_clip": 1.50918126, + "balance_loss_mlp": 1.06242216, + "epoch": 0.013467608597625132, + "flos": 18625866258240.0, + "grad_norm": 2.7683071009103353, + "language_loss": 0.90263617, + "learning_rate": 3.484300126837776e-06, + "loss": 0.93326861, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.55932617, + "step": 224, + "time_per_iteration": 2.8750007152557373 + }, + { + "auxiliary_loss_clip": 0.01943416, + "auxiliary_loss_mlp": 0.01103878, + "balance_loss_clip": 1.50840473, + "balance_loss_mlp": 1.04551291, + "epoch": 0.013527731850293101, + "flos": 18557028442080.0, + "grad_norm": 5.832801900776133, + "language_loss": 0.91416866, + "learning_rate": 3.487168070036317e-06, + "loss": 0.94464159, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 4.3515625, + "router_z_loss_mlp": 0.58496094, + "step": 225, + "time_per_iteration": 2.814959764480591 + }, + { + "auxiliary_loss_clip": 0.01915729, + "auxiliary_loss_mlp": 0.01111302, + "balance_loss_clip": 1.50559974, + "balance_loss_mlp": 1.05355716, + "epoch": 0.01358785510296107, + "flos": 19169235655560.0, + "grad_norm": 5.029169231494438, + "language_loss": 1.00128007, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.03155041, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 4.109375, + "router_z_loss_mlp": 0.57739258, + "step": 226, + "time_per_iteration": 2.8295271396636963 + }, + { + "auxiliary_loss_clip": 0.01931969, + "auxiliary_loss_mlp": 0.01118108, + "balance_loss_clip": 1.50200272, + "balance_loss_mlp": 1.06012428, + "epoch": 0.01364797835562904, + "flos": 23335017002280.0, + "grad_norm": 2.898184881290043, + "language_loss": 0.93092382, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.96142465, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 4.30273438, + "router_z_loss_mlp": 0.5793457, + "step": 227, + "time_per_iteration": 2.970775842666626 + }, + { + "auxiliary_loss_clip": 0.01764832, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.49737895, + "balance_loss_mlp": 1.05740607, + "epoch": 0.013708101608297009, + "flos": 71011558017360.0, + "grad_norm": 0.9349343887770805, + "language_loss": 0.57531035, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.60380548, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.2734375, + "step": 228, + "time_per_iteration": 3.396146535873413 + }, + { + "auxiliary_loss_clip": 0.01911394, + "auxiliary_loss_mlp": 0.011056, + "balance_loss_clip": 1.49730372, + "balance_loss_mlp": 1.04806972, + "epoch": 0.013768224860964979, + "flos": 16329396281760.0, + "grad_norm": 4.485806165900927, + "language_loss": 0.8972398, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.92740971, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.57495117, + "step": 229, + "time_per_iteration": 2.789334297180176 + }, + { + "auxiliary_loss_clip": 0.01937054, + "auxiliary_loss_mlp": 0.0111136, + "balance_loss_clip": 1.50530803, + "balance_loss_mlp": 1.05485463, + "epoch": 0.013828348113632948, + "flos": 20197816632720.0, + "grad_norm": 2.8621313255263705, + "language_loss": 0.86207747, + "learning_rate": 3.501319237118231e-06, + "loss": 0.89256155, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.56542969, + "step": 230, + "time_per_iteration": 2.8749637603759766 + }, + { + "auxiliary_loss_clip": 0.0192937, + "auxiliary_loss_mlp": 0.01115436, + "balance_loss_clip": 1.50679302, + "balance_loss_mlp": 1.06026626, + "epoch": 0.013888471366300916, + "flos": 20746018424880.0, + "grad_norm": 2.2583488976503405, + "language_loss": 0.91842198, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.94887006, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 4.234375, + "router_z_loss_mlp": 0.55175781, + "step": 231, + "time_per_iteration": 2.845820426940918 + }, + { + "auxiliary_loss_clip": 0.01942761, + "auxiliary_loss_mlp": 0.01120807, + "balance_loss_clip": 1.50974488, + "balance_loss_mlp": 1.06406355, + "epoch": 0.013948594618968886, + "flos": 22095699307200.0, + "grad_norm": 2.4714389012053632, + "language_loss": 0.8654263, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.89606202, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.56787109, + "step": 232, + "time_per_iteration": 2.845452308654785 + }, + { + "auxiliary_loss_clip": 0.01959846, + "auxiliary_loss_mlp": 0.01123524, + "balance_loss_clip": 1.51476645, + "balance_loss_mlp": 1.05943727, + "epoch": 0.014008717871636855, + "flos": 19067887524240.0, + "grad_norm": 3.0680703124646582, + "language_loss": 0.77149618, + "learning_rate": 3.509663010692652e-06, + "loss": 0.8023299, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 4.44921875, + "router_z_loss_mlp": 0.64160156, + "step": 233, + "time_per_iteration": 2.8691351413726807 + }, + { + "auxiliary_loss_clip": 0.01930973, + "auxiliary_loss_mlp": 0.01119917, + "balance_loss_clip": 1.50750113, + "balance_loss_mlp": 1.05945373, + "epoch": 0.014068841124304825, + "flos": 14533958164320.0, + "grad_norm": 2.3811113273948608, + "language_loss": 0.87964541, + "learning_rate": 3.512420411838642e-06, + "loss": 0.91015434, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.60400391, + "step": 234, + "time_per_iteration": 2.904994249343872 + }, + { + "auxiliary_loss_clip": 0.01932175, + "auxiliary_loss_mlp": 0.01136782, + "balance_loss_clip": 1.51088476, + "balance_loss_mlp": 1.07069278, + "epoch": 0.014128964376972794, + "flos": 18081968952240.0, + "grad_norm": 2.2825814467091794, + "language_loss": 0.92428285, + "learning_rate": 3.515166054308634e-06, + "loss": 0.95497245, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.66064453, + "step": 235, + "time_per_iteration": 2.8986027240753174 + }, + { + "auxiliary_loss_clip": 0.01935846, + "auxiliary_loss_mlp": 0.01150578, + "balance_loss_clip": 1.51428628, + "balance_loss_mlp": 1.07251978, + "epoch": 0.014189087629640764, + "flos": 25339364461440.0, + "grad_norm": 2.9844847754709316, + "language_loss": 0.87306732, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.9039315, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.78027344, + "step": 236, + "time_per_iteration": 2.9448375701904297 + }, + { + "auxiliary_loss_clip": 0.01935138, + "auxiliary_loss_mlp": 0.01119297, + "balance_loss_clip": 1.50978518, + "balance_loss_mlp": 1.05921566, + "epoch": 0.014249210882308733, + "flos": 36146923001640.0, + "grad_norm": 3.4030892962300157, + "language_loss": 0.85544777, + "learning_rate": 3.520622461401154e-06, + "loss": 0.88599211, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 4.25390625, + "router_z_loss_mlp": 0.60058594, + "step": 237, + "time_per_iteration": 2.9991860389709473 + }, + { + "auxiliary_loss_clip": 0.0193634, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_clip": 1.50710523, + "balance_loss_mlp": 1.07100034, + "epoch": 0.014309334134976702, + "flos": 12936888196200.0, + "grad_norm": 2.213938202149095, + "language_loss": 0.79221368, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.82295561, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 4.29101562, + "router_z_loss_mlp": 0.66845703, + "step": 238, + "time_per_iteration": 2.8323984146118164 + }, + { + "auxiliary_loss_clip": 0.0192767, + "auxiliary_loss_mlp": 0.01122667, + "balance_loss_clip": 1.51443768, + "balance_loss_mlp": 1.07126439, + "epoch": 0.014369457387644672, + "flos": 20782467750960.0, + "grad_norm": 1.9031522151440061, + "language_loss": 0.88876384, + "learning_rate": 3.526033015791284e-06, + "loss": 0.91926724, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.51489258, + "step": 239, + "time_per_iteration": 2.8064520359039307 + }, + { + "auxiliary_loss_clip": 0.01901027, + "auxiliary_loss_mlp": 0.01117531, + "balance_loss_clip": 1.49915206, + "balance_loss_mlp": 1.06021547, + "epoch": 0.01442958064031264, + "flos": 25854081337800.0, + "grad_norm": 2.483096837522917, + "language_loss": 0.95210594, + "learning_rate": 3.528721337790862e-06, + "loss": 0.98229146, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.57275391, + "step": 240, + "time_per_iteration": 2.8449289798736572 + }, + { + "auxiliary_loss_clip": 0.0191938, + "auxiliary_loss_mlp": 0.01134152, + "balance_loss_clip": 1.50566232, + "balance_loss_mlp": 1.06839609, + "epoch": 0.014489703892980611, + "flos": 28225520992800.0, + "grad_norm": 2.5039751952574876, + "language_loss": 0.86557871, + "learning_rate": 3.531398481704111e-06, + "loss": 0.89611399, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 4.13867188, + "router_z_loss_mlp": 0.65771484, + "step": 241, + "time_per_iteration": 2.8571133613586426 + }, + { + "auxiliary_loss_clip": 0.01901854, + "auxiliary_loss_mlp": 0.01120307, + "balance_loss_clip": 1.50761724, + "balance_loss_mlp": 1.06012988, + "epoch": 0.01454982714564858, + "flos": 22495910943600.0, + "grad_norm": 2.098001848185389, + "language_loss": 0.90007377, + "learning_rate": 3.534064540103573e-06, + "loss": 0.93029535, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.60205078, + "step": 242, + "time_per_iteration": 2.826002597808838 + }, + { + "auxiliary_loss_clip": 0.01921566, + "auxiliary_loss_mlp": 0.01129931, + "balance_loss_clip": 1.50659919, + "balance_loss_mlp": 1.06722724, + "epoch": 0.014609950398316548, + "flos": 21658266785880.0, + "grad_norm": 2.2623017907431526, + "language_loss": 0.87364256, + "learning_rate": 3.536719604416555e-06, + "loss": 0.90415752, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 4.15429688, + "router_z_loss_mlp": 0.62744141, + "step": 243, + "time_per_iteration": 2.8352391719818115 + }, + { + "auxiliary_loss_clip": 0.01912802, + "auxiliary_loss_mlp": 0.01114507, + "balance_loss_clip": 1.50086117, + "balance_loss_mlp": 1.05540347, + "epoch": 0.014670073650984519, + "flos": 21874810499280.0, + "grad_norm": 3.7257360207408126, + "language_loss": 0.85571831, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.88599145, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.59155273, + "step": 244, + "time_per_iteration": 2.8336904048919678 + }, + { + "auxiliary_loss_clip": 0.01933625, + "auxiliary_loss_mlp": 0.01141274, + "balance_loss_clip": 1.50372076, + "balance_loss_mlp": 1.07866526, + "epoch": 0.014730196903652487, + "flos": 23188732397640.0, + "grad_norm": 2.9536810786171537, + "language_loss": 0.81562513, + "learning_rate": 3.54199711087864e-06, + "loss": 0.84637415, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.62597656, + "step": 245, + "time_per_iteration": 2.8497021198272705 + }, + { + "auxiliary_loss_clip": 0.01930082, + "auxiliary_loss_mlp": 0.01131906, + "balance_loss_clip": 1.5059706, + "balance_loss_mlp": 1.0637182, + "epoch": 0.014790320156320457, + "flos": 23227943092200.0, + "grad_norm": 2.3207624372657336, + "language_loss": 0.85198104, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.8826009, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.68164062, + "step": 246, + "time_per_iteration": 2.8406827449798584 + }, + { + "auxiliary_loss_clip": 0.01928505, + "auxiliary_loss_mlp": 0.01123804, + "balance_loss_clip": 1.50662291, + "balance_loss_mlp": 1.06346059, + "epoch": 0.014850443408988426, + "flos": 15819796058760.0, + "grad_norm": 3.310619098208306, + "language_loss": 0.91404527, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.9445684, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 4.22070312, + "router_z_loss_mlp": 0.60327148, + "step": 247, + "time_per_iteration": 2.806055784225464 + }, + { + "auxiliary_loss_clip": 0.01927036, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_clip": 1.50230145, + "balance_loss_mlp": 1.067644, + "epoch": 0.014910566661656396, + "flos": 22786530951600.0, + "grad_norm": 3.1836813307250655, + "language_loss": 0.78957844, + "learning_rate": 3.549833136812155e-06, + "loss": 0.82020783, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.68188477, + "step": 248, + "time_per_iteration": 2.8725523948669434 + }, + { + "auxiliary_loss_clip": 0.01924301, + "auxiliary_loss_mlp": 0.01124322, + "balance_loss_clip": 1.50800776, + "balance_loss_mlp": 1.05937648, + "epoch": 0.014970689914324365, + "flos": 26870033115000.0, + "grad_norm": 2.1886142786021154, + "language_loss": 0.84725511, + "learning_rate": 3.552424094769381e-06, + "loss": 0.87774134, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 4.16210938, + "router_z_loss_mlp": 0.64990234, + "step": 249, + "time_per_iteration": 5.922483682632446 + }, + { + "auxiliary_loss_clip": 0.01912854, + "auxiliary_loss_mlp": 0.01109806, + "balance_loss_clip": 1.50113332, + "balance_loss_mlp": 1.05275285, + "epoch": 0.015030813166992334, + "flos": 13989005040960.0, + "grad_norm": 2.5492700070431975, + "language_loss": 0.94768959, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.97791624, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.57104492, + "step": 250, + "time_per_iteration": 5.80391263961792 + }, + { + "auxiliary_loss_clip": 0.01910884, + "auxiliary_loss_mlp": 0.01137058, + "balance_loss_clip": 1.50009465, + "balance_loss_mlp": 1.0585947, + "epoch": 0.015090936419660304, + "flos": 24723258845400.0, + "grad_norm": 2.436022407465941, + "language_loss": 0.99708676, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.02756619, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.78369141, + "step": 251, + "time_per_iteration": 2.8553948402404785 + }, + { + "auxiliary_loss_clip": 0.01918397, + "auxiliary_loss_mlp": 0.01115004, + "balance_loss_clip": 1.50325155, + "balance_loss_mlp": 1.05284774, + "epoch": 0.015151059672328273, + "flos": 25744286667600.0, + "grad_norm": 1.9924824993719872, + "language_loss": 0.86664569, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.89697969, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 4.15820312, + "router_z_loss_mlp": 0.62060547, + "step": 252, + "time_per_iteration": 2.918600559234619 + }, + { + "auxiliary_loss_clip": 0.01895293, + "auxiliary_loss_mlp": 0.01106037, + "balance_loss_clip": 1.49887919, + "balance_loss_mlp": 1.05000865, + "epoch": 0.015211182924996243, + "flos": 21876191183520.0, + "grad_norm": 2.2485578494177823, + "language_loss": 1.01563573, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.04564905, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.56103516, + "step": 253, + "time_per_iteration": 2.87422513961792 + }, + { + "auxiliary_loss_clip": 0.01729527, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.44972563, + "balance_loss_mlp": 1.01552284, + "epoch": 0.015271306177664212, + "flos": 66910756692600.0, + "grad_norm": 0.858893342613563, + "language_loss": 0.55582416, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.58350158, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.2265625, + "step": 254, + "time_per_iteration": 3.4016706943511963 + }, + { + "auxiliary_loss_clip": 0.01910805, + "auxiliary_loss_mlp": 0.01115506, + "balance_loss_clip": 1.49951744, + "balance_loss_mlp": 1.0570215, + "epoch": 0.01533142943033218, + "flos": 26839796868000.0, + "grad_norm": 2.6124820235245285, + "language_loss": 0.92772305, + "learning_rate": 3.567754632921479e-06, + "loss": 0.95798624, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.5847168, + "step": 255, + "time_per_iteration": 2.901061773300171 + }, + { + "auxiliary_loss_clip": 0.01907986, + "auxiliary_loss_mlp": 0.01125062, + "balance_loss_clip": 1.49814582, + "balance_loss_mlp": 1.06839013, + "epoch": 0.01539155268300015, + "flos": 20818673426880.0, + "grad_norm": 2.717370487319634, + "language_loss": 0.87599152, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.90632194, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.56665039, + "step": 256, + "time_per_iteration": 2.7780768871307373 + }, + { + "auxiliary_loss_clip": 0.01918521, + "auxiliary_loss_mlp": 0.01116681, + "balance_loss_clip": 1.49952769, + "balance_loss_mlp": 1.04587078, + "epoch": 0.01545167593566812, + "flos": 15966243096840.0, + "grad_norm": 5.328817592005003, + "language_loss": 0.75836468, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.78871667, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 4.18554688, + "router_z_loss_mlp": 0.70800781, + "step": 257, + "time_per_iteration": 2.7912707328796387 + }, + { + "auxiliary_loss_clip": 0.01893706, + "auxiliary_loss_mlp": 0.01102749, + "balance_loss_clip": 1.49620771, + "balance_loss_mlp": 1.04304922, + "epoch": 0.01551179918833609, + "flos": 22606923864600.0, + "grad_norm": 2.396931799826911, + "language_loss": 0.9683516, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.99831617, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.59643555, + "step": 258, + "time_per_iteration": 2.8256702423095703 + }, + { + "auxiliary_loss_clip": 0.01895137, + "auxiliary_loss_mlp": 0.01127131, + "balance_loss_clip": 1.49812984, + "balance_loss_mlp": 1.05953884, + "epoch": 0.015571922441004058, + "flos": 22821396551640.0, + "grad_norm": 2.085445079044377, + "language_loss": 0.94406271, + "learning_rate": 3.577775880881658e-06, + "loss": 0.97428536, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.67578125, + "step": 259, + "time_per_iteration": 2.845794677734375 + }, + { + "auxiliary_loss_clip": 0.01882309, + "auxiliary_loss_mlp": 0.01108441, + "balance_loss_clip": 1.49955356, + "balance_loss_mlp": 1.04947996, + "epoch": 0.015632045693672027, + "flos": 18951392474640.0, + "grad_norm": 2.30762686689122, + "language_loss": 0.97825783, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.00816536, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.58984375, + "step": 260, + "time_per_iteration": 2.7745325565338135 + }, + { + "auxiliary_loss_clip": 0.01912051, + "auxiliary_loss_mlp": 0.01099133, + "balance_loss_clip": 1.50105846, + "balance_loss_mlp": 1.04134035, + "epoch": 0.015692168946339995, + "flos": 29977850013120.0, + "grad_norm": 4.333885890286412, + "language_loss": 0.90775371, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.9378655, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 4.1015625, + "router_z_loss_mlp": 0.57885742, + "step": 261, + "time_per_iteration": 2.8712658882141113 + }, + { + "auxiliary_loss_clip": 0.01926801, + "auxiliary_loss_mlp": 0.01104456, + "balance_loss_clip": 1.50951684, + "balance_loss_mlp": 1.04823732, + "epoch": 0.015752292199007967, + "flos": 19396987276320.0, + "grad_norm": 2.082789423283173, + "language_loss": 0.69242102, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.72273362, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 4.16992188, + "router_z_loss_mlp": 0.5625, + "step": 262, + "time_per_iteration": 2.807140350341797 + }, + { + "auxiliary_loss_clip": 0.01913477, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_clip": 1.50373459, + "balance_loss_mlp": 1.05467188, + "epoch": 0.015812415451675936, + "flos": 20344710362760.0, + "grad_norm": 4.289368048786222, + "language_loss": 0.70104855, + "learning_rate": 3.587643540438383e-06, + "loss": 0.7313199, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.58911133, + "step": 263, + "time_per_iteration": 2.8133041858673096 + }, + { + "auxiliary_loss_clip": 0.01908733, + "auxiliary_loss_mlp": 0.01118131, + "balance_loss_clip": 1.49952579, + "balance_loss_mlp": 1.05149317, + "epoch": 0.015872538704343905, + "flos": 17529300240480.0, + "grad_norm": 2.4939309145771227, + "language_loss": 0.87344158, + "learning_rate": 3.590087005168037e-06, + "loss": 0.90371025, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.66625977, + "step": 264, + "time_per_iteration": 2.8222663402557373 + }, + { + "auxiliary_loss_clip": 0.01906273, + "auxiliary_loss_mlp": 0.0110195, + "balance_loss_clip": 1.50043011, + "balance_loss_mlp": 1.04861557, + "epoch": 0.015932661957011873, + "flos": 15263716244760.0, + "grad_norm": 4.243809640362907, + "language_loss": 1.04429841, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.07438064, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 4.06054688, + "router_z_loss_mlp": 0.53295898, + "step": 265, + "time_per_iteration": 2.7740957736968994 + }, + { + "auxiliary_loss_clip": 0.01905251, + "auxiliary_loss_mlp": 0.01135424, + "balance_loss_clip": 1.50776505, + "balance_loss_mlp": 1.05808103, + "epoch": 0.015992785209679845, + "flos": 20307327044400.0, + "grad_norm": 2.506434961561057, + "language_loss": 0.79270577, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.82311261, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.7734375, + "step": 266, + "time_per_iteration": 2.8303136825561523 + }, + { + "auxiliary_loss_clip": 0.01899439, + "auxiliary_loss_mlp": 0.01103767, + "balance_loss_clip": 1.49971509, + "balance_loss_mlp": 1.04516327, + "epoch": 0.016052908462347814, + "flos": 23366918192040.0, + "grad_norm": 3.0535563951307236, + "language_loss": 0.91259754, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.94262958, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 4.00195312, + "router_z_loss_mlp": 0.5859375, + "step": 267, + "time_per_iteration": 2.846980094909668 + }, + { + "auxiliary_loss_clip": 0.01897373, + "auxiliary_loss_mlp": 0.01138558, + "balance_loss_clip": 1.49103856, + "balance_loss_mlp": 1.05620813, + "epoch": 0.016113031715015783, + "flos": 21291377631840.0, + "grad_norm": 2.733291751204773, + "language_loss": 0.87626243, + "learning_rate": 3.599769175344462e-06, + "loss": 0.90662169, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.82348633, + "step": 268, + "time_per_iteration": 2.843695878982544 + }, + { + "auxiliary_loss_clip": 0.01891344, + "auxiliary_loss_mlp": 0.01097757, + "balance_loss_clip": 1.49767089, + "balance_loss_mlp": 1.04308724, + "epoch": 0.01617315496768375, + "flos": 18919125809640.0, + "grad_norm": 2.381419693641931, + "language_loss": 0.89594388, + "learning_rate": 3.602167137831432e-06, + "loss": 0.92583489, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.54711914, + "step": 269, + "time_per_iteration": 2.8174378871917725 + }, + { + "auxiliary_loss_clip": 0.01889611, + "auxiliary_loss_mlp": 0.01124957, + "balance_loss_clip": 1.49291968, + "balance_loss_mlp": 1.04148698, + "epoch": 0.01623327822035172, + "flos": 16551056648520.0, + "grad_norm": 2.3491191992536624, + "language_loss": 0.98823828, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.01838398, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.83496094, + "step": 270, + "time_per_iteration": 2.868804931640625 + }, + { + "auxiliary_loss_clip": 0.01895839, + "auxiliary_loss_mlp": 0.01119277, + "balance_loss_clip": 1.49973166, + "balance_loss_mlp": 1.05561888, + "epoch": 0.016293401473019692, + "flos": 23518725533640.0, + "grad_norm": 7.4952988413762895, + "language_loss": 0.88065577, + "learning_rate": 3.606936435072361e-06, + "loss": 0.91080701, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.63647461, + "step": 271, + "time_per_iteration": 2.9517738819122314 + }, + { + "auxiliary_loss_clip": 0.01906675, + "auxiliary_loss_mlp": 0.01103271, + "balance_loss_clip": 1.49886489, + "balance_loss_mlp": 1.03932691, + "epoch": 0.01635352472568766, + "flos": 29021152479120.0, + "grad_norm": 2.607322413230582, + "language_loss": 0.83502257, + "learning_rate": 3.609307900676025e-06, + "loss": 0.86512202, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.63964844, + "step": 272, + "time_per_iteration": 2.9003498554229736 + }, + { + "auxiliary_loss_clip": 0.01883625, + "auxiliary_loss_mlp": 0.01115461, + "balance_loss_clip": 1.49039686, + "balance_loss_mlp": 1.05621386, + "epoch": 0.01641364797835563, + "flos": 13374442542600.0, + "grad_norm": 2.698683941637631, + "language_loss": 0.82246786, + "learning_rate": 3.611670663634051e-06, + "loss": 0.85245872, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.59204102, + "step": 273, + "time_per_iteration": 2.8074638843536377 + }, + { + "auxiliary_loss_clip": 0.01894905, + "auxiliary_loss_mlp": 0.01116385, + "balance_loss_clip": 1.48285413, + "balance_loss_mlp": 1.05339456, + "epoch": 0.016473771231023598, + "flos": 18882676483560.0, + "grad_norm": 2.307212945580152, + "language_loss": 0.93467271, + "learning_rate": 3.614024787585744e-06, + "loss": 0.96478558, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.63037109, + "step": 274, + "time_per_iteration": 2.835801124572754 + }, + { + "auxiliary_loss_clip": 0.01882791, + "auxiliary_loss_mlp": 0.0109927, + "balance_loss_clip": 1.49163949, + "balance_loss_mlp": 1.04176378, + "epoch": 0.016533894483691566, + "flos": 22606802039520.0, + "grad_norm": 2.5901664938960414, + "language_loss": 0.89467478, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.9244954, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.57568359, + "step": 275, + "time_per_iteration": 2.874114751815796 + }, + { + "auxiliary_loss_clip": 0.0189764, + "auxiliary_loss_mlp": 0.01104008, + "balance_loss_clip": 1.49363899, + "balance_loss_mlp": 1.04788458, + "epoch": 0.01659401773635954, + "flos": 21512225831400.0, + "grad_norm": 1.7969879288614465, + "language_loss": 0.81714064, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.84715712, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 4.04101562, + "router_z_loss_mlp": 0.56103516, + "step": 276, + "time_per_iteration": 2.818136215209961 + }, + { + "auxiliary_loss_clip": 0.01870812, + "auxiliary_loss_mlp": 0.01091634, + "balance_loss_clip": 1.48497677, + "balance_loss_mlp": 1.03756118, + "epoch": 0.016654140989027507, + "flos": 32857062514920.0, + "grad_norm": 2.533819056100973, + "language_loss": 0.82476544, + "learning_rate": 3.621035951423551e-06, + "loss": 0.85438985, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.54174805, + "step": 277, + "time_per_iteration": 2.8837602138519287 + }, + { + "auxiliary_loss_clip": 0.01875041, + "auxiliary_loss_mlp": 0.01121852, + "balance_loss_clip": 1.48907304, + "balance_loss_mlp": 1.05347323, + "epoch": 0.016714264241695476, + "flos": 12309209197560.0, + "grad_norm": 3.3171977883071238, + "language_loss": 0.82134706, + "learning_rate": 3.623356141983041e-06, + "loss": 0.85131598, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.68432617, + "step": 278, + "time_per_iteration": 2.75716233253479 + }, + { + "auxiliary_loss_clip": 0.01885012, + "auxiliary_loss_mlp": 0.01114943, + "balance_loss_clip": 1.48789668, + "balance_loss_mlp": 1.0537647, + "epoch": 0.016774387494363444, + "flos": 27129279841920.0, + "grad_norm": 2.019714378268216, + "language_loss": 0.91684639, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.94684589, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.61132812, + "step": 279, + "time_per_iteration": 2.8212625980377197 + }, + { + "auxiliary_loss_clip": 0.01888695, + "auxiliary_loss_mlp": 0.01116128, + "balance_loss_clip": 1.48585224, + "balance_loss_mlp": 1.05924153, + "epoch": 0.016834510747031413, + "flos": 20196070473240.0, + "grad_norm": 2.30312366905585, + "language_loss": 0.9555378, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.98558599, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.56933594, + "step": 280, + "time_per_iteration": 2.810877799987793 + }, + { + "auxiliary_loss_clip": 0.01885413, + "auxiliary_loss_mlp": 0.01127259, + "balance_loss_clip": 1.48500192, + "balance_loss_mlp": 1.06074023, + "epoch": 0.016894633999699385, + "flos": 27280356233040.0, + "grad_norm": 2.07441756900557, + "language_loss": 0.75702381, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.7871505, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.66552734, + "step": 281, + "time_per_iteration": 2.8626832962036133 + }, + { + "auxiliary_loss_clip": 0.01866243, + "auxiliary_loss_mlp": 0.01115367, + "balance_loss_clip": 1.47681022, + "balance_loss_mlp": 1.05700207, + "epoch": 0.016954757252367354, + "flos": 14907101005800.0, + "grad_norm": 2.804397006536754, + "language_loss": 0.82391512, + "learning_rate": 3.632554186750274e-06, + "loss": 0.85373127, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 3.89648438, + "router_z_loss_mlp": 0.58422852, + "step": 282, + "time_per_iteration": 2.801447629928589 + }, + { + "auxiliary_loss_clip": 0.01875309, + "auxiliary_loss_mlp": 0.01113005, + "balance_loss_clip": 1.48514831, + "balance_loss_mlp": 1.05902684, + "epoch": 0.017014880505035322, + "flos": 21363504725160.0, + "grad_norm": 21.49905683359905, + "language_loss": 0.79183608, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.82171923, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 3.90820312, + "router_z_loss_mlp": 0.53979492, + "step": 283, + "time_per_iteration": 2.9052045345306396 + }, + { + "auxiliary_loss_clip": 0.01883218, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_clip": 1.49069476, + "balance_loss_mlp": 1.05242634, + "epoch": 0.01707500375770329, + "flos": 35339718132720.0, + "grad_norm": 2.864821104464612, + "language_loss": 0.85692155, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.88685572, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 3.92578125, + "router_z_loss_mlp": 0.57714844, + "step": 284, + "time_per_iteration": 2.943439245223999 + }, + { + "auxiliary_loss_clip": 0.01866521, + "auxiliary_loss_mlp": 0.01107388, + "balance_loss_clip": 1.48363507, + "balance_loss_mlp": 1.05176532, + "epoch": 0.01713512701037126, + "flos": 23586751182600.0, + "grad_norm": 2.618273100580275, + "language_loss": 0.99359345, + "learning_rate": 3.639367500948819e-06, + "loss": 1.0233326, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.55615234, + "step": 285, + "time_per_iteration": 2.8458471298217773 + }, + { + "auxiliary_loss_clip": 0.01864812, + "auxiliary_loss_mlp": 0.01102317, + "balance_loss_clip": 1.4767108, + "balance_loss_mlp": 1.04244995, + "epoch": 0.01719525026303923, + "flos": 27640220140800.0, + "grad_norm": 2.7758799349522065, + "language_loss": 0.94849491, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.97816622, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.59838867, + "step": 286, + "time_per_iteration": 2.9144585132598877 + }, + { + "auxiliary_loss_clip": 0.01870437, + "auxiliary_loss_mlp": 0.0109445, + "balance_loss_clip": 1.48777187, + "balance_loss_mlp": 1.04760098, + "epoch": 0.0172553735157072, + "flos": 26985431738880.0, + "grad_norm": 2.296738885906495, + "language_loss": 0.92964053, + "learning_rate": 3.643869982119001e-06, + "loss": 0.95928943, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.46875, + "step": 287, + "time_per_iteration": 2.8542208671569824 + }, + { + "auxiliary_loss_clip": 0.01869423, + "auxiliary_loss_mlp": 0.01104963, + "balance_loss_clip": 1.47861278, + "balance_loss_mlp": 1.04910183, + "epoch": 0.01731549676837517, + "flos": 14059710841680.0, + "grad_norm": 2.7672856522102256, + "language_loss": 1.04139352, + "learning_rate": 3.646109470232502e-06, + "loss": 1.07113743, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.55932617, + "step": 288, + "time_per_iteration": 4.365466833114624 + }, + { + "auxiliary_loss_clip": 0.01703784, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.45826578, + "balance_loss_mlp": 1.02579749, + "epoch": 0.017375620021043137, + "flos": 66528534559680.0, + "grad_norm": 0.9193488467062623, + "language_loss": 0.63915205, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66668916, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.24121094, + "step": 289, + "time_per_iteration": 4.942915916442871 + }, + { + "auxiliary_loss_clip": 0.01865876, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_clip": 1.48143435, + "balance_loss_mlp": 1.06896293, + "epoch": 0.01743574327371111, + "flos": 15228525777840.0, + "grad_norm": 2.853651326048057, + "language_loss": 0.90415597, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.93406868, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.56420898, + "step": 290, + "time_per_iteration": 2.792006731033325 + }, + { + "auxiliary_loss_clip": 0.01870999, + "auxiliary_loss_mlp": 0.01120765, + "balance_loss_clip": 1.4857502, + "balance_loss_mlp": 1.05493784, + "epoch": 0.017495866526379078, + "flos": 25379306106480.0, + "grad_norm": 1.9454991902066454, + "language_loss": 0.86582077, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.89573842, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.65869141, + "step": 291, + "time_per_iteration": 2.881653308868408 + }, + { + "auxiliary_loss_clip": 0.01869078, + "auxiliary_loss_mlp": 0.01128509, + "balance_loss_clip": 1.49185753, + "balance_loss_mlp": 1.05426598, + "epoch": 0.017555989779047047, + "flos": 26365102853400.0, + "grad_norm": 1.698995224628583, + "language_loss": 0.73644382, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.76641971, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 0.74169922, + "step": 292, + "time_per_iteration": 2.8552443981170654 + }, + { + "auxiliary_loss_clip": 0.01858035, + "auxiliary_loss_mlp": 0.01107762, + "balance_loss_clip": 1.47789681, + "balance_loss_mlp": 1.05833817, + "epoch": 0.017616113031715015, + "flos": 22342966567920.0, + "grad_norm": 2.250129013361217, + "language_loss": 0.88815212, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.91781008, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 0.49462891, + "step": 293, + "time_per_iteration": 2.8297958374023438 + }, + { + "auxiliary_loss_clip": 0.01878309, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_clip": 1.49095058, + "balance_loss_mlp": 1.07120156, + "epoch": 0.017676236284382984, + "flos": 20161732781880.0, + "grad_norm": 1.78959282359719, + "language_loss": 0.81643426, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.84653282, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.60327148, + "step": 294, + "time_per_iteration": 2.9446277618408203 + }, + { + "auxiliary_loss_clip": 0.01878798, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_clip": 1.48605072, + "balance_loss_mlp": 1.07271254, + "epoch": 0.017736359537050956, + "flos": 25228148498640.0, + "grad_norm": 2.7983910114389197, + "language_loss": 0.85748655, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.8876282, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 3.92382812, + "router_z_loss_mlp": 0.6262207, + "step": 295, + "time_per_iteration": 2.9195024967193604 + }, + { + "auxiliary_loss_clip": 0.01872311, + "auxiliary_loss_mlp": 0.01118645, + "balance_loss_clip": 1.49267936, + "balance_loss_mlp": 1.06335533, + "epoch": 0.017796482789718925, + "flos": 20343573328680.0, + "grad_norm": 2.0335393778035233, + "language_loss": 0.8583535, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.88826311, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.55322266, + "step": 296, + "time_per_iteration": 2.776371479034424 + }, + { + "auxiliary_loss_clip": 0.01874048, + "auxiliary_loss_mlp": 0.01108887, + "balance_loss_clip": 1.49149358, + "balance_loss_mlp": 1.0537169, + "epoch": 0.017856606042386893, + "flos": 22383882813600.0, + "grad_norm": 2.5187658271643896, + "language_loss": 0.88502562, + "learning_rate": 3.665921869855132e-06, + "loss": 0.91485494, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.55126953, + "step": 297, + "time_per_iteration": 2.7702605724334717 + }, + { + "auxiliary_loss_clip": 0.01873874, + "auxiliary_loss_mlp": 0.01097699, + "balance_loss_clip": 1.48703921, + "balance_loss_mlp": 1.04429364, + "epoch": 0.017916729295054862, + "flos": 20234915692560.0, + "grad_norm": 2.2913294916252065, + "language_loss": 0.907525, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.93724072, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.53417969, + "step": 298, + "time_per_iteration": 2.8189706802368164 + }, + { + "auxiliary_loss_clip": 0.0185586, + "auxiliary_loss_mlp": 0.01126444, + "balance_loss_clip": 1.48691845, + "balance_loss_mlp": 1.07120275, + "epoch": 0.01797685254772283, + "flos": 19395728417160.0, + "grad_norm": 1.978404938613744, + "language_loss": 0.89766788, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.92749095, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.55200195, + "step": 299, + "time_per_iteration": 2.8080039024353027 + }, + { + "auxiliary_loss_clip": 0.01886397, + "auxiliary_loss_mlp": 0.01116861, + "balance_loss_clip": 1.49723446, + "balance_loss_mlp": 1.06059456, + "epoch": 0.018036975800390802, + "flos": 24431420586600.0, + "grad_norm": 3.3987982258054514, + "language_loss": 0.68920624, + "learning_rate": 3.672392800539357e-06, + "loss": 0.71923888, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 3.89257812, + "router_z_loss_mlp": 0.56274414, + "step": 300, + "time_per_iteration": 2.818830728530884 + }, + { + "auxiliary_loss_clip": 0.01869746, + "auxiliary_loss_mlp": 0.0110856, + "balance_loss_clip": 1.49027359, + "balance_loss_mlp": 1.05722892, + "epoch": 0.01809709905305877, + "flos": 15783224907600.0, + "grad_norm": 2.870573111611887, + "language_loss": 0.90092933, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.9307124, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.51367188, + "step": 301, + "time_per_iteration": 2.794621467590332 + }, + { + "auxiliary_loss_clip": 0.01711871, + "auxiliary_loss_mlp": 0.01062098, + "balance_loss_clip": 1.4663322, + "balance_loss_mlp": 1.04016387, + "epoch": 0.01815722230572674, + "flos": 67366138109040.0, + "grad_norm": 0.8373898059956523, + "language_loss": 0.62154555, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64928526, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.21972656, + "step": 302, + "time_per_iteration": 3.4075193405151367 + }, + { + "auxiliary_loss_clip": 0.01859643, + "auxiliary_loss_mlp": 0.01104566, + "balance_loss_clip": 1.48493218, + "balance_loss_mlp": 1.05423641, + "epoch": 0.01821734555839471, + "flos": 15489681097680.0, + "grad_norm": 2.0882561061295823, + "language_loss": 0.90906799, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.93871003, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 0.50390625, + "step": 303, + "time_per_iteration": 2.823737382888794 + }, + { + "auxiliary_loss_clip": 0.01881905, + "auxiliary_loss_mlp": 0.01113093, + "balance_loss_clip": 1.49734473, + "balance_loss_mlp": 1.05990136, + "epoch": 0.018277468811062677, + "flos": 24102767526480.0, + "grad_norm": 2.143842977569879, + "language_loss": 0.82259548, + "learning_rate": 3.680920768703364e-06, + "loss": 0.8525455, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.53198242, + "step": 304, + "time_per_iteration": 2.8251426219940186 + }, + { + "auxiliary_loss_clip": 0.01852044, + "auxiliary_loss_mlp": 0.01100294, + "balance_loss_clip": 1.48937857, + "balance_loss_mlp": 1.05265784, + "epoch": 0.01833759206373065, + "flos": 20964227081040.0, + "grad_norm": 1.7586275553024027, + "language_loss": 0.83707321, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.86659664, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.47607422, + "step": 305, + "time_per_iteration": 2.8343915939331055 + }, + { + "auxiliary_loss_clip": 0.01858537, + "auxiliary_loss_mlp": 0.01105094, + "balance_loss_clip": 1.4860456, + "balance_loss_mlp": 1.05149758, + "epoch": 0.018397715316398618, + "flos": 19395484767000.0, + "grad_norm": 2.4940842510198133, + "language_loss": 0.92511332, + "learning_rate": 3.685142765363119e-06, + "loss": 0.95474958, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 0.53588867, + "step": 306, + "time_per_iteration": 2.973663568496704 + }, + { + "auxiliary_loss_clip": 0.01869502, + "auxiliary_loss_mlp": 0.01109806, + "balance_loss_clip": 1.48089445, + "balance_loss_mlp": 1.05589938, + "epoch": 0.018457838569066586, + "flos": 29138540912640.0, + "grad_norm": 3.5649223252938174, + "language_loss": 0.88247472, + "learning_rate": 3.687243426879095e-06, + "loss": 0.9122678, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.53955078, + "step": 307, + "time_per_iteration": 2.9355552196502686 + }, + { + "auxiliary_loss_clip": 0.01871119, + "auxiliary_loss_mlp": 0.01122717, + "balance_loss_clip": 1.49324763, + "balance_loss_mlp": 1.07071781, + "epoch": 0.018517961821734555, + "flos": 19213400570040.0, + "grad_norm": 2.230465057404492, + "language_loss": 0.73472404, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.76466244, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.52001953, + "step": 308, + "time_per_iteration": 2.8276989459991455 + }, + { + "auxiliary_loss_clip": 0.01874383, + "auxiliary_loss_mlp": 0.01110275, + "balance_loss_clip": 1.48391676, + "balance_loss_mlp": 1.06011224, + "epoch": 0.018578085074402523, + "flos": 19867539238200.0, + "grad_norm": 2.3102766176543503, + "language_loss": 0.92585146, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.95569807, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.50219727, + "step": 309, + "time_per_iteration": 2.8153743743896484 + }, + { + "auxiliary_loss_clip": 0.01887605, + "auxiliary_loss_mlp": 0.01112409, + "balance_loss_clip": 1.48940909, + "balance_loss_mlp": 1.06045806, + "epoch": 0.018638208327070496, + "flos": 29613031885440.0, + "grad_norm": 2.34378742973937, + "language_loss": 0.7488749, + "learning_rate": 3.69350459956065e-06, + "loss": 0.77887499, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 3.9765625, + "router_z_loss_mlp": 0.51904297, + "step": 310, + "time_per_iteration": 2.889331102371216 + }, + { + "auxiliary_loss_clip": 0.01860788, + "auxiliary_loss_mlp": 0.01129889, + "balance_loss_clip": 1.48642004, + "balance_loss_mlp": 1.06959271, + "epoch": 0.018698331579738464, + "flos": 45739471273200.0, + "grad_norm": 1.8713802702572675, + "language_loss": 0.75096774, + "learning_rate": 3.695578199367497e-06, + "loss": 0.78087449, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.6027832, + "step": 311, + "time_per_iteration": 3.0146074295043945 + }, + { + "auxiliary_loss_clip": 0.0186427, + "auxiliary_loss_mlp": 0.01107088, + "balance_loss_clip": 1.48304057, + "balance_loss_mlp": 1.05215645, + "epoch": 0.018758454832406433, + "flos": 20488314815640.0, + "grad_norm": 3.06954914946101, + "language_loss": 0.92691863, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.95663226, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 0.54980469, + "step": 312, + "time_per_iteration": 2.854996681213379 + }, + { + "auxiliary_loss_clip": 0.01869581, + "auxiliary_loss_mlp": 0.01110649, + "balance_loss_clip": 1.48635459, + "balance_loss_mlp": 1.0563606, + "epoch": 0.0188185780850744, + "flos": 15781925440080.0, + "grad_norm": 2.3961057761417415, + "language_loss": 0.91652548, + "learning_rate": 3.699705471087043e-06, + "loss": 0.94632769, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.54272461, + "step": 313, + "time_per_iteration": 2.806993007659912 + }, + { + "auxiliary_loss_clip": 0.01902573, + "auxiliary_loss_mlp": 0.01139034, + "balance_loss_clip": 1.49282169, + "balance_loss_mlp": 1.06073761, + "epoch": 0.018878701337742373, + "flos": 22460882910120.0, + "grad_norm": 2.3151042152038284, + "language_loss": 0.75877231, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.78918839, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.78320312, + "step": 314, + "time_per_iteration": 2.8034632205963135 + }, + { + "auxiliary_loss_clip": 0.01854911, + "auxiliary_loss_mlp": 0.01115488, + "balance_loss_clip": 1.47501028, + "balance_loss_mlp": 1.06634998, + "epoch": 0.018938824590410342, + "flos": 30999811827600.0, + "grad_norm": 3.1032856743416746, + "language_loss": 0.93725514, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.966959, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 0.49121094, + "step": 315, + "time_per_iteration": 2.948942184448242 + }, + { + "auxiliary_loss_clip": 0.01859603, + "auxiliary_loss_mlp": 0.01096172, + "balance_loss_clip": 1.47647715, + "balance_loss_mlp": 1.04340959, + "epoch": 0.01899894784307831, + "flos": 23264229984840.0, + "grad_norm": 1.8879692725722532, + "language_loss": 0.8181963, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.847754, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.52758789, + "step": 316, + "time_per_iteration": 2.8049325942993164 + }, + { + "auxiliary_loss_clip": 0.01853924, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_clip": 1.48278475, + "balance_loss_mlp": 1.03961825, + "epoch": 0.01905907109574628, + "flos": 17462492842320.0, + "grad_norm": 3.2216736047886725, + "language_loss": 0.91298008, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.94240117, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.4855957, + "step": 317, + "time_per_iteration": 2.8051629066467285 + }, + { + "auxiliary_loss_clip": 0.01849127, + "auxiliary_loss_mlp": 0.01108174, + "balance_loss_clip": 1.48117328, + "balance_loss_mlp": 1.05135846, + "epoch": 0.019119194348414248, + "flos": 14973624145440.0, + "grad_norm": 2.572123741751907, + "language_loss": 0.93870264, + "learning_rate": 3.709909364265374e-06, + "loss": 0.96827561, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.56762695, + "step": 318, + "time_per_iteration": 2.7519922256469727 + }, + { + "auxiliary_loss_clip": 0.01852604, + "auxiliary_loss_mlp": 0.01092077, + "balance_loss_clip": 1.48107982, + "balance_loss_mlp": 1.03938639, + "epoch": 0.01917931760108222, + "flos": 25488207392760.0, + "grad_norm": 2.566895748373094, + "language_loss": 0.95084172, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.98028851, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.52734375, + "step": 319, + "time_per_iteration": 2.862416982650757 + }, + { + "auxiliary_loss_clip": 0.01667462, + "auxiliary_loss_mlp": 0.0105077, + "balance_loss_clip": 1.43423939, + "balance_loss_mlp": 1.02950311, + "epoch": 0.01923944085375019, + "flos": 71572168986480.0, + "grad_norm": 0.9682492024718983, + "language_loss": 0.59725189, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62443423, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.21289062, + "step": 320, + "time_per_iteration": 3.197497844696045 + }, + { + "auxiliary_loss_clip": 0.01867181, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_clip": 1.48368251, + "balance_loss_mlp": 1.05895138, + "epoch": 0.019299564106418157, + "flos": 19687241809080.0, + "grad_norm": 2.9851536696658747, + "language_loss": 0.9215157, + "learning_rate": 3.715954969092154e-06, + "loss": 0.95128417, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.50683594, + "step": 321, + "time_per_iteration": 2.7733371257781982 + }, + { + "auxiliary_loss_clip": 0.01862224, + "auxiliary_loss_mlp": 0.01115807, + "balance_loss_clip": 1.47864842, + "balance_loss_mlp": 1.06042266, + "epoch": 0.019359687359086126, + "flos": 24392250500400.0, + "grad_norm": 3.4984982374697418, + "language_loss": 0.84738308, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.87716341, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.55419922, + "step": 322, + "time_per_iteration": 2.8329226970672607 + }, + { + "auxiliary_loss_clip": 0.01873238, + "auxiliary_loss_mlp": 0.0109897, + "balance_loss_clip": 1.49003422, + "balance_loss_mlp": 1.04344237, + "epoch": 0.019419810611754094, + "flos": 23956685963640.0, + "grad_norm": 2.26262433719635, + "language_loss": 0.7400102, + "learning_rate": 3.719954063833981e-06, + "loss": 0.76973224, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.55566406, + "step": 323, + "time_per_iteration": 2.821077346801758 + }, + { + "auxiliary_loss_clip": 0.01865573, + "auxiliary_loss_mlp": 0.01103337, + "balance_loss_clip": 1.48170328, + "balance_loss_mlp": 1.0500983, + "epoch": 0.019479933864422067, + "flos": 22164861990240.0, + "grad_norm": 5.342327503608609, + "language_loss": 0.93275797, + "learning_rate": 3.721944334919596e-06, + "loss": 0.96244705, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.53173828, + "step": 324, + "time_per_iteration": 2.8288345336914062 + }, + { + "auxiliary_loss_clip": 0.01876027, + "auxiliary_loss_mlp": 0.01109438, + "balance_loss_clip": 1.49215055, + "balance_loss_mlp": 1.05746293, + "epoch": 0.019540057117090035, + "flos": 22241943303480.0, + "grad_norm": 5.133893906435715, + "language_loss": 0.66311032, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.69296503, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.52001953, + "step": 325, + "time_per_iteration": 2.7818002700805664 + }, + { + "auxiliary_loss_clip": 0.01847496, + "auxiliary_loss_mlp": 0.01110019, + "balance_loss_clip": 1.48312306, + "balance_loss_mlp": 1.05687523, + "epoch": 0.019600180369758004, + "flos": 23082105179520.0, + "grad_norm": 3.8730620723289912, + "language_loss": 0.77335846, + "learning_rate": 3.72590651470665e-06, + "loss": 0.80293357, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.53149414, + "step": 326, + "time_per_iteration": 4.345036268234253 + }, + { + "auxiliary_loss_clip": 0.0185488, + "auxiliary_loss_mlp": 0.01130837, + "balance_loss_clip": 1.4915632, + "balance_loss_mlp": 1.08320069, + "epoch": 0.019660303622425972, + "flos": 25416364557960.0, + "grad_norm": 2.2320342358891616, + "language_loss": 0.81032473, + "learning_rate": 3.727878498433505e-06, + "loss": 0.84018189, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.47607422, + "step": 327, + "time_per_iteration": 2.8881783485412598 + }, + { + "auxiliary_loss_clip": 0.01855376, + "auxiliary_loss_mlp": 0.01117366, + "balance_loss_clip": 1.48767138, + "balance_loss_mlp": 1.06694055, + "epoch": 0.01972042687509394, + "flos": 23662533028320.0, + "grad_norm": 2.551101631897701, + "language_loss": 0.81847137, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.84819877, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.50439453, + "step": 328, + "time_per_iteration": 5.773611307144165 + }, + { + "auxiliary_loss_clip": 0.01871681, + "auxiliary_loss_mlp": 0.01111304, + "balance_loss_clip": 1.49048901, + "balance_loss_mlp": 1.0583992, + "epoch": 0.019780550127761913, + "flos": 18228091123440.0, + "grad_norm": 3.0293386515611567, + "language_loss": 0.97027361, + "learning_rate": 3.731804438545683e-06, + "loss": 1.00010347, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 0.52905273, + "step": 329, + "time_per_iteration": 2.838628053665161 + }, + { + "auxiliary_loss_clip": 0.01878252, + "auxiliary_loss_mlp": 0.01118156, + "balance_loss_clip": 1.49752355, + "balance_loss_mlp": 1.0668726, + "epoch": 0.01984067338042988, + "flos": 22423865067000.0, + "grad_norm": 3.0172651729354727, + "language_loss": 0.76648664, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.79645073, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.51342773, + "step": 330, + "time_per_iteration": 2.8827953338623047 + }, + { + "auxiliary_loss_clip": 0.01874588, + "auxiliary_loss_mlp": 0.01124481, + "balance_loss_clip": 1.49582779, + "balance_loss_mlp": 1.07109892, + "epoch": 0.01990079663309785, + "flos": 17059560445800.0, + "grad_norm": 2.7320308390673174, + "language_loss": 0.95390844, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.98389912, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.53393555, + "step": 331, + "time_per_iteration": 2.855118751525879 + }, + { + "auxiliary_loss_clip": 0.01840458, + "auxiliary_loss_mlp": 0.01093394, + "balance_loss_clip": 1.48504817, + "balance_loss_mlp": 1.04165685, + "epoch": 0.01996091988576582, + "flos": 15966161880120.0, + "grad_norm": 2.678706592467145, + "language_loss": 0.94495761, + "learning_rate": 3.737648825272422e-06, + "loss": 0.97429621, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.51782227, + "step": 332, + "time_per_iteration": 2.7950727939605713 + }, + { + "auxiliary_loss_clip": 0.0187647, + "auxiliary_loss_mlp": 0.01107051, + "balance_loss_clip": 1.50259233, + "balance_loss_mlp": 1.05190468, + "epoch": 0.02002104313843379, + "flos": 23591705402520.0, + "grad_norm": 3.3382815295825097, + "language_loss": 0.77451801, + "learning_rate": 3.739585224276384e-06, + "loss": 0.80435324, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.55102539, + "step": 333, + "time_per_iteration": 2.8356566429138184 + }, + { + "auxiliary_loss_clip": 0.0188584, + "auxiliary_loss_mlp": 0.01119059, + "balance_loss_clip": 1.4998138, + "balance_loss_mlp": 1.06562936, + "epoch": 0.02008116639110176, + "flos": 34101578080080.0, + "grad_norm": 2.5333259514761104, + "language_loss": 0.80828369, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.83833265, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.53369141, + "step": 334, + "time_per_iteration": 2.8923330307006836 + }, + { + "auxiliary_loss_clip": 0.01879655, + "auxiliary_loss_mlp": 0.01131954, + "balance_loss_clip": 1.502738, + "balance_loss_mlp": 1.07559156, + "epoch": 0.020141289643769728, + "flos": 19688622493320.0, + "grad_norm": 2.0132813306306785, + "language_loss": 0.85063112, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.8807472, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.5637207, + "step": 335, + "time_per_iteration": 2.770155429840088 + }, + { + "auxiliary_loss_clip": 0.01862711, + "auxiliary_loss_mlp": 0.01102722, + "balance_loss_clip": 1.49139488, + "balance_loss_mlp": 1.04903054, + "epoch": 0.020201412896437697, + "flos": 20745449907840.0, + "grad_norm": 2.23125294765471, + "language_loss": 0.93376154, + "learning_rate": 3.745359722027911e-06, + "loss": 0.96341586, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.53759766, + "step": 336, + "time_per_iteration": 2.8189971446990967 + }, + { + "auxiliary_loss_clip": 0.0186194, + "auxiliary_loss_mlp": 0.01102324, + "balance_loss_clip": 1.4940567, + "balance_loss_mlp": 1.05361521, + "epoch": 0.020261536149105665, + "flos": 20271365018640.0, + "grad_norm": 1.795078935020784, + "language_loss": 0.89966458, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.92930722, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.48681641, + "step": 337, + "time_per_iteration": 2.7993617057800293 + }, + { + "auxiliary_loss_clip": 0.01840782, + "auxiliary_loss_mlp": 0.01105903, + "balance_loss_clip": 1.48434246, + "balance_loss_mlp": 1.05693209, + "epoch": 0.020321659401773638, + "flos": 25854324987960.0, + "grad_norm": 1.519387049454275, + "language_loss": 0.90603518, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.93550205, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.48999023, + "step": 338, + "time_per_iteration": 2.8358821868896484 + }, + { + "auxiliary_loss_clip": 0.01867438, + "auxiliary_loss_mlp": 0.01109218, + "balance_loss_clip": 1.4980514, + "balance_loss_mlp": 1.05583596, + "epoch": 0.020381782654441606, + "flos": 17499713727240.0, + "grad_norm": 5.335342580408303, + "language_loss": 0.86540163, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.89516807, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.53393555, + "step": 339, + "time_per_iteration": 2.760751724243164 + }, + { + "auxiliary_loss_clip": 0.01873371, + "auxiliary_loss_mlp": 0.01104843, + "balance_loss_clip": 1.493752, + "balance_loss_mlp": 1.05267692, + "epoch": 0.020441905907109575, + "flos": 24249376998000.0, + "grad_norm": 1.7724166958125485, + "language_loss": 0.89581287, + "learning_rate": 3.75297936342452e-06, + "loss": 0.92559493, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.52124023, + "step": 340, + "time_per_iteration": 2.83474063873291 + }, + { + "auxiliary_loss_clip": 0.01859709, + "auxiliary_loss_mlp": 0.01109001, + "balance_loss_clip": 1.49185133, + "balance_loss_mlp": 1.05254352, + "epoch": 0.020502029159777543, + "flos": 22237963684200.0, + "grad_norm": 2.104590637433499, + "language_loss": 0.89371055, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.92339766, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.56445312, + "step": 341, + "time_per_iteration": 2.7890748977661133 + }, + { + "auxiliary_loss_clip": 0.01884091, + "auxiliary_loss_mlp": 0.01113992, + "balance_loss_clip": 1.50051486, + "balance_loss_mlp": 1.05982304, + "epoch": 0.020562152412445512, + "flos": 23993175898080.0, + "grad_norm": 2.481939598045133, + "language_loss": 0.82908535, + "learning_rate": 3.756755633390458e-06, + "loss": 0.85906613, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.54125977, + "step": 342, + "time_per_iteration": 2.847593069076538 + }, + { + "auxiliary_loss_clip": 0.01871608, + "auxiliary_loss_mlp": 0.01117208, + "balance_loss_clip": 1.49888635, + "balance_loss_mlp": 1.05839014, + "epoch": 0.020622275665113484, + "flos": 26980436910600.0, + "grad_norm": 1.9133406122430465, + "language_loss": 0.90482634, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.9347145, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.58789062, + "step": 343, + "time_per_iteration": 2.9695870876312256 + }, + { + "auxiliary_loss_clip": 0.0185507, + "auxiliary_loss_mlp": 0.01102969, + "balance_loss_clip": 1.49561501, + "balance_loss_mlp": 1.05359244, + "epoch": 0.020682398917781453, + "flos": 22605746222160.0, + "grad_norm": 2.147705612071627, + "language_loss": 0.79180115, + "learning_rate": 3.7605098841644e-06, + "loss": 0.82138157, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.49414062, + "step": 344, + "time_per_iteration": 2.850959300994873 + }, + { + "auxiliary_loss_clip": 0.01853986, + "auxiliary_loss_mlp": 0.01099794, + "balance_loss_clip": 1.49277806, + "balance_loss_mlp": 1.04500532, + "epoch": 0.02074252217044942, + "flos": 15017910885000.0, + "grad_norm": 2.03357582688864, + "language_loss": 0.76985526, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.79939306, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.54858398, + "step": 345, + "time_per_iteration": 2.8163516521453857 + }, + { + "auxiliary_loss_clip": 0.01847403, + "auxiliary_loss_mlp": 0.01099761, + "balance_loss_clip": 1.49218512, + "balance_loss_mlp": 1.04792881, + "epoch": 0.02080264542311739, + "flos": 25343465905800.0, + "grad_norm": 1.9461289988479697, + "language_loss": 0.91159016, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.94106179, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.51782227, + "step": 346, + "time_per_iteration": 2.865760564804077 + }, + { + "auxiliary_loss_clip": 0.0185744, + "auxiliary_loss_mlp": 0.01091112, + "balance_loss_clip": 1.49310887, + "balance_loss_mlp": 1.04364324, + "epoch": 0.02086276867578536, + "flos": 24394118484960.0, + "grad_norm": 2.4403195068992165, + "language_loss": 0.80730748, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.83679301, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.47485352, + "step": 347, + "time_per_iteration": 2.9614646434783936 + }, + { + "auxiliary_loss_clip": 0.01848787, + "auxiliary_loss_mlp": 0.01099472, + "balance_loss_clip": 1.49256551, + "balance_loss_mlp": 1.04737759, + "epoch": 0.02092289192845333, + "flos": 24467585654160.0, + "grad_norm": 1.8769848678708647, + "language_loss": 0.7269817, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7564643, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.52099609, + "step": 348, + "time_per_iteration": 2.865936040878296 + }, + { + "auxiliary_loss_clip": 0.018668, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_clip": 1.49577641, + "balance_loss_mlp": 1.03846145, + "epoch": 0.0209830151811213, + "flos": 17454899079000.0, + "grad_norm": 2.626471734958818, + "language_loss": 0.77968431, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.80922973, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 0.49291992, + "step": 349, + "time_per_iteration": 2.8254897594451904 + }, + { + "auxiliary_loss_clip": 0.01846752, + "auxiliary_loss_mlp": 0.01110296, + "balance_loss_clip": 1.49436748, + "balance_loss_mlp": 1.05538774, + "epoch": 0.021043138433789268, + "flos": 24579695000880.0, + "grad_norm": 1.814742286352125, + "language_loss": 0.8592605, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.88883096, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.54907227, + "step": 350, + "time_per_iteration": 2.891737699508667 + }, + { + "auxiliary_loss_clip": 0.01854532, + "auxiliary_loss_mlp": 0.01095803, + "balance_loss_clip": 1.49891889, + "balance_loss_mlp": 1.04203928, + "epoch": 0.021103261686457236, + "flos": 24458367556440.0, + "grad_norm": 2.0123692154898656, + "language_loss": 0.80477494, + "learning_rate": 3.773480007028776e-06, + "loss": 0.83427823, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.53759766, + "step": 351, + "time_per_iteration": 2.837585210800171 + }, + { + "auxiliary_loss_clip": 0.01852525, + "auxiliary_loss_mlp": 0.0109203, + "balance_loss_clip": 1.49478889, + "balance_loss_mlp": 1.03922033, + "epoch": 0.021163384939125205, + "flos": 14687064973440.0, + "grad_norm": 2.0547763392828995, + "language_loss": 0.89186502, + "learning_rate": 3.775311735671078e-06, + "loss": 0.9213106, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.52807617, + "step": 352, + "time_per_iteration": 2.8063244819641113 + }, + { + "auxiliary_loss_clip": 0.01872896, + "auxiliary_loss_mlp": 0.01109095, + "balance_loss_clip": 1.50681651, + "balance_loss_mlp": 1.05111158, + "epoch": 0.021223508191793177, + "flos": 24497862509520.0, + "grad_norm": 2.8247000168046923, + "language_loss": 0.83290184, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.8627218, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.58007812, + "step": 353, + "time_per_iteration": 2.851614475250244 + }, + { + "auxiliary_loss_clip": 0.01844205, + "auxiliary_loss_mlp": 0.01102611, + "balance_loss_clip": 1.49187493, + "balance_loss_mlp": 1.05003929, + "epoch": 0.021283631444461146, + "flos": 24131420047440.0, + "grad_norm": 1.957105356116701, + "language_loss": 0.81733668, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.84680486, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.52563477, + "step": 354, + "time_per_iteration": 2.90939998626709 + }, + { + "auxiliary_loss_clip": 0.0187175, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_clip": 1.5083133, + "balance_loss_mlp": 1.03691578, + "epoch": 0.021343754697129114, + "flos": 25197749818200.0, + "grad_norm": 2.065876913128362, + "language_loss": 0.82169342, + "learning_rate": 3.780775860546545e-06, + "loss": 0.85128248, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.50244141, + "step": 355, + "time_per_iteration": 2.8999602794647217 + }, + { + "auxiliary_loss_clip": 0.01876017, + "auxiliary_loss_mlp": 0.01101556, + "balance_loss_clip": 1.5131433, + "balance_loss_mlp": 1.05048704, + "epoch": 0.021403877949797083, + "flos": 17278581269160.0, + "grad_norm": 3.0237166790732815, + "language_loss": 0.91160023, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.94137591, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.51123047, + "step": 356, + "time_per_iteration": 2.8523576259613037 + }, + { + "auxiliary_loss_clip": 0.01853999, + "auxiliary_loss_mlp": 0.01088321, + "balance_loss_clip": 1.49615276, + "balance_loss_mlp": 1.04073262, + "epoch": 0.021464001202465055, + "flos": 30923299031400.0, + "grad_norm": 2.1752521490922994, + "language_loss": 0.81335473, + "learning_rate": 3.784393017158528e-06, + "loss": 0.84277791, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.47607422, + "step": 357, + "time_per_iteration": 2.9413886070251465 + }, + { + "auxiliary_loss_clip": 0.01860679, + "auxiliary_loss_mlp": 0.01089513, + "balance_loss_clip": 1.49906206, + "balance_loss_mlp": 1.04247332, + "epoch": 0.021524124455133024, + "flos": 18190910846880.0, + "grad_norm": 2.472632085310012, + "language_loss": 0.77891308, + "learning_rate": 3.786194003461506e-06, + "loss": 0.808415, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.47021484, + "step": 358, + "time_per_iteration": 2.8190219402313232 + }, + { + "auxiliary_loss_clip": 0.0186526, + "auxiliary_loss_mlp": 0.01095255, + "balance_loss_clip": 1.50424814, + "balance_loss_mlp": 1.04306519, + "epoch": 0.021584247707800992, + "flos": 13809438562320.0, + "grad_norm": 2.6294064137872426, + "language_loss": 0.90840322, + "learning_rate": 3.787989966086264e-06, + "loss": 0.93800837, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.5222168, + "step": 359, + "time_per_iteration": 2.864478588104248 + }, + { + "auxiliary_loss_clip": 0.01876484, + "auxiliary_loss_mlp": 0.01106715, + "balance_loss_clip": 1.50912178, + "balance_loss_mlp": 1.05597985, + "epoch": 0.02164437096046896, + "flos": 23299948360440.0, + "grad_norm": 2.5405746665075437, + "language_loss": 0.78262913, + "learning_rate": 3.789780932980997e-06, + "loss": 0.81246114, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 3.67382812, + "router_z_loss_mlp": 0.50683594, + "step": 360, + "time_per_iteration": 2.842863082885742 + }, + { + "auxiliary_loss_clip": 0.01674504, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.43842411, + "balance_loss_mlp": 1.01176059, + "epoch": 0.02170449421313693, + "flos": 68915225976840.0, + "grad_norm": 0.84245149795129, + "language_loss": 0.64960945, + "learning_rate": 3.79156693186132e-06, + "loss": 0.6766485, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.17675781, + "step": 361, + "time_per_iteration": 3.4030632972717285 + }, + { + "auxiliary_loss_clip": 0.01854505, + "auxiliary_loss_mlp": 0.01085488, + "balance_loss_clip": 1.49598145, + "balance_loss_mlp": 1.03696954, + "epoch": 0.0217646174658049, + "flos": 25234036710840.0, + "grad_norm": 3.9318900545161073, + "language_loss": 0.81211734, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.84151727, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.4855957, + "step": 362, + "time_per_iteration": 2.843902111053467 + }, + { + "auxiliary_loss_clip": 0.01861463, + "auxiliary_loss_mlp": 0.0108482, + "balance_loss_clip": 1.49900305, + "balance_loss_mlp": 1.03735065, + "epoch": 0.02182474071847287, + "flos": 22898274823080.0, + "grad_norm": 2.344794815283622, + "language_loss": 0.93549263, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.96495551, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.47460938, + "step": 363, + "time_per_iteration": 2.7851405143737793 + }, + { + "auxiliary_loss_clip": 0.01854949, + "auxiliary_loss_mlp": 0.01096496, + "balance_loss_clip": 1.50149751, + "balance_loss_mlp": 1.04740572, + "epoch": 0.02188486397114084, + "flos": 23664197971080.0, + "grad_norm": 2.231719705995846, + "language_loss": 0.91490668, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.94442117, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.49072266, + "step": 364, + "time_per_iteration": 2.8147404193878174 + }, + { + "auxiliary_loss_clip": 0.01865596, + "auxiliary_loss_mlp": 0.01100451, + "balance_loss_clip": 1.50751054, + "balance_loss_mlp": 1.04880941, + "epoch": 0.021944987223808807, + "flos": 21548675157480.0, + "grad_norm": 2.4099780763435605, + "language_loss": 0.80589449, + "learning_rate": 3.798661793553676e-06, + "loss": 0.83555502, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.51635742, + "step": 365, + "time_per_iteration": 2.807121753692627 + }, + { + "auxiliary_loss_clip": 0.01844545, + "auxiliary_loss_mlp": 0.01102783, + "balance_loss_clip": 1.49899662, + "balance_loss_mlp": 1.0537883, + "epoch": 0.022005110476476776, + "flos": 16075225599840.0, + "grad_norm": 6.4572306290638455, + "language_loss": 0.85896027, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.88843358, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.49047852, + "step": 366, + "time_per_iteration": 7.461624383926392 + }, + { + "auxiliary_loss_clip": 0.01865689, + "auxiliary_loss_mlp": 0.01089785, + "balance_loss_clip": 1.50386405, + "balance_loss_mlp": 1.04295945, + "epoch": 0.022065233729144748, + "flos": 21438677445480.0, + "grad_norm": 2.0197938488172227, + "language_loss": 0.89213371, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.92168844, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.46875, + "step": 367, + "time_per_iteration": 2.8768980503082275 + }, + { + "auxiliary_loss_clip": 0.01866164, + "auxiliary_loss_mlp": 0.01114098, + "balance_loss_clip": 1.50454831, + "balance_loss_mlp": 1.05890441, + "epoch": 0.022125356981812717, + "flos": 21548756374200.0, + "grad_norm": 2.878533690240504, + "language_loss": 0.8673439, + "learning_rate": 3.803932100062912e-06, + "loss": 0.89714652, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.55224609, + "step": 368, + "time_per_iteration": 2.8413188457489014 + }, + { + "auxiliary_loss_clip": 0.01884205, + "auxiliary_loss_mlp": 0.01103844, + "balance_loss_clip": 1.50358617, + "balance_loss_mlp": 1.0567807, + "epoch": 0.022185480234480685, + "flos": 20709041190120.0, + "grad_norm": 3.634824186648675, + "language_loss": 0.78539956, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.81528002, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.47021484, + "step": 369, + "time_per_iteration": 2.8124821186065674 + }, + { + "auxiliary_loss_clip": 0.01853683, + "auxiliary_loss_mlp": 0.01097097, + "balance_loss_clip": 1.4943316, + "balance_loss_mlp": 1.04640889, + "epoch": 0.022245603487148654, + "flos": 25198846243920.0, + "grad_norm": 2.273574321158616, + "language_loss": 0.84165204, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.87115979, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.50683594, + "step": 370, + "time_per_iteration": 2.8635780811309814 + }, + { + "auxiliary_loss_clip": 0.01853472, + "auxiliary_loss_mlp": 0.010897, + "balance_loss_clip": 1.4969486, + "balance_loss_mlp": 1.04299426, + "epoch": 0.022305726739816623, + "flos": 21400725610080.0, + "grad_norm": 1.6597131773067588, + "language_loss": 0.83164024, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.86107194, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.46728516, + "step": 371, + "time_per_iteration": 2.840500593185425 + }, + { + "auxiliary_loss_clip": 0.01860008, + "auxiliary_loss_mlp": 0.01093798, + "balance_loss_clip": 1.50405419, + "balance_loss_mlp": 1.04408801, + "epoch": 0.022365849992484595, + "flos": 22497291627840.0, + "grad_norm": 2.1498103692835913, + "language_loss": 0.84769487, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.87723291, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.49804688, + "step": 372, + "time_per_iteration": 2.8250980377197266 + }, + { + "auxiliary_loss_clip": 0.01849669, + "auxiliary_loss_mlp": 0.01095519, + "balance_loss_clip": 1.49696219, + "balance_loss_mlp": 1.04814577, + "epoch": 0.022425973245152563, + "flos": 17860877102520.0, + "grad_norm": 2.496783164289982, + "language_loss": 0.80964988, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.83910179, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.47387695, + "step": 373, + "time_per_iteration": 2.8229820728302 + }, + { + "auxiliary_loss_clip": 0.01859888, + "auxiliary_loss_mlp": 0.01101648, + "balance_loss_clip": 1.50001502, + "balance_loss_mlp": 1.05227196, + "epoch": 0.022486096497820532, + "flos": 15486757295760.0, + "grad_norm": 2.721699328711289, + "language_loss": 0.79815787, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.82777327, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.49389648, + "step": 374, + "time_per_iteration": 2.7755370140075684 + }, + { + "auxiliary_loss_clip": 0.0184167, + "auxiliary_loss_mlp": 0.01080638, + "balance_loss_clip": 1.48509419, + "balance_loss_mlp": 1.02909184, + "epoch": 0.0225462197504885, + "flos": 27790200106200.0, + "grad_norm": 1.828389925802028, + "language_loss": 0.87281305, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.90203613, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.51513672, + "step": 375, + "time_per_iteration": 2.8964481353759766 + }, + { + "auxiliary_loss_clip": 0.01856096, + "auxiliary_loss_mlp": 0.01092281, + "balance_loss_clip": 1.49910772, + "balance_loss_mlp": 1.04249871, + "epoch": 0.02260634300315647, + "flos": 19980420143760.0, + "grad_norm": 2.237893033501127, + "language_loss": 0.8947854, + "learning_rate": 3.817778917253314e-06, + "loss": 0.9242692, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.49780273, + "step": 376, + "time_per_iteration": 2.8110220432281494 + }, + { + "auxiliary_loss_clip": 0.01855398, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.48920429, + "balance_loss_mlp": 1.04792738, + "epoch": 0.02266646625582444, + "flos": 16031710419120.0, + "grad_norm": 2.2466635784801303, + "language_loss": 0.7688868, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.79842687, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.50683594, + "step": 377, + "time_per_iteration": 2.7773852348327637 + }, + { + "auxiliary_loss_clip": 0.01835289, + "auxiliary_loss_mlp": 0.01082247, + "balance_loss_clip": 1.49225426, + "balance_loss_mlp": 1.03472972, + "epoch": 0.02272658950849241, + "flos": 20408472133920.0, + "grad_norm": 2.247646670298807, + "language_loss": 1.00279224, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.03196764, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.47558594, + "step": 378, + "time_per_iteration": 2.900880813598633 + }, + { + "auxiliary_loss_clip": 0.01652155, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.43027806, + "balance_loss_mlp": 1.011657, + "epoch": 0.02278671276116038, + "flos": 69862989671640.0, + "grad_norm": 1.0206239181367716, + "language_loss": 0.7527678, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77962053, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.21484375, + "step": 379, + "time_per_iteration": 3.349172353744507 + }, + { + "auxiliary_loss_clip": 0.01850406, + "auxiliary_loss_mlp": 0.01083915, + "balance_loss_clip": 1.4855262, + "balance_loss_mlp": 1.03885412, + "epoch": 0.022846836013828347, + "flos": 38515926155040.0, + "grad_norm": 2.6426716762115734, + "language_loss": 0.79879832, + "learning_rate": 3.824592231451859e-06, + "loss": 0.82814157, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.45092773, + "step": 380, + "time_per_iteration": 2.9773244857788086 + }, + { + "auxiliary_loss_clip": 0.0183668, + "auxiliary_loss_mlp": 0.01092404, + "balance_loss_clip": 1.48876798, + "balance_loss_mlp": 1.04584038, + "epoch": 0.02290695926649632, + "flos": 20964389514480.0, + "grad_norm": 2.7605817693416586, + "language_loss": 0.98483682, + "learning_rate": 3.826284353801652e-06, + "loss": 1.01412773, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.46533203, + "step": 381, + "time_per_iteration": 2.799781322479248 + }, + { + "auxiliary_loss_clip": 0.01839726, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_clip": 1.48638773, + "balance_loss_mlp": 1.04256749, + "epoch": 0.022967082519164288, + "flos": 24027472981080.0, + "grad_norm": 2.5237395385062604, + "language_loss": 0.89068043, + "learning_rate": 3.827972040701142e-06, + "loss": 0.92005205, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.54833984, + "step": 382, + "time_per_iteration": 2.8496499061584473 + }, + { + "auxiliary_loss_clip": 0.01822237, + "auxiliary_loss_mlp": 0.01093608, + "balance_loss_clip": 1.47646165, + "balance_loss_mlp": 1.05081224, + "epoch": 0.023027205771832256, + "flos": 21002219524800.0, + "grad_norm": 6.968696958098711, + "language_loss": 0.86065048, + "learning_rate": 3.829655315342268e-06, + "loss": 0.88980889, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.42773438, + "step": 383, + "time_per_iteration": 2.79717755317688 + }, + { + "auxiliary_loss_clip": 0.01824857, + "auxiliary_loss_mlp": 0.01097166, + "balance_loss_clip": 1.47860765, + "balance_loss_mlp": 1.05031669, + "epoch": 0.023087329024500225, + "flos": 21365778793320.0, + "grad_norm": 3.0908919985841607, + "language_loss": 0.83997905, + "learning_rate": 3.831334200735543e-06, + "loss": 0.86919928, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.46875, + "step": 384, + "time_per_iteration": 2.803321123123169 + }, + { + "auxiliary_loss_clip": 0.01815573, + "auxiliary_loss_mlp": 0.01084948, + "balance_loss_clip": 1.48206067, + "balance_loss_mlp": 1.04310536, + "epoch": 0.023147452277168194, + "flos": 21877571867760.0, + "grad_norm": 1.8501336151429921, + "language_loss": 0.89179862, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.92080379, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.41845703, + "step": 385, + "time_per_iteration": 2.8687210083007812 + }, + { + "auxiliary_loss_clip": 0.01820232, + "auxiliary_loss_mlp": 0.0112162, + "balance_loss_clip": 1.47667992, + "balance_loss_mlp": 1.07894266, + "epoch": 0.023207575529836166, + "flos": 18921399877800.0, + "grad_norm": 1.7775715844480402, + "language_loss": 0.70194608, + "learning_rate": 3.83467889492477e-06, + "loss": 0.73136461, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.42675781, + "step": 386, + "time_per_iteration": 2.8209969997406006 + }, + { + "auxiliary_loss_clip": 0.0182306, + "auxiliary_loss_mlp": 0.01087359, + "balance_loss_clip": 1.47285652, + "balance_loss_mlp": 1.04062867, + "epoch": 0.023267698782504134, + "flos": 25051465213560.0, + "grad_norm": 3.5239723608675613, + "language_loss": 0.89182538, + "learning_rate": 3.836344748851495e-06, + "loss": 0.92092955, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.4675293, + "step": 387, + "time_per_iteration": 2.901883840560913 + }, + { + "auxiliary_loss_clip": 0.0183074, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_clip": 1.48258018, + "balance_loss_mlp": 1.04133844, + "epoch": 0.023327822035172103, + "flos": 28885466656440.0, + "grad_norm": 2.2772929962662616, + "language_loss": 0.85104793, + "learning_rate": 3.838006303795566e-06, + "loss": 0.88024342, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.47460938, + "step": 388, + "time_per_iteration": 2.926150321960449 + }, + { + "auxiliary_loss_clip": 0.01826658, + "auxiliary_loss_mlp": 0.01092291, + "balance_loss_clip": 1.47191048, + "balance_loss_mlp": 1.04904199, + "epoch": 0.02338794528784007, + "flos": 27126721515240.0, + "grad_norm": 2.829430854732123, + "language_loss": 0.95849407, + "learning_rate": 3.839663581888206e-06, + "loss": 0.98768353, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.43261719, + "step": 389, + "time_per_iteration": 2.868727445602417 + }, + { + "auxiliary_loss_clip": 0.0181565, + "auxiliary_loss_mlp": 0.0108912, + "balance_loss_clip": 1.48068213, + "balance_loss_mlp": 1.0489223, + "epoch": 0.02344806854050804, + "flos": 21327055399080.0, + "grad_norm": 2.1278036987223206, + "language_loss": 0.88206977, + "learning_rate": 3.841316605090178e-06, + "loss": 0.91111743, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.40234375, + "step": 390, + "time_per_iteration": 2.897918224334717 + }, + { + "auxiliary_loss_clip": 0.01814404, + "auxiliary_loss_mlp": 0.01090625, + "balance_loss_clip": 1.47138762, + "balance_loss_mlp": 1.04751945, + "epoch": 0.023508191793176012, + "flos": 24795345330360.0, + "grad_norm": 3.084355559793433, + "language_loss": 0.91123748, + "learning_rate": 3.842965395193529e-06, + "loss": 0.94028777, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.4309082, + "step": 391, + "time_per_iteration": 2.84867262840271 + }, + { + "auxiliary_loss_clip": 0.01822767, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_clip": 1.47458816, + "balance_loss_mlp": 1.05741632, + "epoch": 0.02356831504584398, + "flos": 26001056284560.0, + "grad_norm": 2.078677329953683, + "language_loss": 0.87924623, + "learning_rate": 3.84460997382332e-06, + "loss": 0.90848291, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.43505859, + "step": 392, + "time_per_iteration": 2.8435795307159424 + }, + { + "auxiliary_loss_clip": 0.01808786, + "auxiliary_loss_mlp": 0.01095715, + "balance_loss_clip": 1.47894752, + "balance_loss_mlp": 1.05260921, + "epoch": 0.02362843829851195, + "flos": 19067156573760.0, + "grad_norm": 2.251204349488416, + "language_loss": 0.9016822, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.93072718, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.4309082, + "step": 393, + "time_per_iteration": 2.8599202632904053 + }, + { + "auxiliary_loss_clip": 0.01814982, + "auxiliary_loss_mlp": 0.01105718, + "balance_loss_clip": 1.47657418, + "balance_loss_mlp": 1.06387544, + "epoch": 0.023688561551179918, + "flos": 16075388033280.0, + "grad_norm": 1.8215904976516688, + "language_loss": 0.82039511, + "learning_rate": 3.84788658233771e-06, + "loss": 0.8496021, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.41870117, + "step": 394, + "time_per_iteration": 2.8208329677581787 + }, + { + "auxiliary_loss_clip": 0.01802773, + "auxiliary_loss_mlp": 0.01089901, + "balance_loss_clip": 1.46464324, + "balance_loss_mlp": 1.04195452, + "epoch": 0.023748684803847887, + "flos": 21729216236760.0, + "grad_norm": 4.263053192690436, + "language_loss": 0.8667357, + "learning_rate": 3.84951865465269e-06, + "loss": 0.89566243, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.47924805, + "step": 395, + "time_per_iteration": 2.794889211654663 + }, + { + "auxiliary_loss_clip": 0.0166156, + "auxiliary_loss_mlp": 0.01079592, + "balance_loss_clip": 1.44692373, + "balance_loss_mlp": 1.05699027, + "epoch": 0.02380880805651586, + "flos": 61940328803640.0, + "grad_norm": 0.95266663334423, + "language_loss": 0.63833612, + "learning_rate": 3.851146600358172e-06, + "loss": 0.6657477, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.22558594, + "step": 396, + "time_per_iteration": 3.119494676589966 + }, + { + "auxiliary_loss_clip": 0.01796127, + "auxiliary_loss_mlp": 0.01075846, + "balance_loss_clip": 1.46476102, + "balance_loss_mlp": 1.03607821, + "epoch": 0.023868931309183827, + "flos": 20271161976840.0, + "grad_norm": 2.7314960012953398, + "language_loss": 0.86471272, + "learning_rate": 3.852770440269372e-06, + "loss": 0.8934325, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.39794922, + "step": 397, + "time_per_iteration": 2.79596209526062 + }, + { + "auxiliary_loss_clip": 0.01815962, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.4713037, + "balance_loss_mlp": 1.04289079, + "epoch": 0.023929054561851796, + "flos": 21143671734600.0, + "grad_norm": 1.91853837008087, + "language_loss": 0.85595083, + "learning_rate": 3.854390195044404e-06, + "loss": 0.88495511, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.41625977, + "step": 398, + "time_per_iteration": 2.7807557582855225 + }, + { + "auxiliary_loss_clip": 0.01805069, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_clip": 1.46598387, + "balance_loss_mlp": 1.03111196, + "epoch": 0.023989177814519765, + "flos": 13702445868960.0, + "grad_norm": 2.753852384780746, + "language_loss": 0.88516992, + "learning_rate": 3.856005885185868e-06, + "loss": 0.91395301, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.42114258, + "step": 399, + "time_per_iteration": 2.81380295753479 + }, + { + "auxiliary_loss_clip": 0.01790118, + "auxiliary_loss_mlp": 0.01095633, + "balance_loss_clip": 1.46540499, + "balance_loss_mlp": 1.05593634, + "epoch": 0.024049301067187733, + "flos": 26327516493240.0, + "grad_norm": 2.1149753836747833, + "language_loss": 0.86994994, + "learning_rate": 3.857617531042398e-06, + "loss": 0.89880753, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.3972168, + "step": 400, + "time_per_iteration": 2.841519594192505 + }, + { + "auxiliary_loss_clip": 0.01794689, + "auxiliary_loss_mlp": 0.01092823, + "balance_loss_clip": 1.46555948, + "balance_loss_mlp": 1.05276871, + "epoch": 0.024109424319855705, + "flos": 24430770852840.0, + "grad_norm": 2.046307884279544, + "language_loss": 0.80314076, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.83201587, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.40087891, + "step": 401, + "time_per_iteration": 2.8401451110839844 + }, + { + "auxiliary_loss_clip": 0.01794123, + "auxiliary_loss_mlp": 0.0109309, + "balance_loss_clip": 1.46740711, + "balance_loss_mlp": 1.05005574, + "epoch": 0.024169547572523674, + "flos": 29610148691880.0, + "grad_norm": 1.9825046048373107, + "language_loss": 0.79867548, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.82754755, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.43041992, + "step": 402, + "time_per_iteration": 2.8777852058410645 + }, + { + "auxiliary_loss_clip": 0.01800844, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_clip": 1.45900655, + "balance_loss_mlp": 1.04923153, + "epoch": 0.024229670825191642, + "flos": 22606598997720.0, + "grad_norm": 2.5101204175984337, + "language_loss": 0.96471083, + "learning_rate": 3.86242840411147e-06, + "loss": 0.99363744, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.42553711, + "step": 403, + "time_per_iteration": 2.818066120147705 + }, + { + "auxiliary_loss_clip": 0.01801511, + "auxiliary_loss_mlp": 0.01090417, + "balance_loss_clip": 1.46238673, + "balance_loss_mlp": 1.04044497, + "epoch": 0.02428979407785961, + "flos": 18154826996040.0, + "grad_norm": 2.408397264056825, + "language_loss": 1.00560975, + "learning_rate": 3.864024073288798e-06, + "loss": 1.03452909, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.49951172, + "step": 404, + "time_per_iteration": 4.465649366378784 + }, + { + "auxiliary_loss_clip": 0.01797492, + "auxiliary_loss_mlp": 0.01102859, + "balance_loss_clip": 1.46280789, + "balance_loss_mlp": 1.06216133, + "epoch": 0.024349917330527583, + "flos": 15309261843480.0, + "grad_norm": 2.487534077776594, + "language_loss": 0.89083993, + "learning_rate": 3.865615797668091e-06, + "loss": 0.91984344, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.40698242, + "step": 405, + "time_per_iteration": 4.290499210357666 + }, + { + "auxiliary_loss_clip": 0.01812943, + "auxiliary_loss_mlp": 0.01112675, + "balance_loss_clip": 1.47029424, + "balance_loss_mlp": 1.06363201, + "epoch": 0.024410040583195552, + "flos": 20778366306600.0, + "grad_norm": 2.155653863052072, + "language_loss": 0.9508214, + "learning_rate": 3.867203596705844e-06, + "loss": 0.98007768, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.49047852, + "step": 406, + "time_per_iteration": 4.377735376358032 + }, + { + "auxiliary_loss_clip": 0.01793007, + "auxiliary_loss_mlp": 0.01090582, + "balance_loss_clip": 1.45827866, + "balance_loss_mlp": 1.04814291, + "epoch": 0.02447016383586352, + "flos": 21803901656760.0, + "grad_norm": 2.1115887800486064, + "language_loss": 0.88837355, + "learning_rate": 3.86878748971496e-06, + "loss": 0.91720945, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.42480469, + "step": 407, + "time_per_iteration": 2.830195903778076 + }, + { + "auxiliary_loss_clip": 0.01784722, + "auxiliary_loss_mlp": 0.01079038, + "balance_loss_clip": 1.45948803, + "balance_loss_mlp": 1.04201174, + "epoch": 0.02453028708853149, + "flos": 33954684350040.0, + "grad_norm": 1.9902130061920191, + "language_loss": 0.74346101, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.7720986, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.37036133, + "step": 408, + "time_per_iteration": 3.0513176918029785 + }, + { + "auxiliary_loss_clip": 0.01801597, + "auxiliary_loss_mlp": 0.01088893, + "balance_loss_clip": 1.46067405, + "balance_loss_mlp": 1.04516709, + "epoch": 0.024590410341199458, + "flos": 21797282494080.0, + "grad_norm": 2.5851500172248296, + "language_loss": 0.9368487, + "learning_rate": 3.871943634189376e-06, + "loss": 0.96575367, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.43725586, + "step": 409, + "time_per_iteration": 2.862905502319336 + }, + { + "auxiliary_loss_clip": 0.01809979, + "auxiliary_loss_mlp": 0.01076415, + "balance_loss_clip": 1.46678805, + "balance_loss_mlp": 1.03924537, + "epoch": 0.02465053359386743, + "flos": 35121387651480.0, + "grad_norm": 3.2764817656894136, + "language_loss": 0.8406136, + "learning_rate": 3.873515923575128e-06, + "loss": 0.86947751, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37182617, + "step": 410, + "time_per_iteration": 2.992888927459717 + }, + { + "auxiliary_loss_clip": 0.01808357, + "auxiliary_loss_mlp": 0.01095542, + "balance_loss_clip": 1.47107029, + "balance_loss_mlp": 1.05217361, + "epoch": 0.0247106568465354, + "flos": 27456836476320.0, + "grad_norm": 3.504693413556288, + "language_loss": 0.7920385, + "learning_rate": 3.875084382775879e-06, + "loss": 0.82107747, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.43359375, + "step": 411, + "time_per_iteration": 2.923435926437378 + }, + { + "auxiliary_loss_clip": 0.01804976, + "auxiliary_loss_mlp": 0.01099534, + "balance_loss_clip": 1.46677589, + "balance_loss_mlp": 1.05175447, + "epoch": 0.024770780099203367, + "flos": 20708635106520.0, + "grad_norm": 2.4706241635894015, + "language_loss": 0.87667751, + "learning_rate": 3.87664903040738e-06, + "loss": 0.90572262, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.47802734, + "step": 412, + "time_per_iteration": 2.858708143234253 + }, + { + "auxiliary_loss_clip": 0.01639395, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.43227077, + "balance_loss_mlp": 1.01194942, + "epoch": 0.024830903351871336, + "flos": 69567009360120.0, + "grad_norm": 0.8491462684296677, + "language_loss": 0.58467263, + "learning_rate": 3.878209884949994e-06, + "loss": 0.61137301, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18652344, + "step": 413, + "time_per_iteration": 3.450713634490967 + }, + { + "auxiliary_loss_clip": 0.01799937, + "auxiliary_loss_mlp": 0.0108808, + "balance_loss_clip": 1.46289563, + "balance_loss_mlp": 1.04759693, + "epoch": 0.024891026604539304, + "flos": 32276918924640.0, + "grad_norm": 1.892468416863276, + "language_loss": 0.8216964, + "learning_rate": 3.879766964750006e-06, + "loss": 0.85057664, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.40478516, + "step": 414, + "time_per_iteration": 2.9093406200408936 + }, + { + "auxiliary_loss_clip": 0.01798855, + "auxiliary_loss_mlp": 0.01102423, + "balance_loss_clip": 1.47409558, + "balance_loss_mlp": 1.06170106, + "epoch": 0.024951149857207276, + "flos": 18844521606360.0, + "grad_norm": 1.9278868762453791, + "language_loss": 0.81890666, + "learning_rate": 3.881320288020917e-06, + "loss": 0.84791946, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.40698242, + "step": 415, + "time_per_iteration": 2.8072478771209717 + }, + { + "auxiliary_loss_clip": 0.01833879, + "auxiliary_loss_mlp": 0.01097816, + "balance_loss_clip": 1.47694099, + "balance_loss_mlp": 1.05656946, + "epoch": 0.025011273109875245, + "flos": 15381388936800.0, + "grad_norm": 2.639102802211356, + "language_loss": 0.98384321, + "learning_rate": 3.882869872844723e-06, + "loss": 1.01316011, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.41259766, + "step": 416, + "time_per_iteration": 2.841568946838379 + }, + { + "auxiliary_loss_clip": 0.01817677, + "auxiliary_loss_mlp": 0.01107944, + "balance_loss_clip": 1.47447813, + "balance_loss_mlp": 1.05513442, + "epoch": 0.025071396362543213, + "flos": 18920059801920.0, + "grad_norm": 1.7625602796501918, + "language_loss": 0.7791822, + "learning_rate": 3.884415737173176e-06, + "loss": 0.80843842, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.52783203, + "step": 417, + "time_per_iteration": 2.86099910736084 + }, + { + "auxiliary_loss_clip": 0.01803852, + "auxiliary_loss_mlp": 0.01101495, + "balance_loss_clip": 1.47568107, + "balance_loss_mlp": 1.06427824, + "epoch": 0.025131519615211182, + "flos": 25343465905800.0, + "grad_norm": 1.650571264062608, + "language_loss": 0.78368509, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.81273854, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.37255859, + "step": 418, + "time_per_iteration": 2.905170202255249 + }, + { + "auxiliary_loss_clip": 0.01822751, + "auxiliary_loss_mlp": 0.01117033, + "balance_loss_clip": 1.47693443, + "balance_loss_mlp": 1.07566738, + "epoch": 0.02519164286787915, + "flos": 18957849203880.0, + "grad_norm": 2.77477780670549, + "language_loss": 0.83907652, + "learning_rate": 3.887496375507294e-06, + "loss": 0.86847436, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.41357422, + "step": 419, + "time_per_iteration": 2.7807719707489014 + }, + { + "auxiliary_loss_clip": 0.01806787, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_clip": 1.47120833, + "balance_loss_mlp": 1.05510211, + "epoch": 0.025251766120547123, + "flos": 17425637432640.0, + "grad_norm": 2.0537303451140776, + "language_loss": 0.74558401, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.77461195, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.40917969, + "step": 420, + "time_per_iteration": 2.816929817199707 + }, + { + "auxiliary_loss_clip": 0.01813187, + "auxiliary_loss_mlp": 0.01104213, + "balance_loss_clip": 1.47328651, + "balance_loss_mlp": 1.06427836, + "epoch": 0.02531188937321509, + "flos": 25050815479800.0, + "grad_norm": 1.8098564024178274, + "language_loss": 0.79834342, + "learning_rate": 3.890562344079484e-06, + "loss": 0.82751739, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.39916992, + "step": 421, + "time_per_iteration": 2.797628402709961 + }, + { + "auxiliary_loss_clip": 0.01816584, + "auxiliary_loss_mlp": 0.01101657, + "balance_loss_clip": 1.47819972, + "balance_loss_mlp": 1.05919504, + "epoch": 0.02537201262588306, + "flos": 30598463157120.0, + "grad_norm": 2.2176870829185193, + "language_loss": 0.83312941, + "learning_rate": 3.89208987073549e-06, + "loss": 0.86231178, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.42480469, + "step": 422, + "time_per_iteration": 2.89094877243042 + }, + { + "auxiliary_loss_clip": 0.01815836, + "auxiliary_loss_mlp": 0.01080764, + "balance_loss_clip": 1.47284317, + "balance_loss_mlp": 1.04221201, + "epoch": 0.02543213587855103, + "flos": 26070665659560.0, + "grad_norm": 1.6972911410231266, + "language_loss": 0.84536141, + "learning_rate": 3.893613781940409e-06, + "loss": 0.87432742, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.38549805, + "step": 423, + "time_per_iteration": 2.929533004760742 + }, + { + "auxiliary_loss_clip": 0.01811354, + "auxiliary_loss_mlp": 0.01090301, + "balance_loss_clip": 1.47662449, + "balance_loss_mlp": 1.05029464, + "epoch": 0.025492259131218997, + "flos": 36029006659440.0, + "grad_norm": 1.876265313783841, + "language_loss": 0.74723828, + "learning_rate": 3.895134094768415e-06, + "loss": 0.77625477, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.40014648, + "step": 424, + "time_per_iteration": 3.054236650466919 + }, + { + "auxiliary_loss_clip": 0.01820866, + "auxiliary_loss_mlp": 0.01091617, + "balance_loss_clip": 1.4788506, + "balance_loss_mlp": 1.05287373, + "epoch": 0.02555238238388697, + "flos": 18592665600960.0, + "grad_norm": 2.525202363886116, + "language_loss": 0.84600937, + "learning_rate": 3.896650826173015e-06, + "loss": 0.87513423, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.38745117, + "step": 425, + "time_per_iteration": 2.899630546569824 + }, + { + "auxiliary_loss_clip": 0.01814504, + "auxiliary_loss_mlp": 0.01088063, + "balance_loss_clip": 1.47393656, + "balance_loss_mlp": 1.04083228, + "epoch": 0.025612505636554938, + "flos": 24248524222440.0, + "grad_norm": 2.258227667925071, + "language_loss": 0.86373818, + "learning_rate": 3.898163992988186e-06, + "loss": 0.89276385, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.47241211, + "step": 426, + "time_per_iteration": 2.9902398586273193 + }, + { + "auxiliary_loss_clip": 0.01608438, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.39991891, + "balance_loss_mlp": 1.0060662, + "epoch": 0.025672628889222907, + "flos": 60602302520640.0, + "grad_norm": 0.9118068946700559, + "language_loss": 0.57229799, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59863567, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.19238281, + "step": 427, + "time_per_iteration": 3.446822166442871 + }, + { + "auxiliary_loss_clip": 0.01815641, + "auxiliary_loss_mlp": 0.01083248, + "balance_loss_clip": 1.48157215, + "balance_loss_mlp": 1.04240704, + "epoch": 0.025732752141890875, + "flos": 19578015655920.0, + "grad_norm": 2.6070483485016767, + "language_loss": 0.90715277, + "learning_rate": 3.901179699595194e-06, + "loss": 0.93614161, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.40844727, + "step": 428, + "time_per_iteration": 2.916853427886963 + }, + { + "auxiliary_loss_clip": 0.01804627, + "auxiliary_loss_mlp": 0.01078732, + "balance_loss_clip": 1.47711754, + "balance_loss_mlp": 1.03679454, + "epoch": 0.025792875394558847, + "flos": 31290431835600.0, + "grad_norm": 1.6728993530353284, + "language_loss": 0.86641717, + "learning_rate": 3.902682272467353e-06, + "loss": 0.8952508, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.41918945, + "step": 429, + "time_per_iteration": 2.9507346153259277 + }, + { + "auxiliary_loss_clip": 0.01828862, + "auxiliary_loss_mlp": 0.01084013, + "balance_loss_clip": 1.48266721, + "balance_loss_mlp": 1.03840399, + "epoch": 0.025852998647226816, + "flos": 32386023252720.0, + "grad_norm": 2.2324349301713085, + "language_loss": 0.89089203, + "learning_rate": 3.904181346912895e-06, + "loss": 0.92002076, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.45605469, + "step": 430, + "time_per_iteration": 3.08522891998291 + }, + { + "auxiliary_loss_clip": 0.01809442, + "auxiliary_loss_mlp": 0.01086476, + "balance_loss_clip": 1.48079824, + "balance_loss_mlp": 1.04523015, + "epoch": 0.025913121899894784, + "flos": 20198100891240.0, + "grad_norm": 4.363795668214834, + "language_loss": 0.84630275, + "learning_rate": 3.905676939184698e-06, + "loss": 0.87526202, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.41259766, + "step": 431, + "time_per_iteration": 2.9260036945343018 + }, + { + "auxiliary_loss_clip": 0.018066, + "auxiliary_loss_mlp": 0.01080948, + "balance_loss_clip": 1.47115254, + "balance_loss_mlp": 1.03974938, + "epoch": 0.025973245152562753, + "flos": 14724529508520.0, + "grad_norm": 4.286470849758868, + "language_loss": 0.89346933, + "learning_rate": 3.907169065422638e-06, + "loss": 0.9223448, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.41235352, + "step": 432, + "time_per_iteration": 2.8352627754211426 + }, + { + "auxiliary_loss_clip": 0.01819005, + "auxiliary_loss_mlp": 0.01075233, + "balance_loss_clip": 1.48340178, + "balance_loss_mlp": 1.03618038, + "epoch": 0.02603336840523072, + "flos": 30999202702200.0, + "grad_norm": 2.236795271174903, + "language_loss": 0.77026266, + "learning_rate": 3.908657741654636e-06, + "loss": 0.79920506, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.39038086, + "step": 433, + "time_per_iteration": 2.964000701904297 + }, + { + "auxiliary_loss_clip": 0.01816272, + "auxiliary_loss_mlp": 0.01085757, + "balance_loss_clip": 1.47799516, + "balance_loss_mlp": 1.04060018, + "epoch": 0.026093491657898694, + "flos": 17678549255400.0, + "grad_norm": 2.1729206898245197, + "language_loss": 0.90716499, + "learning_rate": 3.910142983797699e-06, + "loss": 0.93618524, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.4519043, + "step": 434, + "time_per_iteration": 2.8189680576324463 + }, + { + "auxiliary_loss_clip": 0.01817993, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_clip": 1.48390448, + "balance_loss_mlp": 1.05634904, + "epoch": 0.026153614910566662, + "flos": 17862339003480.0, + "grad_norm": 2.260140064542477, + "language_loss": 0.80774689, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.8369503, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.45996094, + "step": 435, + "time_per_iteration": 2.767894983291626 + }, + { + "auxiliary_loss_clip": 0.01818886, + "auxiliary_loss_mlp": 0.01091873, + "balance_loss_clip": 1.47588694, + "balance_loss_mlp": 1.04876733, + "epoch": 0.02621373816323463, + "flos": 20015935477560.0, + "grad_norm": 2.633814940118623, + "language_loss": 0.88015902, + "learning_rate": 3.913103228936546e-06, + "loss": 0.90926659, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.43115234, + "step": 436, + "time_per_iteration": 2.804853677749634 + }, + { + "auxiliary_loss_clip": 0.0181456, + "auxiliary_loss_mlp": 0.01091758, + "balance_loss_clip": 1.48119307, + "balance_loss_mlp": 1.05134583, + "epoch": 0.0262738614159026, + "flos": 19285893138600.0, + "grad_norm": 2.895578367494196, + "language_loss": 0.76367778, + "learning_rate": 3.914578263220868e-06, + "loss": 0.79274094, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.40429688, + "step": 437, + "time_per_iteration": 2.9521443843841553 + }, + { + "auxiliary_loss_clip": 0.01820154, + "auxiliary_loss_mlp": 0.01089677, + "balance_loss_clip": 1.48506832, + "balance_loss_mlp": 1.04540241, + "epoch": 0.026333984668570568, + "flos": 18811889466120.0, + "grad_norm": 3.019069930839196, + "language_loss": 0.9292506, + "learning_rate": 3.916049925995316e-06, + "loss": 0.95834893, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.44287109, + "step": 438, + "time_per_iteration": 2.833728313446045 + }, + { + "auxiliary_loss_clip": 0.01612395, + "auxiliary_loss_mlp": 0.01017301, + "balance_loss_clip": 1.40213418, + "balance_loss_mlp": 1.00061131, + "epoch": 0.02639410792123854, + "flos": 64588964080680.0, + "grad_norm": 0.8705073033207715, + "language_loss": 0.62535703, + "learning_rate": 3.917518232637377e-06, + "loss": 0.65165401, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.16699219, + "step": 439, + "time_per_iteration": 3.412036895751953 + }, + { + "auxiliary_loss_clip": 0.01833637, + "auxiliary_loss_mlp": 0.01097722, + "balance_loss_clip": 1.48591042, + "balance_loss_mlp": 1.05204093, + "epoch": 0.02645423117390651, + "flos": 28479082549320.0, + "grad_norm": 1.8010053111910962, + "language_loss": 0.76694477, + "learning_rate": 3.918983198419573e-06, + "loss": 0.79625839, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.45703125, + "step": 440, + "time_per_iteration": 2.875169515609741 + }, + { + "auxiliary_loss_clip": 0.01822684, + "auxiliary_loss_mlp": 0.01095205, + "balance_loss_clip": 1.48508, + "balance_loss_mlp": 1.0504775, + "epoch": 0.026514354426574478, + "flos": 18555810191280.0, + "grad_norm": 2.11093333685131, + "language_loss": 0.84346229, + "learning_rate": 3.920444838510415e-06, + "loss": 0.87264121, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.44726562, + "step": 441, + "time_per_iteration": 2.7605502605438232 + }, + { + "auxiliary_loss_clip": 0.01820309, + "auxiliary_loss_mlp": 0.0109074, + "balance_loss_clip": 1.47619176, + "balance_loss_mlp": 1.04682291, + "epoch": 0.026574477679242446, + "flos": 20672713689120.0, + "grad_norm": 2.462045759266024, + "language_loss": 0.794402, + "learning_rate": 3.92190316797534e-06, + "loss": 0.82351249, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.43920898, + "step": 442, + "time_per_iteration": 5.885108709335327 + }, + { + "auxiliary_loss_clip": 0.01606397, + "auxiliary_loss_mlp": 0.0102539, + "balance_loss_clip": 1.39777851, + "balance_loss_mlp": 1.00927246, + "epoch": 0.026634600931910415, + "flos": 57970641538080.0, + "grad_norm": 0.9553359490230798, + "language_loss": 0.64412546, + "learning_rate": 3.92335820177765e-06, + "loss": 0.67044342, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.16113281, + "step": 443, + "time_per_iteration": 3.185600996017456 + }, + { + "auxiliary_loss_clip": 0.01822723, + "auxiliary_loss_mlp": 0.01100001, + "balance_loss_clip": 1.481565, + "balance_loss_mlp": 1.05665624, + "epoch": 0.026694724184578387, + "flos": 15819389975160.0, + "grad_norm": 2.822718348453661, + "language_loss": 0.84068775, + "learning_rate": 3.924809954779425e-06, + "loss": 0.86991501, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.43383789, + "step": 444, + "time_per_iteration": 5.943375825881958 + }, + { + "auxiliary_loss_clip": 0.01828654, + "auxiliary_loss_mlp": 0.01099624, + "balance_loss_clip": 1.48276937, + "balance_loss_mlp": 1.05298913, + "epoch": 0.026754847437246355, + "flos": 23445461406240.0, + "grad_norm": 2.0565013290246634, + "language_loss": 0.96452701, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.99380982, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.46679688, + "step": 445, + "time_per_iteration": 3.1827898025512695 + }, + { + "auxiliary_loss_clip": 0.01818829, + "auxiliary_loss_mlp": 0.01115921, + "balance_loss_clip": 1.48130703, + "balance_loss_mlp": 1.06251478, + "epoch": 0.026814970689914324, + "flos": 17345510492400.0, + "grad_norm": 3.4453975920078226, + "language_loss": 0.93743408, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.96678156, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.53417969, + "step": 446, + "time_per_iteration": 2.971579074859619 + }, + { + "auxiliary_loss_clip": 0.01809752, + "auxiliary_loss_mlp": 0.01093397, + "balance_loss_clip": 1.47389758, + "balance_loss_mlp": 1.0494566, + "epoch": 0.026875093942582293, + "flos": 17899235021520.0, + "grad_norm": 2.0434368651332333, + "language_loss": 0.80882764, + "learning_rate": 3.92914567610317e-06, + "loss": 0.83785915, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.43920898, + "step": 447, + "time_per_iteration": 3.0134549140930176 + }, + { + "auxiliary_loss_clip": 0.01834359, + "auxiliary_loss_mlp": 0.01099223, + "balance_loss_clip": 1.48607337, + "balance_loss_mlp": 1.05740452, + "epoch": 0.026935217195250265, + "flos": 21728769544800.0, + "grad_norm": 2.031778779673967, + "language_loss": 0.87986326, + "learning_rate": 3.930584452530952e-06, + "loss": 0.90919912, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.41796875, + "step": 448, + "time_per_iteration": 3.1242733001708984 + }, + { + "auxiliary_loss_clip": 0.01797923, + "auxiliary_loss_mlp": 0.01118397, + "balance_loss_clip": 1.46830809, + "balance_loss_mlp": 1.07447994, + "epoch": 0.026995340447918233, + "flos": 23627911078440.0, + "grad_norm": 2.0321006247219824, + "language_loss": 0.89773357, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.92689681, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.43945312, + "step": 449, + "time_per_iteration": 2.9982526302337646 + }, + { + "auxiliary_loss_clip": 0.01824613, + "auxiliary_loss_mlp": 0.0109841, + "balance_loss_clip": 1.47587001, + "balance_loss_mlp": 1.05256224, + "epoch": 0.027055463700586202, + "flos": 17935806172680.0, + "grad_norm": 2.6842780945245575, + "language_loss": 0.82188046, + "learning_rate": 3.933452395729493e-06, + "loss": 0.8511107, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.45874023, + "step": 450, + "time_per_iteration": 2.941917896270752 + }, + { + "auxiliary_loss_clip": 0.01806751, + "auxiliary_loss_mlp": 0.01106144, + "balance_loss_clip": 1.47580099, + "balance_loss_mlp": 1.0588181, + "epoch": 0.02711558695325417, + "flos": 25124363865720.0, + "grad_norm": 1.6114180749619753, + "language_loss": 0.82088298, + "learning_rate": 3.934881590952304e-06, + "loss": 0.85001194, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.47338867, + "step": 451, + "time_per_iteration": 2.947930097579956 + }, + { + "auxiliary_loss_clip": 0.01796857, + "auxiliary_loss_mlp": 0.01097488, + "balance_loss_clip": 1.47403717, + "balance_loss_mlp": 1.05302286, + "epoch": 0.02717571020592214, + "flos": 24244422778080.0, + "grad_norm": 1.8100078678979667, + "language_loss": 0.77691907, + "learning_rate": 3.936307620734599e-06, + "loss": 0.80586255, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.44458008, + "step": 452, + "time_per_iteration": 3.0087733268737793 + }, + { + "auxiliary_loss_clip": 0.01802891, + "auxiliary_loss_mlp": 0.0110342, + "balance_loss_clip": 1.47608912, + "balance_loss_mlp": 1.05883598, + "epoch": 0.02723583345859011, + "flos": 25124404474080.0, + "grad_norm": 1.7555415500604628, + "language_loss": 0.73455566, + "learning_rate": 3.937730499067294e-06, + "loss": 0.76361877, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.44580078, + "step": 453, + "time_per_iteration": 2.932187557220459 + }, + { + "auxiliary_loss_clip": 0.01798044, + "auxiliary_loss_mlp": 0.01098067, + "balance_loss_clip": 1.46735144, + "balance_loss_mlp": 1.04237294, + "epoch": 0.02729595671125808, + "flos": 42749367675480.0, + "grad_norm": 1.8861311019950737, + "language_loss": 0.83385432, + "learning_rate": 3.939150239848748e-06, + "loss": 0.86281538, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.55761719, + "step": 454, + "time_per_iteration": 3.040224313735962 + }, + { + "auxiliary_loss_clip": 0.01792363, + "auxiliary_loss_mlp": 0.01084019, + "balance_loss_clip": 1.46264327, + "balance_loss_mlp": 1.04265392, + "epoch": 0.02735607996392605, + "flos": 21435591210120.0, + "grad_norm": 1.8401325515996454, + "language_loss": 0.75905204, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78781587, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.41357422, + "step": 455, + "time_per_iteration": 2.897864818572998 + }, + { + "auxiliary_loss_clip": 0.01804234, + "auxiliary_loss_mlp": 0.01088918, + "balance_loss_clip": 1.46648097, + "balance_loss_mlp": 1.04483461, + "epoch": 0.027416203216594017, + "flos": 20856341003760.0, + "grad_norm": 1.9944318776508232, + "language_loss": 0.81662607, + "learning_rate": 3.941980363893499e-06, + "loss": 0.84555769, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.44091797, + "step": 456, + "time_per_iteration": 2.8527400493621826 + }, + { + "auxiliary_loss_clip": 0.01790473, + "auxiliary_loss_mlp": 0.01095913, + "balance_loss_clip": 1.46469784, + "balance_loss_mlp": 1.044963, + "epoch": 0.027476326469261986, + "flos": 13228401588120.0, + "grad_norm": 2.330908956863554, + "language_loss": 0.82711232, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.85597622, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.50952148, + "step": 457, + "time_per_iteration": 2.8858041763305664 + }, + { + "auxiliary_loss_clip": 0.01811312, + "auxiliary_loss_mlp": 0.01097476, + "balance_loss_clip": 1.47721291, + "balance_loss_mlp": 1.04700267, + "epoch": 0.027536449721929958, + "flos": 24029868874320.0, + "grad_norm": 2.85998154231531, + "language_loss": 0.94474888, + "learning_rate": 3.944798102235412e-06, + "loss": 0.97383666, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.50561523, + "step": 458, + "time_per_iteration": 3.0048828125 + }, + { + "auxiliary_loss_clip": 0.01806442, + "auxiliary_loss_mlp": 0.01097186, + "balance_loss_clip": 1.47334266, + "balance_loss_mlp": 1.05198145, + "epoch": 0.027596572974597926, + "flos": 13009989890160.0, + "grad_norm": 3.823977170384063, + "language_loss": 0.7993961, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.82843244, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.4519043, + "step": 459, + "time_per_iteration": 2.7781331539154053 + }, + { + "auxiliary_loss_clip": 0.01799636, + "auxiliary_loss_mlp": 0.01110591, + "balance_loss_clip": 1.47358441, + "balance_loss_mlp": 1.06176305, + "epoch": 0.027656696227265895, + "flos": 26149005831960.0, + "grad_norm": 1.786369154563108, + "language_loss": 0.84136599, + "learning_rate": 3.947603562811407e-06, + "loss": 0.87046826, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.48876953, + "step": 460, + "time_per_iteration": 2.907898187637329 + }, + { + "auxiliary_loss_clip": 0.01613258, + "auxiliary_loss_mlp": 0.01081197, + "balance_loss_clip": 1.40011168, + "balance_loss_mlp": 1.06498456, + "epoch": 0.027716819479933864, + "flos": 60712706316240.0, + "grad_norm": 1.5890102193579294, + "language_loss": 0.73611736, + "learning_rate": 3.949001722282675e-06, + "loss": 0.76306188, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.16210938, + "step": 461, + "time_per_iteration": 3.2801787853240967 + }, + { + "auxiliary_loss_clip": 0.0179896, + "auxiliary_loss_mlp": 0.01105037, + "balance_loss_clip": 1.47856784, + "balance_loss_mlp": 1.0629797, + "epoch": 0.027776942732601832, + "flos": 31218182917200.0, + "grad_norm": 2.7945133926714156, + "language_loss": 0.83233273, + "learning_rate": 3.950396852153582e-06, + "loss": 0.86137271, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.4206543, + "step": 462, + "time_per_iteration": 2.909472942352295 + }, + { + "auxiliary_loss_clip": 0.01801502, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_clip": 1.47058022, + "balance_loss_mlp": 1.04272735, + "epoch": 0.027837065985269804, + "flos": 22679822516760.0, + "grad_norm": 2.206100961379775, + "language_loss": 0.91479731, + "learning_rate": 3.951788965525118e-06, + "loss": 0.94367379, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.43408203, + "step": 463, + "time_per_iteration": 2.8775243759155273 + }, + { + "auxiliary_loss_clip": 0.01610475, + "auxiliary_loss_mlp": 0.01023545, + "balance_loss_clip": 1.39919376, + "balance_loss_mlp": 1.00637901, + "epoch": 0.027897189237937773, + "flos": 62196773553720.0, + "grad_norm": 0.8865410661869846, + "language_loss": 0.59017307, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61651325, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.171875, + "step": 464, + "time_per_iteration": 3.245722532272339 + }, + { + "auxiliary_loss_clip": 0.01826542, + "auxiliary_loss_mlp": 0.01107281, + "balance_loss_clip": 1.48497248, + "balance_loss_mlp": 1.06260097, + "epoch": 0.02795731249060574, + "flos": 24497821901160.0, + "grad_norm": 2.378383985946387, + "language_loss": 0.83166134, + "learning_rate": 3.954564194750784e-06, + "loss": 0.86099958, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.44677734, + "step": 465, + "time_per_iteration": 2.8889410495758057 + }, + { + "auxiliary_loss_clip": 0.01818595, + "auxiliary_loss_mlp": 0.0110892, + "balance_loss_clip": 1.48259258, + "balance_loss_mlp": 1.05966234, + "epoch": 0.02801743574327371, + "flos": 23738355482400.0, + "grad_norm": 4.275060437016707, + "language_loss": 0.79199433, + "learning_rate": 3.955947336385828e-06, + "loss": 0.82126951, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.4934082, + "step": 466, + "time_per_iteration": 2.8443491458892822 + }, + { + "auxiliary_loss_clip": 0.01810529, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_clip": 1.48393583, + "balance_loss_mlp": 1.05388343, + "epoch": 0.02807755899594168, + "flos": 20633706036360.0, + "grad_norm": 2.0508940026005273, + "language_loss": 0.87666881, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90575683, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.44384766, + "step": 467, + "time_per_iteration": 2.830679416656494 + }, + { + "auxiliary_loss_clip": 0.01824701, + "auxiliary_loss_mlp": 0.01107555, + "balance_loss_clip": 1.49380004, + "balance_loss_mlp": 1.06099176, + "epoch": 0.02813768224860965, + "flos": 19249240770720.0, + "grad_norm": 2.2109849042512093, + "language_loss": 0.87501752, + "learning_rate": 3.958704737531818e-06, + "loss": 0.90434015, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.46582031, + "step": 468, + "time_per_iteration": 2.8813729286193848 + }, + { + "auxiliary_loss_clip": 0.01826152, + "auxiliary_loss_mlp": 0.01110814, + "balance_loss_clip": 1.48819137, + "balance_loss_mlp": 1.06484699, + "epoch": 0.02819780550127762, + "flos": 20818917077040.0, + "grad_norm": 2.337779052080577, + "language_loss": 0.92460555, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.9539752, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.4597168, + "step": 469, + "time_per_iteration": 2.809584617614746 + }, + { + "auxiliary_loss_clip": 0.01806917, + "auxiliary_loss_mlp": 0.01107757, + "balance_loss_clip": 1.4789772, + "balance_loss_mlp": 1.0629586, + "epoch": 0.028257928753945588, + "flos": 19978877026080.0, + "grad_norm": 2.8988173410061266, + "language_loss": 0.83616436, + "learning_rate": 3.96145038000181e-06, + "loss": 0.86531115, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.44775391, + "step": 470, + "time_per_iteration": 2.838177442550659 + }, + { + "auxiliary_loss_clip": 0.01823749, + "auxiliary_loss_mlp": 0.01116509, + "balance_loss_clip": 1.48952639, + "balance_loss_mlp": 1.0683248, + "epoch": 0.028318052006613557, + "flos": 20489248807920.0, + "grad_norm": 2.020666226814759, + "language_loss": 0.94679254, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9761951, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.48193359, + "step": 471, + "time_per_iteration": 2.834595203399658 + }, + { + "auxiliary_loss_clip": 0.01807437, + "auxiliary_loss_mlp": 0.01103457, + "balance_loss_clip": 1.47699249, + "balance_loss_mlp": 1.05198276, + "epoch": 0.02837817525928153, + "flos": 28521095220720.0, + "grad_norm": 26.485387527968353, + "language_loss": 0.76876485, + "learning_rate": 3.964184363657625e-06, + "loss": 0.7978738, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.51464844, + "step": 472, + "time_per_iteration": 2.99941086769104 + }, + { + "auxiliary_loss_clip": 0.01820134, + "auxiliary_loss_mlp": 0.01099684, + "balance_loss_clip": 1.48370779, + "balance_loss_mlp": 1.04990184, + "epoch": 0.028438298511949497, + "flos": 18556297491600.0, + "grad_norm": 2.217504029747933, + "language_loss": 0.94758654, + "learning_rate": 3.965547014290071e-06, + "loss": 0.97678471, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.49829102, + "step": 473, + "time_per_iteration": 2.9241981506347656 + }, + { + "auxiliary_loss_clip": 0.01828082, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_clip": 1.48806679, + "balance_loss_mlp": 1.07990515, + "epoch": 0.028498421764617466, + "flos": 16914697133760.0, + "grad_norm": 2.7170386259918025, + "language_loss": 0.90696514, + "learning_rate": 3.96690678709433e-06, + "loss": 0.93649822, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.453125, + "step": 474, + "time_per_iteration": 2.794342279434204 + }, + { + "auxiliary_loss_clip": 0.01808294, + "auxiliary_loss_mlp": 0.01103775, + "balance_loss_clip": 1.48090863, + "balance_loss_mlp": 1.05852294, + "epoch": 0.028558545017285435, + "flos": 27784027635480.0, + "grad_norm": 2.4648888144802603, + "language_loss": 0.80696237, + "learning_rate": 3.968263694200355e-06, + "loss": 0.83608299, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.45288086, + "step": 475, + "time_per_iteration": 2.9031598567962646 + }, + { + "auxiliary_loss_clip": 0.01603674, + "auxiliary_loss_mlp": 0.01131933, + "balance_loss_clip": 1.39763463, + "balance_loss_mlp": 1.10847235, + "epoch": 0.028618668269953403, + "flos": 65669855271480.0, + "grad_norm": 0.934559563009531, + "language_loss": 0.66919339, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69654948, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.234375, + "step": 476, + "time_per_iteration": 3.2295336723327637 + }, + { + "auxiliary_loss_clip": 0.01795684, + "auxiliary_loss_mlp": 0.01101833, + "balance_loss_clip": 1.47115362, + "balance_loss_mlp": 1.05374432, + "epoch": 0.028678791522621375, + "flos": 21941130597120.0, + "grad_norm": 2.4272287118186147, + "language_loss": 0.85686445, + "learning_rate": 3.970968959455509e-06, + "loss": 0.8858397, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.48071289, + "step": 477, + "time_per_iteration": 2.818530797958374 + }, + { + "auxiliary_loss_clip": 0.01805593, + "auxiliary_loss_mlp": 0.01114305, + "balance_loss_clip": 1.4791379, + "balance_loss_mlp": 1.06550097, + "epoch": 0.028738914775289344, + "flos": 24577583366160.0, + "grad_norm": 2.1815701549471886, + "language_loss": 0.83007324, + "learning_rate": 3.97231734148446e-06, + "loss": 0.85927224, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.48803711, + "step": 478, + "time_per_iteration": 2.837702751159668 + }, + { + "auxiliary_loss_clip": 0.01796734, + "auxiliary_loss_mlp": 0.01100216, + "balance_loss_clip": 1.46947992, + "balance_loss_mlp": 1.05303347, + "epoch": 0.028799038027957313, + "flos": 23263174167480.0, + "grad_norm": 2.74105296241228, + "language_loss": 0.81828362, + "learning_rate": 3.973662905576082e-06, + "loss": 0.84725314, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.47192383, + "step": 479, + "time_per_iteration": 2.8672423362731934 + }, + { + "auxiliary_loss_clip": 0.01792154, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_clip": 1.46958661, + "balance_loss_mlp": 1.04334402, + "epoch": 0.02885916128062528, + "flos": 22169247693120.0, + "grad_norm": 3.323685243882045, + "language_loss": 0.74577546, + "learning_rate": 3.975005663484038e-06, + "loss": 0.77460051, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.46972656, + "step": 480, + "time_per_iteration": 2.803575277328491 + }, + { + "auxiliary_loss_clip": 0.01801636, + "auxiliary_loss_mlp": 0.01110614, + "balance_loss_clip": 1.47770393, + "balance_loss_mlp": 1.05932999, + "epoch": 0.02891928453329325, + "flos": 22938419509920.0, + "grad_norm": 1.9736633502105425, + "language_loss": 0.88807207, + "learning_rate": 3.976345626888605e-06, + "loss": 0.9171946, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.51318359, + "step": 481, + "time_per_iteration": 4.385293483734131 + }, + { + "auxiliary_loss_clip": 0.01586455, + "auxiliary_loss_mlp": 0.0105869, + "balance_loss_clip": 1.38536787, + "balance_loss_mlp": 1.04285932, + "epoch": 0.028979407785961222, + "flos": 57445772571720.0, + "grad_norm": 0.8380234328964505, + "language_loss": 0.66020036, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68665183, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.15820312, + "step": 482, + "time_per_iteration": 4.5547285079956055 + }, + { + "auxiliary_loss_clip": 0.01820822, + "auxiliary_loss_mlp": 0.01109368, + "balance_loss_clip": 1.48008478, + "balance_loss_mlp": 1.04723585, + "epoch": 0.02903953103862919, + "flos": 16726521682800.0, + "grad_norm": 3.121439005648795, + "language_loss": 0.80493599, + "learning_rate": 3.979017216545415e-06, + "loss": 0.83423787, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.62084961, + "step": 483, + "time_per_iteration": 5.80743145942688 + }, + { + "auxiliary_loss_clip": 0.01812506, + "auxiliary_loss_mlp": 0.01113825, + "balance_loss_clip": 1.48068357, + "balance_loss_mlp": 1.06578374, + "epoch": 0.02909965429129716, + "flos": 16768006445520.0, + "grad_norm": 1.9271171193642807, + "language_loss": 0.76809716, + "learning_rate": 3.980348865796749e-06, + "loss": 0.79736042, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.48095703, + "step": 484, + "time_per_iteration": 2.855107545852661 + }, + { + "auxiliary_loss_clip": 0.01800762, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.47057295, + "balance_loss_mlp": 1.0434382, + "epoch": 0.029159777543965128, + "flos": 19789239674160.0, + "grad_norm": 2.0147572501533197, + "language_loss": 0.84329236, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.87232989, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.59545898, + "step": 485, + "time_per_iteration": 2.853874921798706 + }, + { + "auxiliary_loss_clip": 0.01800382, + "auxiliary_loss_mlp": 0.01108255, + "balance_loss_clip": 1.47542143, + "balance_loss_mlp": 1.05577922, + "epoch": 0.029219900796633096, + "flos": 19646975297160.0, + "grad_norm": 2.0641024950693083, + "language_loss": 0.85880375, + "learning_rate": 3.983003930109732e-06, + "loss": 0.8878901, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.5246582, + "step": 486, + "time_per_iteration": 2.7979328632354736 + }, + { + "auxiliary_loss_clip": 0.01795784, + "auxiliary_loss_mlp": 0.01100873, + "balance_loss_clip": 1.46629548, + "balance_loss_mlp": 1.05340374, + "epoch": 0.02928002404930107, + "flos": 25891342831080.0, + "grad_norm": 1.816053590723996, + "language_loss": 0.89433849, + "learning_rate": 3.984327367746315e-06, + "loss": 0.92330503, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.47460938, + "step": 487, + "time_per_iteration": 2.9448561668395996 + }, + { + "auxiliary_loss_clip": 0.01805255, + "auxiliary_loss_mlp": 0.01095798, + "balance_loss_clip": 1.47236013, + "balance_loss_mlp": 1.04923439, + "epoch": 0.029340147301969037, + "flos": 20664145325160.0, + "grad_norm": 2.286367143606925, + "language_loss": 0.8902303, + "learning_rate": 3.985648090637122e-06, + "loss": 0.91924077, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.46533203, + "step": 488, + "time_per_iteration": 2.9034128189086914 + }, + { + "auxiliary_loss_clip": 0.01789177, + "auxiliary_loss_mlp": 0.01098716, + "balance_loss_clip": 1.46580458, + "balance_loss_mlp": 1.05134201, + "epoch": 0.029400270554637006, + "flos": 24433694654760.0, + "grad_norm": 3.077065423715376, + "language_loss": 0.89162558, + "learning_rate": 3.986966109896785e-06, + "loss": 0.92050451, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.47363281, + "step": 489, + "time_per_iteration": 2.900831937789917 + }, + { + "auxiliary_loss_clip": 0.01776669, + "auxiliary_loss_mlp": 0.01097197, + "balance_loss_clip": 1.45806968, + "balance_loss_mlp": 1.04350567, + "epoch": 0.029460393807304974, + "flos": 20125770756120.0, + "grad_norm": 2.2475779043588315, + "language_loss": 0.89656949, + "learning_rate": 3.988281436571815e-06, + "loss": 0.92530817, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.53710938, + "step": 490, + "time_per_iteration": 2.8212130069732666 + }, + { + "auxiliary_loss_clip": 0.01801946, + "auxiliary_loss_mlp": 0.01094675, + "balance_loss_clip": 1.46974599, + "balance_loss_mlp": 1.04892302, + "epoch": 0.029520517059972943, + "flos": 17680417239960.0, + "grad_norm": 2.5675067635063167, + "language_loss": 0.9293772, + "learning_rate": 3.989594081641164e-06, + "loss": 0.95834339, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.45776367, + "step": 491, + "time_per_iteration": 2.89632511138916 + }, + { + "auxiliary_loss_clip": 0.01777508, + "auxiliary_loss_mlp": 0.0109524, + "balance_loss_clip": 1.46284211, + "balance_loss_mlp": 1.04874814, + "epoch": 0.029580640312640915, + "flos": 18957889812240.0, + "grad_norm": 1.8667440235843624, + "language_loss": 0.86342096, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.89214844, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.46533203, + "step": 492, + "time_per_iteration": 2.840514659881592 + }, + { + "auxiliary_loss_clip": 0.01798725, + "auxiliary_loss_mlp": 0.01125508, + "balance_loss_clip": 1.47347856, + "balance_loss_mlp": 1.07620263, + "epoch": 0.029640763565308884, + "flos": 18729732107880.0, + "grad_norm": 3.4480803089134446, + "language_loss": 0.85176539, + "learning_rate": 3.992211370544093e-06, + "loss": 0.88100779, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.49291992, + "step": 493, + "time_per_iteration": 2.813770055770874 + }, + { + "auxiliary_loss_clip": 0.01790337, + "auxiliary_loss_mlp": 0.01099874, + "balance_loss_clip": 1.46465492, + "balance_loss_mlp": 1.05595696, + "epoch": 0.029700886817976852, + "flos": 20600261728920.0, + "grad_norm": 1.72862846955858, + "language_loss": 0.8801254, + "learning_rate": 3.99351603600268e-06, + "loss": 0.90902758, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.43920898, + "step": 494, + "time_per_iteration": 2.81103253364563 + }, + { + "auxiliary_loss_clip": 0.01789423, + "auxiliary_loss_mlp": 0.01098271, + "balance_loss_clip": 1.46487737, + "balance_loss_mlp": 1.05590391, + "epoch": 0.02976101007064482, + "flos": 22241983911840.0, + "grad_norm": 2.1221396803034374, + "language_loss": 0.88259995, + "learning_rate": 3.994818063106668e-06, + "loss": 0.91147691, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.42358398, + "step": 495, + "time_per_iteration": 2.802389144897461 + }, + { + "auxiliary_loss_clip": 0.01773978, + "auxiliary_loss_mlp": 0.01090738, + "balance_loss_clip": 1.45947981, + "balance_loss_mlp": 1.04775143, + "epoch": 0.029821133323312793, + "flos": 23737949398800.0, + "grad_norm": 2.0958948464734877, + "language_loss": 0.64115483, + "learning_rate": 3.99611746250533e-06, + "loss": 0.66980201, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.42993164, + "step": 496, + "time_per_iteration": 2.9175286293029785 + }, + { + "auxiliary_loss_clip": 0.01769628, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.45679069, + "balance_loss_mlp": 1.05015421, + "epoch": 0.02988125657598076, + "flos": 22424677234200.0, + "grad_norm": 1.7749425008105837, + "language_loss": 0.90147203, + "learning_rate": 3.997414244783595e-06, + "loss": 0.93010497, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.43505859, + "step": 497, + "time_per_iteration": 2.852322578430176 + }, + { + "auxiliary_loss_clip": 0.01785491, + "auxiliary_loss_mlp": 0.01108786, + "balance_loss_clip": 1.45975447, + "balance_loss_mlp": 1.06115007, + "epoch": 0.02994137982864873, + "flos": 13849136557200.0, + "grad_norm": 2.7512581438893697, + "language_loss": 0.86863261, + "learning_rate": 3.998708420462557e-06, + "loss": 0.89757538, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.4765625, + "step": 498, + "time_per_iteration": 2.819840669631958 + }, + { + "auxiliary_loss_clip": 0.01780548, + "auxiliary_loss_mlp": 0.01095314, + "balance_loss_clip": 1.45856667, + "balance_loss_mlp": 1.05061078, + "epoch": 0.0300015030813167, + "flos": 23912805307680.0, + "grad_norm": 3.3725334889362686, + "language_loss": 0.80812395, + "learning_rate": 4e-06, + "loss": 0.83688259, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.44726562, + "step": 499, + "time_per_iteration": 2.8474159240722656 + }, + { + "auxiliary_loss_clip": 0.01771735, + "auxiliary_loss_mlp": 0.01102317, + "balance_loss_clip": 1.45560741, + "balance_loss_mlp": 1.06068897, + "epoch": 0.030061626333984667, + "flos": 22021744837680.0, + "grad_norm": 1.9512500540898843, + "language_loss": 0.83682269, + "learning_rate": 3.9999999620799e-06, + "loss": 0.86556315, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.41650391, + "step": 500, + "time_per_iteration": 2.796003818511963 + }, + { + "auxiliary_loss_clip": 0.01766681, + "auxiliary_loss_mlp": 0.01101093, + "balance_loss_clip": 1.45049357, + "balance_loss_mlp": 1.05300355, + "epoch": 0.03012174958665264, + "flos": 23045006119680.0, + "grad_norm": 3.6244611849671764, + "language_loss": 0.90209728, + "learning_rate": 3.9999998483196e-06, + "loss": 0.93077499, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.48095703, + "step": 501, + "time_per_iteration": 2.7917654514312744 + }, + { + "auxiliary_loss_clip": 0.01780731, + "auxiliary_loss_mlp": 0.01084013, + "balance_loss_clip": 1.45422864, + "balance_loss_mlp": 1.04174113, + "epoch": 0.030181872839320608, + "flos": 18957971028960.0, + "grad_norm": 2.43203336281541, + "language_loss": 0.88288832, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.91153574, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.42260742, + "step": 502, + "time_per_iteration": 2.790926218032837 + }, + { + "auxiliary_loss_clip": 0.01769445, + "auxiliary_loss_mlp": 0.01092416, + "balance_loss_clip": 1.45036125, + "balance_loss_mlp": 1.04797482, + "epoch": 0.030241996091988577, + "flos": 16732491111720.0, + "grad_norm": 2.581733612062025, + "language_loss": 0.85587108, + "learning_rate": 3.999999393278425e-06, + "loss": 0.88448972, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.44433594, + "step": 503, + "time_per_iteration": 2.8179876804351807 + }, + { + "auxiliary_loss_clip": 0.01763929, + "auxiliary_loss_mlp": 0.01108564, + "balance_loss_clip": 1.4508388, + "balance_loss_mlp": 1.06121397, + "epoch": 0.030302119344656545, + "flos": 28627153921800.0, + "grad_norm": 1.6618742056437203, + "language_loss": 0.89180547, + "learning_rate": 3.999999051997567e-06, + "loss": 0.92053044, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.47387695, + "step": 504, + "time_per_iteration": 2.879014730453491 + }, + { + "auxiliary_loss_clip": 0.01758526, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.44641638, + "balance_loss_mlp": 1.05370808, + "epoch": 0.030362242597324514, + "flos": 15673592670840.0, + "grad_norm": 1.7792116194713612, + "language_loss": 0.79620928, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.82475471, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.42333984, + "step": 505, + "time_per_iteration": 2.8419244289398193 + }, + { + "auxiliary_loss_clip": 0.01584683, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.3894099, + "balance_loss_mlp": 1.00928521, + "epoch": 0.030422365849992486, + "flos": 72144086060880.0, + "grad_norm": 0.840794215099111, + "language_loss": 0.54950052, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57560521, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.16503906, + "step": 506, + "time_per_iteration": 3.469745397567749 + }, + { + "auxiliary_loss_clip": 0.01781204, + "auxiliary_loss_mlp": 0.01119253, + "balance_loss_clip": 1.458426, + "balance_loss_mlp": 1.07352436, + "epoch": 0.030482489102660455, + "flos": 19432624435200.0, + "grad_norm": 2.7750717854796143, + "language_loss": 0.84270233, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8717069, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.45703125, + "step": 507, + "time_per_iteration": 2.831650495529175 + }, + { + "auxiliary_loss_clip": 0.01775737, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_clip": 1.45382524, + "balance_loss_mlp": 1.05866718, + "epoch": 0.030542612355328423, + "flos": 20380672388520.0, + "grad_norm": 2.5898301623016944, + "language_loss": 0.89080012, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91959071, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.4465332, + "step": 508, + "time_per_iteration": 2.928023099899292 + }, + { + "auxiliary_loss_clip": 0.01780952, + "auxiliary_loss_mlp": 0.0109905, + "balance_loss_clip": 1.45733666, + "balance_loss_mlp": 1.05415583, + "epoch": 0.030602735607996392, + "flos": 34683955130160.0, + "grad_norm": 1.7750515266362037, + "language_loss": 0.72356033, + "learning_rate": 3.999996207991165e-06, + "loss": 0.75236034, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.44897461, + "step": 509, + "time_per_iteration": 2.943779230117798 + }, + { + "auxiliary_loss_clip": 0.01775799, + "auxiliary_loss_mlp": 0.01119295, + "balance_loss_clip": 1.46338773, + "balance_loss_mlp": 1.07335138, + "epoch": 0.03066285886066436, + "flos": 23663832495840.0, + "grad_norm": 7.509088114450406, + "language_loss": 0.82741213, + "learning_rate": 3.999995411669614e-06, + "loss": 0.85636306, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.4597168, + "step": 510, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0176998, + "auxiliary_loss_mlp": 0.01109324, + "balance_loss_clip": 1.45906317, + "balance_loss_mlp": 1.06545496, + "epoch": 0.030722982113332332, + "flos": 23008191318360.0, + "grad_norm": 3.749742429856392, + "language_loss": 0.8566817, + "learning_rate": 3.999994539508036e-06, + "loss": 0.8854748, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.4387207, + "step": 511, + "time_per_iteration": 2.842301368713379 + }, + { + "auxiliary_loss_clip": 0.01793839, + "auxiliary_loss_mlp": 0.01119171, + "balance_loss_clip": 1.46263576, + "balance_loss_mlp": 1.0682447, + "epoch": 0.0307831053660003, + "flos": 24755850377280.0, + "grad_norm": 1.9592495579801694, + "language_loss": 0.8299306, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.85906065, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.50878906, + "step": 512, + "time_per_iteration": 2.840771198272705 + }, + { + "auxiliary_loss_clip": 0.01780976, + "auxiliary_loss_mlp": 0.01134957, + "balance_loss_clip": 1.46163511, + "balance_loss_mlp": 1.08260036, + "epoch": 0.03084322861866827, + "flos": 26146528722000.0, + "grad_norm": 1.8442958657479205, + "language_loss": 0.88243866, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.91159797, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.52319336, + "step": 513, + "time_per_iteration": 2.879631757736206 + }, + { + "auxiliary_loss_clip": 0.01790011, + "auxiliary_loss_mlp": 0.01105575, + "balance_loss_clip": 1.46426296, + "balance_loss_mlp": 1.06015587, + "epoch": 0.03090335187133624, + "flos": 18776049265440.0, + "grad_norm": 3.2442406389825806, + "language_loss": 0.80831707, + "learning_rate": 3.999991467983491e-06, + "loss": 0.83727288, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.45410156, + "step": 514, + "time_per_iteration": 2.8324999809265137 + }, + { + "auxiliary_loss_clip": 0.01779931, + "auxiliary_loss_mlp": 0.01091687, + "balance_loss_clip": 1.46315742, + "balance_loss_mlp": 1.04724526, + "epoch": 0.030963475124004207, + "flos": 23227130925000.0, + "grad_norm": 2.7805225031517975, + "language_loss": 0.79398596, + "learning_rate": 3.999990292462167e-06, + "loss": 0.82270217, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.44433594, + "step": 515, + "time_per_iteration": 2.825822591781616 + }, + { + "auxiliary_loss_clip": 0.01786188, + "auxiliary_loss_mlp": 0.0110231, + "balance_loss_clip": 1.46032166, + "balance_loss_mlp": 1.05023944, + "epoch": 0.03102359837667218, + "flos": 42533717346000.0, + "grad_norm": 2.2928045630426626, + "language_loss": 0.84529984, + "learning_rate": 3.999989041101011e-06, + "loss": 0.87418485, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.52099609, + "step": 516, + "time_per_iteration": 3.0312485694885254 + }, + { + "auxiliary_loss_clip": 0.01776889, + "auxiliary_loss_mlp": 0.01116499, + "balance_loss_clip": 1.45841205, + "balance_loss_mlp": 1.0707469, + "epoch": 0.031083721629340148, + "flos": 21181704786720.0, + "grad_norm": 1.8096639213444607, + "language_loss": 0.79440033, + "learning_rate": 3.999987713900071e-06, + "loss": 0.82333422, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.45800781, + "step": 517, + "time_per_iteration": 2.8131563663482666 + }, + { + "auxiliary_loss_clip": 0.01779637, + "auxiliary_loss_mlp": 0.01103772, + "balance_loss_clip": 1.46455026, + "balance_loss_mlp": 1.05854392, + "epoch": 0.031143844882008116, + "flos": 29722258038600.0, + "grad_norm": 1.557667612056483, + "language_loss": 0.91062307, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93945718, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.45263672, + "step": 518, + "time_per_iteration": 2.874823570251465 + }, + { + "auxiliary_loss_clip": 0.01791877, + "auxiliary_loss_mlp": 0.01125481, + "balance_loss_clip": 1.4733088, + "balance_loss_mlp": 1.07591391, + "epoch": 0.031203968134676085, + "flos": 23117701730040.0, + "grad_norm": 2.1664175450523198, + "language_loss": 0.87537736, + "learning_rate": 3.999984831979039e-06, + "loss": 0.90455091, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.49584961, + "step": 519, + "time_per_iteration": 2.8526551723480225 + }, + { + "auxiliary_loss_clip": 0.01784788, + "auxiliary_loss_mlp": 0.01110746, + "balance_loss_clip": 1.46115017, + "balance_loss_mlp": 1.06637669, + "epoch": 0.03126409138734405, + "flos": 20958623127360.0, + "grad_norm": 2.1045551765936317, + "language_loss": 0.87787032, + "learning_rate": 3.999983277259057e-06, + "loss": 0.90682566, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.44384766, + "step": 520, + "time_per_iteration": 5.826273441314697 + }, + { + "auxiliary_loss_clip": 0.01786396, + "auxiliary_loss_mlp": 0.01097171, + "balance_loss_clip": 1.46446717, + "balance_loss_mlp": 1.05280185, + "epoch": 0.031324214640012026, + "flos": 21654774466920.0, + "grad_norm": 1.715517220533544, + "language_loss": 0.89947373, + "learning_rate": 3.999981646699509e-06, + "loss": 0.92830944, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.44360352, + "step": 521, + "time_per_iteration": 2.82307505607605 + }, + { + "auxiliary_loss_clip": 0.01772875, + "auxiliary_loss_mlp": 0.01094431, + "balance_loss_clip": 1.45837986, + "balance_loss_mlp": 1.04953694, + "epoch": 0.03138433789267999, + "flos": 23446679657040.0, + "grad_norm": 2.00040735835989, + "language_loss": 0.71631688, + "learning_rate": 3.999979940300456e-06, + "loss": 0.74498993, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.44946289, + "step": 522, + "time_per_iteration": 4.376165390014648 + }, + { + "auxiliary_loss_clip": 0.01790789, + "auxiliary_loss_mlp": 0.01103702, + "balance_loss_clip": 1.46295249, + "balance_loss_mlp": 1.05697227, + "epoch": 0.03144446114534796, + "flos": 18985892599440.0, + "grad_norm": 3.8780232630653915, + "language_loss": 0.85924834, + "learning_rate": 3.999978158061963e-06, + "loss": 0.88819319, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.46777344, + "step": 523, + "time_per_iteration": 4.3985936641693115 + }, + { + "auxiliary_loss_clip": 0.01783608, + "auxiliary_loss_mlp": 0.01092551, + "balance_loss_clip": 1.45461655, + "balance_loss_mlp": 1.04865789, + "epoch": 0.031504584398015935, + "flos": 22642804673640.0, + "grad_norm": 2.493153567868237, + "language_loss": 0.91518259, + "learning_rate": 3.999976299984099e-06, + "loss": 0.94394422, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.43896484, + "step": 524, + "time_per_iteration": 2.8164584636688232 + }, + { + "auxiliary_loss_clip": 0.01789023, + "auxiliary_loss_mlp": 0.01109685, + "balance_loss_clip": 1.46500814, + "balance_loss_mlp": 1.06219232, + "epoch": 0.0315647076506839, + "flos": 25301981143080.0, + "grad_norm": 2.452606812291489, + "language_loss": 0.82396871, + "learning_rate": 3.999974366066933e-06, + "loss": 0.85295582, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.47460938, + "step": 525, + "time_per_iteration": 2.913918972015381 + }, + { + "auxiliary_loss_clip": 0.01775222, + "auxiliary_loss_mlp": 0.01095851, + "balance_loss_clip": 1.45611858, + "balance_loss_mlp": 1.05376959, + "epoch": 0.03162483090335187, + "flos": 16987270919040.0, + "grad_norm": 2.1129875293407907, + "language_loss": 0.81694448, + "learning_rate": 3.999972356310538e-06, + "loss": 0.84565526, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.42114258, + "step": 526, + "time_per_iteration": 2.8098087310791016 + }, + { + "auxiliary_loss_clip": 0.01786183, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_clip": 1.46359098, + "balance_loss_mlp": 1.04948819, + "epoch": 0.03168495415601984, + "flos": 18739193855760.0, + "grad_norm": 3.188338255760935, + "language_loss": 0.82622051, + "learning_rate": 3.999970270714991e-06, + "loss": 0.85507882, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.5012207, + "step": 527, + "time_per_iteration": 2.804887294769287 + }, + { + "auxiliary_loss_clip": 0.01773698, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_clip": 1.45540166, + "balance_loss_mlp": 1.05412412, + "epoch": 0.03174507740868781, + "flos": 21219819055560.0, + "grad_norm": 3.9740845134477576, + "language_loss": 0.95595974, + "learning_rate": 3.999968109280371e-06, + "loss": 0.98471117, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.47363281, + "step": 528, + "time_per_iteration": 2.823347568511963 + }, + { + "auxiliary_loss_clip": 0.01767996, + "auxiliary_loss_mlp": 0.01104198, + "balance_loss_clip": 1.45391488, + "balance_loss_mlp": 1.05920815, + "epoch": 0.03180520066135578, + "flos": 24792705786960.0, + "grad_norm": 2.6202775428306833, + "language_loss": 0.85313052, + "learning_rate": 3.99996587200676e-06, + "loss": 0.88185245, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.45019531, + "step": 529, + "time_per_iteration": 2.837402820587158 + }, + { + "auxiliary_loss_clip": 0.017754, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_clip": 1.46314073, + "balance_loss_mlp": 1.05112529, + "epoch": 0.03186532391402375, + "flos": 24869787100200.0, + "grad_norm": 1.8147838015164452, + "language_loss": 0.90959597, + "learning_rate": 3.999963558894243e-06, + "loss": 0.93831348, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.45214844, + "step": 530, + "time_per_iteration": 2.8745932579040527 + }, + { + "auxiliary_loss_clip": 0.0175975, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_clip": 1.44694328, + "balance_loss_mlp": 1.0497669, + "epoch": 0.03192544716669172, + "flos": 21220022097360.0, + "grad_norm": 2.8750643172044574, + "language_loss": 0.77644145, + "learning_rate": 3.999961169942907e-06, + "loss": 0.80498195, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.44580078, + "step": 531, + "time_per_iteration": 2.877429723739624 + }, + { + "auxiliary_loss_clip": 0.01764602, + "auxiliary_loss_mlp": 0.01091752, + "balance_loss_clip": 1.45162332, + "balance_loss_mlp": 1.04597533, + "epoch": 0.03198557041935969, + "flos": 24358643759520.0, + "grad_norm": 2.052522018782261, + "language_loss": 0.9210912, + "learning_rate": 3.999958705152843e-06, + "loss": 0.94965476, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.45776367, + "step": 532, + "time_per_iteration": 3.024980068206787 + }, + { + "auxiliary_loss_clip": 0.01568466, + "auxiliary_loss_mlp": 0.01023504, + "balance_loss_clip": 1.3666544, + "balance_loss_mlp": 1.00481176, + "epoch": 0.032045693672027656, + "flos": 61842879074880.0, + "grad_norm": 0.7351635471468801, + "language_loss": 0.57929355, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60521328, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.18652344, + "step": 533, + "time_per_iteration": 3.3518497943878174 + }, + { + "auxiliary_loss_clip": 0.01761278, + "auxiliary_loss_mlp": 0.01107363, + "balance_loss_clip": 1.44953775, + "balance_loss_mlp": 1.06435275, + "epoch": 0.03210581692469563, + "flos": 28406874239280.0, + "grad_norm": 1.6133940779930087, + "language_loss": 0.87741554, + "learning_rate": 3.999953548056907e-06, + "loss": 0.90610194, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.43017578, + "step": 534, + "time_per_iteration": 2.8608951568603516 + }, + { + "auxiliary_loss_clip": 0.01762356, + "auxiliary_loss_mlp": 0.01083382, + "balance_loss_clip": 1.44907606, + "balance_loss_mlp": 1.03815413, + "epoch": 0.03216594017736359, + "flos": 24723137020320.0, + "grad_norm": 2.0972871235030777, + "language_loss": 0.78150618, + "learning_rate": 3.999950855751232e-06, + "loss": 0.80996358, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.45239258, + "step": 535, + "time_per_iteration": 2.84554123878479 + }, + { + "auxiliary_loss_clip": 0.01769513, + "auxiliary_loss_mlp": 0.01101142, + "balance_loss_clip": 1.45631051, + "balance_loss_mlp": 1.05894172, + "epoch": 0.032226063430031565, + "flos": 31181692982760.0, + "grad_norm": 5.116402739847662, + "language_loss": 0.81244731, + "learning_rate": 3.999948087607219e-06, + "loss": 0.84115386, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.421875, + "step": 536, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.01770457, + "auxiliary_loss_mlp": 0.0110046, + "balance_loss_clip": 1.4533217, + "balance_loss_mlp": 1.05709147, + "epoch": 0.03228618668269954, + "flos": 32204751222960.0, + "grad_norm": 2.3564495525125206, + "language_loss": 0.71189457, + "learning_rate": 3.999945243624975e-06, + "loss": 0.74060369, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.43359375, + "step": 537, + "time_per_iteration": 2.8981380462646484 + }, + { + "auxiliary_loss_clip": 0.0176396, + "auxiliary_loss_mlp": 0.01106733, + "balance_loss_clip": 1.45705128, + "balance_loss_mlp": 1.06143308, + "epoch": 0.0323463099353675, + "flos": 22674746471760.0, + "grad_norm": 2.086579196518479, + "language_loss": 0.84558094, + "learning_rate": 3.999942323804607e-06, + "loss": 0.8742879, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.45288086, + "step": 538, + "time_per_iteration": 2.794452667236328 + }, + { + "auxiliary_loss_clip": 0.01785689, + "auxiliary_loss_mlp": 0.01106083, + "balance_loss_clip": 1.46127832, + "balance_loss_mlp": 1.06161797, + "epoch": 0.032406433188035474, + "flos": 26910665102160.0, + "grad_norm": 1.837560576617673, + "language_loss": 0.81102169, + "learning_rate": 3.999939328146225e-06, + "loss": 0.83993936, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.44482422, + "step": 539, + "time_per_iteration": 2.895462989807129 + }, + { + "auxiliary_loss_clip": 0.01773824, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_clip": 1.461321, + "balance_loss_mlp": 1.05168712, + "epoch": 0.03246655644070344, + "flos": 31510224217800.0, + "grad_norm": 1.9676279043497047, + "language_loss": 0.78552985, + "learning_rate": 3.999936256649943e-06, + "loss": 0.81423444, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.44946289, + "step": 540, + "time_per_iteration": 2.911592483520508 + }, + { + "auxiliary_loss_clip": 0.01788571, + "auxiliary_loss_mlp": 0.01120816, + "balance_loss_clip": 1.47427762, + "balance_loss_mlp": 1.07739985, + "epoch": 0.03252667969337141, + "flos": 23223110697360.0, + "grad_norm": 2.2865571353900305, + "language_loss": 0.86436033, + "learning_rate": 3.999933109315878e-06, + "loss": 0.89345419, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.43383789, + "step": 541, + "time_per_iteration": 2.7842111587524414 + }, + { + "auxiliary_loss_clip": 0.01763391, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_clip": 1.45828211, + "balance_loss_mlp": 1.06727672, + "epoch": 0.032586802946039384, + "flos": 14761628568360.0, + "grad_norm": 2.388046469198701, + "language_loss": 0.90273058, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.93153435, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.49731445, + "step": 542, + "time_per_iteration": 2.779000759124756 + }, + { + "auxiliary_loss_clip": 0.0177061, + "auxiliary_loss_mlp": 0.01111946, + "balance_loss_clip": 1.45851731, + "balance_loss_mlp": 1.0689826, + "epoch": 0.03264692619870735, + "flos": 24286151190960.0, + "grad_norm": 1.9046693841762303, + "language_loss": 0.71911103, + "learning_rate": 3.999926587134879e-06, + "loss": 0.74793661, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.42944336, + "step": 543, + "time_per_iteration": 2.8256399631500244 + }, + { + "auxiliary_loss_clip": 0.01766947, + "auxiliary_loss_mlp": 0.01104159, + "balance_loss_clip": 1.45164585, + "balance_loss_mlp": 1.06315112, + "epoch": 0.03270704945137532, + "flos": 22898315431440.0, + "grad_norm": 2.354209087010259, + "language_loss": 0.93451762, + "learning_rate": 3.999923212288192e-06, + "loss": 0.9632287, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.40991211, + "step": 544, + "time_per_iteration": 2.8461804389953613 + }, + { + "auxiliary_loss_clip": 0.01773189, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_clip": 1.45581186, + "balance_loss_mlp": 1.05788398, + "epoch": 0.032767172704043286, + "flos": 18045600842880.0, + "grad_norm": 2.4759131582191434, + "language_loss": 0.68822336, + "learning_rate": 3.999919761604216e-06, + "loss": 0.7169413, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.40698242, + "step": 545, + "time_per_iteration": 2.7396957874298096 + }, + { + "auxiliary_loss_clip": 0.01777918, + "auxiliary_loss_mlp": 0.01081984, + "balance_loss_clip": 1.46205676, + "balance_loss_mlp": 1.04228699, + "epoch": 0.03282729595671126, + "flos": 22533903387360.0, + "grad_norm": 1.9926856327258073, + "language_loss": 0.93036151, + "learning_rate": 3.999916235083083e-06, + "loss": 0.95896053, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.39697266, + "step": 546, + "time_per_iteration": 2.8134548664093018 + }, + { + "auxiliary_loss_clip": 0.017802, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_clip": 1.46072888, + "balance_loss_mlp": 1.05609202, + "epoch": 0.03288741920937923, + "flos": 20415253730040.0, + "grad_norm": 2.7529076888582926, + "language_loss": 0.84429944, + "learning_rate": 3.999912632724925e-06, + "loss": 0.87311256, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.45019531, + "step": 547, + "time_per_iteration": 2.769829034805298 + }, + { + "auxiliary_loss_clip": 0.01768857, + "auxiliary_loss_mlp": 0.01105693, + "balance_loss_clip": 1.45252943, + "balance_loss_mlp": 1.05085635, + "epoch": 0.032947542462047195, + "flos": 20782995659640.0, + "grad_norm": 1.9246366110022692, + "language_loss": 0.82642257, + "learning_rate": 3.999908954529881e-06, + "loss": 0.8551681, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.54858398, + "step": 548, + "time_per_iteration": 2.8482823371887207 + }, + { + "auxiliary_loss_clip": 0.01791772, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.47305703, + "balance_loss_mlp": 1.05615854, + "epoch": 0.03300766571471517, + "flos": 19906018982280.0, + "grad_norm": 3.2474522176311056, + "language_loss": 0.6917972, + "learning_rate": 3.999905200498087e-06, + "loss": 0.7207467, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.4699707, + "step": 549, + "time_per_iteration": 2.8219995498657227 + }, + { + "auxiliary_loss_clip": 0.01777658, + "auxiliary_loss_mlp": 0.01111585, + "balance_loss_clip": 1.46682286, + "balance_loss_mlp": 1.07198405, + "epoch": 0.03306778896738313, + "flos": 17972296107120.0, + "grad_norm": 1.898265460686203, + "language_loss": 0.86685085, + "learning_rate": 3.999901370629689e-06, + "loss": 0.89574325, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.39599609, + "step": 550, + "time_per_iteration": 2.799626588821411 + }, + { + "auxiliary_loss_clip": 0.01768856, + "auxiliary_loss_mlp": 0.0109931, + "balance_loss_clip": 1.46323013, + "balance_loss_mlp": 1.0596844, + "epoch": 0.033127912220051105, + "flos": 21658469827680.0, + "grad_norm": 1.9966470247335157, + "language_loss": 0.82252377, + "learning_rate": 3.99989746492483e-06, + "loss": 0.85120547, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.39624023, + "step": 551, + "time_per_iteration": 2.8093552589416504 + }, + { + "auxiliary_loss_clip": 0.01799897, + "auxiliary_loss_mlp": 0.0110734, + "balance_loss_clip": 1.47411954, + "balance_loss_mlp": 1.06235051, + "epoch": 0.03318803547271908, + "flos": 30194231293080.0, + "grad_norm": 2.8582773269123893, + "language_loss": 0.87876117, + "learning_rate": 3.999893483383658e-06, + "loss": 0.90783352, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.45043945, + "step": 552, + "time_per_iteration": 2.817289113998413 + }, + { + "auxiliary_loss_clip": 0.01785975, + "auxiliary_loss_mlp": 0.01101478, + "balance_loss_clip": 1.4670682, + "balance_loss_mlp": 1.05565357, + "epoch": 0.03324815872538704, + "flos": 20380916038680.0, + "grad_norm": 2.536999880854815, + "language_loss": 0.93246138, + "learning_rate": 3.999889426006326e-06, + "loss": 0.96133596, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.45800781, + "step": 553, + "time_per_iteration": 2.8644461631774902 + }, + { + "auxiliary_loss_clip": 0.0177186, + "auxiliary_loss_mlp": 0.01092169, + "balance_loss_clip": 1.46194172, + "balance_loss_mlp": 1.04889584, + "epoch": 0.033308281978055014, + "flos": 24499405627200.0, + "grad_norm": 1.991931606316375, + "language_loss": 0.79930413, + "learning_rate": 3.999885292792986e-06, + "loss": 0.8279444, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.43261719, + "step": 554, + "time_per_iteration": 2.882340908050537 + }, + { + "auxiliary_loss_clip": 0.01780199, + "auxiliary_loss_mlp": 0.0110944, + "balance_loss_clip": 1.46681833, + "balance_loss_mlp": 1.06557155, + "epoch": 0.03336840523072298, + "flos": 23405073069240.0, + "grad_norm": 2.714063066379624, + "language_loss": 0.8242892, + "learning_rate": 3.999881083743795e-06, + "loss": 0.85318559, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.43847656, + "step": 555, + "time_per_iteration": 2.8028151988983154 + }, + { + "auxiliary_loss_clip": 0.01783311, + "auxiliary_loss_mlp": 0.01098159, + "balance_loss_clip": 1.46134305, + "balance_loss_mlp": 1.0522157, + "epoch": 0.03342852848339095, + "flos": 30556125618840.0, + "grad_norm": 2.9503989624582023, + "language_loss": 0.89753532, + "learning_rate": 3.999876798858914e-06, + "loss": 0.92635012, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.4597168, + "step": 556, + "time_per_iteration": 2.8868677616119385 + }, + { + "auxiliary_loss_clip": 0.01775989, + "auxiliary_loss_mlp": 0.01101752, + "balance_loss_clip": 1.46081948, + "balance_loss_mlp": 1.05335307, + "epoch": 0.03348865173605892, + "flos": 22898477864880.0, + "grad_norm": 2.466138014939698, + "language_loss": 0.85398555, + "learning_rate": 3.999872438138503e-06, + "loss": 0.88276291, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.484375, + "step": 557, + "time_per_iteration": 2.8040823936462402 + }, + { + "auxiliary_loss_clip": 0.01787486, + "auxiliary_loss_mlp": 0.01094727, + "balance_loss_clip": 1.47112942, + "balance_loss_mlp": 1.05002403, + "epoch": 0.03354877498872689, + "flos": 17680539065040.0, + "grad_norm": 3.366762757642055, + "language_loss": 0.97185463, + "learning_rate": 3.999868001582729e-06, + "loss": 1.00067675, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.44677734, + "step": 558, + "time_per_iteration": 4.241248369216919 + }, + { + "auxiliary_loss_clip": 0.01764088, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.44872355, + "balance_loss_mlp": 1.04383457, + "epoch": 0.03360889824139486, + "flos": 21658023135720.0, + "grad_norm": 2.354943768620181, + "language_loss": 0.79405832, + "learning_rate": 3.99986348919176e-06, + "loss": 0.82254291, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.4050293, + "step": 559, + "time_per_iteration": 4.440772294998169 + }, + { + "auxiliary_loss_clip": 0.01780261, + "auxiliary_loss_mlp": 0.01111046, + "balance_loss_clip": 1.46657491, + "balance_loss_mlp": 1.06822634, + "epoch": 0.033669021494062826, + "flos": 21800368729440.0, + "grad_norm": 1.816704652957994, + "language_loss": 0.88606578, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.91497886, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.42822266, + "step": 560, + "time_per_iteration": 4.356745719909668 + }, + { + "auxiliary_loss_clip": 0.01764621, + "auxiliary_loss_mlp": 0.0109271, + "balance_loss_clip": 1.45458269, + "balance_loss_mlp": 1.05272734, + "epoch": 0.0337291447467308, + "flos": 21870546621480.0, + "grad_norm": 2.3381086188510336, + "language_loss": 0.82457829, + "learning_rate": 3.999854236904925e-06, + "loss": 0.85315156, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.39941406, + "step": 561, + "time_per_iteration": 4.294705629348755 + }, + { + "auxiliary_loss_clip": 0.01762129, + "auxiliary_loss_mlp": 0.01100174, + "balance_loss_clip": 1.45647907, + "balance_loss_mlp": 1.05833173, + "epoch": 0.03378926799939877, + "flos": 24251651066160.0, + "grad_norm": 2.4224147464006656, + "language_loss": 0.82617629, + "learning_rate": 3.999849497009409e-06, + "loss": 0.85479933, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.41821289, + "step": 562, + "time_per_iteration": 2.837266206741333 + }, + { + "auxiliary_loss_clip": 0.01766668, + "auxiliary_loss_mlp": 0.01100471, + "balance_loss_clip": 1.45695341, + "balance_loss_mlp": 1.05488527, + "epoch": 0.033849391252066735, + "flos": 16512089604120.0, + "grad_norm": 2.0179220795909987, + "language_loss": 0.85230255, + "learning_rate": 3.999844681279401e-06, + "loss": 0.88097394, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.45605469, + "step": 563, + "time_per_iteration": 2.783952236175537 + }, + { + "auxiliary_loss_clip": 0.01770328, + "auxiliary_loss_mlp": 0.01103725, + "balance_loss_clip": 1.46051693, + "balance_loss_mlp": 1.06066704, + "epoch": 0.03390951450473471, + "flos": 15673552062480.0, + "grad_norm": 2.1082394480663074, + "language_loss": 0.95015877, + "learning_rate": 3.99983978971508e-06, + "loss": 0.9788993, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.4309082, + "step": 564, + "time_per_iteration": 2.7685647010803223 + }, + { + "auxiliary_loss_clip": 0.01772778, + "auxiliary_loss_mlp": 0.01102716, + "balance_loss_clip": 1.45747852, + "balance_loss_mlp": 1.0543406, + "epoch": 0.03396963775740267, + "flos": 22679984950200.0, + "grad_norm": 2.505470475294784, + "language_loss": 0.95233536, + "learning_rate": 3.999834822316635e-06, + "loss": 0.98109025, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.48364258, + "step": 565, + "time_per_iteration": 2.9344801902770996 + }, + { + "auxiliary_loss_clip": 0.01565088, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.3705008, + "balance_loss_mlp": 1.0220145, + "epoch": 0.034029761010070644, + "flos": 64410006727440.0, + "grad_norm": 1.1904404377159705, + "language_loss": 0.54871708, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57473588, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.14746094, + "step": 566, + "time_per_iteration": 3.448732852935791 + }, + { + "auxiliary_loss_clip": 0.01782039, + "auxiliary_loss_mlp": 0.01092811, + "balance_loss_clip": 1.45796132, + "balance_loss_mlp": 1.04634356, + "epoch": 0.034089884262738616, + "flos": 25008965241840.0, + "grad_norm": 1.9113682341317548, + "language_loss": 0.77665412, + "learning_rate": 3.999824660018126e-06, + "loss": 0.80540264, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.46435547, + "step": 567, + "time_per_iteration": 3.0263073444366455 + }, + { + "auxiliary_loss_clip": 0.01751253, + "auxiliary_loss_mlp": 0.01109629, + "balance_loss_clip": 1.45029116, + "balance_loss_mlp": 1.06924081, + "epoch": 0.03415000751540658, + "flos": 28444988508120.0, + "grad_norm": 2.144696568175175, + "language_loss": 0.81841844, + "learning_rate": 3.999819465118447e-06, + "loss": 0.8470273, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.40332031, + "step": 568, + "time_per_iteration": 2.9844534397125244 + }, + { + "auxiliary_loss_clip": 0.01761865, + "auxiliary_loss_mlp": 0.01101869, + "balance_loss_clip": 1.45432603, + "balance_loss_mlp": 1.0540427, + "epoch": 0.034210130768074554, + "flos": 21473502437160.0, + "grad_norm": 1.6444190270865666, + "language_loss": 0.87359399, + "learning_rate": 3.999814194385413e-06, + "loss": 0.90223128, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.47827148, + "step": 569, + "time_per_iteration": 2.967419147491455 + }, + { + "auxiliary_loss_clip": 0.01762257, + "auxiliary_loss_mlp": 0.01096826, + "balance_loss_clip": 1.4530437, + "balance_loss_mlp": 1.05336261, + "epoch": 0.03427025402074252, + "flos": 18702135404280.0, + "grad_norm": 1.7177840256646424, + "language_loss": 0.95910203, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98769289, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.43457031, + "step": 570, + "time_per_iteration": 2.842526435852051 + }, + { + "auxiliary_loss_clip": 0.01768629, + "auxiliary_loss_mlp": 0.0110464, + "balance_loss_clip": 1.45264769, + "balance_loss_mlp": 1.06005609, + "epoch": 0.03433037727341049, + "flos": 20854879102800.0, + "grad_norm": 2.1479614070776356, + "language_loss": 0.81924963, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.84798229, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.44604492, + "step": 571, + "time_per_iteration": 2.8512494564056396 + }, + { + "auxiliary_loss_clip": 0.01743831, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_clip": 1.44124544, + "balance_loss_mlp": 1.05437708, + "epoch": 0.03439050052607846, + "flos": 25415714824200.0, + "grad_norm": 3.967954542481707, + "language_loss": 0.82146871, + "learning_rate": 3.999797927188199e-06, + "loss": 0.84985018, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.39941406, + "step": 572, + "time_per_iteration": 2.813002824783325 + }, + { + "auxiliary_loss_clip": 0.01751902, + "auxiliary_loss_mlp": 0.01088122, + "balance_loss_clip": 1.44146979, + "balance_loss_mlp": 1.04487252, + "epoch": 0.03445062377874643, + "flos": 17644739472720.0, + "grad_norm": 1.8496332574759744, + "language_loss": 0.855461, + "learning_rate": 3.999792353123774e-06, + "loss": 0.88386124, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.43237305, + "step": 573, + "time_per_iteration": 2.772002696990967 + }, + { + "auxiliary_loss_clip": 0.01748283, + "auxiliary_loss_mlp": 0.01087411, + "balance_loss_clip": 1.43973398, + "balance_loss_mlp": 1.04456675, + "epoch": 0.0345107470314144, + "flos": 16768981046160.0, + "grad_norm": 2.5376663261191745, + "language_loss": 0.78367221, + "learning_rate": 3.999786703227023e-06, + "loss": 0.81202918, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.42871094, + "step": 574, + "time_per_iteration": 2.76306414604187 + }, + { + "auxiliary_loss_clip": 0.01756062, + "auxiliary_loss_mlp": 0.01093422, + "balance_loss_clip": 1.45111871, + "balance_loss_mlp": 1.05265188, + "epoch": 0.03457087028408237, + "flos": 14688608091120.0, + "grad_norm": 16.56152950394363, + "language_loss": 0.84192848, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.87042332, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.40795898, + "step": 575, + "time_per_iteration": 2.874455213546753 + }, + { + "auxiliary_loss_clip": 0.01747901, + "auxiliary_loss_mlp": 0.0109001, + "balance_loss_clip": 1.44778562, + "balance_loss_mlp": 1.05234003, + "epoch": 0.03463099353675034, + "flos": 20016138519360.0, + "grad_norm": 2.1750209934659783, + "language_loss": 0.8459999, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.87437904, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.37695312, + "step": 576, + "time_per_iteration": 2.7791121006011963 + }, + { + "auxiliary_loss_clip": 0.01745122, + "auxiliary_loss_mlp": 0.01093248, + "balance_loss_clip": 1.44822383, + "balance_loss_mlp": 1.05440915, + "epoch": 0.03469111678941831, + "flos": 25306610496120.0, + "grad_norm": 3.4331663568900423, + "language_loss": 0.8709166, + "learning_rate": 3.99976929854497e-06, + "loss": 0.89930028, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.38818359, + "step": 577, + "time_per_iteration": 2.8115310668945312 + }, + { + "auxiliary_loss_clip": 0.01749428, + "auxiliary_loss_mlp": 0.01083397, + "balance_loss_clip": 1.44977355, + "balance_loss_mlp": 1.04379547, + "epoch": 0.034751240042086275, + "flos": 23264798501880.0, + "grad_norm": 2.401807194905276, + "language_loss": 0.72965193, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.75798017, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.39599609, + "step": 578, + "time_per_iteration": 2.826514720916748 + }, + { + "auxiliary_loss_clip": 0.01754736, + "auxiliary_loss_mlp": 0.01093289, + "balance_loss_clip": 1.44655859, + "balance_loss_mlp": 1.05161369, + "epoch": 0.03481136329475425, + "flos": 23774561158320.0, + "grad_norm": 1.8368869152392482, + "language_loss": 0.78290892, + "learning_rate": 3.999757316265973e-06, + "loss": 0.81138909, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.41674805, + "step": 579, + "time_per_iteration": 2.8889365196228027 + }, + { + "auxiliary_loss_clip": 0.01735259, + "auxiliary_loss_mlp": 0.01088432, + "balance_loss_clip": 1.43193436, + "balance_loss_mlp": 1.04766262, + "epoch": 0.03487148654742222, + "flos": 20162423124000.0, + "grad_norm": 1.9095878247123392, + "language_loss": 0.87365675, + "learning_rate": 3.999751211379863e-06, + "loss": 0.90189362, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.40771484, + "step": 580, + "time_per_iteration": 2.837171792984009 + }, + { + "auxiliary_loss_clip": 0.01751215, + "auxiliary_loss_mlp": 0.01095138, + "balance_loss_clip": 1.44037247, + "balance_loss_mlp": 1.05556083, + "epoch": 0.034931609800090184, + "flos": 15673876929360.0, + "grad_norm": 2.305129243031845, + "language_loss": 0.83703905, + "learning_rate": 3.999745030662987e-06, + "loss": 0.8655026, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.39624023, + "step": 581, + "time_per_iteration": 2.786439895629883 + }, + { + "auxiliary_loss_clip": 0.01750792, + "auxiliary_loss_mlp": 0.0108642, + "balance_loss_clip": 1.44736147, + "balance_loss_mlp": 1.04674673, + "epoch": 0.034991733052758156, + "flos": 16366698383400.0, + "grad_norm": 2.0870930112077177, + "language_loss": 0.77303213, + "learning_rate": 3.99973877411558e-06, + "loss": 0.80140424, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.39648438, + "step": 582, + "time_per_iteration": 2.8386776447296143 + }, + { + "auxiliary_loss_clip": 0.01728993, + "auxiliary_loss_mlp": 0.01088957, + "balance_loss_clip": 1.4344523, + "balance_loss_mlp": 1.04885471, + "epoch": 0.03505185630542612, + "flos": 19391586364440.0, + "grad_norm": 2.1844683430298275, + "language_loss": 0.8849799, + "learning_rate": 3.999732441737877e-06, + "loss": 0.91315937, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.40112305, + "step": 583, + "time_per_iteration": 2.795156240463257 + }, + { + "auxiliary_loss_clip": 0.01742328, + "auxiliary_loss_mlp": 0.01097641, + "balance_loss_clip": 1.43550491, + "balance_loss_mlp": 1.05675244, + "epoch": 0.03511197955809409, + "flos": 21328720341840.0, + "grad_norm": 2.20801872897732, + "language_loss": 0.81766081, + "learning_rate": 3.99972603353012e-06, + "loss": 0.84606045, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.40869141, + "step": 584, + "time_per_iteration": 2.813551187515259 + }, + { + "auxiliary_loss_clip": 0.01744226, + "auxiliary_loss_mlp": 0.01094668, + "balance_loss_clip": 1.43769157, + "balance_loss_mlp": 1.05458975, + "epoch": 0.035172102810762065, + "flos": 14140609340760.0, + "grad_norm": 3.198775391256021, + "language_loss": 0.94106418, + "learning_rate": 3.999719549492551e-06, + "loss": 0.96945316, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.40112305, + "step": 585, + "time_per_iteration": 2.752291679382324 + }, + { + "auxiliary_loss_clip": 0.01742077, + "auxiliary_loss_mlp": 0.01094137, + "balance_loss_clip": 1.43573427, + "balance_loss_mlp": 1.05224657, + "epoch": 0.03523222606343003, + "flos": 20300951531880.0, + "grad_norm": 2.270955836382352, + "language_loss": 0.88876706, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.91712916, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.41870117, + "step": 586, + "time_per_iteration": 2.7953808307647705 + }, + { + "auxiliary_loss_clip": 0.01750359, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_clip": 1.44261098, + "balance_loss_mlp": 1.06193101, + "epoch": 0.035292349316098, + "flos": 20380672388520.0, + "grad_norm": 2.1390082467596323, + "language_loss": 0.78115499, + "learning_rate": 3.999706353928965e-06, + "loss": 0.80968821, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.41064453, + "step": 587, + "time_per_iteration": 2.7907397747039795 + }, + { + "auxiliary_loss_clip": 0.01751428, + "auxiliary_loss_mlp": 0.01097998, + "balance_loss_clip": 1.43791378, + "balance_loss_mlp": 1.05696595, + "epoch": 0.03535247256876597, + "flos": 21473664870600.0, + "grad_norm": 1.6770822960330563, + "language_loss": 0.79651845, + "learning_rate": 3.999699642403449e-06, + "loss": 0.82501262, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.41015625, + "step": 588, + "time_per_iteration": 2.8099541664123535 + }, + { + "auxiliary_loss_clip": 0.01736793, + "auxiliary_loss_mlp": 0.01090868, + "balance_loss_clip": 1.43001008, + "balance_loss_mlp": 1.05126667, + "epoch": 0.03541259582143394, + "flos": 23628317162040.0, + "grad_norm": 3.0988097880212933, + "language_loss": 0.95518154, + "learning_rate": 3.99969285504912e-06, + "loss": 0.98345816, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.39599609, + "step": 589, + "time_per_iteration": 2.8509373664855957 + }, + { + "auxiliary_loss_clip": 0.01745118, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_clip": 1.43540907, + "balance_loss_mlp": 1.0550468, + "epoch": 0.03547271907410191, + "flos": 33732536682960.0, + "grad_norm": 5.179482706109855, + "language_loss": 0.85201859, + "learning_rate": 3.99968599186624e-06, + "loss": 0.88039744, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.37719727, + "step": 590, + "time_per_iteration": 2.913105010986328 + }, + { + "auxiliary_loss_clip": 0.01735029, + "auxiliary_loss_mlp": 0.01098065, + "balance_loss_clip": 1.43200517, + "balance_loss_mlp": 1.05956066, + "epoch": 0.03553284232676988, + "flos": 21147732570600.0, + "grad_norm": 2.3641024332362925, + "language_loss": 0.88000262, + "learning_rate": 3.999679052855065e-06, + "loss": 0.90833354, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.38476562, + "step": 591, + "time_per_iteration": 2.8285813331604004 + }, + { + "auxiliary_loss_clip": 0.01744404, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_clip": 1.43128991, + "balance_loss_mlp": 1.0535804, + "epoch": 0.03559296557943785, + "flos": 20051288377920.0, + "grad_norm": 1.8725946561927362, + "language_loss": 0.84146702, + "learning_rate": 3.999672038015861e-06, + "loss": 0.86987388, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.42700195, + "step": 592, + "time_per_iteration": 2.7801244258880615 + }, + { + "auxiliary_loss_clip": 0.01567414, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.36842442, + "balance_loss_mlp": 1.01906681, + "epoch": 0.035653088832105814, + "flos": 60350324690160.0, + "grad_norm": 1.0125048867453201, + "language_loss": 0.5978421, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62385559, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.1484375, + "step": 593, + "time_per_iteration": 3.3308262825012207 + }, + { + "auxiliary_loss_clip": 0.01740392, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_clip": 1.43685114, + "balance_loss_mlp": 1.06177592, + "epoch": 0.035713212084773786, + "flos": 20116999350360.0, + "grad_norm": 1.7456960205318517, + "language_loss": 0.8786788, + "learning_rate": 3.999657780854429e-06, + "loss": 0.90709198, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.39160156, + "step": 594, + "time_per_iteration": 2.8005306720733643 + }, + { + "auxiliary_loss_clip": 0.0173771, + "auxiliary_loss_mlp": 0.01110867, + "balance_loss_clip": 1.43239284, + "balance_loss_mlp": 1.07183814, + "epoch": 0.03577333533744176, + "flos": 26291310817320.0, + "grad_norm": 2.0921520172977814, + "language_loss": 0.8526392, + "learning_rate": 3.999650538532742e-06, + "loss": 0.88112503, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.39013672, + "step": 595, + "time_per_iteration": 2.8147616386413574 + }, + { + "auxiliary_loss_clip": 0.01751206, + "auxiliary_loss_mlp": 0.01120724, + "balance_loss_clip": 1.44093156, + "balance_loss_mlp": 1.07954931, + "epoch": 0.035833458590109724, + "flos": 10893370650840.0, + "grad_norm": 2.347009282390936, + "language_loss": 0.96984994, + "learning_rate": 3.999643220384106e-06, + "loss": 0.99856931, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.41137695, + "step": 596, + "time_per_iteration": 2.788520574569702 + }, + { + "auxiliary_loss_clip": 0.0173511, + "auxiliary_loss_mlp": 0.01110852, + "balance_loss_clip": 1.42947364, + "balance_loss_mlp": 1.07258594, + "epoch": 0.035893581842777696, + "flos": 22095212006880.0, + "grad_norm": 2.2297648139524044, + "language_loss": 0.84408557, + "learning_rate": 3.999635826408799e-06, + "loss": 0.87254518, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.38256836, + "step": 597, + "time_per_iteration": 5.7131876945495605 + }, + { + "auxiliary_loss_clip": 0.01733245, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_clip": 1.43300986, + "balance_loss_mlp": 1.07226801, + "epoch": 0.03595370509544566, + "flos": 23043422393640.0, + "grad_norm": 1.7216115951763729, + "language_loss": 0.81998861, + "learning_rate": 3.999628356607101e-06, + "loss": 0.84845459, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.41088867, + "step": 598, + "time_per_iteration": 2.788677930831909 + }, + { + "auxiliary_loss_clip": 0.01728662, + "auxiliary_loss_mlp": 0.01108152, + "balance_loss_clip": 1.433496, + "balance_loss_mlp": 1.06991005, + "epoch": 0.03601382834811363, + "flos": 20782711401120.0, + "grad_norm": 1.7312633635271835, + "language_loss": 0.81749219, + "learning_rate": 3.999620810979295e-06, + "loss": 0.84586036, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.38232422, + "step": 599, + "time_per_iteration": 4.392529487609863 + }, + { + "auxiliary_loss_clip": 0.01751067, + "auxiliary_loss_mlp": 0.01103165, + "balance_loss_clip": 1.434829, + "balance_loss_mlp": 1.0651139, + "epoch": 0.036073951600781605, + "flos": 23956970222160.0, + "grad_norm": 2.325842325412506, + "language_loss": 0.87631816, + "learning_rate": 3.999613189525668e-06, + "loss": 0.9048605, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.38061523, + "step": 600, + "time_per_iteration": 4.301169157028198 + }, + { + "auxiliary_loss_clip": 0.01727249, + "auxiliary_loss_mlp": 0.01109755, + "balance_loss_clip": 1.42476606, + "balance_loss_mlp": 1.07039189, + "epoch": 0.03613407485344957, + "flos": 18916405049520.0, + "grad_norm": 2.431640480138494, + "language_loss": 0.8332901, + "learning_rate": 3.999605492246508e-06, + "loss": 0.86166018, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.39379883, + "step": 601, + "time_per_iteration": 2.907844066619873 + }, + { + "auxiliary_loss_clip": 0.01728806, + "auxiliary_loss_mlp": 0.01097057, + "balance_loss_clip": 1.4283967, + "balance_loss_mlp": 1.05576324, + "epoch": 0.03619419810611754, + "flos": 23043625435440.0, + "grad_norm": 2.9523393638864226, + "language_loss": 0.76553237, + "learning_rate": 3.999597719142107e-06, + "loss": 0.793791, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.41333008, + "step": 602, + "time_per_iteration": 2.8291308879852295 + }, + { + "auxiliary_loss_clip": 0.01728474, + "auxiliary_loss_mlp": 0.01096246, + "balance_loss_clip": 1.4296422, + "balance_loss_mlp": 1.05657291, + "epoch": 0.03625432135878551, + "flos": 29463417395280.0, + "grad_norm": 1.8884659869988865, + "language_loss": 0.80588901, + "learning_rate": 3.999589870212761e-06, + "loss": 0.83413625, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.39697266, + "step": 603, + "time_per_iteration": 2.8920738697052 + }, + { + "auxiliary_loss_clip": 0.01730172, + "auxiliary_loss_mlp": 0.01086343, + "balance_loss_clip": 1.43323588, + "balance_loss_mlp": 1.04876781, + "epoch": 0.03631444461145348, + "flos": 23513121579960.0, + "grad_norm": 9.451361093860513, + "language_loss": 0.8738302, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.9019953, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.37548828, + "step": 604, + "time_per_iteration": 2.8348987102508545 + }, + { + "auxiliary_loss_clip": 0.0173746, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_clip": 1.43287218, + "balance_loss_mlp": 1.04374373, + "epoch": 0.03637456786412145, + "flos": 16622534008080.0, + "grad_norm": 4.434921423489694, + "language_loss": 0.82926822, + "learning_rate": 3.999573944880424e-06, + "loss": 0.85748661, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.40649414, + "step": 605, + "time_per_iteration": 2.843594789505005 + }, + { + "auxiliary_loss_clip": 0.01740539, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_clip": 1.43512392, + "balance_loss_mlp": 1.0545516, + "epoch": 0.03643469111678942, + "flos": 15856123559760.0, + "grad_norm": 3.3369841348265434, + "language_loss": 0.87433827, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.90265536, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.36645508, + "step": 606, + "time_per_iteration": 2.837759494781494 + }, + { + "auxiliary_loss_clip": 0.01751282, + "auxiliary_loss_mlp": 0.01092237, + "balance_loss_clip": 1.43904829, + "balance_loss_mlp": 1.05416143, + "epoch": 0.03649481436945739, + "flos": 23625312143400.0, + "grad_norm": 2.025236029325102, + "language_loss": 0.82923043, + "learning_rate": 3.999557716251912e-06, + "loss": 0.85766566, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 607, + "time_per_iteration": 2.865360736846924 + }, + { + "auxiliary_loss_clip": 0.01735353, + "auxiliary_loss_mlp": 0.01095741, + "balance_loss_clip": 1.43577909, + "balance_loss_mlp": 1.05702162, + "epoch": 0.036554937622125354, + "flos": 21759736742280.0, + "grad_norm": 2.3367841061464936, + "language_loss": 0.84290671, + "learning_rate": 3.999549488202358e-06, + "loss": 0.87121767, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.38720703, + "step": 608, + "time_per_iteration": 2.8709702491760254 + }, + { + "auxiliary_loss_clip": 0.01745647, + "auxiliary_loss_mlp": 0.0108554, + "balance_loss_clip": 1.43978739, + "balance_loss_mlp": 1.0455339, + "epoch": 0.036615060874793326, + "flos": 17824021692840.0, + "grad_norm": 7.987029582003362, + "language_loss": 0.82813358, + "learning_rate": 3.999541184329688e-06, + "loss": 0.85644549, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.39990234, + "step": 609, + "time_per_iteration": 2.893648386001587 + }, + { + "auxiliary_loss_clip": 0.01760163, + "auxiliary_loss_mlp": 0.01105005, + "balance_loss_clip": 1.44912624, + "balance_loss_mlp": 1.06490266, + "epoch": 0.0366751841274613, + "flos": 26758573502040.0, + "grad_norm": 1.8926715319763552, + "language_loss": 0.8075465, + "learning_rate": 3.999532804634215e-06, + "loss": 0.83619815, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.40112305, + "step": 610, + "time_per_iteration": 2.8634402751922607 + }, + { + "auxiliary_loss_clip": 0.01753538, + "auxiliary_loss_mlp": 0.01095842, + "balance_loss_clip": 1.44206309, + "balance_loss_mlp": 1.05752778, + "epoch": 0.03673530738012926, + "flos": 22201636183200.0, + "grad_norm": 2.156802731823911, + "language_loss": 0.88784671, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.91634053, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.38330078, + "step": 611, + "time_per_iteration": 2.801431179046631 + }, + { + "auxiliary_loss_clip": 0.01748639, + "auxiliary_loss_mlp": 0.01112592, + "balance_loss_clip": 1.44572532, + "balance_loss_mlp": 1.07516074, + "epoch": 0.036795430632797235, + "flos": 24687134386200.0, + "grad_norm": 2.002265820771011, + "language_loss": 0.73767447, + "learning_rate": 3.999515817776136e-06, + "loss": 0.76628673, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.37451172, + "step": 612, + "time_per_iteration": 2.9010725021362305 + }, + { + "auxiliary_loss_clip": 0.01752863, + "auxiliary_loss_mlp": 0.01094777, + "balance_loss_clip": 1.44407976, + "balance_loss_mlp": 1.05527091, + "epoch": 0.0368555538854652, + "flos": 17753437717200.0, + "grad_norm": 2.48942198401856, + "language_loss": 0.81012422, + "learning_rate": 3.999507210614175e-06, + "loss": 0.83860058, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39526367, + "step": 613, + "time_per_iteration": 2.8434197902679443 + }, + { + "auxiliary_loss_clip": 0.01734666, + "auxiliary_loss_mlp": 0.01089463, + "balance_loss_clip": 1.43302739, + "balance_loss_mlp": 1.04943299, + "epoch": 0.03691567713813317, + "flos": 20599408953360.0, + "grad_norm": 2.041949573750035, + "language_loss": 0.94933975, + "learning_rate": 3.9994985276307e-06, + "loss": 0.97758108, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.40014648, + "step": 614, + "time_per_iteration": 2.794320821762085 + }, + { + "auxiliary_loss_clip": 0.01759113, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_clip": 1.44492245, + "balance_loss_mlp": 1.05670702, + "epoch": 0.036975800390801145, + "flos": 33656145711840.0, + "grad_norm": 2.943233209660226, + "language_loss": 0.75844663, + "learning_rate": 3.999489768826041e-06, + "loss": 0.78704214, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.43701172, + "step": 615, + "time_per_iteration": 2.9676060676574707 + }, + { + "auxiliary_loss_clip": 0.01751844, + "auxiliary_loss_mlp": 0.0109246, + "balance_loss_clip": 1.44088995, + "balance_loss_mlp": 1.05474162, + "epoch": 0.03703592364346911, + "flos": 28299637895760.0, + "grad_norm": 1.6729741993166005, + "language_loss": 0.82939029, + "learning_rate": 3.999480934200528e-06, + "loss": 0.85783339, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.37744141, + "step": 616, + "time_per_iteration": 2.8732004165649414 + }, + { + "auxiliary_loss_clip": 0.01755742, + "auxiliary_loss_mlp": 0.0107578, + "balance_loss_clip": 1.44717348, + "balance_loss_mlp": 1.03801501, + "epoch": 0.03709604689613708, + "flos": 31510549084680.0, + "grad_norm": 2.273830704833927, + "language_loss": 0.69045889, + "learning_rate": 3.999472023754499e-06, + "loss": 0.71877414, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.37744141, + "step": 617, + "time_per_iteration": 2.8734357357025146 + }, + { + "auxiliary_loss_clip": 0.01751877, + "auxiliary_loss_mlp": 0.01085182, + "balance_loss_clip": 1.44483376, + "balance_loss_mlp": 1.04684377, + "epoch": 0.03715617014880505, + "flos": 19614099506760.0, + "grad_norm": 2.282521419999474, + "language_loss": 0.81571782, + "learning_rate": 3.99946303748829e-06, + "loss": 0.84408832, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.38354492, + "step": 618, + "time_per_iteration": 2.779531240463257 + }, + { + "auxiliary_loss_clip": 0.01748924, + "auxiliary_loss_mlp": 0.01092003, + "balance_loss_clip": 1.43858421, + "balance_loss_mlp": 1.04746604, + "epoch": 0.03721629340147302, + "flos": 15928413086520.0, + "grad_norm": 2.2742950313295505, + "language_loss": 0.91571593, + "learning_rate": 3.999453975402242e-06, + "loss": 0.94412518, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.4453125, + "step": 619, + "time_per_iteration": 2.8329262733459473 + }, + { + "auxiliary_loss_clip": 0.01744876, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_clip": 1.43956399, + "balance_loss_mlp": 1.06697071, + "epoch": 0.03727641665414099, + "flos": 21108846742920.0, + "grad_norm": 2.231965932463618, + "language_loss": 0.95431936, + "learning_rate": 3.9994448374967e-06, + "loss": 0.98285508, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.41723633, + "step": 620, + "time_per_iteration": 3.024001359939575 + }, + { + "auxiliary_loss_clip": 0.01755223, + "auxiliary_loss_mlp": 0.01082734, + "balance_loss_clip": 1.44457257, + "balance_loss_mlp": 1.04184532, + "epoch": 0.037336539906808956, + "flos": 24136739742600.0, + "grad_norm": 1.906430094090924, + "language_loss": 0.77816564, + "learning_rate": 3.999435623772008e-06, + "loss": 0.80654526, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.40844727, + "step": 621, + "time_per_iteration": 2.8849263191223145 + }, + { + "auxiliary_loss_clip": 0.01747407, + "auxiliary_loss_mlp": 0.01087135, + "balance_loss_clip": 1.44627738, + "balance_loss_mlp": 1.04877329, + "epoch": 0.03739666315947693, + "flos": 22351534931880.0, + "grad_norm": 2.482151897048137, + "language_loss": 0.87663877, + "learning_rate": 3.999426334228518e-06, + "loss": 0.90498418, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.38354492, + "step": 622, + "time_per_iteration": 2.885483741760254 + }, + { + "auxiliary_loss_clip": 0.01748024, + "auxiliary_loss_mlp": 0.01090245, + "balance_loss_clip": 1.43884981, + "balance_loss_mlp": 1.05057192, + "epoch": 0.0374567864121449, + "flos": 20454464424600.0, + "grad_norm": 2.2485369728315936, + "language_loss": 0.90818071, + "learning_rate": 3.999416968866581e-06, + "loss": 0.93656343, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39697266, + "step": 623, + "time_per_iteration": 2.854170560836792 + }, + { + "auxiliary_loss_clip": 0.01754813, + "auxiliary_loss_mlp": 0.01089051, + "balance_loss_clip": 1.44776309, + "balance_loss_mlp": 1.05102277, + "epoch": 0.037516909664812866, + "flos": 19212710227920.0, + "grad_norm": 1.999283194214831, + "language_loss": 0.85054648, + "learning_rate": 3.999407527686551e-06, + "loss": 0.87898511, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.38037109, + "step": 624, + "time_per_iteration": 2.8516900539398193 + }, + { + "auxiliary_loss_clip": 0.01749057, + "auxiliary_loss_mlp": 0.01083409, + "balance_loss_clip": 1.44161022, + "balance_loss_mlp": 1.04406965, + "epoch": 0.03757703291748084, + "flos": 35011796023080.0, + "grad_norm": 3.5005748387148667, + "language_loss": 0.68101943, + "learning_rate": 3.999398010688788e-06, + "loss": 0.70934409, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.39331055, + "step": 625, + "time_per_iteration": 2.94978928565979 + }, + { + "auxiliary_loss_clip": 0.01741754, + "auxiliary_loss_mlp": 0.01101268, + "balance_loss_clip": 1.43973851, + "balance_loss_mlp": 1.05985475, + "epoch": 0.0376371561701488, + "flos": 25489425643560.0, + "grad_norm": 3.0119481257040475, + "language_loss": 0.77855194, + "learning_rate": 3.999388417873652e-06, + "loss": 0.80698216, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.4140625, + "step": 626, + "time_per_iteration": 2.8605289459228516 + }, + { + "auxiliary_loss_clip": 0.01750518, + "auxiliary_loss_mlp": 0.01101321, + "balance_loss_clip": 1.43952668, + "balance_loss_mlp": 1.06021762, + "epoch": 0.037697279422816775, + "flos": 18190058071320.0, + "grad_norm": 2.4605131815225687, + "language_loss": 0.81723076, + "learning_rate": 3.999378749241506e-06, + "loss": 0.84574914, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.41064453, + "step": 627, + "time_per_iteration": 2.8594563007354736 + }, + { + "auxiliary_loss_clip": 0.01749234, + "auxiliary_loss_mlp": 0.01089382, + "balance_loss_clip": 1.43909192, + "balance_loss_mlp": 1.05097294, + "epoch": 0.03775740267548475, + "flos": 24649629242760.0, + "grad_norm": 1.7509117102495069, + "language_loss": 0.89278579, + "learning_rate": 3.999369004792719e-06, + "loss": 0.92117202, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.38378906, + "step": 628, + "time_per_iteration": 2.9065074920654297 + }, + { + "auxiliary_loss_clip": 0.017412, + "auxiliary_loss_mlp": 0.01090619, + "balance_loss_clip": 1.43355143, + "balance_loss_mlp": 1.04930091, + "epoch": 0.03781752592815271, + "flos": 21293205008040.0, + "grad_norm": 3.401379602899984, + "language_loss": 0.80678099, + "learning_rate": 3.999359184527658e-06, + "loss": 0.83509922, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.41308594, + "step": 629, + "time_per_iteration": 2.8786680698394775 + }, + { + "auxiliary_loss_clip": 0.01742818, + "auxiliary_loss_mlp": 0.01084941, + "balance_loss_clip": 1.43443763, + "balance_loss_mlp": 1.04538727, + "epoch": 0.037877649180820684, + "flos": 22094562273120.0, + "grad_norm": 2.7648804268783196, + "language_loss": 0.78046721, + "learning_rate": 3.999349288446696e-06, + "loss": 0.80874479, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.39526367, + "step": 630, + "time_per_iteration": 2.8237051963806152 + }, + { + "auxiliary_loss_clip": 0.01759111, + "auxiliary_loss_mlp": 0.01080896, + "balance_loss_clip": 1.44400358, + "balance_loss_mlp": 1.04177189, + "epoch": 0.03793777243348865, + "flos": 14505386860080.0, + "grad_norm": 2.706297259923188, + "language_loss": 0.92652941, + "learning_rate": 3.99933931655021e-06, + "loss": 0.95492947, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.39135742, + "step": 631, + "time_per_iteration": 2.8244364261627197 + }, + { + "auxiliary_loss_clip": 0.01723078, + "auxiliary_loss_mlp": 0.01107726, + "balance_loss_clip": 1.42933178, + "balance_loss_mlp": 1.06416738, + "epoch": 0.03799789568615662, + "flos": 21913493285160.0, + "grad_norm": 1.595570176205809, + "language_loss": 0.92544079, + "learning_rate": 3.999329268838575e-06, + "loss": 0.95374876, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.43554688, + "step": 632, + "time_per_iteration": 2.823334217071533 + }, + { + "auxiliary_loss_clip": 0.01740085, + "auxiliary_loss_mlp": 0.01080313, + "balance_loss_clip": 1.43630886, + "balance_loss_mlp": 1.04402542, + "epoch": 0.03805801893882459, + "flos": 24832444390200.0, + "grad_norm": 1.755893524570022, + "language_loss": 0.84784949, + "learning_rate": 3.999319145312175e-06, + "loss": 0.87605351, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.36303711, + "step": 633, + "time_per_iteration": 2.8094611167907715 + }, + { + "auxiliary_loss_clip": 0.0174841, + "auxiliary_loss_mlp": 0.01088436, + "balance_loss_clip": 1.44134486, + "balance_loss_mlp": 1.04883432, + "epoch": 0.03811814219149256, + "flos": 30489196395600.0, + "grad_norm": 3.9342579356874015, + "language_loss": 0.70433128, + "learning_rate": 3.999308945971392e-06, + "loss": 0.73269969, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.39599609, + "step": 634, + "time_per_iteration": 2.864084482192993 + }, + { + "auxiliary_loss_clip": 0.01533423, + "auxiliary_loss_mlp": 0.01021111, + "balance_loss_clip": 1.33436966, + "balance_loss_mlp": 1.00489891, + "epoch": 0.03817826544416053, + "flos": 67006703912040.0, + "grad_norm": 0.9106168596212594, + "language_loss": 0.61576116, + "learning_rate": 3.999298670816614e-06, + "loss": 0.64130652, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.16210938, + "step": 635, + "time_per_iteration": 3.276822328567505 + }, + { + "auxiliary_loss_clip": 0.01737059, + "auxiliary_loss_mlp": 0.01089987, + "balance_loss_clip": 1.43932056, + "balance_loss_mlp": 1.05169666, + "epoch": 0.038238388696828496, + "flos": 20490223408560.0, + "grad_norm": 2.2869644470757295, + "language_loss": 0.84343415, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.87170458, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.38305664, + "step": 636, + "time_per_iteration": 5.819219350814819 + }, + { + "auxiliary_loss_clip": 0.01737545, + "auxiliary_loss_mlp": 0.01096452, + "balance_loss_clip": 1.43270409, + "balance_loss_mlp": 1.05968809, + "epoch": 0.03829851194949647, + "flos": 17970184472400.0, + "grad_norm": 2.4351804213978037, + "language_loss": 0.80949461, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8378346, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.36767578, + "step": 637, + "time_per_iteration": 4.267074108123779 + }, + { + "auxiliary_loss_clip": 0.01754691, + "auxiliary_loss_mlp": 0.01098617, + "balance_loss_clip": 1.44440448, + "balance_loss_mlp": 1.06020796, + "epoch": 0.03835863520216444, + "flos": 22461776294040.0, + "grad_norm": 1.7716103371246885, + "language_loss": 0.84803224, + "learning_rate": 3.999267390472215e-06, + "loss": 0.87656534, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.38427734, + "step": 638, + "time_per_iteration": 2.818012237548828 + }, + { + "auxiliary_loss_clip": 0.01759171, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_clip": 1.44519198, + "balance_loss_mlp": 1.05167532, + "epoch": 0.038418758454832405, + "flos": 22169613168360.0, + "grad_norm": 2.805135622758442, + "language_loss": 0.7149272, + "learning_rate": 3.999256812065381e-06, + "loss": 0.74343407, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.3984375, + "step": 639, + "time_per_iteration": 4.3317694664001465 + }, + { + "auxiliary_loss_clip": 0.01756238, + "auxiliary_loss_mlp": 0.01097214, + "balance_loss_clip": 1.44525886, + "balance_loss_mlp": 1.05739808, + "epoch": 0.03847888170750038, + "flos": 22752599343840.0, + "grad_norm": 3.346745423146823, + "language_loss": 0.8701297, + "learning_rate": 3.999246157846526e-06, + "loss": 0.89866424, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.39770508, + "step": 640, + "time_per_iteration": 2.828613758087158 + }, + { + "auxiliary_loss_clip": 0.01755278, + "auxiliary_loss_mlp": 0.0110101, + "balance_loss_clip": 1.44458389, + "balance_loss_mlp": 1.06271958, + "epoch": 0.03853900496016834, + "flos": 22716393667920.0, + "grad_norm": 2.070582297019251, + "language_loss": 0.82809258, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.85665548, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.3828125, + "step": 641, + "time_per_iteration": 2.9044408798217773 + }, + { + "auxiliary_loss_clip": 0.01526439, + "auxiliary_loss_mlp": 0.01056177, + "balance_loss_clip": 1.33079827, + "balance_loss_mlp": 1.03023672, + "epoch": 0.038599128212836314, + "flos": 70415455341600.0, + "grad_norm": 0.9104580037357486, + "language_loss": 0.6541723, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67999846, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.25976562, + "step": 642, + "time_per_iteration": 3.2710366249084473 + }, + { + "auxiliary_loss_clip": 0.01732075, + "auxiliary_loss_mlp": 0.01083522, + "balance_loss_clip": 1.42990255, + "balance_loss_mlp": 1.04492211, + "epoch": 0.03865925146550429, + "flos": 23300435660760.0, + "grad_norm": 3.395660718588866, + "language_loss": 0.80416071, + "learning_rate": 3.999213740321906e-06, + "loss": 0.8323167, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.38598633, + "step": 643, + "time_per_iteration": 2.8422749042510986 + }, + { + "auxiliary_loss_clip": 0.01735825, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.43290079, + "balance_loss_mlp": 1.05841804, + "epoch": 0.03871937471817225, + "flos": 21434982084720.0, + "grad_norm": 2.0803482457684965, + "language_loss": 0.82874084, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.37158203, + "step": 644, + "time_per_iteration": 2.819693088531494 + }, + { + "auxiliary_loss_clip": 0.01736787, + "auxiliary_loss_mlp": 0.01080849, + "balance_loss_clip": 1.43683434, + "balance_loss_mlp": 1.04117572, + "epoch": 0.038779497970840224, + "flos": 34284027752280.0, + "grad_norm": 2.051927630089661, + "language_loss": 0.83214414, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.86032045, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.39697266, + "step": 645, + "time_per_iteration": 2.9832377433776855 + }, + { + "auxiliary_loss_clip": 0.01741189, + "auxiliary_loss_mlp": 0.01091667, + "balance_loss_clip": 1.43565106, + "balance_loss_mlp": 1.04937124, + "epoch": 0.03883962122350819, + "flos": 22753452119400.0, + "grad_norm": 2.8220874199143653, + "language_loss": 0.82625175, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.85458028, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.42260742, + "step": 646, + "time_per_iteration": 2.862657070159912 + }, + { + "auxiliary_loss_clip": 0.01734438, + "auxiliary_loss_mlp": 0.01091793, + "balance_loss_clip": 1.43567789, + "balance_loss_mlp": 1.05502939, + "epoch": 0.03889974447617616, + "flos": 21951039036960.0, + "grad_norm": 2.1075158281558974, + "language_loss": 0.83334893, + "learning_rate": 3.999169455612323e-06, + "loss": 0.86161125, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.3671875, + "step": 647, + "time_per_iteration": 2.83406662940979 + }, + { + "auxiliary_loss_clip": 0.01731253, + "auxiliary_loss_mlp": 0.01087508, + "balance_loss_clip": 1.43241727, + "balance_loss_mlp": 1.04497421, + "epoch": 0.03895986772884413, + "flos": 31511239426800.0, + "grad_norm": 3.5575508086839367, + "language_loss": 0.8543191, + "learning_rate": 3.999158194912106e-06, + "loss": 0.88250673, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.42529297, + "step": 648, + "time_per_iteration": 2.9108800888061523 + }, + { + "auxiliary_loss_clip": 0.01726677, + "auxiliary_loss_mlp": 0.01087058, + "balance_loss_clip": 1.42904854, + "balance_loss_mlp": 1.05050838, + "epoch": 0.0390199909815121, + "flos": 19905734723760.0, + "grad_norm": 2.5046816200704676, + "language_loss": 0.848943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.87708032, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.36547852, + "step": 649, + "time_per_iteration": 2.8608932495117188 + }, + { + "auxiliary_loss_clip": 0.01745238, + "auxiliary_loss_mlp": 0.01085224, + "balance_loss_clip": 1.44046378, + "balance_loss_mlp": 1.04824543, + "epoch": 0.03908011423418007, + "flos": 21616985064960.0, + "grad_norm": 1.864947516901381, + "language_loss": 0.8066839, + "learning_rate": 3.999135446087263e-06, + "loss": 0.83498859, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.37011719, + "step": 650, + "time_per_iteration": 2.8918373584747314 + }, + { + "auxiliary_loss_clip": 0.01725736, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.4276818, + "balance_loss_mlp": 1.04697776, + "epoch": 0.039140237486848035, + "flos": 18666092161800.0, + "grad_norm": 2.045965884681031, + "language_loss": 0.79903924, + "learning_rate": 3.9991239579635e-06, + "loss": 0.82714379, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.37792969, + "step": 651, + "time_per_iteration": 2.828561544418335 + }, + { + "auxiliary_loss_clip": 0.01737617, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_clip": 1.43507206, + "balance_loss_mlp": 1.0557065, + "epoch": 0.03920036073951601, + "flos": 18665767294920.0, + "grad_norm": 2.6758544973508593, + "language_loss": 0.88087642, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90920043, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.39086914, + "step": 652, + "time_per_iteration": 2.7891926765441895 + }, + { + "auxiliary_loss_clip": 0.01719264, + "auxiliary_loss_mlp": 0.01083103, + "balance_loss_clip": 1.42234027, + "balance_loss_mlp": 1.04719758, + "epoch": 0.03926048399218398, + "flos": 31360284860760.0, + "grad_norm": 2.525039279254495, + "language_loss": 0.81006682, + "learning_rate": 3.999100754295471e-06, + "loss": 0.83809054, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.35913086, + "step": 653, + "time_per_iteration": 2.890406370162964 + }, + { + "auxiliary_loss_clip": 0.01749505, + "auxiliary_loss_mlp": 0.01102786, + "balance_loss_clip": 1.43572211, + "balance_loss_mlp": 1.06344652, + "epoch": 0.039320607244851945, + "flos": 29609133482880.0, + "grad_norm": 2.007639892763759, + "language_loss": 0.86745465, + "learning_rate": 3.999089038752085e-06, + "loss": 0.89597756, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.39331055, + "step": 654, + "time_per_iteration": 2.854051351547241 + }, + { + "auxiliary_loss_clip": 0.01518395, + "auxiliary_loss_mlp": 0.01015327, + "balance_loss_clip": 1.32054687, + "balance_loss_mlp": 1.00054538, + "epoch": 0.03938073049751992, + "flos": 66550811568120.0, + "grad_norm": 0.7370503418145627, + "language_loss": 0.49911875, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5244559, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.14746094, + "step": 655, + "time_per_iteration": 3.353440999984741 + }, + { + "auxiliary_loss_clip": 0.0171786, + "auxiliary_loss_mlp": 0.01086649, + "balance_loss_clip": 1.42778361, + "balance_loss_mlp": 1.05091, + "epoch": 0.03944085375018788, + "flos": 23373090662760.0, + "grad_norm": 1.9299882861533924, + "language_loss": 0.81319112, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.84123623, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.35742188, + "step": 656, + "time_per_iteration": 2.810244560241699 + }, + { + "auxiliary_loss_clip": 0.01740666, + "auxiliary_loss_mlp": 0.0110137, + "balance_loss_clip": 1.43687844, + "balance_loss_mlp": 1.06060004, + "epoch": 0.039500977002855854, + "flos": 18552480305760.0, + "grad_norm": 2.4298533993958875, + "language_loss": 0.77131176, + "learning_rate": 3.999053437289776e-06, + "loss": 0.79973209, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.40771484, + "step": 657, + "time_per_iteration": 2.763482093811035 + }, + { + "auxiliary_loss_clip": 0.01732624, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_clip": 1.42704487, + "balance_loss_mlp": 1.05312014, + "epoch": 0.039561100255523826, + "flos": 25343709555960.0, + "grad_norm": 1.8621402222276953, + "language_loss": 0.81980097, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84805256, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.39379883, + "step": 658, + "time_per_iteration": 2.8646836280822754 + }, + { + "auxiliary_loss_clip": 0.01722994, + "auxiliary_loss_mlp": 0.01095474, + "balance_loss_clip": 1.4268173, + "balance_loss_mlp": 1.05828023, + "epoch": 0.03962122350819179, + "flos": 18224030287440.0, + "grad_norm": 1.982814223750767, + "language_loss": 0.91641301, + "learning_rate": 3.999029323959287e-06, + "loss": 0.94459766, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.37207031, + "step": 659, + "time_per_iteration": 2.8513975143432617 + }, + { + "auxiliary_loss_clip": 0.01727173, + "auxiliary_loss_mlp": 0.01083347, + "balance_loss_clip": 1.42758393, + "balance_loss_mlp": 1.04562902, + "epoch": 0.03968134676085976, + "flos": 20527160034960.0, + "grad_norm": 2.6601875945637046, + "language_loss": 0.8039099, + "learning_rate": 3.999017153588724e-06, + "loss": 0.83201516, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.37719727, + "step": 660, + "time_per_iteration": 2.873192548751831 + }, + { + "auxiliary_loss_clip": 0.01724966, + "auxiliary_loss_mlp": 0.01078195, + "balance_loss_clip": 1.42885447, + "balance_loss_mlp": 1.03940439, + "epoch": 0.03974147001352773, + "flos": 22428169553160.0, + "grad_norm": 1.859246464909579, + "language_loss": 0.82336426, + "learning_rate": 3.999004907415231e-06, + "loss": 0.85139591, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.38818359, + "step": 661, + "time_per_iteration": 2.929036855697632 + }, + { + "auxiliary_loss_clip": 0.01502961, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.30856681, + "balance_loss_mlp": 1.01454663, + "epoch": 0.0398015932661957, + "flos": 71145172813680.0, + "grad_norm": 0.9180318166943674, + "language_loss": 0.69391674, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71922725, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.13574219, + "step": 662, + "time_per_iteration": 3.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01726004, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_clip": 1.42456579, + "balance_loss_mlp": 1.05801153, + "epoch": 0.03986171651886367, + "flos": 16805430372240.0, + "grad_norm": 1.7551748337250719, + "language_loss": 0.83617789, + "learning_rate": 3.998980187661314e-06, + "loss": 0.86438733, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.36938477, + "step": 663, + "time_per_iteration": 2.938512086868286 + }, + { + "auxiliary_loss_clip": 0.01734546, + "auxiliary_loss_mlp": 0.01092081, + "balance_loss_clip": 1.42996502, + "balance_loss_mlp": 1.05314708, + "epoch": 0.03992183977153164, + "flos": 24540443697960.0, + "grad_norm": 2.1821020424607305, + "language_loss": 0.88302648, + "learning_rate": 3.998967714081826e-06, + "loss": 0.91129279, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.38964844, + "step": 664, + "time_per_iteration": 2.903125047683716 + }, + { + "auxiliary_loss_clip": 0.01711253, + "auxiliary_loss_mlp": 0.01088615, + "balance_loss_clip": 1.42298031, + "balance_loss_mlp": 1.0490849, + "epoch": 0.03998196302419961, + "flos": 15600084893280.0, + "grad_norm": 2.2008175084949806, + "language_loss": 0.85569763, + "learning_rate": 3.998955164701281e-06, + "loss": 0.88369632, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.39526367, + "step": 665, + "time_per_iteration": 2.816967487335205 + }, + { + "auxiliary_loss_clip": 0.01724598, + "auxiliary_loss_mlp": 0.01092251, + "balance_loss_clip": 1.41919446, + "balance_loss_mlp": 1.05603576, + "epoch": 0.04004208627686758, + "flos": 25311036807360.0, + "grad_norm": 1.8301059425459878, + "language_loss": 0.82154739, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84971583, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.36230469, + "step": 666, + "time_per_iteration": 2.9347565174102783 + }, + { + "auxiliary_loss_clip": 0.01720717, + "auxiliary_loss_mlp": 0.01080161, + "balance_loss_clip": 1.4240284, + "balance_loss_mlp": 1.03953409, + "epoch": 0.04010220952953555, + "flos": 23480936131680.0, + "grad_norm": 2.6290050184411875, + "language_loss": 0.87601578, + "learning_rate": 3.998929838538932e-06, + "loss": 0.9040246, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.40625, + "step": 667, + "time_per_iteration": 2.8310601711273193 + }, + { + "auxiliary_loss_clip": 0.01715361, + "auxiliary_loss_mlp": 0.01090965, + "balance_loss_clip": 1.42197609, + "balance_loss_mlp": 1.05551195, + "epoch": 0.04016233278220352, + "flos": 18620668388160.0, + "grad_norm": 2.228015539645901, + "language_loss": 0.81536174, + "learning_rate": 3.998917061758087e-06, + "loss": 0.84342504, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.35449219, + "step": 668, + "time_per_iteration": 2.8028619289398193 + }, + { + "auxiliary_loss_clip": 0.01497672, + "auxiliary_loss_mlp": 0.01019964, + "balance_loss_clip": 1.30626273, + "balance_loss_mlp": 1.00756633, + "epoch": 0.040222456034871484, + "flos": 70922091154320.0, + "grad_norm": 0.7869263487618166, + "language_loss": 0.60066307, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62583947, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.12353516, + "step": 669, + "time_per_iteration": 3.386941432952881 + }, + { + "auxiliary_loss_clip": 0.01717994, + "auxiliary_loss_mlp": 0.01085658, + "balance_loss_clip": 1.41629577, + "balance_loss_mlp": 1.04979956, + "epoch": 0.040282579287539456, + "flos": 23769444504960.0, + "grad_norm": 1.8680892187398896, + "language_loss": 0.86558962, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.89362609, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.35864258, + "step": 670, + "time_per_iteration": 2.8324999809265137 + }, + { + "auxiliary_loss_clip": 0.01704442, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.41385853, + "balance_loss_mlp": 1.04827607, + "epoch": 0.04034270254020743, + "flos": 18483114580920.0, + "grad_norm": 1.955440199034746, + "language_loss": 0.75580102, + "learning_rate": 3.998878276622692e-06, + "loss": 0.78368413, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.35620117, + "step": 671, + "time_per_iteration": 2.831709861755371 + }, + { + "auxiliary_loss_clip": 0.01723769, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_clip": 1.42504239, + "balance_loss_mlp": 1.05926776, + "epoch": 0.040402825792875394, + "flos": 17206332350760.0, + "grad_norm": 2.2753433375111154, + "language_loss": 0.93194133, + "learning_rate": 3.998865196648242e-06, + "loss": 0.9601422, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.37060547, + "step": 672, + "time_per_iteration": 2.773070812225342 + }, + { + "auxiliary_loss_clip": 0.01716113, + "auxiliary_loss_mlp": 0.01089419, + "balance_loss_clip": 1.41788888, + "balance_loss_mlp": 1.04526401, + "epoch": 0.040462949045543366, + "flos": 19176910635600.0, + "grad_norm": 1.929787073447155, + "language_loss": 0.90896738, + "learning_rate": 3.998852040876622e-06, + "loss": 0.93702269, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.44140625, + "step": 673, + "time_per_iteration": 2.7839362621307373 + }, + { + "auxiliary_loss_clip": 0.01702514, + "auxiliary_loss_mlp": 0.01103538, + "balance_loss_clip": 1.41371822, + "balance_loss_mlp": 1.06489086, + "epoch": 0.04052307229821133, + "flos": 24024549179160.0, + "grad_norm": 2.0331061978724914, + "language_loss": 0.76011288, + "learning_rate": 3.998838809308334e-06, + "loss": 0.78817344, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.38623047, + "step": 674, + "time_per_iteration": 2.8094351291656494 + }, + { + "auxiliary_loss_clip": 0.01724184, + "auxiliary_loss_mlp": 0.01088884, + "balance_loss_clip": 1.42263031, + "balance_loss_mlp": 1.05331218, + "epoch": 0.0405831955508793, + "flos": 16441099544880.0, + "grad_norm": 2.471384781061086, + "language_loss": 0.79165292, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.81978369, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.35571289, + "step": 675, + "time_per_iteration": 4.237086534500122 + }, + { + "auxiliary_loss_clip": 0.01700769, + "auxiliary_loss_mlp": 0.01088589, + "balance_loss_clip": 1.40705335, + "balance_loss_mlp": 1.04994166, + "epoch": 0.040643318803547275, + "flos": 24285298415400.0, + "grad_norm": 1.7521295678754623, + "language_loss": 0.77156878, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79946232, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.38671875, + "step": 676, + "time_per_iteration": 4.3774683475494385 + }, + { + "auxiliary_loss_clip": 0.01714909, + "auxiliary_loss_mlp": 0.01087279, + "balance_loss_clip": 1.4151175, + "balance_loss_mlp": 1.04953742, + "epoch": 0.04070344205621524, + "flos": 17716541699160.0, + "grad_norm": 2.193698652065463, + "language_loss": 0.86311412, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.89113599, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.37744141, + "step": 677, + "time_per_iteration": 4.188544988632202 + }, + { + "auxiliary_loss_clip": 0.01700478, + "auxiliary_loss_mlp": 0.01083589, + "balance_loss_clip": 1.41017592, + "balance_loss_mlp": 1.04696751, + "epoch": 0.04076356530888321, + "flos": 26183343523320.0, + "grad_norm": 1.7283034546143137, + "language_loss": 0.77110302, + "learning_rate": 3.998785125078559e-06, + "loss": 0.7989437, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.3659668, + "step": 678, + "time_per_iteration": 2.827285051345825 + }, + { + "auxiliary_loss_clip": 0.01694075, + "auxiliary_loss_mlp": 0.01076617, + "balance_loss_clip": 1.40511894, + "balance_loss_mlp": 1.04319036, + "epoch": 0.04082368856155118, + "flos": 35780399322840.0, + "grad_norm": 3.6288756961520368, + "language_loss": 0.82725728, + "learning_rate": 3.998771514534505e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.33422852, + "step": 679, + "time_per_iteration": 2.9181907176971436 + }, + { + "auxiliary_loss_clip": 0.01710178, + "auxiliary_loss_mlp": 0.0107013, + "balance_loss_clip": 1.42132235, + "balance_loss_mlp": 1.03553522, + "epoch": 0.04088381181421915, + "flos": 28152419298840.0, + "grad_norm": 1.7774387738913682, + "language_loss": 0.77578688, + "learning_rate": 3.998757828196835e-06, + "loss": 0.80358994, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.34570312, + "step": 680, + "time_per_iteration": 2.8575310707092285 + }, + { + "auxiliary_loss_clip": 0.01713121, + "auxiliary_loss_mlp": 0.01082799, + "balance_loss_clip": 1.41379702, + "balance_loss_mlp": 1.0472033, + "epoch": 0.04094393506688712, + "flos": 27603161689320.0, + "grad_norm": 2.0652259739709975, + "language_loss": 0.83912176, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.86708093, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.35620117, + "step": 681, + "time_per_iteration": 2.9012093544006348 + }, + { + "auxiliary_loss_clip": 0.01711378, + "auxiliary_loss_mlp": 0.01075646, + "balance_loss_clip": 1.41379285, + "balance_loss_mlp": 1.03952575, + "epoch": 0.04100405831955509, + "flos": 23117214429720.0, + "grad_norm": 1.8640404988414818, + "language_loss": 0.72793812, + "learning_rate": 3.998730228142726e-06, + "loss": 0.75580835, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.840329647064209 + }, + { + "auxiliary_loss_clip": 0.01702573, + "auxiliary_loss_mlp": 0.01081656, + "balance_loss_clip": 1.41041541, + "balance_loss_mlp": 1.04715681, + "epoch": 0.04106418157222306, + "flos": 20161529740080.0, + "grad_norm": 1.7875204109185379, + "language_loss": 0.73530096, + "learning_rate": 3.998716314427333e-06, + "loss": 0.7631433, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.34472656, + "step": 683, + "time_per_iteration": 2.7983155250549316 + }, + { + "auxiliary_loss_clip": 0.01705279, + "auxiliary_loss_mlp": 0.01086934, + "balance_loss_clip": 1.41930628, + "balance_loss_mlp": 1.05221987, + "epoch": 0.041124304824891024, + "flos": 17425190740680.0, + "grad_norm": 2.2535081915921054, + "language_loss": 0.82037663, + "learning_rate": 3.998702324920417e-06, + "loss": 0.84829873, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.34716797, + "step": 684, + "time_per_iteration": 2.85282039642334 + }, + { + "auxiliary_loss_clip": 0.01708664, + "auxiliary_loss_mlp": 0.0107632, + "balance_loss_clip": 1.42006445, + "balance_loss_mlp": 1.03950822, + "epoch": 0.041184428077558996, + "flos": 25786299339000.0, + "grad_norm": 2.1335267992143154, + "language_loss": 0.91234338, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.94019318, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.36816406, + "step": 685, + "time_per_iteration": 2.9385225772857666 + }, + { + "auxiliary_loss_clip": 0.0170872, + "auxiliary_loss_mlp": 0.01074457, + "balance_loss_clip": 1.41651464, + "balance_loss_mlp": 1.03864646, + "epoch": 0.04124455133022697, + "flos": 22969752182640.0, + "grad_norm": 2.351612972653754, + "language_loss": 0.89272547, + "learning_rate": 3.998674118534141e-06, + "loss": 0.92055726, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.3581543, + "step": 686, + "time_per_iteration": 2.9533870220184326 + }, + { + "auxiliary_loss_clip": 0.01710793, + "auxiliary_loss_mlp": 0.01072436, + "balance_loss_clip": 1.4167347, + "balance_loss_mlp": 1.03798413, + "epoch": 0.04130467458289493, + "flos": 21294179608680.0, + "grad_norm": 1.9846945273570802, + "language_loss": 0.72404301, + "learning_rate": 3.998659901655851e-06, + "loss": 0.75187528, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.34472656, + "step": 687, + "time_per_iteration": 2.8272106647491455 + }, + { + "auxiliary_loss_clip": 0.01696099, + "auxiliary_loss_mlp": 0.01082302, + "balance_loss_clip": 1.41372335, + "balance_loss_mlp": 1.04737377, + "epoch": 0.041364797835562905, + "flos": 19978998851160.0, + "grad_norm": 1.4714226397737302, + "language_loss": 0.86578786, + "learning_rate": 3.998645608988177e-06, + "loss": 0.89357191, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.34912109, + "step": 688, + "time_per_iteration": 2.8284294605255127 + }, + { + "auxiliary_loss_clip": 0.01698968, + "auxiliary_loss_mlp": 0.01091026, + "balance_loss_clip": 1.41868961, + "balance_loss_mlp": 1.05619287, + "epoch": 0.04142492108823087, + "flos": 21911016175200.0, + "grad_norm": 1.9275039155509102, + "language_loss": 0.84115434, + "learning_rate": 3.998631240531661e-06, + "loss": 0.86905426, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.34863281, + "step": 689, + "time_per_iteration": 2.879783868789673 + }, + { + "auxiliary_loss_clip": 0.01701041, + "auxiliary_loss_mlp": 0.0109102, + "balance_loss_clip": 1.41272342, + "balance_loss_mlp": 1.05413699, + "epoch": 0.04148504434089884, + "flos": 27645824094480.0, + "grad_norm": 2.0033776203518032, + "language_loss": 0.68969637, + "learning_rate": 3.998616796286848e-06, + "loss": 0.71761698, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.36914062, + "step": 690, + "time_per_iteration": 2.858302593231201 + }, + { + "auxiliary_loss_clip": 0.01704845, + "auxiliary_loss_mlp": 0.01089251, + "balance_loss_clip": 1.41849542, + "balance_loss_mlp": 1.05446589, + "epoch": 0.041545167593566815, + "flos": 20522814940440.0, + "grad_norm": 1.7169285833086667, + "language_loss": 0.75598347, + "learning_rate": 3.998602276254286e-06, + "loss": 0.78392446, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.34790039, + "step": 691, + "time_per_iteration": 2.8392717838287354 + }, + { + "auxiliary_loss_clip": 0.01703705, + "auxiliary_loss_mlp": 0.01087411, + "balance_loss_clip": 1.4173286, + "balance_loss_mlp": 1.05148137, + "epoch": 0.04160529084623478, + "flos": 11871898501320.0, + "grad_norm": 2.0350209633358034, + "language_loss": 0.84946406, + "learning_rate": 3.998587680434526e-06, + "loss": 0.87737525, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.359375, + "step": 692, + "time_per_iteration": 2.7931416034698486 + }, + { + "auxiliary_loss_clip": 0.01718948, + "auxiliary_loss_mlp": 0.01087078, + "balance_loss_clip": 1.418751, + "balance_loss_mlp": 1.05000424, + "epoch": 0.04166541409890275, + "flos": 14832212544000.0, + "grad_norm": 2.8571967128664304, + "language_loss": 0.90147597, + "learning_rate": 3.99857300882812e-06, + "loss": 0.92953622, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.37084961, + "step": 693, + "time_per_iteration": 2.826098680496216 + }, + { + "auxiliary_loss_clip": 0.01717304, + "auxiliary_loss_mlp": 0.0108146, + "balance_loss_clip": 1.42747974, + "balance_loss_mlp": 1.04240727, + "epoch": 0.04172553735157072, + "flos": 25813165092120.0, + "grad_norm": 2.1903087844249147, + "language_loss": 0.83171773, + "learning_rate": 3.998558261435626e-06, + "loss": 0.85970539, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.390625, + "step": 694, + "time_per_iteration": 2.8517065048217773 + }, + { + "auxiliary_loss_clip": 0.01720304, + "auxiliary_loss_mlp": 0.01084552, + "balance_loss_clip": 1.42447722, + "balance_loss_mlp": 1.04900384, + "epoch": 0.04178566060423869, + "flos": 24285176590320.0, + "grad_norm": 1.9751887801279329, + "language_loss": 0.84431875, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.87236726, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.35546875, + "step": 695, + "time_per_iteration": 2.8330984115600586 + }, + { + "auxiliary_loss_clip": 0.01712593, + "auxiliary_loss_mlp": 0.0108602, + "balance_loss_clip": 1.42313731, + "balance_loss_mlp": 1.04782534, + "epoch": 0.04184578385690666, + "flos": 18226426180680.0, + "grad_norm": 2.4067857457002195, + "language_loss": 0.85432225, + "learning_rate": 3.99852853929461e-06, + "loss": 0.88230836, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.38208008, + "step": 696, + "time_per_iteration": 2.7764391899108887 + }, + { + "auxiliary_loss_clip": 0.01700617, + "auxiliary_loss_mlp": 0.01093131, + "balance_loss_clip": 1.41627192, + "balance_loss_mlp": 1.05546081, + "epoch": 0.041905907109574626, + "flos": 22780520914320.0, + "grad_norm": 2.2713128432049605, + "language_loss": 0.93869412, + "learning_rate": 3.998513564547216e-06, + "loss": 0.96663165, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.37646484, + "step": 697, + "time_per_iteration": 2.9427566528320312 + }, + { + "auxiliary_loss_clip": 0.01687451, + "auxiliary_loss_mlp": 0.01069068, + "balance_loss_clip": 1.40526962, + "balance_loss_mlp": 1.03680992, + "epoch": 0.0419660303622426, + "flos": 20161773390240.0, + "grad_norm": 2.609977267619126, + "language_loss": 0.84675211, + "learning_rate": 3.998498514015987e-06, + "loss": 0.87431729, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.32275391, + "step": 698, + "time_per_iteration": 2.865123987197876 + }, + { + "auxiliary_loss_clip": 0.01704049, + "auxiliary_loss_mlp": 0.01087155, + "balance_loss_clip": 1.4179256, + "balance_loss_mlp": 1.04938936, + "epoch": 0.042026153614910564, + "flos": 23081658487560.0, + "grad_norm": 1.8806380396329403, + "language_loss": 0.91726494, + "learning_rate": 3.998483387701495e-06, + "loss": 0.94517696, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.37768555, + "step": 699, + "time_per_iteration": 2.8146615028381348 + }, + { + "auxiliary_loss_clip": 0.01516697, + "auxiliary_loss_mlp": 0.01093951, + "balance_loss_clip": 1.3259902, + "balance_loss_mlp": 1.08298361, + "epoch": 0.042086276867578536, + "flos": 64511679725640.0, + "grad_norm": 0.908300073620388, + "language_loss": 0.67914265, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70524913, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.10986328, + "step": 700, + "time_per_iteration": 3.326219320297241 + }, + { + "auxiliary_loss_clip": 0.01703541, + "auxiliary_loss_mlp": 0.01095332, + "balance_loss_clip": 1.41615629, + "balance_loss_mlp": 1.05756617, + "epoch": 0.04214640012024651, + "flos": 15491264823720.0, + "grad_norm": 2.9520508364898075, + "language_loss": 0.89782548, + "learning_rate": 3.998452907725016e-06, + "loss": 0.92581415, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.37768555, + "step": 701, + "time_per_iteration": 2.856341600418091 + }, + { + "auxiliary_loss_clip": 0.01702227, + "auxiliary_loss_mlp": 0.01076592, + "balance_loss_clip": 1.41899967, + "balance_loss_mlp": 1.04118717, + "epoch": 0.04220652337291447, + "flos": 23882162977080.0, + "grad_norm": 2.188655496307791, + "language_loss": 0.68031079, + "learning_rate": 3.998437554064184e-06, + "loss": 0.70809901, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.35424805, + "step": 702, + "time_per_iteration": 2.8915157318115234 + }, + { + "auxiliary_loss_clip": 0.01516516, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_clip": 1.32824838, + "balance_loss_mlp": 1.0103482, + "epoch": 0.042266646625582445, + "flos": 63810939641400.0, + "grad_norm": 0.8479398776955545, + "language_loss": 0.60759968, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.63297755, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.109375, + "step": 703, + "time_per_iteration": 3.3767616748809814 + }, + { + "auxiliary_loss_clip": 0.01510513, + "auxiliary_loss_mlp": 0.01015509, + "balance_loss_clip": 1.32470536, + "balance_loss_mlp": 1.00406444, + "epoch": 0.04232676987825041, + "flos": 50034515675760.0, + "grad_norm": 1.0193704033024849, + "language_loss": 0.57711816, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.60237837, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.11425781, + "step": 704, + "time_per_iteration": 3.159438133239746 + }, + { + "auxiliary_loss_clip": 0.016931, + "auxiliary_loss_mlp": 0.01081669, + "balance_loss_clip": 1.40695274, + "balance_loss_mlp": 1.04714584, + "epoch": 0.04238689313091838, + "flos": 21621086509320.0, + "grad_norm": 2.4864027890273297, + "language_loss": 0.88572747, + "learning_rate": 3.998391038398319e-06, + "loss": 0.91347516, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.34545898, + "step": 705, + "time_per_iteration": 2.865469217300415 + }, + { + "auxiliary_loss_clip": 0.01678652, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_clip": 1.40770173, + "balance_loss_mlp": 1.04738355, + "epoch": 0.042447016383586354, + "flos": 19139770967400.0, + "grad_norm": 1.7720645330257494, + "language_loss": 0.72262919, + "learning_rate": 3.998375381617201e-06, + "loss": 0.75022185, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.33203125, + "step": 706, + "time_per_iteration": 2.8151230812072754 + }, + { + "auxiliary_loss_clip": 0.01688093, + "auxiliary_loss_mlp": 0.01091482, + "balance_loss_clip": 1.40759087, + "balance_loss_mlp": 1.0562439, + "epoch": 0.04250713963625432, + "flos": 24431420586600.0, + "grad_norm": 2.109828654527525, + "language_loss": 0.94026458, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.96806037, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.35253906, + "step": 707, + "time_per_iteration": 2.792814254760742 + }, + { + "auxiliary_loss_clip": 0.01703805, + "auxiliary_loss_mlp": 0.01087606, + "balance_loss_clip": 1.4189744, + "balance_loss_mlp": 1.05088985, + "epoch": 0.04256726288892229, + "flos": 30372823171080.0, + "grad_norm": 2.2891957456461647, + "language_loss": 0.82154238, + "learning_rate": 3.998343840719776e-06, + "loss": 0.84945649, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.3671875, + "step": 708, + "time_per_iteration": 2.969757556915283 + }, + { + "auxiliary_loss_clip": 0.01707256, + "auxiliary_loss_mlp": 0.01115156, + "balance_loss_clip": 1.41827762, + "balance_loss_mlp": 1.07352829, + "epoch": 0.04262738614159026, + "flos": 16367023250280.0, + "grad_norm": 2.156823148210704, + "language_loss": 0.83029556, + "learning_rate": 3.998327956604666e-06, + "loss": 0.85851973, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.41650391, + "step": 709, + "time_per_iteration": 2.8215856552124023 + }, + { + "auxiliary_loss_clip": 0.01710873, + "auxiliary_loss_mlp": 0.01105452, + "balance_loss_clip": 1.41859806, + "balance_loss_mlp": 1.06909323, + "epoch": 0.04268750939425823, + "flos": 20417243539680.0, + "grad_norm": 3.9099475365648053, + "language_loss": 0.85803282, + "learning_rate": 3.99831199671276e-06, + "loss": 0.88619608, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.36376953, + "step": 710, + "time_per_iteration": 2.796637535095215 + }, + { + "auxiliary_loss_clip": 0.01710916, + "auxiliary_loss_mlp": 0.01101487, + "balance_loss_clip": 1.42519832, + "balance_loss_mlp": 1.06672525, + "epoch": 0.0427476326469262, + "flos": 20307814344720.0, + "grad_norm": 3.163062465868504, + "language_loss": 0.85180414, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87992823, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.34790039, + "step": 711, + "time_per_iteration": 2.7898454666137695 + }, + { + "auxiliary_loss_clip": 0.01695666, + "auxiliary_loss_mlp": 0.01103458, + "balance_loss_clip": 1.41041279, + "balance_loss_mlp": 1.07139087, + "epoch": 0.042807755899594166, + "flos": 21655546025760.0, + "grad_norm": 1.8494459537992651, + "language_loss": 0.85671353, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.88470483, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.32080078, + "step": 712, + "time_per_iteration": 2.853097438812256 + }, + { + "auxiliary_loss_clip": 0.01711626, + "auxiliary_loss_mlp": 0.01118385, + "balance_loss_clip": 1.41828644, + "balance_loss_mlp": 1.08257508, + "epoch": 0.04286787915226214, + "flos": 21440139346440.0, + "grad_norm": 3.2407252320757682, + "language_loss": 0.92308968, + "learning_rate": 3.998263662382328e-06, + "loss": 0.95138979, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.3581543, + "step": 713, + "time_per_iteration": 2.7892820835113525 + }, + { + "auxiliary_loss_clip": 0.01504041, + "auxiliary_loss_mlp": 0.01075597, + "balance_loss_clip": 1.31954956, + "balance_loss_mlp": 1.06148279, + "epoch": 0.04292800240493011, + "flos": 66415369395600.0, + "grad_norm": 0.9011567004419851, + "language_loss": 0.63789093, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66368723, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.14160156, + "step": 714, + "time_per_iteration": 6.359567403793335 + }, + { + "auxiliary_loss_clip": 0.01688168, + "auxiliary_loss_mlp": 0.01086666, + "balance_loss_clip": 1.40908384, + "balance_loss_mlp": 1.05350208, + "epoch": 0.042988125657598075, + "flos": 31656630647520.0, + "grad_norm": 2.178110715653817, + "language_loss": 0.75071442, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77846277, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.33154297, + "step": 715, + "time_per_iteration": 2.8735382556915283 + }, + { + "auxiliary_loss_clip": 0.01703342, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_clip": 1.41695905, + "balance_loss_mlp": 1.06445265, + "epoch": 0.04304824891026605, + "flos": 33254106699240.0, + "grad_norm": 1.7656795117431965, + "language_loss": 0.73535538, + "learning_rate": 3.998214646082688e-06, + "loss": 0.76340526, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.37207031, + "step": 716, + "time_per_iteration": 4.346436023712158 + }, + { + "auxiliary_loss_clip": 0.01483899, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_clip": 1.30340409, + "balance_loss_mlp": 1.04067123, + "epoch": 0.04310837216293401, + "flos": 64082125226160.0, + "grad_norm": 1.0559249398504644, + "language_loss": 0.65620995, + "learning_rate": 3.998198155770314e-06, + "loss": 0.68159211, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.13671875, + "step": 717, + "time_per_iteration": 3.2976531982421875 + }, + { + "auxiliary_loss_clip": 0.01478815, + "auxiliary_loss_mlp": 0.01014195, + "balance_loss_clip": 1.29773855, + "balance_loss_mlp": 1.00141609, + "epoch": 0.043168495415601985, + "flos": 61357992361920.0, + "grad_norm": 0.9754093583602139, + "language_loss": 0.58772862, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61265874, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.12792969, + "step": 718, + "time_per_iteration": 3.09810209274292 + }, + { + "auxiliary_loss_clip": 0.01701503, + "auxiliary_loss_mlp": 0.01081583, + "balance_loss_clip": 1.41833138, + "balance_loss_mlp": 1.04629731, + "epoch": 0.04322861866826996, + "flos": 20709163015200.0, + "grad_norm": 2.41646033809703, + "language_loss": 0.92551994, + "learning_rate": 3.99816494783057e-06, + "loss": 0.95335078, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.35302734, + "step": 719, + "time_per_iteration": 2.8451802730560303 + }, + { + "auxiliary_loss_clip": 0.01701263, + "auxiliary_loss_mlp": 0.01098913, + "balance_loss_clip": 1.4151485, + "balance_loss_mlp": 1.06174374, + "epoch": 0.04328874192093792, + "flos": 30379564158840.0, + "grad_norm": 1.7792613232964847, + "language_loss": 0.67074776, + "learning_rate": 3.99814823020446e-06, + "loss": 0.69874954, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.37182617, + "step": 720, + "time_per_iteration": 2.9188406467437744 + }, + { + "auxiliary_loss_clip": 0.01698985, + "auxiliary_loss_mlp": 0.01079644, + "balance_loss_clip": 1.41441321, + "balance_loss_mlp": 1.04369092, + "epoch": 0.043348865173605894, + "flos": 21949861394520.0, + "grad_norm": 2.080659321327456, + "language_loss": 0.78433174, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.81211805, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.35961914, + "step": 721, + "time_per_iteration": 2.8688480854034424 + }, + { + "auxiliary_loss_clip": 0.01714478, + "auxiliary_loss_mlp": 0.01096738, + "balance_loss_clip": 1.42707825, + "balance_loss_mlp": 1.06219125, + "epoch": 0.04340898842627386, + "flos": 15267939514200.0, + "grad_norm": 2.475480294777698, + "language_loss": 0.89463449, + "learning_rate": 3.998114567642933e-06, + "loss": 0.92274666, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.34545898, + "step": 722, + "time_per_iteration": 2.8426005840301514 + }, + { + "auxiliary_loss_clip": 0.01714878, + "auxiliary_loss_mlp": 0.01099594, + "balance_loss_clip": 1.42407405, + "balance_loss_mlp": 1.06569099, + "epoch": 0.04346911167894183, + "flos": 27971350310880.0, + "grad_norm": 1.8003149967454466, + "language_loss": 0.85468674, + "learning_rate": 3.998097622708792e-06, + "loss": 0.88283139, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.33911133, + "step": 723, + "time_per_iteration": 2.842391014099121 + }, + { + "auxiliary_loss_clip": 0.01723767, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_clip": 1.43662882, + "balance_loss_mlp": 1.06040692, + "epoch": 0.0435292349316098, + "flos": 29248254366120.0, + "grad_norm": 1.7266893138774257, + "language_loss": 0.83256465, + "learning_rate": 3.99808060200659e-06, + "loss": 0.86081886, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.41235352, + "step": 724, + "time_per_iteration": 2.9101414680480957 + }, + { + "auxiliary_loss_clip": 0.01705916, + "auxiliary_loss_mlp": 0.01106521, + "balance_loss_clip": 1.42014813, + "balance_loss_mlp": 1.06699157, + "epoch": 0.04358935818427777, + "flos": 20563243885800.0, + "grad_norm": 1.8870425200859386, + "language_loss": 0.80901533, + "learning_rate": 3.998063505536971e-06, + "loss": 0.83713973, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.39477539, + "step": 725, + "time_per_iteration": 2.7865025997161865 + }, + { + "auxiliary_loss_clip": 0.01724515, + "auxiliary_loss_mlp": 0.01089353, + "balance_loss_clip": 1.42708778, + "balance_loss_mlp": 1.05015707, + "epoch": 0.04364948143694574, + "flos": 14468978142360.0, + "grad_norm": 1.9980040584285919, + "language_loss": 0.87981594, + "learning_rate": 3.998046333300584e-06, + "loss": 0.90795457, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.39160156, + "step": 726, + "time_per_iteration": 2.7718217372894287 + }, + { + "auxiliary_loss_clip": 0.01472669, + "auxiliary_loss_mlp": 0.01084602, + "balance_loss_clip": 1.28855348, + "balance_loss_mlp": 1.07353926, + "epoch": 0.043709604689613706, + "flos": 50079719426400.0, + "grad_norm": 0.9111241929303401, + "language_loss": 0.55898905, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58456177, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.11083984, + "step": 727, + "time_per_iteration": 3.398052930831909 + }, + { + "auxiliary_loss_clip": 0.01715585, + "auxiliary_loss_mlp": 0.01094066, + "balance_loss_clip": 1.42614985, + "balance_loss_mlp": 1.05630076, + "epoch": 0.04376972794228168, + "flos": 13995705420360.0, + "grad_norm": 1.9685807984184833, + "language_loss": 0.82608676, + "learning_rate": 3.998011761530112e-06, + "loss": 0.85418326, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.37792969, + "step": 728, + "time_per_iteration": 2.755796194076538 + }, + { + "auxiliary_loss_clip": 0.01705397, + "auxiliary_loss_mlp": 0.0107452, + "balance_loss_clip": 1.42510676, + "balance_loss_mlp": 1.0412364, + "epoch": 0.04382985119494965, + "flos": 22013948032560.0, + "grad_norm": 2.9446151211951515, + "language_loss": 0.77129477, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79909396, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.33300781, + "step": 729, + "time_per_iteration": 2.8307011127471924 + }, + { + "auxiliary_loss_clip": 0.01711672, + "auxiliary_loss_mlp": 0.01092915, + "balance_loss_clip": 1.42107892, + "balance_loss_mlp": 1.05398154, + "epoch": 0.043889974447617615, + "flos": 24211831246200.0, + "grad_norm": 2.005715235435175, + "language_loss": 0.96189326, + "learning_rate": 3.997976886700417e-06, + "loss": 0.98993909, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.38916016, + "step": 730, + "time_per_iteration": 2.829333782196045 + }, + { + "auxiliary_loss_clip": 0.01717151, + "auxiliary_loss_mlp": 0.01080532, + "balance_loss_clip": 1.42308903, + "balance_loss_mlp": 1.04259944, + "epoch": 0.04395009770028559, + "flos": 17278987352760.0, + "grad_norm": 2.061697313129339, + "language_loss": 0.88775516, + "learning_rate": 3.997959335640013e-06, + "loss": 0.91573197, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.37915039, + "step": 731, + "time_per_iteration": 2.79929256439209 + }, + { + "auxiliary_loss_clip": 0.01718805, + "auxiliary_loss_mlp": 0.01086948, + "balance_loss_clip": 1.43236041, + "balance_loss_mlp": 1.0487299, + "epoch": 0.04401022095295355, + "flos": 12313797942240.0, + "grad_norm": 3.531253761883262, + "language_loss": 0.90103412, + "learning_rate": 3.997941708816791e-06, + "loss": 0.92909169, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.38183594, + "step": 732, + "time_per_iteration": 2.9822187423706055 + }, + { + "auxiliary_loss_clip": 0.01722278, + "auxiliary_loss_mlp": 0.01094354, + "balance_loss_clip": 1.4288137, + "balance_loss_mlp": 1.05553973, + "epoch": 0.044070344205621524, + "flos": 20964592556280.0, + "grad_norm": 2.1035605113939755, + "language_loss": 0.86532146, + "learning_rate": 3.997924006231419e-06, + "loss": 0.89348781, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.38793945, + "step": 733, + "time_per_iteration": 2.8025200366973877 + }, + { + "auxiliary_loss_clip": 0.0172467, + "auxiliary_loss_mlp": 0.0110426, + "balance_loss_clip": 1.43306589, + "balance_loss_mlp": 1.06592262, + "epoch": 0.044130467458289496, + "flos": 13849339599000.0, + "grad_norm": 2.110236408775799, + "language_loss": 0.92246944, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.95075881, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.38330078, + "step": 734, + "time_per_iteration": 2.7799136638641357 + }, + { + "auxiliary_loss_clip": 0.01716195, + "auxiliary_loss_mlp": 0.01087832, + "balance_loss_clip": 1.43484628, + "balance_loss_mlp": 1.05121088, + "epoch": 0.04419059071095746, + "flos": 28660598229240.0, + "grad_norm": 1.866847930840157, + "language_loss": 0.79091537, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.81895566, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.36645508, + "step": 735, + "time_per_iteration": 2.859071731567383 + }, + { + "auxiliary_loss_clip": 0.01707117, + "auxiliary_loss_mlp": 0.0108778, + "balance_loss_clip": 1.41900039, + "balance_loss_mlp": 1.05068171, + "epoch": 0.04425071396362543, + "flos": 28189112275080.0, + "grad_norm": 2.0916243119681366, + "language_loss": 0.89312863, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.92107755, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.37109375, + "step": 736, + "time_per_iteration": 2.8907108306884766 + }, + { + "auxiliary_loss_clip": 0.0171162, + "auxiliary_loss_mlp": 0.01099355, + "balance_loss_clip": 1.42940199, + "balance_loss_mlp": 1.0608505, + "epoch": 0.0443108372162934, + "flos": 23663588845680.0, + "grad_norm": 1.7344399636012064, + "language_loss": 0.85131127, + "learning_rate": 3.997852438281901e-06, + "loss": 0.879421, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.38500977, + "step": 737, + "time_per_iteration": 2.8216710090637207 + }, + { + "auxiliary_loss_clip": 0.01711682, + "auxiliary_loss_mlp": 0.01088097, + "balance_loss_clip": 1.42806244, + "balance_loss_mlp": 1.04878187, + "epoch": 0.04437096046896137, + "flos": 33985813980960.0, + "grad_norm": 2.771961113506318, + "language_loss": 0.85860419, + "learning_rate": 3.997834356895906e-06, + "loss": 0.88660192, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.39331055, + "step": 738, + "time_per_iteration": 2.948781728744507 + }, + { + "auxiliary_loss_clip": 0.01497295, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.31874478, + "balance_loss_mlp": 1.01553273, + "epoch": 0.04443108372162934, + "flos": 67412154026880.0, + "grad_norm": 0.8541166825880099, + "language_loss": 0.59154952, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61678743, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.10986328, + "step": 739, + "time_per_iteration": 3.2808892726898193 + }, + { + "auxiliary_loss_clip": 0.017077, + "auxiliary_loss_mlp": 0.01082871, + "balance_loss_clip": 1.42312765, + "balance_loss_mlp": 1.045892, + "epoch": 0.04449120697429731, + "flos": 29758747973040.0, + "grad_norm": 2.2525377755230807, + "language_loss": 0.92482328, + "learning_rate": 3.997797966850369e-06, + "loss": 0.95272893, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.36987305, + "step": 740, + "time_per_iteration": 2.9161577224731445 + }, + { + "auxiliary_loss_clip": 0.0171836, + "auxiliary_loss_mlp": 0.01098976, + "balance_loss_clip": 1.43060446, + "balance_loss_mlp": 1.05904067, + "epoch": 0.04455133022696528, + "flos": 36509020369200.0, + "grad_norm": 2.356971172758727, + "language_loss": 0.72348207, + "learning_rate": 3.997779658192205e-06, + "loss": 0.75165546, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.39941406, + "step": 741, + "time_per_iteration": 2.9245386123657227 + }, + { + "auxiliary_loss_clip": 0.01694907, + "auxiliary_loss_mlp": 0.01078602, + "balance_loss_clip": 1.41745853, + "balance_loss_mlp": 1.04334021, + "epoch": 0.044611453479633245, + "flos": 28809562985640.0, + "grad_norm": 1.7897458068864007, + "language_loss": 0.89278299, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9205181, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.35253906, + "step": 742, + "time_per_iteration": 2.9056034088134766 + }, + { + "auxiliary_loss_clip": 0.01701781, + "auxiliary_loss_mlp": 0.01082578, + "balance_loss_clip": 1.42016602, + "balance_loss_mlp": 1.04106927, + "epoch": 0.04467157673230122, + "flos": 20015773044120.0, + "grad_norm": 1.8233842856963998, + "language_loss": 0.84609401, + "learning_rate": 3.997742813608561e-06, + "loss": 0.87393755, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.4152832, + "step": 743, + "time_per_iteration": 2.809168815612793 + }, + { + "auxiliary_loss_clip": 0.01710923, + "auxiliary_loss_mlp": 0.01075079, + "balance_loss_clip": 1.42646194, + "balance_loss_mlp": 1.03821945, + "epoch": 0.04473169998496919, + "flos": 18009598208760.0, + "grad_norm": 2.199525081809097, + "language_loss": 0.80768138, + "learning_rate": 3.997724277684479e-06, + "loss": 0.83554143, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.36865234, + "step": 744, + "time_per_iteration": 2.855990409851074 + }, + { + "auxiliary_loss_clip": 0.01696886, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_clip": 1.41723824, + "balance_loss_mlp": 1.04810309, + "epoch": 0.044791823237637154, + "flos": 20636670446640.0, + "grad_norm": 2.968277596802702, + "language_loss": 0.8588708, + "learning_rate": 3.99770566600649e-06, + "loss": 0.88667881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.35839844, + "step": 745, + "time_per_iteration": 2.7676665782928467 + }, + { + "auxiliary_loss_clip": 0.01698003, + "auxiliary_loss_mlp": 0.01091984, + "balance_loss_clip": 1.41501546, + "balance_loss_mlp": 1.04034233, + "epoch": 0.04485194649030513, + "flos": 31182505149960.0, + "grad_norm": 1.9900647578447024, + "language_loss": 0.69583154, + "learning_rate": 3.997686978575302e-06, + "loss": 0.7237314, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.5168457, + "step": 746, + "time_per_iteration": 2.9128899574279785 + }, + { + "auxiliary_loss_clip": 0.01702226, + "auxiliary_loss_mlp": 0.01087086, + "balance_loss_clip": 1.4185617, + "balance_loss_mlp": 1.05086994, + "epoch": 0.04491206974297309, + "flos": 26149249482120.0, + "grad_norm": 2.0911628694499362, + "language_loss": 0.69822341, + "learning_rate": 3.997668215391625e-06, + "loss": 0.72611648, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.36181641, + "step": 747, + "time_per_iteration": 2.828549385070801 + }, + { + "auxiliary_loss_clip": 0.01696857, + "auxiliary_loss_mlp": 0.01082189, + "balance_loss_clip": 1.41241598, + "balance_loss_mlp": 1.04625916, + "epoch": 0.044972192995641064, + "flos": 20672551255680.0, + "grad_norm": 2.315563110730637, + "language_loss": 0.6692661, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69705653, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.359375, + "step": 748, + "time_per_iteration": 2.797659158706665 + }, + { + "auxiliary_loss_clip": 0.01701013, + "auxiliary_loss_mlp": 0.01094125, + "balance_loss_clip": 1.41527891, + "balance_loss_mlp": 1.05621672, + "epoch": 0.045032316248309036, + "flos": 16111106408880.0, + "grad_norm": 2.212307510848911, + "language_loss": 0.78034675, + "learning_rate": 3.997630461769647e-06, + "loss": 0.80829805, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.37890625, + "step": 749, + "time_per_iteration": 2.8286423683166504 + }, + { + "auxiliary_loss_clip": 0.01695767, + "auxiliary_loss_mlp": 0.01099215, + "balance_loss_clip": 1.41315579, + "balance_loss_mlp": 1.0615921, + "epoch": 0.045092439500977, + "flos": 17863435429200.0, + "grad_norm": 2.104479826644667, + "language_loss": 0.90312439, + "learning_rate": 3.997611471332778e-06, + "loss": 0.9310742, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.3762207, + "step": 750, + "time_per_iteration": 2.8212828636169434 + }, + { + "auxiliary_loss_clip": 0.01700921, + "auxiliary_loss_mlp": 0.01084111, + "balance_loss_clip": 1.41546011, + "balance_loss_mlp": 1.04429483, + "epoch": 0.04515256275364497, + "flos": 24468275996280.0, + "grad_norm": 2.2925933156187464, + "language_loss": 0.75848329, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.78633356, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.39819336, + "step": 751, + "time_per_iteration": 2.8726437091827393 + }, + { + "auxiliary_loss_clip": 0.01695471, + "auxiliary_loss_mlp": 0.01081611, + "balance_loss_clip": 1.41121995, + "balance_loss_mlp": 1.04651558, + "epoch": 0.04521268600631294, + "flos": 20920427641800.0, + "grad_norm": 2.27492458848176, + "language_loss": 0.70419824, + "learning_rate": 3.997573263210883e-06, + "loss": 0.73196906, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.35131836, + "step": 752, + "time_per_iteration": 5.057526588439941 + }, + { + "auxiliary_loss_clip": 0.01699368, + "auxiliary_loss_mlp": 0.01070172, + "balance_loss_clip": 1.4160794, + "balance_loss_mlp": 1.03195357, + "epoch": 0.04527280925898091, + "flos": 13375417143240.0, + "grad_norm": 2.656771949536378, + "language_loss": 0.94469655, + "learning_rate": 3.997554045527305e-06, + "loss": 0.97239196, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.38208008, + "step": 753, + "time_per_iteration": 4.178483486175537 + }, + { + "auxiliary_loss_clip": 0.01701405, + "auxiliary_loss_mlp": 0.01097978, + "balance_loss_clip": 1.41613066, + "balance_loss_mlp": 1.06242943, + "epoch": 0.04533293251164888, + "flos": 23259316373280.0, + "grad_norm": 1.9720881176160692, + "language_loss": 0.91152334, + "learning_rate": 3.997534752096277e-06, + "loss": 0.9395172, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.35522461, + "step": 754, + "time_per_iteration": 4.337465763092041 + }, + { + "auxiliary_loss_clip": 0.01671968, + "auxiliary_loss_mlp": 0.01089804, + "balance_loss_clip": 1.39864922, + "balance_loss_mlp": 1.05580568, + "epoch": 0.04539305576431685, + "flos": 12425826072240.0, + "grad_norm": 2.4612963317437386, + "language_loss": 0.79489934, + "learning_rate": 3.997515382918531e-06, + "loss": 0.82251704, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.33984375, + "step": 755, + "time_per_iteration": 2.869373321533203 + }, + { + "auxiliary_loss_clip": 0.01698248, + "auxiliary_loss_mlp": 0.01084634, + "balance_loss_clip": 1.41250539, + "balance_loss_mlp": 1.04715502, + "epoch": 0.04545317901698482, + "flos": 16075347424920.0, + "grad_norm": 2.2228406684942916, + "language_loss": 0.7978394, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.82566822, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.37475586, + "step": 756, + "time_per_iteration": 4.312723875045776 + }, + { + "auxiliary_loss_clip": 0.01477592, + "auxiliary_loss_mlp": 0.01057222, + "balance_loss_clip": 1.29030657, + "balance_loss_mlp": 1.04654109, + "epoch": 0.045513302269652785, + "flos": 66413298369240.0, + "grad_norm": 0.8099252566764107, + "language_loss": 0.62699634, + "learning_rate": 3.997476417325827e-06, + "loss": 0.65234447, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.10693359, + "step": 757, + "time_per_iteration": 3.326160192489624 + }, + { + "auxiliary_loss_clip": 0.01685142, + "auxiliary_loss_mlp": 0.01080738, + "balance_loss_clip": 1.40588617, + "balance_loss_mlp": 1.046978, + "epoch": 0.04557342552232076, + "flos": 21476263805640.0, + "grad_norm": 1.504744410937263, + "language_loss": 0.84680539, + "learning_rate": 3.997456820912346e-06, + "loss": 0.87446415, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.33789062, + "step": 758, + "time_per_iteration": 2.875411033630371 + }, + { + "auxiliary_loss_clip": 0.01682308, + "auxiliary_loss_mlp": 0.01072438, + "balance_loss_clip": 1.40586352, + "balance_loss_mlp": 1.03455353, + "epoch": 0.04563354877498873, + "flos": 23737990007160.0, + "grad_norm": 1.7548862813667692, + "language_loss": 0.88970208, + "learning_rate": 3.997437148755101e-06, + "loss": 0.9172495, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.37841797, + "step": 759, + "time_per_iteration": 2.871490478515625 + }, + { + "auxiliary_loss_clip": 0.01688613, + "auxiliary_loss_mlp": 0.01084182, + "balance_loss_clip": 1.40736055, + "balance_loss_mlp": 1.04491484, + "epoch": 0.045693672027656694, + "flos": 25740834957000.0, + "grad_norm": 3.164160578717661, + "language_loss": 0.75092125, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.77864921, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.39257812, + "step": 760, + "time_per_iteration": 2.8776607513427734 + }, + { + "auxiliary_loss_clip": 0.01685661, + "auxiliary_loss_mlp": 0.01090859, + "balance_loss_clip": 1.40851438, + "balance_loss_mlp": 1.05402374, + "epoch": 0.045753795280324666, + "flos": 19723772351880.0, + "grad_norm": 1.9477374761494466, + "language_loss": 0.84084952, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.86861479, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.36816406, + "step": 761, + "time_per_iteration": 2.849039316177368 + }, + { + "auxiliary_loss_clip": 0.01692854, + "auxiliary_loss_mlp": 0.01086813, + "balance_loss_clip": 1.41346645, + "balance_loss_mlp": 1.05233729, + "epoch": 0.04581391853299264, + "flos": 23260697057520.0, + "grad_norm": 1.7324718386215332, + "language_loss": 0.80255812, + "learning_rate": 3.997377677828266e-06, + "loss": 0.83035469, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.34472656, + "step": 762, + "time_per_iteration": 2.897305727005005 + }, + { + "auxiliary_loss_clip": 0.01475394, + "auxiliary_loss_mlp": 0.01015976, + "balance_loss_clip": 1.2928381, + "balance_loss_mlp": 1.00448406, + "epoch": 0.0458740417856606, + "flos": 64246422961440.0, + "grad_norm": 1.0126143073505343, + "language_loss": 0.58799207, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61290574, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.11474609, + "step": 763, + "time_per_iteration": 3.3115930557250977 + }, + { + "auxiliary_loss_clip": 0.0169524, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_clip": 1.41327095, + "balance_loss_mlp": 1.05393028, + "epoch": 0.045934165038328575, + "flos": 20774102428800.0, + "grad_norm": 2.231868366166252, + "language_loss": 0.89333129, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.92119205, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.36914062, + "step": 764, + "time_per_iteration": 2.8003108501434326 + }, + { + "auxiliary_loss_clip": 0.01705451, + "auxiliary_loss_mlp": 0.01095886, + "balance_loss_clip": 1.42208791, + "balance_loss_mlp": 1.05494988, + "epoch": 0.04599428829099654, + "flos": 30268957321440.0, + "grad_norm": 2.1821006764971087, + "language_loss": 0.86838162, + "learning_rate": 3.997317525234592e-06, + "loss": 0.89639497, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.40942383, + "step": 765, + "time_per_iteration": 2.8890538215637207 + }, + { + "auxiliary_loss_clip": 0.01696302, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_clip": 1.41189229, + "balance_loss_mlp": 1.05708992, + "epoch": 0.04605441154366451, + "flos": 23044072127400.0, + "grad_norm": 2.419851909526934, + "language_loss": 0.88599181, + "learning_rate": 3.997297322892056e-06, + "loss": 0.91389978, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.37426758, + "step": 766, + "time_per_iteration": 2.8441309928894043 + }, + { + "auxiliary_loss_clip": 0.01684637, + "auxiliary_loss_mlp": 0.01073042, + "balance_loss_clip": 1.40392506, + "balance_loss_mlp": 1.04171348, + "epoch": 0.046114534796332485, + "flos": 22022435179800.0, + "grad_norm": 2.4482951280492897, + "language_loss": 0.85132378, + "learning_rate": 3.997277044811806e-06, + "loss": 0.87890053, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.31323242, + "step": 767, + "time_per_iteration": 2.8444406986236572 + }, + { + "auxiliary_loss_clip": 0.01693202, + "auxiliary_loss_mlp": 0.01076224, + "balance_loss_clip": 1.41556263, + "balance_loss_mlp": 1.03879225, + "epoch": 0.04617465804900045, + "flos": 29868502034880.0, + "grad_norm": 1.8937549545882064, + "language_loss": 0.87991834, + "learning_rate": 3.99725669099461e-06, + "loss": 0.90761256, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.37451172, + "step": 768, + "time_per_iteration": 2.869967460632324 + }, + { + "auxiliary_loss_clip": 0.01688658, + "auxiliary_loss_mlp": 0.01077061, + "balance_loss_clip": 1.40674281, + "balance_loss_mlp": 1.04342055, + "epoch": 0.04623478130166842, + "flos": 25635629031480.0, + "grad_norm": 1.9812857401509756, + "language_loss": 0.75862461, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.78628182, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.33642578, + "step": 769, + "time_per_iteration": 2.836698055267334 + }, + { + "auxiliary_loss_clip": 0.01674943, + "auxiliary_loss_mlp": 0.01080768, + "balance_loss_clip": 1.40117085, + "balance_loss_mlp": 1.04665017, + "epoch": 0.04629490455433639, + "flos": 20453977124280.0, + "grad_norm": 2.553243087252995, + "language_loss": 0.8684181, + "learning_rate": 3.997215756152471e-06, + "loss": 0.89597523, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.34106445, + "step": 770, + "time_per_iteration": 2.8184306621551514 + }, + { + "auxiliary_loss_clip": 0.01697189, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_clip": 1.41107607, + "balance_loss_mlp": 1.04043674, + "epoch": 0.04635502780700436, + "flos": 23153948014320.0, + "grad_norm": 2.1341814800694117, + "language_loss": 0.8829391, + "learning_rate": 3.99719517512908e-06, + "loss": 0.91066939, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.35375977, + "step": 771, + "time_per_iteration": 2.798421859741211 + }, + { + "auxiliary_loss_clip": 0.01701278, + "auxiliary_loss_mlp": 0.01088217, + "balance_loss_clip": 1.40992093, + "balance_loss_mlp": 1.05176234, + "epoch": 0.04641515105967233, + "flos": 23296902733440.0, + "grad_norm": 1.913235968318512, + "language_loss": 0.84700954, + "learning_rate": 3.997174518371848e-06, + "loss": 0.87490451, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.36425781, + "step": 772, + "time_per_iteration": 2.8803396224975586 + }, + { + "auxiliary_loss_clip": 0.01675378, + "auxiliary_loss_mlp": 0.01078504, + "balance_loss_clip": 1.3991344, + "balance_loss_mlp": 1.04402876, + "epoch": 0.046475274312340296, + "flos": 25120181204640.0, + "grad_norm": 1.904245811190089, + "language_loss": 0.74439359, + "learning_rate": 3.997153785881557e-06, + "loss": 0.77193236, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.34472656, + "step": 773, + "time_per_iteration": 2.8741822242736816 + }, + { + "auxiliary_loss_clip": 0.01667442, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_clip": 1.3942548, + "balance_loss_mlp": 1.04034233, + "epoch": 0.04653539756500827, + "flos": 25270404820200.0, + "grad_norm": 2.106395998625477, + "language_loss": 0.79595321, + "learning_rate": 3.997132977658996e-06, + "loss": 0.82337773, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.34643555, + "step": 774, + "time_per_iteration": 2.8570659160614014 + }, + { + "auxiliary_loss_clip": 0.01676431, + "auxiliary_loss_mlp": 0.01069226, + "balance_loss_clip": 1.4034977, + "balance_loss_mlp": 1.03742075, + "epoch": 0.046595520817676234, + "flos": 35410464541800.0, + "grad_norm": 2.048769455012076, + "language_loss": 0.74598598, + "learning_rate": 3.997112093704952e-06, + "loss": 0.77344251, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.31787109, + "step": 775, + "time_per_iteration": 3.011870861053467 + }, + { + "auxiliary_loss_clip": 0.01680128, + "auxiliary_loss_mlp": 0.01067639, + "balance_loss_clip": 1.40272284, + "balance_loss_mlp": 1.03418922, + "epoch": 0.046655644070344206, + "flos": 18116915769000.0, + "grad_norm": 1.570539094007346, + "language_loss": 0.77883047, + "learning_rate": 3.997091134020217e-06, + "loss": 0.80630815, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.33422852, + "step": 776, + "time_per_iteration": 2.834771156311035 + }, + { + "auxiliary_loss_clip": 0.01672371, + "auxiliary_loss_mlp": 0.01074306, + "balance_loss_clip": 1.4006331, + "balance_loss_mlp": 1.0413332, + "epoch": 0.04671576732301218, + "flos": 29211277131360.0, + "grad_norm": 2.7647532295241883, + "language_loss": 0.72433418, + "learning_rate": 3.997070098605585e-06, + "loss": 0.75180095, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.32983398, + "step": 777, + "time_per_iteration": 2.9128363132476807 + }, + { + "auxiliary_loss_clip": 0.01679377, + "auxiliary_loss_mlp": 0.0107957, + "balance_loss_clip": 1.40423083, + "balance_loss_mlp": 1.04168499, + "epoch": 0.04677589057568014, + "flos": 30483957917160.0, + "grad_norm": 1.7300456356112999, + "language_loss": 0.76795942, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79554892, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.37866211, + "step": 778, + "time_per_iteration": 2.9191226959228516 + }, + { + "auxiliary_loss_clip": 0.01671645, + "auxiliary_loss_mlp": 0.01082177, + "balance_loss_clip": 1.3995831, + "balance_loss_mlp": 1.04834509, + "epoch": 0.046836013828348115, + "flos": 20562472326960.0, + "grad_norm": 1.8648968551755278, + "language_loss": 0.79749751, + "learning_rate": 3.997027800589829e-06, + "loss": 0.82503575, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.33837891, + "step": 779, + "time_per_iteration": 2.858020782470703 + }, + { + "auxiliary_loss_clip": 0.0166334, + "auxiliary_loss_mlp": 0.01069099, + "balance_loss_clip": 1.39339709, + "balance_loss_mlp": 1.03655493, + "epoch": 0.04689613708101608, + "flos": 25452895100760.0, + "grad_norm": 3.2861117351064717, + "language_loss": 0.77955711, + "learning_rate": 3.997006537990308e-06, + "loss": 0.80688143, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.32543945, + "step": 780, + "time_per_iteration": 2.9543404579162598 + }, + { + "auxiliary_loss_clip": 0.01660053, + "auxiliary_loss_mlp": 0.01078402, + "balance_loss_clip": 1.39274991, + "balance_loss_mlp": 1.04798019, + "epoch": 0.04695626033368405, + "flos": 23006404550520.0, + "grad_norm": 1.9211058237809704, + "language_loss": 0.76765037, + "learning_rate": 3.996985199664099e-06, + "loss": 0.79503489, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.30395508, + "step": 781, + "time_per_iteration": 2.864748001098633 + }, + { + "auxiliary_loss_clip": 0.01688149, + "auxiliary_loss_mlp": 0.01088808, + "balance_loss_clip": 1.40315926, + "balance_loss_mlp": 1.05616844, + "epoch": 0.047016383586352024, + "flos": 29139474904920.0, + "grad_norm": 2.176525373667253, + "language_loss": 0.74297011, + "learning_rate": 3.99696378561201e-06, + "loss": 0.77073967, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.32641602, + "step": 782, + "time_per_iteration": 2.9476590156555176 + }, + { + "auxiliary_loss_clip": 0.0166945, + "auxiliary_loss_mlp": 0.01070377, + "balance_loss_clip": 1.39583111, + "balance_loss_mlp": 1.0371654, + "epoch": 0.04707650683901999, + "flos": 14980527566640.0, + "grad_norm": 2.2039469458895784, + "language_loss": 0.81057382, + "learning_rate": 3.996942295834855e-06, + "loss": 0.83797204, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.33215332, + "step": 783, + "time_per_iteration": 2.8290984630584717 + }, + { + "auxiliary_loss_clip": 0.01660473, + "auxiliary_loss_mlp": 0.01071228, + "balance_loss_clip": 1.39188612, + "balance_loss_mlp": 1.03663349, + "epoch": 0.04713663009168796, + "flos": 21655911501000.0, + "grad_norm": 2.6764949425794216, + "language_loss": 0.82467479, + "learning_rate": 3.996920730333448e-06, + "loss": 0.85199177, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.34594727, + "step": 784, + "time_per_iteration": 2.8233721256256104 + }, + { + "auxiliary_loss_clip": 0.01663378, + "auxiliary_loss_mlp": 0.01084287, + "balance_loss_clip": 1.39004171, + "balance_loss_mlp": 1.05081272, + "epoch": 0.04719675334435593, + "flos": 21330507109680.0, + "grad_norm": 2.156776509313447, + "language_loss": 0.80868608, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83616269, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.3347168, + "step": 785, + "time_per_iteration": 2.8719921112060547 + }, + { + "auxiliary_loss_clip": 0.01673772, + "auxiliary_loss_mlp": 0.01077751, + "balance_loss_clip": 1.40061879, + "balance_loss_mlp": 1.0451827, + "epoch": 0.0472568765970239, + "flos": 17935927997760.0, + "grad_norm": 2.1792401174114744, + "language_loss": 0.90407175, + "learning_rate": 3.996877372161152e-06, + "loss": 0.93158698, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.32543945, + "step": 786, + "time_per_iteration": 2.774346113204956 + }, + { + "auxiliary_loss_clip": 0.01679189, + "auxiliary_loss_mlp": 0.01087217, + "balance_loss_clip": 1.39515662, + "balance_loss_mlp": 1.05016685, + "epoch": 0.04731699984969187, + "flos": 18081887735520.0, + "grad_norm": 2.6459410350994848, + "language_loss": 0.78091741, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.80858141, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.37060547, + "step": 787, + "time_per_iteration": 2.8147127628326416 + }, + { + "auxiliary_loss_clip": 0.01680553, + "auxiliary_loss_mlp": 0.01069892, + "balance_loss_clip": 1.40333056, + "balance_loss_mlp": 1.03884947, + "epoch": 0.047377123102359836, + "flos": 23190275515320.0, + "grad_norm": 2.3114329638062094, + "language_loss": 0.81174028, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83924472, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.31030273, + "step": 788, + "time_per_iteration": 2.8473074436187744 + }, + { + "auxiliary_loss_clip": 0.0167502, + "auxiliary_loss_mlp": 0.01087478, + "balance_loss_clip": 1.40311217, + "balance_loss_mlp": 1.04577851, + "epoch": 0.04743724635502781, + "flos": 22753005427440.0, + "grad_norm": 2.9669446185869326, + "language_loss": 0.8537845, + "learning_rate": 3.996811766991355e-06, + "loss": 0.88140947, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.41650391, + "step": 789, + "time_per_iteration": 2.872643232345581 + }, + { + "auxiliary_loss_clip": 0.01676624, + "auxiliary_loss_mlp": 0.01076829, + "balance_loss_clip": 1.40424228, + "balance_loss_mlp": 1.04430842, + "epoch": 0.04749736960769577, + "flos": 17243228368800.0, + "grad_norm": 2.3445972416970395, + "language_loss": 0.82379228, + "learning_rate": 3.996789747161709e-06, + "loss": 0.85132682, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.32519531, + "step": 790, + "time_per_iteration": 2.79720401763916 + }, + { + "auxiliary_loss_clip": 0.01672952, + "auxiliary_loss_mlp": 0.0107557, + "balance_loss_clip": 1.39794624, + "balance_loss_mlp": 1.04049826, + "epoch": 0.047557492860363745, + "flos": 40486707481680.0, + "grad_norm": 2.064319835086564, + "language_loss": 0.8864733, + "learning_rate": 3.996767651613597e-06, + "loss": 0.91395849, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.3503418, + "step": 791, + "time_per_iteration": 4.456761121749878 + }, + { + "auxiliary_loss_clip": 0.01683266, + "auxiliary_loss_mlp": 0.01089251, + "balance_loss_clip": 1.40737557, + "balance_loss_mlp": 1.05618215, + "epoch": 0.04761761611303172, + "flos": 18702703921320.0, + "grad_norm": 2.0530783423805614, + "language_loss": 0.90991426, + "learning_rate": 3.996745480347854e-06, + "loss": 0.93763942, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.33081055, + "step": 792, + "time_per_iteration": 6.226160287857056 + }, + { + "auxiliary_loss_clip": 0.01688834, + "auxiliary_loss_mlp": 0.0108886, + "balance_loss_clip": 1.41189206, + "balance_loss_mlp": 1.05815196, + "epoch": 0.04767773936569968, + "flos": 20926762545960.0, + "grad_norm": 1.8364146789977263, + "language_loss": 0.73345816, + "learning_rate": 3.996723233365324e-06, + "loss": 0.76123512, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.30688477, + "step": 793, + "time_per_iteration": 2.888947010040283 + }, + { + "auxiliary_loss_clip": 0.0168515, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_clip": 1.40438378, + "balance_loss_mlp": 1.0370717, + "epoch": 0.047737862618367655, + "flos": 23737990007160.0, + "grad_norm": 2.5729621459204495, + "language_loss": 0.87326878, + "learning_rate": 3.996700910666847e-06, + "loss": 0.90083361, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.3425293, + "step": 794, + "time_per_iteration": 4.247905015945435 + }, + { + "auxiliary_loss_clip": 0.01693957, + "auxiliary_loss_mlp": 0.0108527, + "balance_loss_clip": 1.41307187, + "balance_loss_mlp": 1.05091381, + "epoch": 0.04779798587103562, + "flos": 23701012772400.0, + "grad_norm": 3.4047318762028516, + "language_loss": 0.7165764, + "learning_rate": 3.996678512253272e-06, + "loss": 0.74436867, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.34350586, + "step": 795, + "time_per_iteration": 2.9104361534118652 + }, + { + "auxiliary_loss_clip": 0.01678452, + "auxiliary_loss_mlp": 0.01085347, + "balance_loss_clip": 1.4045881, + "balance_loss_mlp": 1.04922676, + "epoch": 0.04785810912370359, + "flos": 23188651180920.0, + "grad_norm": 1.823464530839535, + "language_loss": 0.81262863, + "learning_rate": 3.996656038125449e-06, + "loss": 0.84026659, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.36108398, + "step": 796, + "time_per_iteration": 2.838137626647949 + }, + { + "auxiliary_loss_clip": 0.01685164, + "auxiliary_loss_mlp": 0.01078361, + "balance_loss_clip": 1.40692425, + "balance_loss_mlp": 1.04328942, + "epoch": 0.047918232376371564, + "flos": 18045154150920.0, + "grad_norm": 2.018925601710993, + "language_loss": 0.82413793, + "learning_rate": 3.996633488284228e-06, + "loss": 0.85177314, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.35107422, + "step": 797, + "time_per_iteration": 2.810835838317871 + }, + { + "auxiliary_loss_clip": 0.01483007, + "auxiliary_loss_mlp": 0.01058655, + "balance_loss_clip": 1.30332088, + "balance_loss_mlp": 1.04716337, + "epoch": 0.04797835562903953, + "flos": 62457522789960.0, + "grad_norm": 0.9310499901579076, + "language_loss": 0.64472032, + "learning_rate": 3.996610862730465e-06, + "loss": 0.67013693, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.11474609, + "step": 798, + "time_per_iteration": 3.222289562225342 + }, + { + "auxiliary_loss_clip": 0.01697887, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_clip": 1.41095734, + "balance_loss_mlp": 1.05095375, + "epoch": 0.0480384788817075, + "flos": 21512347656480.0, + "grad_norm": 2.5519463424067643, + "language_loss": 0.91753423, + "learning_rate": 3.996588161465018e-06, + "loss": 0.94539583, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.37304688, + "step": 799, + "time_per_iteration": 2.8659234046936035 + }, + { + "auxiliary_loss_clip": 0.01694172, + "auxiliary_loss_mlp": 0.01084593, + "balance_loss_clip": 1.41479492, + "balance_loss_mlp": 1.04723239, + "epoch": 0.048098602134375466, + "flos": 21731855780160.0, + "grad_norm": 2.0695055822464634, + "language_loss": 0.87292087, + "learning_rate": 3.996565384488748e-06, + "loss": 0.90070856, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.37353516, + "step": 800, + "time_per_iteration": 2.8984713554382324 + }, + { + "auxiliary_loss_clip": 0.0169911, + "auxiliary_loss_mlp": 0.01081284, + "balance_loss_clip": 1.41755986, + "balance_loss_mlp": 1.04568803, + "epoch": 0.04815872538704344, + "flos": 22936429700280.0, + "grad_norm": 2.2281678270357026, + "language_loss": 0.85207856, + "learning_rate": 3.996542531802518e-06, + "loss": 0.87988245, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.35595703, + "step": 801, + "time_per_iteration": 2.870368719100952 + }, + { + "auxiliary_loss_clip": 0.01702349, + "auxiliary_loss_mlp": 0.01080331, + "balance_loss_clip": 1.41694295, + "balance_loss_mlp": 1.04371047, + "epoch": 0.04821884863971141, + "flos": 43181927193600.0, + "grad_norm": 1.781736193608115, + "language_loss": 0.80610317, + "learning_rate": 3.996519603407196e-06, + "loss": 0.83392996, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.36669922, + "step": 802, + "time_per_iteration": 3.0421690940856934 + }, + { + "auxiliary_loss_clip": 0.01688141, + "auxiliary_loss_mlp": 0.01069304, + "balance_loss_clip": 1.41241288, + "balance_loss_mlp": 1.0363307, + "epoch": 0.048278971892379376, + "flos": 18624038882040.0, + "grad_norm": 2.142760307058116, + "language_loss": 0.86985016, + "learning_rate": 3.996496599303649e-06, + "loss": 0.89742458, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.32958984, + "step": 803, + "time_per_iteration": 2.7812137603759766 + }, + { + "auxiliary_loss_clip": 0.01690696, + "auxiliary_loss_mlp": 0.01068841, + "balance_loss_clip": 1.41191959, + "balance_loss_mlp": 1.03529549, + "epoch": 0.04833909514504735, + "flos": 20234915692560.0, + "grad_norm": 2.5320204654851066, + "language_loss": 0.86224723, + "learning_rate": 3.996473519492753e-06, + "loss": 0.88984257, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.33569336, + "step": 804, + "time_per_iteration": 2.8119359016418457 + }, + { + "auxiliary_loss_clip": 0.01688727, + "auxiliary_loss_mlp": 0.01080467, + "balance_loss_clip": 1.41173851, + "balance_loss_mlp": 1.04315495, + "epoch": 0.04839921839771532, + "flos": 24650644451760.0, + "grad_norm": 1.873882223232823, + "language_loss": 0.86846077, + "learning_rate": 3.99645036397538e-06, + "loss": 0.89615268, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.37280273, + "step": 805, + "time_per_iteration": 2.8364503383636475 + }, + { + "auxiliary_loss_clip": 0.01693089, + "auxiliary_loss_mlp": 0.01080517, + "balance_loss_clip": 1.41662121, + "balance_loss_mlp": 1.04468274, + "epoch": 0.048459341650383285, + "flos": 24832931690520.0, + "grad_norm": 1.8466445125133102, + "language_loss": 0.68500662, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.71274269, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.35864258, + "step": 806, + "time_per_iteration": 2.8288092613220215 + }, + { + "auxiliary_loss_clip": 0.01677851, + "auxiliary_loss_mlp": 0.0107694, + "balance_loss_clip": 1.40860415, + "balance_loss_mlp": 1.04244041, + "epoch": 0.04851946490305126, + "flos": 22168232484120.0, + "grad_norm": 2.8332086924637663, + "language_loss": 0.77560854, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.8031565, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.34472656, + "step": 807, + "time_per_iteration": 2.866222858428955 + }, + { + "auxiliary_loss_clip": 0.01680092, + "auxiliary_loss_mlp": 0.01084633, + "balance_loss_clip": 1.40624523, + "balance_loss_mlp": 1.04822636, + "epoch": 0.04857958815571922, + "flos": 19796792829120.0, + "grad_norm": 2.3520033793684956, + "language_loss": 0.87604284, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.9036901, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.36450195, + "step": 808, + "time_per_iteration": 2.7999839782714844 + }, + { + "auxiliary_loss_clip": 0.01703149, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_clip": 1.42313218, + "balance_loss_mlp": 1.03492844, + "epoch": 0.048639711408387194, + "flos": 18702663312960.0, + "grad_norm": 1.7500987662117744, + "language_loss": 0.90834343, + "learning_rate": 3.996356984858732e-06, + "loss": 0.93606633, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.34204102, + "step": 809, + "time_per_iteration": 2.823153495788574 + }, + { + "auxiliary_loss_clip": 0.01690558, + "auxiliary_loss_mlp": 0.01080879, + "balance_loss_clip": 1.41531336, + "balance_loss_mlp": 1.04697597, + "epoch": 0.048699834661055166, + "flos": 24868934324640.0, + "grad_norm": 1.782216010698459, + "language_loss": 0.85432041, + "learning_rate": 3.996333450822208e-06, + "loss": 0.88203478, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.33911133, + "step": 810, + "time_per_iteration": 2.8535726070404053 + }, + { + "auxiliary_loss_clip": 0.01704299, + "auxiliary_loss_mlp": 0.01073757, + "balance_loss_clip": 1.42286301, + "balance_loss_mlp": 1.04116488, + "epoch": 0.04875995791372313, + "flos": 20708878756680.0, + "grad_norm": 1.7807476297336533, + "language_loss": 0.81292701, + "learning_rate": 3.99630984108452e-06, + "loss": 0.84070754, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.32568359, + "step": 811, + "time_per_iteration": 2.8225796222686768 + }, + { + "auxiliary_loss_clip": 0.01688244, + "auxiliary_loss_mlp": 0.01078505, + "balance_loss_clip": 1.41752446, + "balance_loss_mlp": 1.04310024, + "epoch": 0.048820081166391104, + "flos": 18592949859480.0, + "grad_norm": 3.0448392921817495, + "language_loss": 0.74662346, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77429092, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.35400391, + "step": 812, + "time_per_iteration": 2.856377601623535 + }, + { + "auxiliary_loss_clip": 0.01676332, + "auxiliary_loss_mlp": 0.01081218, + "balance_loss_clip": 1.41140926, + "balance_loss_mlp": 1.05024755, + "epoch": 0.04888020441905907, + "flos": 22712007965040.0, + "grad_norm": 1.8359038958188387, + "language_loss": 0.9059664, + "learning_rate": 3.996262394509233e-06, + "loss": 0.93354195, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.30957031, + "step": 813, + "time_per_iteration": 2.8642492294311523 + }, + { + "auxiliary_loss_clip": 0.01688464, + "auxiliary_loss_mlp": 0.01068365, + "balance_loss_clip": 1.41781449, + "balance_loss_mlp": 1.03789496, + "epoch": 0.04894032767172704, + "flos": 22789576578600.0, + "grad_norm": 2.4434162965830035, + "language_loss": 0.75750774, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.78507596, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.30493164, + "step": 814, + "time_per_iteration": 2.832897663116455 + }, + { + "auxiliary_loss_clip": 0.01685236, + "auxiliary_loss_mlp": 0.01077786, + "balance_loss_clip": 1.41118503, + "balance_loss_mlp": 1.04424012, + "epoch": 0.04900045092439501, + "flos": 25521123791520.0, + "grad_norm": 2.7695745249487054, + "language_loss": 0.8465848, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.87421501, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.33544922, + "step": 815, + "time_per_iteration": 2.8267645835876465 + }, + { + "auxiliary_loss_clip": 0.01699101, + "auxiliary_loss_mlp": 0.01083666, + "balance_loss_clip": 1.42081785, + "balance_loss_mlp": 1.05035925, + "epoch": 0.04906057417706298, + "flos": 25963713574560.0, + "grad_norm": 2.0642298866444473, + "language_loss": 0.91384351, + "learning_rate": 3.996190656910043e-06, + "loss": 0.94167125, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.33325195, + "step": 816, + "time_per_iteration": 2.8775949478149414 + }, + { + "auxiliary_loss_clip": 0.01704135, + "auxiliary_loss_mlp": 0.01074453, + "balance_loss_clip": 1.42103851, + "balance_loss_mlp": 1.0414077, + "epoch": 0.04912069742973095, + "flos": 18629277360480.0, + "grad_norm": 2.1691042830231098, + "language_loss": 0.80480194, + "learning_rate": 3.996166592984268e-06, + "loss": 0.83258784, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.33007812, + "step": 817, + "time_per_iteration": 2.7809736728668213 + }, + { + "auxiliary_loss_clip": 0.01690543, + "auxiliary_loss_mlp": 0.01088754, + "balance_loss_clip": 1.41788602, + "balance_loss_mlp": 1.05661511, + "epoch": 0.049180820682398915, + "flos": 23705560908720.0, + "grad_norm": 1.6710314541517581, + "language_loss": 0.85573351, + "learning_rate": 3.996142453363656e-06, + "loss": 0.88352644, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.32128906, + "step": 818, + "time_per_iteration": 2.870185375213623 + }, + { + "auxiliary_loss_clip": 0.01701393, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_clip": 1.41919947, + "balance_loss_mlp": 1.04927373, + "epoch": 0.04924094393506689, + "flos": 22425692443200.0, + "grad_norm": 2.164486958245614, + "language_loss": 0.77295506, + "learning_rate": 3.996118238049124e-06, + "loss": 0.80078554, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.32421875, + "step": 819, + "time_per_iteration": 2.846682071685791 + }, + { + "auxiliary_loss_clip": 0.01691876, + "auxiliary_loss_mlp": 0.01075619, + "balance_loss_clip": 1.41733849, + "balance_loss_mlp": 1.04615009, + "epoch": 0.04930106718773486, + "flos": 15741983795040.0, + "grad_norm": 2.031878904263717, + "language_loss": 0.8564862, + "learning_rate": 3.996093947041586e-06, + "loss": 0.88416111, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.29492188, + "step": 820, + "time_per_iteration": 2.830597162246704 + }, + { + "auxiliary_loss_clip": 0.01682503, + "auxiliary_loss_mlp": 0.01065781, + "balance_loss_clip": 1.40758121, + "balance_loss_mlp": 1.03485763, + "epoch": 0.049361190440402825, + "flos": 26256323392200.0, + "grad_norm": 2.0778065470444917, + "language_loss": 0.91083282, + "learning_rate": 3.996069580341966e-06, + "loss": 0.93831563, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.30908203, + "step": 821, + "time_per_iteration": 2.8383779525756836 + }, + { + "auxiliary_loss_clip": 0.01681487, + "auxiliary_loss_mlp": 0.01083271, + "balance_loss_clip": 1.40707564, + "balance_loss_mlp": 1.05342126, + "epoch": 0.0494213136930708, + "flos": 21257446024080.0, + "grad_norm": 1.7969842545835517, + "language_loss": 0.90448338, + "learning_rate": 3.996045137951188e-06, + "loss": 0.93213093, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.29833984, + "step": 822, + "time_per_iteration": 2.7638652324676514 + }, + { + "auxiliary_loss_clip": 0.01689373, + "auxiliary_loss_mlp": 0.01069589, + "balance_loss_clip": 1.41675079, + "balance_loss_mlp": 1.03640091, + "epoch": 0.04948143694573876, + "flos": 27971431527600.0, + "grad_norm": 1.7487062113287188, + "language_loss": 0.68561637, + "learning_rate": 3.996020619870178e-06, + "loss": 0.71320593, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.33178711, + "step": 823, + "time_per_iteration": 2.9053597450256348 + }, + { + "auxiliary_loss_clip": 0.01472345, + "auxiliary_loss_mlp": 0.01011959, + "balance_loss_clip": 1.29434872, + "balance_loss_mlp": 1.00151622, + "epoch": 0.049541560198406734, + "flos": 66193854481080.0, + "grad_norm": 1.3050961597754749, + "language_loss": 0.62206078, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64690381, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.10449219, + "step": 824, + "time_per_iteration": 3.320638418197632 + }, + { + "auxiliary_loss_clip": 0.01692957, + "auxiliary_loss_mlp": 0.01089743, + "balance_loss_clip": 1.41315937, + "balance_loss_mlp": 1.05748522, + "epoch": 0.049601683451074706, + "flos": 22897787522760.0, + "grad_norm": 2.0257896743093835, + "language_loss": 0.90805829, + "learning_rate": 3.995971356641185e-06, + "loss": 0.93588531, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.32226562, + "step": 825, + "time_per_iteration": 2.816014528274536 + }, + { + "auxiliary_loss_clip": 0.01687368, + "auxiliary_loss_mlp": 0.01072357, + "balance_loss_clip": 1.41332901, + "balance_loss_mlp": 1.04048073, + "epoch": 0.04966180670374267, + "flos": 21438596228760.0, + "grad_norm": 3.167767584958804, + "language_loss": 0.6745981, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.70219541, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.3190918, + "step": 826, + "time_per_iteration": 2.8032641410827637 + }, + { + "auxiliary_loss_clip": 0.0168031, + "auxiliary_loss_mlp": 0.01076673, + "balance_loss_clip": 1.40100527, + "balance_loss_mlp": 1.04549968, + "epoch": 0.04972192995641064, + "flos": 23111935342920.0, + "grad_norm": 1.879395881462743, + "language_loss": 0.78889835, + "learning_rate": 3.995921790662459e-06, + "loss": 0.81646824, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.31164551, + "step": 827, + "time_per_iteration": 2.8648312091827393 + }, + { + "auxiliary_loss_clip": 0.01689589, + "auxiliary_loss_mlp": 0.01082098, + "balance_loss_clip": 1.40667999, + "balance_loss_mlp": 1.04697919, + "epoch": 0.04978205320907861, + "flos": 40413565179360.0, + "grad_norm": 2.203772649417441, + "language_loss": 0.79359746, + "learning_rate": 3.995896894144294e-06, + "loss": 0.82131439, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.35107422, + "step": 828, + "time_per_iteration": 2.9873287677764893 + }, + { + "auxiliary_loss_clip": 0.01659944, + "auxiliary_loss_mlp": 0.01057824, + "balance_loss_clip": 1.39016271, + "balance_loss_mlp": 1.02857006, + "epoch": 0.04984217646174658, + "flos": 25234036710840.0, + "grad_norm": 1.8190427866646481, + "language_loss": 0.85158122, + "learning_rate": 3.995871921941519e-06, + "loss": 0.87875891, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.29248047, + "step": 829, + "time_per_iteration": 2.8443679809570312 + }, + { + "auxiliary_loss_clip": 0.01672061, + "auxiliary_loss_mlp": 0.01087968, + "balance_loss_clip": 1.39570546, + "balance_loss_mlp": 1.0529207, + "epoch": 0.04990229971441455, + "flos": 15963725378520.0, + "grad_norm": 2.2971770779661993, + "language_loss": 0.7645334, + "learning_rate": 3.99584687405508e-06, + "loss": 0.79213369, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.35058594, + "step": 830, + "time_per_iteration": 4.257680654525757 + }, + { + "auxiliary_loss_clip": 0.01668131, + "auxiliary_loss_mlp": 0.01087497, + "balance_loss_clip": 1.39241302, + "balance_loss_mlp": 1.05359411, + "epoch": 0.04996242296708252, + "flos": 18409647411720.0, + "grad_norm": 1.8849519144160003, + "language_loss": 0.80435276, + "learning_rate": 3.995821750485929e-06, + "loss": 0.831909, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.33862305, + "step": 831, + "time_per_iteration": 6.106913805007935 + }, + { + "auxiliary_loss_clip": 0.01669427, + "auxiliary_loss_mlp": 0.01090364, + "balance_loss_clip": 1.39299285, + "balance_loss_mlp": 1.0597508, + "epoch": 0.05002254621975049, + "flos": 17862542045280.0, + "grad_norm": 2.849663543731141, + "language_loss": 0.9301343, + "learning_rate": 3.995796551235016e-06, + "loss": 0.9577322, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.30615234, + "step": 832, + "time_per_iteration": 2.8269472122192383 + }, + { + "auxiliary_loss_clip": 0.01655063, + "auxiliary_loss_mlp": 0.0108152, + "balance_loss_clip": 1.38694119, + "balance_loss_mlp": 1.05238545, + "epoch": 0.050082669472418455, + "flos": 45668399997240.0, + "grad_norm": 3.4511267715601615, + "language_loss": 0.8382653, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.86563122, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.29101562, + "step": 833, + "time_per_iteration": 3.0232014656066895 + }, + { + "auxiliary_loss_clip": 0.01668757, + "auxiliary_loss_mlp": 0.01071497, + "balance_loss_clip": 1.39497197, + "balance_loss_mlp": 1.04016912, + "epoch": 0.05014279272508643, + "flos": 37969226872200.0, + "grad_norm": 2.0487733489189224, + "language_loss": 0.82454473, + "learning_rate": 3.995745925691733e-06, + "loss": 0.85194731, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.31298828, + "step": 834, + "time_per_iteration": 4.3447160720825195 + }, + { + "auxiliary_loss_clip": 0.01669798, + "auxiliary_loss_mlp": 0.01081761, + "balance_loss_clip": 1.39363325, + "balance_loss_mlp": 1.05092144, + "epoch": 0.0502029159777544, + "flos": 21001123099080.0, + "grad_norm": 2.2662293110388565, + "language_loss": 0.93737519, + "learning_rate": 3.995720499401282e-06, + "loss": 0.96489072, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.30822754, + "step": 835, + "time_per_iteration": 2.879499673843384 + }, + { + "auxiliary_loss_clip": 0.01665106, + "auxiliary_loss_mlp": 0.01093291, + "balance_loss_clip": 1.38878393, + "balance_loss_mlp": 1.05764771, + "epoch": 0.050263039230422364, + "flos": 15892329235680.0, + "grad_norm": 2.006160210433074, + "language_loss": 0.78690374, + "learning_rate": 3.995694997432911e-06, + "loss": 0.8144877, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.35644531, + "step": 836, + "time_per_iteration": 2.9173665046691895 + }, + { + "auxiliary_loss_clip": 0.01646607, + "auxiliary_loss_mlp": 0.01079671, + "balance_loss_clip": 1.38277388, + "balance_loss_mlp": 1.05057168, + "epoch": 0.050323162483090336, + "flos": 23738071223880.0, + "grad_norm": 2.143226799007526, + "language_loss": 0.84959328, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.87685609, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.29101562, + "step": 837, + "time_per_iteration": 2.8594424724578857 + }, + { + "auxiliary_loss_clip": 0.01651443, + "auxiliary_loss_mlp": 0.0107394, + "balance_loss_clip": 1.37844706, + "balance_loss_mlp": 1.04065633, + "epoch": 0.0503832857357583, + "flos": 20271040151760.0, + "grad_norm": 2.344109544137011, + "language_loss": 0.7369926, + "learning_rate": 3.995643766466275e-06, + "loss": 0.76424646, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.33276367, + "step": 838, + "time_per_iteration": 2.8702056407928467 + }, + { + "auxiliary_loss_clip": 0.01655987, + "auxiliary_loss_mlp": 0.01072802, + "balance_loss_clip": 1.38171649, + "balance_loss_mlp": 1.04187965, + "epoch": 0.05044340898842627, + "flos": 17789480959680.0, + "grad_norm": 1.8163213644814422, + "language_loss": 0.83744323, + "learning_rate": 3.995618037469953e-06, + "loss": 0.86473107, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.30932617, + "step": 839, + "time_per_iteration": 2.795464515686035 + }, + { + "auxiliary_loss_clip": 0.01657672, + "auxiliary_loss_mlp": 0.01080012, + "balance_loss_clip": 1.38823938, + "balance_loss_mlp": 1.04997098, + "epoch": 0.050503532241094246, + "flos": 22971741992280.0, + "grad_norm": 1.7847951980168784, + "language_loss": 0.8610574, + "learning_rate": 3.995592232799595e-06, + "loss": 0.88843423, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.30053711, + "step": 840, + "time_per_iteration": 2.8512792587280273 + }, + { + "auxiliary_loss_clip": 0.01664825, + "auxiliary_loss_mlp": 0.01077598, + "balance_loss_clip": 1.39117885, + "balance_loss_mlp": 1.0459125, + "epoch": 0.05056365549376221, + "flos": 22781251864800.0, + "grad_norm": 3.0318718172944035, + "language_loss": 0.94953668, + "learning_rate": 3.99556635245618e-06, + "loss": 0.9769609, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.31689453, + "step": 841, + "time_per_iteration": 2.810992479324341 + }, + { + "auxiliary_loss_clip": 0.01660801, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_clip": 1.38937759, + "balance_loss_mlp": 1.04566407, + "epoch": 0.05062377874643018, + "flos": 30922771122720.0, + "grad_norm": 2.2067646631095315, + "language_loss": 0.78130078, + "learning_rate": 3.995540396440688e-06, + "loss": 0.80867797, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.31225586, + "step": 842, + "time_per_iteration": 2.9374754428863525 + }, + { + "auxiliary_loss_clip": 0.016765, + "auxiliary_loss_mlp": 0.01086278, + "balance_loss_clip": 1.39648128, + "balance_loss_mlp": 1.05490255, + "epoch": 0.05068390199909815, + "flos": 19652051342160.0, + "grad_norm": 2.2308164726376596, + "language_loss": 0.78902018, + "learning_rate": 3.995514364754105e-06, + "loss": 0.81664795, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.3137207, + "step": 843, + "time_per_iteration": 2.8538167476654053 + }, + { + "auxiliary_loss_clip": 0.01667999, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.38887739, + "balance_loss_mlp": 1.03752375, + "epoch": 0.05074402525176612, + "flos": 37969267480560.0, + "grad_norm": 1.8545089182370542, + "language_loss": 0.83520538, + "learning_rate": 3.995488257397417e-06, + "loss": 0.86256438, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.30419922, + "step": 844, + "time_per_iteration": 2.9648587703704834 + }, + { + "auxiliary_loss_clip": 0.01664142, + "auxiliary_loss_mlp": 0.01079914, + "balance_loss_clip": 1.38679981, + "balance_loss_mlp": 1.04961145, + "epoch": 0.05080414850443409, + "flos": 22059737281440.0, + "grad_norm": 1.9947441974872733, + "language_loss": 0.76712859, + "learning_rate": 3.995462074371614e-06, + "loss": 0.79456913, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.30297852, + "step": 845, + "time_per_iteration": 2.8206734657287598 + }, + { + "auxiliary_loss_clip": 0.01657641, + "auxiliary_loss_mlp": 0.01066626, + "balance_loss_clip": 1.38455081, + "balance_loss_mlp": 1.03565514, + "epoch": 0.05086427175710206, + "flos": 20230083297720.0, + "grad_norm": 1.7412270573900017, + "language_loss": 0.882653, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90989566, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.31005859, + "step": 846, + "time_per_iteration": 2.8212027549743652 + }, + { + "auxiliary_loss_clip": 0.01660705, + "auxiliary_loss_mlp": 0.01078531, + "balance_loss_clip": 1.3881042, + "balance_loss_mlp": 1.04736948, + "epoch": 0.05092439500977003, + "flos": 15163505147520.0, + "grad_norm": 2.0244283134988676, + "language_loss": 0.88288796, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.91028035, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.31152344, + "step": 847, + "time_per_iteration": 2.8128139972686768 + }, + { + "auxiliary_loss_clip": 0.01647815, + "auxiliary_loss_mlp": 0.01074669, + "balance_loss_clip": 1.37925959, + "balance_loss_mlp": 1.04365122, + "epoch": 0.050984518262437994, + "flos": 22060346406840.0, + "grad_norm": 2.789169498092908, + "language_loss": 0.83479786, + "learning_rate": 3.995383071289462e-06, + "loss": 0.86202276, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.31005859, + "step": 848, + "time_per_iteration": 2.8616580963134766 + }, + { + "auxiliary_loss_clip": 0.01659871, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_clip": 1.38748527, + "balance_loss_mlp": 1.05215836, + "epoch": 0.05104464151510597, + "flos": 30231249136200.0, + "grad_norm": 1.8717127513966179, + "language_loss": 0.88272345, + "learning_rate": 3.995356585597158e-06, + "loss": 0.9101516, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.30786133, + "step": 849, + "time_per_iteration": 2.9721908569335938 + }, + { + "auxiliary_loss_clip": 0.01652307, + "auxiliary_loss_mlp": 0.01072628, + "balance_loss_clip": 1.38166022, + "balance_loss_mlp": 1.04282546, + "epoch": 0.05110476476777394, + "flos": 18337520318400.0, + "grad_norm": 2.2440999469180825, + "language_loss": 0.84020638, + "learning_rate": 3.995330024240732e-06, + "loss": 0.86745572, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.29760742, + "step": 850, + "time_per_iteration": 2.9185123443603516 + }, + { + "auxiliary_loss_clip": 0.01659253, + "auxiliary_loss_mlp": 0.01069847, + "balance_loss_clip": 1.38481832, + "balance_loss_mlp": 1.03853059, + "epoch": 0.051164888020441904, + "flos": 38005919848440.0, + "grad_norm": 2.511497455753873, + "language_loss": 0.66547447, + "learning_rate": 3.995303387221192e-06, + "loss": 0.69276547, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.31311035, + "step": 851, + "time_per_iteration": 2.929441452026367 + }, + { + "auxiliary_loss_clip": 0.01658826, + "auxiliary_loss_mlp": 0.01086309, + "balance_loss_clip": 1.38402545, + "balance_loss_mlp": 1.05445671, + "epoch": 0.051225011273109876, + "flos": 23043747260520.0, + "grad_norm": 2.087874336411489, + "language_loss": 0.84244728, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86989868, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.31884766, + "step": 852, + "time_per_iteration": 2.820202589035034 + }, + { + "auxiliary_loss_clip": 0.01666263, + "auxiliary_loss_mlp": 0.01094382, + "balance_loss_clip": 1.39015007, + "balance_loss_mlp": 1.06238663, + "epoch": 0.05128513452577785, + "flos": 18264418624440.0, + "grad_norm": 2.118112949261323, + "language_loss": 0.80743319, + "learning_rate": 3.995249886196811e-06, + "loss": 0.83503962, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.31982422, + "step": 853, + "time_per_iteration": 2.8467938899993896 + }, + { + "auxiliary_loss_clip": 0.01659148, + "auxiliary_loss_mlp": 0.01076627, + "balance_loss_clip": 1.3880893, + "balance_loss_mlp": 1.04472625, + "epoch": 0.05134525777844581, + "flos": 27204736820760.0, + "grad_norm": 1.8395958152143481, + "language_loss": 0.76297408, + "learning_rate": 3.995223022193999e-06, + "loss": 0.79033178, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.3190918, + "step": 854, + "time_per_iteration": 2.8857808113098145 + }, + { + "auxiliary_loss_clip": 0.0165702, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.38313568, + "balance_loss_mlp": 1.04104257, + "epoch": 0.051405381031113785, + "flos": 28367622936360.0, + "grad_norm": 2.1646651781257034, + "language_loss": 0.82781357, + "learning_rate": 3.99519608253213e-06, + "loss": 0.85510576, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.31152344, + "step": 855, + "time_per_iteration": 2.853815793991089 + }, + { + "auxiliary_loss_clip": 0.01478546, + "auxiliary_loss_mlp": 0.01011939, + "balance_loss_clip": 1.29566526, + "balance_loss_mlp": 1.00230682, + "epoch": 0.05146550428378175, + "flos": 65633690203920.0, + "grad_norm": 0.9830029940821357, + "language_loss": 0.65635848, + "learning_rate": 3.995169067212227e-06, + "loss": 0.68126333, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.09619141, + "step": 856, + "time_per_iteration": 3.2680108547210693 + }, + { + "auxiliary_loss_clip": 0.01647551, + "auxiliary_loss_mlp": 0.01069465, + "balance_loss_clip": 1.37968206, + "balance_loss_mlp": 1.03825617, + "epoch": 0.05152562753644972, + "flos": 22060021539960.0, + "grad_norm": 1.933487988905089, + "language_loss": 0.77335548, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.80052561, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.31176758, + "step": 857, + "time_per_iteration": 2.840022087097168 + }, + { + "auxiliary_loss_clip": 0.01657836, + "auxiliary_loss_mlp": 0.01078904, + "balance_loss_clip": 1.38374698, + "balance_loss_mlp": 1.04724216, + "epoch": 0.051585750789117694, + "flos": 18513432044640.0, + "grad_norm": 1.9918266711096484, + "language_loss": 0.89514571, + "learning_rate": 3.995114809602412e-06, + "loss": 0.92251313, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.31665039, + "step": 858, + "time_per_iteration": 2.8394339084625244 + }, + { + "auxiliary_loss_clip": 0.01658985, + "auxiliary_loss_mlp": 0.01076932, + "balance_loss_clip": 1.38759446, + "balance_loss_mlp": 1.04467344, + "epoch": 0.05164587404178566, + "flos": 23735228638680.0, + "grad_norm": 1.987827576046141, + "language_loss": 0.77076888, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.79812801, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.32250977, + "step": 859, + "time_per_iteration": 2.899503469467163 + }, + { + "auxiliary_loss_clip": 0.01679971, + "auxiliary_loss_mlp": 0.01089285, + "balance_loss_clip": 1.39775372, + "balance_loss_mlp": 1.0549531, + "epoch": 0.05170599729445363, + "flos": 16257187971720.0, + "grad_norm": 2.2731828854096134, + "language_loss": 0.91266173, + "learning_rate": 3.995060249372788e-06, + "loss": 0.94035435, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.34326172, + "step": 860, + "time_per_iteration": 2.782334089279175 + }, + { + "auxiliary_loss_clip": 0.01661746, + "auxiliary_loss_mlp": 0.01076723, + "balance_loss_clip": 1.39035511, + "balance_loss_mlp": 1.04491758, + "epoch": 0.0517661205471216, + "flos": 23990983046640.0, + "grad_norm": 1.9626650752812604, + "language_loss": 0.82238597, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84977067, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.31762695, + "step": 861, + "time_per_iteration": 2.8776912689208984 + }, + { + "auxiliary_loss_clip": 0.01651572, + "auxiliary_loss_mlp": 0.01075368, + "balance_loss_clip": 1.37921822, + "balance_loss_mlp": 1.04434967, + "epoch": 0.05182624379978957, + "flos": 29283647874840.0, + "grad_norm": 1.7527268146615906, + "language_loss": 0.79660845, + "learning_rate": 3.995005386531627e-06, + "loss": 0.82387781, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.31005859, + "step": 862, + "time_per_iteration": 3.0124804973602295 + }, + { + "auxiliary_loss_clip": 0.01649498, + "auxiliary_loss_mlp": 0.0107216, + "balance_loss_clip": 1.38244677, + "balance_loss_mlp": 1.04226279, + "epoch": 0.05188636705245754, + "flos": 24176234695680.0, + "grad_norm": 1.9536763439673492, + "language_loss": 0.89838588, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.92560244, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.29882812, + "step": 863, + "time_per_iteration": 2.905703544616699 + }, + { + "auxiliary_loss_clip": 0.01666486, + "auxiliary_loss_mlp": 0.01077547, + "balance_loss_clip": 1.39563251, + "balance_loss_mlp": 1.04433537, + "epoch": 0.051946490305125506, + "flos": 26766370307160.0, + "grad_norm": 2.0251571415976937, + "language_loss": 0.76831353, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.79575384, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33203125, + "step": 864, + "time_per_iteration": 2.8571298122406006 + }, + { + "auxiliary_loss_clip": 0.016767, + "auxiliary_loss_mlp": 0.01075803, + "balance_loss_clip": 1.4004004, + "balance_loss_mlp": 1.04130423, + "epoch": 0.05200661355779348, + "flos": 21506946744600.0, + "grad_norm": 2.824789998077408, + "language_loss": 0.79573536, + "learning_rate": 3.994922524891474e-06, + "loss": 0.82326031, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.3449707, + "step": 865, + "time_per_iteration": 2.8299174308776855 + }, + { + "auxiliary_loss_clip": 0.01644756, + "auxiliary_loss_mlp": 0.01076784, + "balance_loss_clip": 1.37579846, + "balance_loss_mlp": 1.04524112, + "epoch": 0.05206673681046144, + "flos": 18119149228800.0, + "grad_norm": 4.86234448425575, + "language_loss": 0.87151444, + "learning_rate": 3.994894753048032e-06, + "loss": 0.8987298, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.31542969, + "step": 866, + "time_per_iteration": 2.8076207637786865 + }, + { + "auxiliary_loss_clip": 0.01668637, + "auxiliary_loss_mlp": 0.01083905, + "balance_loss_clip": 1.39953208, + "balance_loss_mlp": 1.0530771, + "epoch": 0.052126860063129415, + "flos": 17527919556240.0, + "grad_norm": 3.931657339083066, + "language_loss": 0.87999809, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.90752351, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.30834961, + "step": 867, + "time_per_iteration": 2.780090570449829 + }, + { + "auxiliary_loss_clip": 0.01644889, + "auxiliary_loss_mlp": 0.01076082, + "balance_loss_clip": 1.38378215, + "balance_loss_mlp": 1.04682791, + "epoch": 0.05218698331579739, + "flos": 32604434950680.0, + "grad_norm": 1.5422060812397869, + "language_loss": 0.63955212, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.66676188, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.29248047, + "step": 868, + "time_per_iteration": 2.9331552982330322 + }, + { + "auxiliary_loss_clip": 0.01666641, + "auxiliary_loss_mlp": 0.01069903, + "balance_loss_clip": 1.3926909, + "balance_loss_mlp": 1.03833663, + "epoch": 0.05224710656846535, + "flos": 22132392283440.0, + "grad_norm": 3.01809453575266, + "language_loss": 0.84109104, + "learning_rate": 3.994810983642281e-06, + "loss": 0.86845642, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.31591797, + "step": 869, + "time_per_iteration": 4.313327789306641 + }, + { + "auxiliary_loss_clip": 0.01661061, + "auxiliary_loss_mlp": 0.01076811, + "balance_loss_clip": 1.38637686, + "balance_loss_mlp": 1.04615057, + "epoch": 0.052307229821133325, + "flos": 11148759583560.0, + "grad_norm": 2.0464260655520956, + "language_loss": 0.88329977, + "learning_rate": 3.994782909218751e-06, + "loss": 0.91067851, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.30688477, + "step": 870, + "time_per_iteration": 6.275209903717041 + }, + { + "auxiliary_loss_clip": 0.01663269, + "auxiliary_loss_mlp": 0.0107713, + "balance_loss_clip": 1.39290154, + "balance_loss_mlp": 1.04608798, + "epoch": 0.05236735307380129, + "flos": 19131811728840.0, + "grad_norm": 1.934976421126367, + "language_loss": 0.81221211, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83961606, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.31054688, + "step": 871, + "time_per_iteration": 2.9096670150756836 + }, + { + "auxiliary_loss_clip": 0.01648225, + "auxiliary_loss_mlp": 0.01077141, + "balance_loss_clip": 1.38499188, + "balance_loss_mlp": 1.04797065, + "epoch": 0.05242747632646926, + "flos": 20965729590360.0, + "grad_norm": 1.597625229870592, + "language_loss": 0.81775004, + "learning_rate": 3.994726533445656e-06, + "loss": 0.84500372, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.29187012, + "step": 872, + "time_per_iteration": 4.388044595718384 + }, + { + "auxiliary_loss_clip": 0.0146417, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.2852726, + "balance_loss_mlp": 1.0217253, + "epoch": 0.052487599579137234, + "flos": 65034176425920.0, + "grad_norm": 0.8866898368350178, + "language_loss": 0.61571145, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.64067483, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.10449219, + "step": 873, + "time_per_iteration": 3.183199167251587 + }, + { + "auxiliary_loss_clip": 0.01656083, + "auxiliary_loss_mlp": 0.01075507, + "balance_loss_clip": 1.38838172, + "balance_loss_mlp": 1.04458368, + "epoch": 0.0525477228318052, + "flos": 23293329197760.0, + "grad_norm": 1.8853166120060978, + "language_loss": 0.89580154, + "learning_rate": 3.994669855111643e-06, + "loss": 0.9231174, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.30908203, + "step": 874, + "time_per_iteration": 2.8887040615081787 + }, + { + "auxiliary_loss_clip": 0.0165895, + "auxiliary_loss_mlp": 0.01072161, + "balance_loss_clip": 1.38909268, + "balance_loss_mlp": 1.04246581, + "epoch": 0.05260784608447317, + "flos": 32236165112400.0, + "grad_norm": 2.3639645949094716, + "language_loss": 0.7506088, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77791989, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.29699707, + "step": 875, + "time_per_iteration": 2.877539873123169 + }, + { + "auxiliary_loss_clip": 0.01651625, + "auxiliary_loss_mlp": 0.01065887, + "balance_loss_clip": 1.38537335, + "balance_loss_mlp": 1.03618002, + "epoch": 0.052667969337141136, + "flos": 24468722688240.0, + "grad_norm": 1.7000827636720404, + "language_loss": 0.92712402, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95429909, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.29736328, + "step": 876, + "time_per_iteration": 2.843127727508545 + }, + { + "auxiliary_loss_clip": 0.01457584, + "auxiliary_loss_mlp": 0.01015019, + "balance_loss_clip": 1.28089499, + "balance_loss_mlp": 1.00605476, + "epoch": 0.05272809258980911, + "flos": 57800211940440.0, + "grad_norm": 0.8189918206636504, + "language_loss": 0.62908733, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65381336, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08984375, + "step": 877, + "time_per_iteration": 3.279761791229248 + }, + { + "auxiliary_loss_clip": 0.01645237, + "auxiliary_loss_mlp": 0.01082742, + "balance_loss_clip": 1.37643945, + "balance_loss_mlp": 1.04893422, + "epoch": 0.05278821584247708, + "flos": 17425312565760.0, + "grad_norm": 2.282907584121975, + "language_loss": 0.86331332, + "learning_rate": 3.994555590795299e-06, + "loss": 0.89059317, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33837891, + "step": 878, + "time_per_iteration": 2.792269706726074 + }, + { + "auxiliary_loss_clip": 0.01664441, + "auxiliary_loss_mlp": 0.01073455, + "balance_loss_clip": 1.39133728, + "balance_loss_mlp": 1.04367614, + "epoch": 0.052848339095145046, + "flos": 26142427277640.0, + "grad_norm": 1.7707869543347137, + "language_loss": 0.83675337, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.86413229, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.29772949, + "step": 879, + "time_per_iteration": 2.8874709606170654 + }, + { + "auxiliary_loss_clip": 0.01648896, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_clip": 1.38512993, + "balance_loss_mlp": 1.04483521, + "epoch": 0.05290846234781302, + "flos": 16476452445240.0, + "grad_norm": 1.909281991964712, + "language_loss": 0.84755445, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.87481129, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.31933594, + "step": 880, + "time_per_iteration": 2.7811551094055176 + }, + { + "auxiliary_loss_clip": 0.01655289, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.38640952, + "balance_loss_mlp": 1.04040003, + "epoch": 0.05296858560048098, + "flos": 19869975739800.0, + "grad_norm": 2.1734017606101146, + "language_loss": 0.87801301, + "learning_rate": 3.994469098399906e-06, + "loss": 0.90526819, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.29858398, + "step": 881, + "time_per_iteration": 2.890244960784912 + }, + { + "auxiliary_loss_clip": 0.01650487, + "auxiliary_loss_mlp": 0.01074454, + "balance_loss_clip": 1.38230741, + "balance_loss_mlp": 1.04213643, + "epoch": 0.053028708853148955, + "flos": 24393874834800.0, + "grad_norm": 1.8185783143220318, + "language_loss": 0.88466126, + "learning_rate": 3.994440116339046e-06, + "loss": 0.91191065, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.32312012, + "step": 882, + "time_per_iteration": 2.874959707260132 + }, + { + "auxiliary_loss_clip": 0.01650918, + "auxiliary_loss_mlp": 0.01064742, + "balance_loss_clip": 1.38293648, + "balance_loss_mlp": 1.03446317, + "epoch": 0.05308883210581693, + "flos": 36400119082920.0, + "grad_norm": 2.3643389778967485, + "language_loss": 0.70722681, + "learning_rate": 3.994411058648816e-06, + "loss": 0.7343834, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.30322266, + "step": 883, + "time_per_iteration": 2.9710495471954346 + }, + { + "auxiliary_loss_clip": 0.01644155, + "auxiliary_loss_mlp": 0.01062765, + "balance_loss_clip": 1.38110209, + "balance_loss_mlp": 1.03408337, + "epoch": 0.05314895535848489, + "flos": 22860038729160.0, + "grad_norm": 2.2153073686210827, + "language_loss": 0.77127087, + "learning_rate": 3.994381925330319e-06, + "loss": 0.79834008, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.28674316, + "step": 884, + "time_per_iteration": 2.834656000137329 + }, + { + "auxiliary_loss_clip": 0.0163415, + "auxiliary_loss_mlp": 0.01064793, + "balance_loss_clip": 1.37484133, + "balance_loss_mlp": 1.03613496, + "epoch": 0.053209078611152864, + "flos": 12864273802560.0, + "grad_norm": 2.5142774145035425, + "language_loss": 0.86550057, + "learning_rate": 3.994352716384659e-06, + "loss": 0.89249003, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.28662109, + "step": 885, + "time_per_iteration": 2.7683048248291016 + }, + { + "auxiliary_loss_clip": 0.01646164, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_clip": 1.37871218, + "balance_loss_mlp": 1.03896999, + "epoch": 0.05326920186382083, + "flos": 12168122463000.0, + "grad_norm": 2.5912290896418093, + "language_loss": 0.87042916, + "learning_rate": 3.994323431812945e-06, + "loss": 0.89759713, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.31640625, + "step": 886, + "time_per_iteration": 2.7660233974456787 + }, + { + "auxiliary_loss_clip": 0.0163489, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_clip": 1.37291455, + "balance_loss_mlp": 1.02853823, + "epoch": 0.0533293251164888, + "flos": 22708799904600.0, + "grad_norm": 1.9520289491497858, + "language_loss": 0.90005493, + "learning_rate": 3.994294071616286e-06, + "loss": 0.92699057, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.30102539, + "step": 887, + "time_per_iteration": 2.7765190601348877 + }, + { + "auxiliary_loss_clip": 0.01649889, + "auxiliary_loss_mlp": 0.01075607, + "balance_loss_clip": 1.38157296, + "balance_loss_mlp": 1.04220474, + "epoch": 0.053389448369156774, + "flos": 26946342869400.0, + "grad_norm": 1.9599087565548323, + "language_loss": 0.75742656, + "learning_rate": 3.994264635795796e-06, + "loss": 0.78468156, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.33398438, + "step": 888, + "time_per_iteration": 2.847461700439453 + }, + { + "auxiliary_loss_clip": 0.01638971, + "auxiliary_loss_mlp": 0.0106905, + "balance_loss_clip": 1.375911, + "balance_loss_mlp": 1.03760302, + "epoch": 0.05344957162182474, + "flos": 25561796387040.0, + "grad_norm": 2.4838154455803174, + "language_loss": 0.88811469, + "learning_rate": 3.994235124352592e-06, + "loss": 0.91519487, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.31445312, + "step": 889, + "time_per_iteration": 2.8200268745422363 + }, + { + "auxiliary_loss_clip": 0.01634624, + "auxiliary_loss_mlp": 0.0106317, + "balance_loss_clip": 1.37320542, + "balance_loss_mlp": 1.03446436, + "epoch": 0.05350969487449271, + "flos": 19724422085640.0, + "grad_norm": 1.8358355772801835, + "language_loss": 0.89061284, + "learning_rate": 3.994205537287791e-06, + "loss": 0.91759074, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2869873, + "step": 890, + "time_per_iteration": 2.814079523086548 + }, + { + "auxiliary_loss_clip": 0.01632761, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_clip": 1.36609161, + "balance_loss_mlp": 1.05377674, + "epoch": 0.053569818127160676, + "flos": 27022084106760.0, + "grad_norm": 1.9930042339480967, + "language_loss": 0.94086945, + "learning_rate": 3.994175874602517e-06, + "loss": 0.96800923, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.27453613, + "step": 891, + "time_per_iteration": 2.8844878673553467 + }, + { + "auxiliary_loss_clip": 0.01636343, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_clip": 1.37060189, + "balance_loss_mlp": 1.04214418, + "epoch": 0.05362994137982865, + "flos": 13192277128920.0, + "grad_norm": 2.067672650128698, + "language_loss": 0.72582078, + "learning_rate": 3.994146136297893e-06, + "loss": 0.7529282, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.32250977, + "step": 892, + "time_per_iteration": 2.8172409534454346 + }, + { + "auxiliary_loss_clip": 0.01643178, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_clip": 1.37849677, + "balance_loss_mlp": 1.04601347, + "epoch": 0.05369006463249662, + "flos": 28663928114760.0, + "grad_norm": 1.895128589831253, + "language_loss": 0.8336938, + "learning_rate": 3.994116322375049e-06, + "loss": 0.86087203, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.28637695, + "step": 893, + "time_per_iteration": 2.9124810695648193 + }, + { + "auxiliary_loss_clip": 0.01642357, + "auxiliary_loss_mlp": 0.01068072, + "balance_loss_clip": 1.37521303, + "balance_loss_mlp": 1.03695822, + "epoch": 0.053750187885164585, + "flos": 28919073397320.0, + "grad_norm": 2.0717689421979864, + "language_loss": 0.82157445, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84867877, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.31103516, + "step": 894, + "time_per_iteration": 2.861941337585449 + }, + { + "auxiliary_loss_clip": 0.01637167, + "auxiliary_loss_mlp": 0.01067907, + "balance_loss_clip": 1.37458885, + "balance_loss_mlp": 1.04002428, + "epoch": 0.05381031113783256, + "flos": 15163058455560.0, + "grad_norm": 2.9587859863462196, + "language_loss": 0.76952875, + "learning_rate": 3.994056467679221e-06, + "loss": 0.79657948, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.27844238, + "step": 895, + "time_per_iteration": 2.94858717918396 + }, + { + "auxiliary_loss_clip": 0.01655714, + "auxiliary_loss_mlp": 0.01077223, + "balance_loss_clip": 1.38536429, + "balance_loss_mlp": 1.04689646, + "epoch": 0.05387043439050053, + "flos": 21840269766120.0, + "grad_norm": 1.8439615928385025, + "language_loss": 0.87511069, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.90244007, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.30322266, + "step": 896, + "time_per_iteration": 2.9265410900115967 + }, + { + "auxiliary_loss_clip": 0.01654034, + "auxiliary_loss_mlp": 0.01073036, + "balance_loss_clip": 1.38366187, + "balance_loss_mlp": 1.03913295, + "epoch": 0.053930557643168495, + "flos": 17314665120000.0, + "grad_norm": 2.2446677993388326, + "language_loss": 0.88881528, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.91608602, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.33886719, + "step": 897, + "time_per_iteration": 2.905001401901245 + }, + { + "auxiliary_loss_clip": 0.01633237, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_clip": 1.37128258, + "balance_loss_mlp": 1.04004908, + "epoch": 0.05399068089583647, + "flos": 17353063647360.0, + "grad_norm": 1.8154148767170444, + "language_loss": 0.91198647, + "learning_rate": 3.993966118527175e-06, + "loss": 0.93906385, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.34448242, + "step": 898, + "time_per_iteration": 2.8321433067321777 + }, + { + "auxiliary_loss_clip": 0.01648666, + "auxiliary_loss_mlp": 0.010893, + "balance_loss_clip": 1.37571335, + "balance_loss_mlp": 1.05900908, + "epoch": 0.05405080414850443, + "flos": 17490576846240.0, + "grad_norm": 2.9527312047291296, + "language_loss": 0.93060553, + "learning_rate": 3.993935850918845e-06, + "loss": 0.95798516, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.30310059, + "step": 899, + "time_per_iteration": 2.7817416191101074 + }, + { + "auxiliary_loss_clip": 0.01635363, + "auxiliary_loss_mlp": 0.01066699, + "balance_loss_clip": 1.37603021, + "balance_loss_mlp": 1.03695655, + "epoch": 0.054110927401172404, + "flos": 24501923345520.0, + "grad_norm": 1.9263262611682257, + "language_loss": 0.7683636, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.79538417, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29748535, + "step": 900, + "time_per_iteration": 2.8138134479522705 + }, + { + "auxiliary_loss_clip": 0.01647346, + "auxiliary_loss_mlp": 0.01064369, + "balance_loss_clip": 1.3784318, + "balance_loss_mlp": 1.03482842, + "epoch": 0.054171050653840376, + "flos": 22935252057840.0, + "grad_norm": 2.4296009578700715, + "language_loss": 0.7552976, + "learning_rate": 3.993875088872592e-06, + "loss": 0.78241479, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.29516602, + "step": 901, + "time_per_iteration": 2.8063530921936035 + }, + { + "auxiliary_loss_clip": 0.01630643, + "auxiliary_loss_mlp": 0.0107751, + "balance_loss_clip": 1.37204218, + "balance_loss_mlp": 1.04749298, + "epoch": 0.05423117390650834, + "flos": 12937131846360.0, + "grad_norm": 2.1199094094851927, + "language_loss": 0.85613197, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.88321352, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.30004883, + "step": 902, + "time_per_iteration": 2.847837448120117 + }, + { + "auxiliary_loss_clip": 0.01642691, + "auxiliary_loss_mlp": 0.01080281, + "balance_loss_clip": 1.37465477, + "balance_loss_mlp": 1.04790342, + "epoch": 0.05429129715917631, + "flos": 19906100199000.0, + "grad_norm": 1.6609831605759549, + "language_loss": 0.87031955, + "learning_rate": 3.993814024394569e-06, + "loss": 0.89754927, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.32385254, + "step": 903, + "time_per_iteration": 2.8243205547332764 + }, + { + "auxiliary_loss_clip": 0.01643104, + "auxiliary_loss_mlp": 0.01075185, + "balance_loss_clip": 1.37795401, + "balance_loss_mlp": 1.04502523, + "epoch": 0.05435142041184428, + "flos": 16912869757560.0, + "grad_norm": 2.5172893799702933, + "language_loss": 0.76915294, + "learning_rate": 3.993783378746537e-06, + "loss": 0.79633582, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.30151367, + "step": 904, + "time_per_iteration": 2.8182079792022705 + }, + { + "auxiliary_loss_clip": 0.01644165, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_clip": 1.37260568, + "balance_loss_mlp": 1.04533458, + "epoch": 0.05441154366451225, + "flos": 23953234253040.0, + "grad_norm": 3.789983273780129, + "language_loss": 0.87143928, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8986249, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.29077148, + "step": 905, + "time_per_iteration": 2.882066249847412 + }, + { + "auxiliary_loss_clip": 0.01644839, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_clip": 1.38349664, + "balance_loss_mlp": 1.05379796, + "epoch": 0.05447166691718022, + "flos": 19980298318680.0, + "grad_norm": 1.7792928377701422, + "language_loss": 0.74611324, + "learning_rate": 3.993721860638241e-06, + "loss": 0.77339745, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.29785156, + "step": 906, + "time_per_iteration": 2.7654430866241455 + }, + { + "auxiliary_loss_clip": 0.01645218, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_clip": 1.37775016, + "balance_loss_mlp": 1.05263448, + "epoch": 0.05453179016984819, + "flos": 24941954801880.0, + "grad_norm": 1.9469098673432186, + "language_loss": 0.8845064, + "learning_rate": 3.993690988180309e-06, + "loss": 0.91180944, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.32446289, + "step": 907, + "time_per_iteration": 2.8153252601623535 + }, + { + "auxiliary_loss_clip": 0.0164495, + "auxiliary_loss_mlp": 0.01079402, + "balance_loss_clip": 1.37517929, + "balance_loss_mlp": 1.04731107, + "epoch": 0.05459191342251616, + "flos": 18119839570920.0, + "grad_norm": 1.6794165599324375, + "language_loss": 0.87796497, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.90520847, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.32080078, + "step": 908, + "time_per_iteration": 6.04713773727417 + }, + { + "auxiliary_loss_clip": 0.01639907, + "auxiliary_loss_mlp": 0.01075318, + "balance_loss_clip": 1.37225342, + "balance_loss_mlp": 1.04217756, + "epoch": 0.054652036675184125, + "flos": 19212994486440.0, + "grad_norm": 2.019987279115677, + "language_loss": 0.9072684, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.9344207, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.33129883, + "step": 909, + "time_per_iteration": 4.311524868011475 + }, + { + "auxiliary_loss_clip": 0.01646008, + "auxiliary_loss_mlp": 0.01089185, + "balance_loss_clip": 1.37529075, + "balance_loss_mlp": 1.057881, + "epoch": 0.0547121599278521, + "flos": 16330046015520.0, + "grad_norm": 2.224118336290216, + "language_loss": 0.72439545, + "learning_rate": 3.99359791720544e-06, + "loss": 0.75174743, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.31298828, + "step": 910, + "time_per_iteration": 2.7885689735412598 + }, + { + "auxiliary_loss_clip": 0.01627487, + "auxiliary_loss_mlp": 0.01071358, + "balance_loss_clip": 1.36288404, + "balance_loss_mlp": 1.04233062, + "epoch": 0.05477228318052007, + "flos": 20343816978840.0, + "grad_norm": 1.9895008565699117, + "language_loss": 0.83879924, + "learning_rate": 3.993566742350714e-06, + "loss": 0.86578774, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.29016113, + "step": 911, + "time_per_iteration": 4.202486515045166 + }, + { + "auxiliary_loss_clip": 0.01642877, + "auxiliary_loss_mlp": 0.01080485, + "balance_loss_clip": 1.37529945, + "balance_loss_mlp": 1.04839349, + "epoch": 0.054832406433188034, + "flos": 21977092622880.0, + "grad_norm": 2.4507466905215125, + "language_loss": 0.7716701, + "learning_rate": 3.993535491899736e-06, + "loss": 0.79890376, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.32104492, + "step": 912, + "time_per_iteration": 2.865156888961792 + }, + { + "auxiliary_loss_clip": 0.01624326, + "auxiliary_loss_mlp": 0.01068173, + "balance_loss_clip": 1.36541641, + "balance_loss_mlp": 1.03841829, + "epoch": 0.054892529685856006, + "flos": 16403269534560.0, + "grad_norm": 2.252590845002573, + "language_loss": 0.83310366, + "learning_rate": 3.993504165853694e-06, + "loss": 0.86002868, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.29760742, + "step": 913, + "time_per_iteration": 2.8363664150238037 + }, + { + "auxiliary_loss_clip": 0.01630605, + "auxiliary_loss_mlp": 0.01073525, + "balance_loss_clip": 1.36936641, + "balance_loss_mlp": 1.04479516, + "epoch": 0.05495265293852397, + "flos": 23917312835640.0, + "grad_norm": 1.5904489225618694, + "language_loss": 0.84102166, + "learning_rate": 3.993472764213772e-06, + "loss": 0.86806297, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.28735352, + "step": 914, + "time_per_iteration": 2.914576768875122 + }, + { + "auxiliary_loss_clip": 0.01636405, + "auxiliary_loss_mlp": 0.01071214, + "balance_loss_clip": 1.37372315, + "balance_loss_mlp": 1.04108953, + "epoch": 0.055012776191191944, + "flos": 23592273919560.0, + "grad_norm": 2.0904792644343044, + "language_loss": 0.91750979, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.94458598, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.3013916, + "step": 915, + "time_per_iteration": 2.835880994796753 + }, + { + "auxiliary_loss_clip": 0.0163368, + "auxiliary_loss_mlp": 0.01074921, + "balance_loss_clip": 1.37195301, + "balance_loss_mlp": 1.04585767, + "epoch": 0.055072899443859916, + "flos": 17532792559440.0, + "grad_norm": 1.7741461118467328, + "language_loss": 0.90319479, + "learning_rate": 3.993409734157064e-06, + "loss": 0.93028086, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.29040527, + "step": 916, + "time_per_iteration": 2.766127824783325 + }, + { + "auxiliary_loss_clip": 0.01646376, + "auxiliary_loss_mlp": 0.01075995, + "balance_loss_clip": 1.37460113, + "balance_loss_mlp": 1.04435682, + "epoch": 0.05513302269652788, + "flos": 21691914135120.0, + "grad_norm": 1.7621587000538579, + "language_loss": 0.808599, + "learning_rate": 3.993378105742666e-06, + "loss": 0.8358227, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.31640625, + "step": 917, + "time_per_iteration": 2.885098934173584 + }, + { + "auxiliary_loss_clip": 0.01635831, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_clip": 1.36696029, + "balance_loss_mlp": 1.0438242, + "epoch": 0.05519314594919585, + "flos": 21617959665600.0, + "grad_norm": 2.1146419352062953, + "language_loss": 0.8056038, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.83277273, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.37231445, + "step": 918, + "time_per_iteration": 2.863462209701538 + }, + { + "auxiliary_loss_clip": 0.01640351, + "auxiliary_loss_mlp": 0.01070406, + "balance_loss_clip": 1.3733753, + "balance_loss_mlp": 1.04000795, + "epoch": 0.05525326920186382, + "flos": 21803576789880.0, + "grad_norm": 2.145008461587701, + "language_loss": 0.89098907, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91809666, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.30395508, + "step": 919, + "time_per_iteration": 2.83479905128479 + }, + { + "auxiliary_loss_clip": 0.01632867, + "auxiliary_loss_mlp": 0.01072763, + "balance_loss_clip": 1.3694706, + "balance_loss_mlp": 1.0416739, + "epoch": 0.05531339245453179, + "flos": 28445597633520.0, + "grad_norm": 2.29088834960177, + "language_loss": 0.88518816, + "learning_rate": 3.993282766969699e-06, + "loss": 0.91224444, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.31079102, + "step": 920, + "time_per_iteration": 2.8044419288635254 + }, + { + "auxiliary_loss_clip": 0.01634829, + "auxiliary_loss_mlp": 0.01075297, + "balance_loss_clip": 1.37305367, + "balance_loss_mlp": 1.04387367, + "epoch": 0.05537351570719976, + "flos": 37381814385480.0, + "grad_norm": 2.036932027173268, + "language_loss": 0.6696986, + "learning_rate": 3.993250836206136e-06, + "loss": 0.69679987, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.31420898, + "step": 921, + "time_per_iteration": 2.9281458854675293 + }, + { + "auxiliary_loss_clip": 0.01649791, + "auxiliary_loss_mlp": 0.01070167, + "balance_loss_clip": 1.38259256, + "balance_loss_mlp": 1.03678811, + "epoch": 0.05543363895986773, + "flos": 20089362038400.0, + "grad_norm": 2.122846676103935, + "language_loss": 0.73106122, + "learning_rate": 3.993218829858301e-06, + "loss": 0.75826079, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.33398438, + "step": 922, + "time_per_iteration": 2.8004117012023926 + }, + { + "auxiliary_loss_clip": 0.01639097, + "auxiliary_loss_mlp": 0.01074432, + "balance_loss_clip": 1.37303972, + "balance_loss_mlp": 1.04188788, + "epoch": 0.0554937622125357, + "flos": 24538332063240.0, + "grad_norm": 2.5243879077579496, + "language_loss": 0.83944005, + "learning_rate": 3.993186747927408e-06, + "loss": 0.86657536, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.32543945, + "step": 923, + "time_per_iteration": 2.8882484436035156 + }, + { + "auxiliary_loss_clip": 0.01626905, + "auxiliary_loss_mlp": 0.01061268, + "balance_loss_clip": 1.36440992, + "balance_loss_mlp": 1.03079855, + "epoch": 0.055553885465203665, + "flos": 14324236655400.0, + "grad_norm": 1.9500370410416628, + "language_loss": 0.79940248, + "learning_rate": 3.993154590414675e-06, + "loss": 0.82628417, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.3046875, + "step": 924, + "time_per_iteration": 2.7945358753204346 + }, + { + "auxiliary_loss_clip": 0.0161904, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.36152434, + "balance_loss_mlp": 1.03669536, + "epoch": 0.05561400871787164, + "flos": 27386902234440.0, + "grad_norm": 2.0300820972873788, + "language_loss": 1.02801204, + "learning_rate": 3.993122357321319e-06, + "loss": 1.05488122, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31176758, + "step": 925, + "time_per_iteration": 2.8414642810821533 + }, + { + "auxiliary_loss_clip": 0.0163684, + "auxiliary_loss_mlp": 0.01066163, + "balance_loss_clip": 1.3743819, + "balance_loss_mlp": 1.03471541, + "epoch": 0.05567413197053961, + "flos": 23226237541080.0, + "grad_norm": 1.8973333158687478, + "language_loss": 0.8146081, + "learning_rate": 3.993090048648564e-06, + "loss": 0.84163809, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.3145752, + "step": 926, + "time_per_iteration": 2.8054659366607666 + }, + { + "auxiliary_loss_clip": 0.01661462, + "auxiliary_loss_mlp": 0.01077094, + "balance_loss_clip": 1.38636303, + "balance_loss_mlp": 1.04593277, + "epoch": 0.055734255223207574, + "flos": 25270445428560.0, + "grad_norm": 3.452695615292325, + "language_loss": 0.74300224, + "learning_rate": 3.993057664397634e-06, + "loss": 0.77038777, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.3112793, + "step": 927, + "time_per_iteration": 2.8420629501342773 + }, + { + "auxiliary_loss_clip": 0.01472859, + "auxiliary_loss_mlp": 0.01045159, + "balance_loss_clip": 1.30008507, + "balance_loss_mlp": 1.03495491, + "epoch": 0.055794378475875546, + "flos": 66519966195720.0, + "grad_norm": 0.7935207359955573, + "language_loss": 0.59841621, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62359643, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.10205078, + "step": 928, + "time_per_iteration": 3.32108736038208 + }, + { + "auxiliary_loss_clip": 0.01645565, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_clip": 1.38257468, + "balance_loss_mlp": 1.04158783, + "epoch": 0.05585450172854351, + "flos": 25343100430560.0, + "grad_norm": 4.852459691869738, + "language_loss": 0.95745158, + "learning_rate": 3.992992669166168e-06, + "loss": 0.98465753, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.3347168, + "step": 929, + "time_per_iteration": 2.8340957164764404 + }, + { + "auxiliary_loss_clip": 0.01642345, + "auxiliary_loss_mlp": 0.01075049, + "balance_loss_clip": 1.3785491, + "balance_loss_mlp": 1.04183722, + "epoch": 0.05591462498121148, + "flos": 33918316240680.0, + "grad_norm": 2.1743083341935967, + "language_loss": 0.72618258, + "learning_rate": 3.992960058188094e-06, + "loss": 0.75335652, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.33227539, + "step": 930, + "time_per_iteration": 2.9270308017730713 + }, + { + "auxiliary_loss_clip": 0.01654073, + "auxiliary_loss_mlp": 0.01084342, + "balance_loss_clip": 1.38789785, + "balance_loss_mlp": 1.05098736, + "epoch": 0.055974748233879455, + "flos": 17935359480720.0, + "grad_norm": 4.760809114022739, + "language_loss": 0.8629092, + "learning_rate": 3.992927371636776e-06, + "loss": 0.8902933, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.33349609, + "step": 931, + "time_per_iteration": 2.8229799270629883 + }, + { + "auxiliary_loss_clip": 0.01648616, + "auxiliary_loss_mlp": 0.01088253, + "balance_loss_clip": 1.38278091, + "balance_loss_mlp": 1.05494571, + "epoch": 0.05603487148654742, + "flos": 24026985680760.0, + "grad_norm": 1.6735389219739891, + "language_loss": 0.8432045, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.87057316, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.33276367, + "step": 932, + "time_per_iteration": 2.800485610961914 + }, + { + "auxiliary_loss_clip": 0.0165603, + "auxiliary_loss_mlp": 0.01093289, + "balance_loss_clip": 1.39031839, + "balance_loss_mlp": 1.06019616, + "epoch": 0.05609499473921539, + "flos": 17311781926440.0, + "grad_norm": 2.0004800358652046, + "language_loss": 0.74606681, + "learning_rate": 3.992861771819365e-06, + "loss": 0.77355993, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.33081055, + "step": 933, + "time_per_iteration": 2.8250110149383545 + }, + { + "auxiliary_loss_clip": 0.01639508, + "auxiliary_loss_mlp": 0.01085444, + "balance_loss_clip": 1.37551689, + "balance_loss_mlp": 1.05435419, + "epoch": 0.05615511799188336, + "flos": 20999498764680.0, + "grad_norm": 2.221522705768895, + "language_loss": 0.8763119, + "learning_rate": 3.99282885855576e-06, + "loss": 0.90356147, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.31103516, + "step": 934, + "time_per_iteration": 2.8330516815185547 + }, + { + "auxiliary_loss_clip": 0.01642441, + "auxiliary_loss_mlp": 0.01084162, + "balance_loss_clip": 1.38644242, + "balance_loss_mlp": 1.05326247, + "epoch": 0.05621524124455133, + "flos": 17277850318680.0, + "grad_norm": 2.417504022110678, + "language_loss": 0.81114209, + "learning_rate": 3.992795869723885e-06, + "loss": 0.83840823, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30883789, + "step": 935, + "time_per_iteration": 2.776341199874878 + }, + { + "auxiliary_loss_clip": 0.01476213, + "auxiliary_loss_mlp": 0.01015971, + "balance_loss_clip": 1.30669391, + "balance_loss_mlp": 1.00586247, + "epoch": 0.0562753644972193, + "flos": 58733600275800.0, + "grad_norm": 0.8122382622839404, + "language_loss": 0.6908015, + "learning_rate": 3.99276280532499e-06, + "loss": 0.7157234, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.10107422, + "step": 936, + "time_per_iteration": 3.1701266765594482 + }, + { + "auxiliary_loss_clip": 0.01659184, + "auxiliary_loss_mlp": 0.01087183, + "balance_loss_clip": 1.39264262, + "balance_loss_mlp": 1.05368495, + "epoch": 0.05633548774988727, + "flos": 17461518241680.0, + "grad_norm": 1.9811414920891177, + "language_loss": 0.77074766, + "learning_rate": 3.992729665360331e-06, + "loss": 0.79821134, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.33520508, + "step": 937, + "time_per_iteration": 2.810732364654541 + }, + { + "auxiliary_loss_clip": 0.01470443, + "auxiliary_loss_mlp": 0.01009753, + "balance_loss_clip": 1.30190754, + "balance_loss_mlp": 0.99964428, + "epoch": 0.05639561100255524, + "flos": 70671331574640.0, + "grad_norm": 0.852169748705517, + "language_loss": 0.64349669, + "learning_rate": 3.992696449831162e-06, + "loss": 0.6682986, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.10107422, + "step": 938, + "time_per_iteration": 3.2702512741088867 + }, + { + "auxiliary_loss_clip": 0.01672321, + "auxiliary_loss_mlp": 0.01083203, + "balance_loss_clip": 1.39942741, + "balance_loss_mlp": 1.04958558, + "epoch": 0.056455734255223204, + "flos": 20490954359040.0, + "grad_norm": 2.1539206293249378, + "language_loss": 0.80190957, + "learning_rate": 3.992663158738745e-06, + "loss": 0.82946479, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.33618164, + "step": 939, + "time_per_iteration": 2.8276853561401367 + }, + { + "auxiliary_loss_clip": 0.01648675, + "auxiliary_loss_mlp": 0.01089634, + "balance_loss_clip": 1.38710356, + "balance_loss_mlp": 1.05859184, + "epoch": 0.056515857507891176, + "flos": 22058559639000.0, + "grad_norm": 1.5848521720548931, + "language_loss": 0.74576378, + "learning_rate": 3.992629792084341e-06, + "loss": 0.77314687, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.31054688, + "step": 940, + "time_per_iteration": 2.8307952880859375 + }, + { + "auxiliary_loss_clip": 0.0164629, + "auxiliary_loss_mlp": 0.01087878, + "balance_loss_clip": 1.38465965, + "balance_loss_mlp": 1.05432105, + "epoch": 0.05657598076055915, + "flos": 24030762258240.0, + "grad_norm": 2.103678762766912, + "language_loss": 0.72511172, + "learning_rate": 3.992596349869216e-06, + "loss": 0.75245339, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.33557129, + "step": 941, + "time_per_iteration": 2.828629493713379 + }, + { + "auxiliary_loss_clip": 0.01644674, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_clip": 1.38281178, + "balance_loss_mlp": 1.05380321, + "epoch": 0.05663610401322711, + "flos": 20484863105040.0, + "grad_norm": 1.7832368476523603, + "language_loss": 0.81010365, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83740449, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.31640625, + "step": 942, + "time_per_iteration": 2.828335762023926 + }, + { + "auxiliary_loss_clip": 0.01645344, + "auxiliary_loss_mlp": 0.01096646, + "balance_loss_clip": 1.38231826, + "balance_loss_mlp": 1.06319594, + "epoch": 0.056696227265895086, + "flos": 21073981142880.0, + "grad_norm": 2.0296337705347067, + "language_loss": 0.89674711, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.92416704, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.3347168, + "step": 943, + "time_per_iteration": 2.857083320617676 + }, + { + "auxiliary_loss_clip": 0.01648481, + "auxiliary_loss_mlp": 0.01078027, + "balance_loss_clip": 1.38684535, + "balance_loss_mlp": 1.04772365, + "epoch": 0.05675635051856306, + "flos": 17825767852320.0, + "grad_norm": 2.533534315261512, + "language_loss": 0.76141542, + "learning_rate": 3.992495569872206e-06, + "loss": 0.78868049, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.30322266, + "step": 944, + "time_per_iteration": 2.791524887084961 + }, + { + "auxiliary_loss_clip": 0.01643612, + "auxiliary_loss_mlp": 0.01086766, + "balance_loss_clip": 1.38226187, + "balance_loss_mlp": 1.0569278, + "epoch": 0.05681647377123102, + "flos": 23120503706880.0, + "grad_norm": 1.830155338228621, + "language_loss": 0.80233586, + "learning_rate": 3.992461825426906e-06, + "loss": 0.82963961, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.2980957, + "step": 945, + "time_per_iteration": 2.889770746231079 + }, + { + "auxiliary_loss_clip": 0.01653606, + "auxiliary_loss_mlp": 0.01081857, + "balance_loss_clip": 1.39138556, + "balance_loss_mlp": 1.05112433, + "epoch": 0.056876597023898995, + "flos": 16074819516240.0, + "grad_norm": 3.2848502416166783, + "language_loss": 0.83252984, + "learning_rate": 3.992428005427252e-06, + "loss": 0.85988444, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.30761719, + "step": 946, + "time_per_iteration": 4.315656900405884 + }, + { + "auxiliary_loss_clip": 0.01664845, + "auxiliary_loss_mlp": 0.01088686, + "balance_loss_clip": 1.39555538, + "balance_loss_mlp": 1.05516398, + "epoch": 0.05693672027656696, + "flos": 16839889888680.0, + "grad_norm": 7.742832811214825, + "language_loss": 0.80512398, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8326593, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.33544922, + "step": 947, + "time_per_iteration": 2.8151955604553223 + }, + { + "auxiliary_loss_clip": 0.01661094, + "auxiliary_loss_mlp": 0.01080115, + "balance_loss_clip": 1.39208913, + "balance_loss_mlp": 1.04814339, + "epoch": 0.05699684352923493, + "flos": 21393010021680.0, + "grad_norm": 2.729670375168216, + "language_loss": 0.86241639, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88982844, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.31982422, + "step": 948, + "time_per_iteration": 6.247646808624268 + }, + { + "auxiliary_loss_clip": 0.01639543, + "auxiliary_loss_mlp": 0.01077193, + "balance_loss_clip": 1.37721562, + "balance_loss_mlp": 1.04317069, + "epoch": 0.057056966781902904, + "flos": 15564610167840.0, + "grad_norm": 1.7265630698780015, + "language_loss": 0.88077807, + "learning_rate": 3.992326092115019e-06, + "loss": 0.90794539, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.34033203, + "step": 949, + "time_per_iteration": 4.212140321731567 + }, + { + "auxiliary_loss_clip": 0.01645541, + "auxiliary_loss_mlp": 0.01089159, + "balance_loss_clip": 1.38734007, + "balance_loss_mlp": 1.05601871, + "epoch": 0.05711709003457087, + "flos": 19942630741800.0, + "grad_norm": 2.009529348305162, + "language_loss": 0.79802889, + "learning_rate": 3.992291969910811e-06, + "loss": 0.82537585, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.33105469, + "step": 950, + "time_per_iteration": 2.8238117694854736 + }, + { + "auxiliary_loss_clip": 0.01661506, + "auxiliary_loss_mlp": 0.01079904, + "balance_loss_clip": 1.39457715, + "balance_loss_mlp": 1.04833722, + "epoch": 0.05717721328723884, + "flos": 30338363654640.0, + "grad_norm": 2.5027763742359856, + "language_loss": 0.83141255, + "learning_rate": 3.992257772158691e-06, + "loss": 0.85882664, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.31555176, + "step": 951, + "time_per_iteration": 2.862776517868042 + }, + { + "auxiliary_loss_clip": 0.01651708, + "auxiliary_loss_mlp": 0.01073305, + "balance_loss_clip": 1.38709915, + "balance_loss_mlp": 1.04054594, + "epoch": 0.05723733653990681, + "flos": 23658959492640.0, + "grad_norm": 2.470527025655416, + "language_loss": 0.87851202, + "learning_rate": 3.992223498859958e-06, + "loss": 0.9057622, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.32763672, + "step": 952, + "time_per_iteration": 2.8727309703826904 + }, + { + "auxiliary_loss_clip": 0.01675653, + "auxiliary_loss_mlp": 0.01083298, + "balance_loss_clip": 1.40369928, + "balance_loss_mlp": 1.04472208, + "epoch": 0.05729745979257478, + "flos": 22061036748960.0, + "grad_norm": 1.856866725501511, + "language_loss": 0.80113292, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.82872248, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.38549805, + "step": 953, + "time_per_iteration": 2.899303913116455 + }, + { + "auxiliary_loss_clip": 0.01647642, + "auxiliary_loss_mlp": 0.01081862, + "balance_loss_clip": 1.38354945, + "balance_loss_mlp": 1.047387, + "epoch": 0.05735758304524275, + "flos": 19608414336360.0, + "grad_norm": 1.8063444493728065, + "language_loss": 0.8790499, + "learning_rate": 3.992154725627848e-06, + "loss": 0.90634495, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.3449707, + "step": 954, + "time_per_iteration": 2.848426103591919 + }, + { + "auxiliary_loss_clip": 0.01651082, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_clip": 1.38674009, + "balance_loss_mlp": 1.03786993, + "epoch": 0.057417706297910716, + "flos": 19103727724920.0, + "grad_norm": 2.419190087821412, + "language_loss": 0.89473081, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.92193431, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.31396484, + "step": 955, + "time_per_iteration": 2.7798922061920166 + }, + { + "auxiliary_loss_clip": 0.01647301, + "auxiliary_loss_mlp": 0.01088142, + "balance_loss_clip": 1.38530183, + "balance_loss_mlp": 1.0550015, + "epoch": 0.05747782955057869, + "flos": 16658902117440.0, + "grad_norm": 2.374127197520338, + "language_loss": 0.89738989, + "learning_rate": 3.992085650224914e-06, + "loss": 0.92474437, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.33129883, + "step": 956, + "time_per_iteration": 2.749410390853882 + }, + { + "auxiliary_loss_clip": 0.01637054, + "auxiliary_loss_mlp": 0.01071971, + "balance_loss_clip": 1.3833847, + "balance_loss_mlp": 1.03849673, + "epoch": 0.05753795280324665, + "flos": 14506199027280.0, + "grad_norm": 1.8952452640682884, + "language_loss": 0.75898242, + "learning_rate": 3.99205099921266e-06, + "loss": 0.78607261, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.3347168, + "step": 957, + "time_per_iteration": 2.8173463344573975 + }, + { + "auxiliary_loss_clip": 0.01647562, + "auxiliary_loss_mlp": 0.01087356, + "balance_loss_clip": 1.38489175, + "balance_loss_mlp": 1.05290496, + "epoch": 0.057598076055914625, + "flos": 18080750701440.0, + "grad_norm": 2.0085668379279906, + "language_loss": 0.80512893, + "learning_rate": 3.992016272661633e-06, + "loss": 0.83247817, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.34423828, + "step": 958, + "time_per_iteration": 2.774944305419922 + }, + { + "auxiliary_loss_clip": 0.01647453, + "auxiliary_loss_mlp": 0.01074939, + "balance_loss_clip": 1.38516498, + "balance_loss_mlp": 1.04451692, + "epoch": 0.0576581993085826, + "flos": 22129346656440.0, + "grad_norm": 2.766837709208571, + "language_loss": 0.89173561, + "learning_rate": 3.99198147057315e-06, + "loss": 0.9189595, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.30444336, + "step": 959, + "time_per_iteration": 2.7985122203826904 + }, + { + "auxiliary_loss_clip": 0.01632032, + "auxiliary_loss_mlp": 0.01070529, + "balance_loss_clip": 1.37553573, + "balance_loss_mlp": 1.04003513, + "epoch": 0.05771832256125056, + "flos": 33188192685000.0, + "grad_norm": 2.0120789620164605, + "language_loss": 0.79286861, + "learning_rate": 3.991946592948529e-06, + "loss": 0.81989419, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30493164, + "step": 960, + "time_per_iteration": 2.9159858226776123 + }, + { + "auxiliary_loss_clip": 0.0164368, + "auxiliary_loss_mlp": 0.010811, + "balance_loss_clip": 1.38001263, + "balance_loss_mlp": 1.04583824, + "epoch": 0.057778445813918534, + "flos": 24175463136840.0, + "grad_norm": 2.2503586313274315, + "language_loss": 0.94030607, + "learning_rate": 3.991911639789094e-06, + "loss": 0.96755385, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.35253906, + "step": 961, + "time_per_iteration": 2.811948537826538 + }, + { + "auxiliary_loss_clip": 0.01652354, + "auxiliary_loss_mlp": 0.01079997, + "balance_loss_clip": 1.38774467, + "balance_loss_mlp": 1.04688048, + "epoch": 0.0578385690665865, + "flos": 29649075127920.0, + "grad_norm": 2.4379793589264085, + "language_loss": 0.69097167, + "learning_rate": 3.991876611096169e-06, + "loss": 0.71829516, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.33105469, + "step": 962, + "time_per_iteration": 2.8749783039093018 + }, + { + "auxiliary_loss_clip": 0.01642189, + "auxiliary_loss_mlp": 0.0107413, + "balance_loss_clip": 1.37989855, + "balance_loss_mlp": 1.042539, + "epoch": 0.05789869231925447, + "flos": 20890191394800.0, + "grad_norm": 2.1291907345920835, + "language_loss": 0.89380711, + "learning_rate": 3.991841506871084e-06, + "loss": 0.92097032, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.31591797, + "step": 963, + "time_per_iteration": 2.7559309005737305 + }, + { + "auxiliary_loss_clip": 0.01659854, + "auxiliary_loss_mlp": 0.0106588, + "balance_loss_clip": 1.39539266, + "balance_loss_mlp": 1.03328824, + "epoch": 0.057958815571922444, + "flos": 26036693443440.0, + "grad_norm": 2.432218013015896, + "language_loss": 0.85723364, + "learning_rate": 3.99180632711517e-06, + "loss": 0.88449097, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.32592773, + "step": 964, + "time_per_iteration": 2.892508029937744 + }, + { + "auxiliary_loss_clip": 0.01654786, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_clip": 1.39160609, + "balance_loss_mlp": 1.04427958, + "epoch": 0.05801893882459041, + "flos": 18082293819120.0, + "grad_norm": 2.771213061750279, + "language_loss": 0.77783954, + "learning_rate": 3.99177107182976e-06, + "loss": 0.8051616, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.33129883, + "step": 965, + "time_per_iteration": 2.820509672164917 + }, + { + "auxiliary_loss_clip": 0.01643674, + "auxiliary_loss_mlp": 0.01072417, + "balance_loss_clip": 1.38482118, + "balance_loss_mlp": 1.03956246, + "epoch": 0.05807906207725838, + "flos": 17753397108840.0, + "grad_norm": 1.8661563545711843, + "language_loss": 0.82192969, + "learning_rate": 3.99173574101619e-06, + "loss": 0.84909052, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.32861328, + "step": 966, + "time_per_iteration": 2.9415078163146973 + }, + { + "auxiliary_loss_clip": 0.01656268, + "auxiliary_loss_mlp": 0.01071093, + "balance_loss_clip": 1.39544129, + "balance_loss_mlp": 1.0404799, + "epoch": 0.058139185329926346, + "flos": 18044869892400.0, + "grad_norm": 2.009901619572653, + "language_loss": 0.76996028, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.79723394, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.30615234, + "step": 967, + "time_per_iteration": 2.839970350265503 + }, + { + "auxiliary_loss_clip": 0.01474096, + "auxiliary_loss_mlp": 0.01016154, + "balance_loss_clip": 1.30615783, + "balance_loss_mlp": 1.00175369, + "epoch": 0.05819930858259432, + "flos": 62377395849720.0, + "grad_norm": 0.785405866980449, + "language_loss": 0.57312727, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59802973, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.14355469, + "step": 968, + "time_per_iteration": 3.224307060241699 + }, + { + "auxiliary_loss_clip": 0.01649967, + "auxiliary_loss_mlp": 0.01068416, + "balance_loss_clip": 1.38746667, + "balance_loss_mlp": 1.03599107, + "epoch": 0.05825943183526229, + "flos": 19140055225920.0, + "grad_norm": 2.3653710340661114, + "language_loss": 0.82852536, + "learning_rate": 3.991629295419945e-06, + "loss": 0.8557092, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.32421875, + "step": 969, + "time_per_iteration": 2.8363454341888428 + }, + { + "auxiliary_loss_clip": 0.01653754, + "auxiliary_loss_mlp": 0.01074063, + "balance_loss_clip": 1.3886025, + "balance_loss_mlp": 1.04325914, + "epoch": 0.058319555087930255, + "flos": 29028380767200.0, + "grad_norm": 2.153423075392471, + "language_loss": 0.78625929, + "learning_rate": 3.991593662507167e-06, + "loss": 0.81353742, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.30773926, + "step": 970, + "time_per_iteration": 2.846461772918701 + }, + { + "auxiliary_loss_clip": 0.01648993, + "auxiliary_loss_mlp": 0.01071103, + "balance_loss_clip": 1.38670719, + "balance_loss_mlp": 1.03922665, + "epoch": 0.05837967834059823, + "flos": 18884666293200.0, + "grad_norm": 2.5367305106239733, + "language_loss": 0.92966199, + "learning_rate": 3.991557954072958e-06, + "loss": 0.95686293, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.31884766, + "step": 971, + "time_per_iteration": 2.8036675453186035 + }, + { + "auxiliary_loss_clip": 0.01643341, + "auxiliary_loss_mlp": 0.01066209, + "balance_loss_clip": 1.38181305, + "balance_loss_mlp": 1.03566778, + "epoch": 0.05843980159326619, + "flos": 25708649508720.0, + "grad_norm": 1.6928218213487394, + "language_loss": 0.86259627, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88969183, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.30517578, + "step": 972, + "time_per_iteration": 2.8315834999084473 + }, + { + "auxiliary_loss_clip": 0.01653169, + "auxiliary_loss_mlp": 0.01087528, + "balance_loss_clip": 1.3933692, + "balance_loss_mlp": 1.0571537, + "epoch": 0.058499924845934165, + "flos": 25557045208920.0, + "grad_norm": 2.116146721183934, + "language_loss": 0.87680918, + "learning_rate": 3.991486310645667e-06, + "loss": 0.90421617, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.30371094, + "step": 973, + "time_per_iteration": 2.865337610244751 + }, + { + "auxiliary_loss_clip": 0.0164066, + "auxiliary_loss_mlp": 0.01077739, + "balance_loss_clip": 1.38417578, + "balance_loss_mlp": 1.04550433, + "epoch": 0.05856004809860214, + "flos": 16440612244560.0, + "grad_norm": 1.9386260435903202, + "language_loss": 0.75642383, + "learning_rate": 3.991450375655301e-06, + "loss": 0.7836079, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.32250977, + "step": 974, + "time_per_iteration": 2.8021011352539062 + }, + { + "auxiliary_loss_clip": 0.0163556, + "auxiliary_loss_mlp": 0.01070393, + "balance_loss_clip": 1.37674212, + "balance_loss_mlp": 1.03973222, + "epoch": 0.0586201713512701, + "flos": 39466288784880.0, + "grad_norm": 1.5463280597784428, + "language_loss": 0.77261645, + "learning_rate": 3.991414365148936e-06, + "loss": 0.79967594, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.30664062, + "step": 975, + "time_per_iteration": 2.9419105052948 + }, + { + "auxiliary_loss_clip": 0.01640398, + "auxiliary_loss_mlp": 0.01072525, + "balance_loss_clip": 1.37758517, + "balance_loss_mlp": 1.04241276, + "epoch": 0.058680294603938074, + "flos": 23370085644120.0, + "grad_norm": 2.0506451591170047, + "language_loss": 0.76916742, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79629672, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.30102539, + "step": 976, + "time_per_iteration": 2.829138994216919 + }, + { + "auxiliary_loss_clip": 0.01636551, + "auxiliary_loss_mlp": 0.01080188, + "balance_loss_clip": 1.3771559, + "balance_loss_mlp": 1.05046892, + "epoch": 0.05874041785660604, + "flos": 32238195530400.0, + "grad_norm": 1.8054807915756272, + "language_loss": 0.87936312, + "learning_rate": 3.991342117593679e-06, + "loss": 0.9065305, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29724121, + "step": 977, + "time_per_iteration": 2.874281406402588 + }, + { + "auxiliary_loss_clip": 0.01641966, + "auxiliary_loss_mlp": 0.01073317, + "balance_loss_clip": 1.38375878, + "balance_loss_mlp": 1.04201269, + "epoch": 0.05880054110927401, + "flos": 22315207430880.0, + "grad_norm": 1.5805340198514288, + "language_loss": 0.79929197, + "learning_rate": 3.991305880547527e-06, + "loss": 0.82644475, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31274414, + "step": 978, + "time_per_iteration": 2.8341376781463623 + }, + { + "auxiliary_loss_clip": 0.01648424, + "auxiliary_loss_mlp": 0.01087777, + "balance_loss_clip": 1.38261402, + "balance_loss_mlp": 1.05427957, + "epoch": 0.05886066436194198, + "flos": 27386130675600.0, + "grad_norm": 1.828290406991551, + "language_loss": 0.81212342, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83948541, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.33496094, + "step": 979, + "time_per_iteration": 2.8596227169036865 + }, + { + "auxiliary_loss_clip": 0.01455706, + "auxiliary_loss_mlp": 0.01024881, + "balance_loss_clip": 1.28023016, + "balance_loss_mlp": 1.0127213, + "epoch": 0.05892078761460995, + "flos": 59597460452880.0, + "grad_norm": 0.91724616082087, + "language_loss": 0.59014928, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61495519, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.12158203, + "step": 980, + "time_per_iteration": 3.2086358070373535 + }, + { + "auxiliary_loss_clip": 0.01628391, + "auxiliary_loss_mlp": 0.01071588, + "balance_loss_clip": 1.37529767, + "balance_loss_mlp": 1.04159522, + "epoch": 0.05898091086727792, + "flos": 15418325563200.0, + "grad_norm": 2.096181599524345, + "language_loss": 0.87398773, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.90098757, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.29968262, + "step": 981, + "time_per_iteration": 2.862734317779541 + }, + { + "auxiliary_loss_clip": 0.01642483, + "auxiliary_loss_mlp": 0.01066844, + "balance_loss_clip": 1.38204861, + "balance_loss_mlp": 1.0375185, + "epoch": 0.059041034119945886, + "flos": 23660055918360.0, + "grad_norm": 2.07095488249618, + "language_loss": 0.79587853, + "learning_rate": 3.991160177271513e-06, + "loss": 0.82297182, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.29321289, + "step": 982, + "time_per_iteration": 2.816887378692627 + }, + { + "auxiliary_loss_clip": 0.01654766, + "auxiliary_loss_mlp": 0.01074358, + "balance_loss_clip": 1.38728607, + "balance_loss_mlp": 1.04362619, + "epoch": 0.05910115737261386, + "flos": 24759505129680.0, + "grad_norm": 2.225419700497755, + "language_loss": 0.8490988, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.8763901, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.30737305, + "step": 983, + "time_per_iteration": 2.8748226165771484 + }, + { + "auxiliary_loss_clip": 0.01632833, + "auxiliary_loss_mlp": 0.01072851, + "balance_loss_clip": 1.37571049, + "balance_loss_mlp": 1.04157019, + "epoch": 0.05916128062528183, + "flos": 11732476709520.0, + "grad_norm": 1.770142055763222, + "language_loss": 0.85058272, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.87763959, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.31262207, + "step": 984, + "time_per_iteration": 2.77239727973938 + }, + { + "auxiliary_loss_clip": 0.01620592, + "auxiliary_loss_mlp": 0.01058701, + "balance_loss_clip": 1.36688447, + "balance_loss_mlp": 1.02930427, + "epoch": 0.059221403877949795, + "flos": 21907361422800.0, + "grad_norm": 2.4462466285105693, + "language_loss": 0.77319622, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79998916, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.29382324, + "step": 985, + "time_per_iteration": 2.902536392211914 + }, + { + "auxiliary_loss_clip": 0.01638407, + "auxiliary_loss_mlp": 0.01081645, + "balance_loss_clip": 1.37712359, + "balance_loss_mlp": 1.04526234, + "epoch": 0.05928152713061777, + "flos": 20518510454280.0, + "grad_norm": 3.239431706304938, + "language_loss": 0.91097271, + "learning_rate": 3.991013265915661e-06, + "loss": 0.93817329, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.36376953, + "step": 986, + "time_per_iteration": 4.315902233123779 + }, + { + "auxiliary_loss_clip": 0.01640023, + "auxiliary_loss_mlp": 0.01074662, + "balance_loss_clip": 1.3769989, + "balance_loss_mlp": 1.03982925, + "epoch": 0.05934165038328574, + "flos": 24500055360960.0, + "grad_norm": 2.2884536079900406, + "language_loss": 0.76474404, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.79189086, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.34863281, + "step": 987, + "time_per_iteration": 4.803138732910156 + }, + { + "auxiliary_loss_clip": 0.0164344, + "auxiliary_loss_mlp": 0.01062357, + "balance_loss_clip": 1.377985, + "balance_loss_mlp": 1.03021801, + "epoch": 0.059401773635953704, + "flos": 38735271845280.0, + "grad_norm": 2.0522980962195803, + "language_loss": 0.72793806, + "learning_rate": 3.990939357235621e-06, + "loss": 0.75499606, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.3215332, + "step": 988, + "time_per_iteration": 2.931974172592163 + }, + { + "auxiliary_loss_clip": 0.01435071, + "auxiliary_loss_mlp": 0.01010265, + "balance_loss_clip": 1.26134193, + "balance_loss_mlp": 0.99891645, + "epoch": 0.059461896888621676, + "flos": 58037367719520.0, + "grad_norm": 0.9305016252142301, + "language_loss": 0.71117431, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73562765, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.11328125, + "step": 989, + "time_per_iteration": 4.571424722671509 + }, + { + "auxiliary_loss_clip": 0.01639244, + "auxiliary_loss_mlp": 0.01073781, + "balance_loss_clip": 1.37471461, + "balance_loss_mlp": 1.04123676, + "epoch": 0.05952202014128964, + "flos": 22132920192120.0, + "grad_norm": 3.635885437770474, + "language_loss": 0.79694891, + "learning_rate": 3.990865146569105e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.32543945, + "step": 990, + "time_per_iteration": 2.980736255645752 + }, + { + "auxiliary_loss_clip": 0.01625388, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.366225, + "balance_loss_mlp": 1.03040671, + "epoch": 0.059582143393957614, + "flos": 20450241155160.0, + "grad_norm": 1.9551804505956714, + "language_loss": 0.87022096, + "learning_rate": 3.990827927994434e-06, + "loss": 0.89710552, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.32641602, + "step": 991, + "time_per_iteration": 2.795100450515747 + }, + { + "auxiliary_loss_clip": 0.01642445, + "auxiliary_loss_mlp": 0.01074348, + "balance_loss_clip": 1.37687242, + "balance_loss_mlp": 1.04275727, + "epoch": 0.059642266646625586, + "flos": 20599774428600.0, + "grad_norm": 1.8905926409204141, + "language_loss": 0.78196341, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.80913138, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.31567383, + "step": 992, + "time_per_iteration": 2.805495500564575 + }, + { + "auxiliary_loss_clip": 0.01642928, + "auxiliary_loss_mlp": 0.0107073, + "balance_loss_clip": 1.38240516, + "balance_loss_mlp": 1.0400455, + "epoch": 0.05970238989929355, + "flos": 19357167456360.0, + "grad_norm": 2.4508493396358335, + "language_loss": 0.75551474, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.78265131, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.30712891, + "step": 993, + "time_per_iteration": 2.7757246494293213 + }, + { + "auxiliary_loss_clip": 0.01632463, + "auxiliary_loss_mlp": 0.01065871, + "balance_loss_clip": 1.37346649, + "balance_loss_mlp": 1.03336239, + "epoch": 0.05976251315196152, + "flos": 30270256788960.0, + "grad_norm": 1.8173761615551949, + "language_loss": 0.79049468, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81747806, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.32507324, + "step": 994, + "time_per_iteration": 2.8814611434936523 + }, + { + "auxiliary_loss_clip": 0.01635085, + "auxiliary_loss_mlp": 0.01081386, + "balance_loss_clip": 1.37380815, + "balance_loss_mlp": 1.04974806, + "epoch": 0.05982263640462949, + "flos": 23190356732040.0, + "grad_norm": 2.386576081147415, + "language_loss": 0.80392516, + "learning_rate": 3.99067829878596e-06, + "loss": 0.83108985, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.31591797, + "step": 995, + "time_per_iteration": 2.826805830001831 + }, + { + "auxiliary_loss_clip": 0.01634881, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_clip": 1.37536907, + "balance_loss_mlp": 1.03704762, + "epoch": 0.05988275965729746, + "flos": 27856114120440.0, + "grad_norm": 1.95419123562763, + "language_loss": 0.87213916, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89917266, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.31445312, + "step": 996, + "time_per_iteration": 2.831880569458008 + }, + { + "auxiliary_loss_clip": 0.01630459, + "auxiliary_loss_mlp": 0.01072816, + "balance_loss_clip": 1.36987209, + "balance_loss_mlp": 1.03874588, + "epoch": 0.05994288290996543, + "flos": 24685266401640.0, + "grad_norm": 3.879615808050045, + "language_loss": 0.89860463, + "learning_rate": 3.990603031255718e-06, + "loss": 0.92563736, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.34082031, + "step": 997, + "time_per_iteration": 2.824367046356201 + }, + { + "auxiliary_loss_clip": 0.01426632, + "auxiliary_loss_mlp": 0.01012858, + "balance_loss_clip": 1.25000477, + "balance_loss_mlp": 1.00198615, + "epoch": 0.0600030061626334, + "flos": 69946446497400.0, + "grad_norm": 1.0071700576963865, + "language_loss": 0.75397885, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77837372, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.10888672, + "step": 998, + "time_per_iteration": 3.398818016052246 + }, + { + "auxiliary_loss_clip": 0.01624849, + "auxiliary_loss_mlp": 0.01067927, + "balance_loss_clip": 1.37030244, + "balance_loss_mlp": 1.03769505, + "epoch": 0.06006312941530137, + "flos": 26545562715960.0, + "grad_norm": 1.8125500113877384, + "language_loss": 0.75951099, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78643882, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.30224609, + "step": 999, + "time_per_iteration": 2.838785409927368 + }, + { + "auxiliary_loss_clip": 0.01637834, + "auxiliary_loss_mlp": 0.01069573, + "balance_loss_clip": 1.37478149, + "balance_loss_mlp": 1.03918672, + "epoch": 0.060123252667969335, + "flos": 27349681349520.0, + "grad_norm": 2.685611889946991, + "language_loss": 0.8297624, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85683644, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30383301, + "step": 1000, + "time_per_iteration": 2.849569082260132 + }, + { + "auxiliary_loss_clip": 0.01632811, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_clip": 1.37273157, + "balance_loss_mlp": 1.03843164, + "epoch": 0.06018337592063731, + "flos": 27022490190360.0, + "grad_norm": 2.234364796310801, + "language_loss": 0.86592305, + "learning_rate": 3.990451590400309e-06, + "loss": 0.89293826, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.30249023, + "step": 1001, + "time_per_iteration": 2.880573272705078 + }, + { + "auxiliary_loss_clip": 0.01626914, + "auxiliary_loss_mlp": 0.01074202, + "balance_loss_clip": 1.3696301, + "balance_loss_mlp": 1.04468584, + "epoch": 0.06024349917330528, + "flos": 25598448754920.0, + "grad_norm": 2.163111988645698, + "language_loss": 0.74881864, + "learning_rate": 3.990413541487551e-06, + "loss": 0.77582979, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.29504395, + "step": 1002, + "time_per_iteration": 2.892428159713745 + }, + { + "auxiliary_loss_clip": 0.01635635, + "auxiliary_loss_mlp": 0.01074655, + "balance_loss_clip": 1.37589645, + "balance_loss_mlp": 1.0454489, + "epoch": 0.060303622425973244, + "flos": 26137919749680.0, + "grad_norm": 2.3845542009924223, + "language_loss": 0.77297592, + "learning_rate": 3.990375417098112e-06, + "loss": 0.80007887, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.29199219, + "step": 1003, + "time_per_iteration": 2.8480660915374756 + }, + { + "auxiliary_loss_clip": 0.01637032, + "auxiliary_loss_mlp": 0.0107741, + "balance_loss_clip": 1.37464368, + "balance_loss_mlp": 1.04643965, + "epoch": 0.060363745678641216, + "flos": 20382215506200.0, + "grad_norm": 2.083561594042578, + "language_loss": 0.70407659, + "learning_rate": 3.990337217233437e-06, + "loss": 0.73122102, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.30957031, + "step": 1004, + "time_per_iteration": 2.849156141281128 + }, + { + "auxiliary_loss_clip": 0.0164625, + "auxiliary_loss_mlp": 0.01078285, + "balance_loss_clip": 1.38124752, + "balance_loss_mlp": 1.04788637, + "epoch": 0.06042386893130918, + "flos": 17753721975720.0, + "grad_norm": 2.4602307147275297, + "language_loss": 0.84137917, + "learning_rate": 3.990298941894976e-06, + "loss": 0.86862445, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.30395508, + "step": 1005, + "time_per_iteration": 2.875619649887085 + }, + { + "auxiliary_loss_clip": 0.01426796, + "auxiliary_loss_mlp": 0.01012663, + "balance_loss_clip": 1.250283, + "balance_loss_mlp": 1.00217271, + "epoch": 0.06048399218397715, + "flos": 68554590510240.0, + "grad_norm": 0.9329104348828603, + "language_loss": 0.59044385, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61483848, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.10498047, + "step": 1006, + "time_per_iteration": 3.3740406036376953 + }, + { + "auxiliary_loss_clip": 0.01632772, + "auxiliary_loss_mlp": 0.01069842, + "balance_loss_clip": 1.37036395, + "balance_loss_mlp": 1.03913414, + "epoch": 0.060544115436645125, + "flos": 23263783292880.0, + "grad_norm": 4.341993534942644, + "language_loss": 0.7506218, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77764791, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30700684, + "step": 1007, + "time_per_iteration": 2.858966827392578 + }, + { + "auxiliary_loss_clip": 0.01642721, + "auxiliary_loss_mlp": 0.01070078, + "balance_loss_clip": 1.38019466, + "balance_loss_mlp": 1.0373435, + "epoch": 0.06060423868931309, + "flos": 23883746703120.0, + "grad_norm": 2.2246002770551705, + "language_loss": 0.81323695, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.84036499, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.32739258, + "step": 1008, + "time_per_iteration": 2.8616554737091064 + }, + { + "auxiliary_loss_clip": 0.01627807, + "auxiliary_loss_mlp": 0.01067933, + "balance_loss_clip": 1.37222242, + "balance_loss_mlp": 1.03727257, + "epoch": 0.06066436194198106, + "flos": 18731924959320.0, + "grad_norm": 1.8587867984983264, + "language_loss": 0.78405917, + "learning_rate": 3.990145085832335e-06, + "loss": 0.8110165, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30664062, + "step": 1009, + "time_per_iteration": 2.911329984664917 + }, + { + "auxiliary_loss_clip": 0.01618854, + "auxiliary_loss_mlp": 0.01060393, + "balance_loss_clip": 1.36627805, + "balance_loss_mlp": 1.03214097, + "epoch": 0.06072448519464903, + "flos": 24645324756600.0, + "grad_norm": 1.7846230730557644, + "language_loss": 0.93553817, + "learning_rate": 3.990106433146769e-06, + "loss": 0.96233064, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.28234863, + "step": 1010, + "time_per_iteration": 2.904902935028076 + }, + { + "auxiliary_loss_clip": 0.01637695, + "auxiliary_loss_mlp": 0.01071726, + "balance_loss_clip": 1.36971498, + "balance_loss_mlp": 1.03994441, + "epoch": 0.060784608447317, + "flos": 17382609552240.0, + "grad_norm": 2.4519099066542025, + "language_loss": 0.73042428, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.75751847, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.31762695, + "step": 1011, + "time_per_iteration": 2.8150954246520996 + }, + { + "auxiliary_loss_clip": 0.01631043, + "auxiliary_loss_mlp": 0.010751, + "balance_loss_clip": 1.37132359, + "balance_loss_mlp": 1.0424124, + "epoch": 0.06084473169998497, + "flos": 23696992544760.0, + "grad_norm": 1.7898635654224306, + "language_loss": 0.87951452, + "learning_rate": 3.990028901381999e-06, + "loss": 0.90657598, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.32702637, + "step": 1012, + "time_per_iteration": 2.852555990219116 + }, + { + "auxiliary_loss_clip": 0.01625331, + "auxiliary_loss_mlp": 0.01073457, + "balance_loss_clip": 1.36589003, + "balance_loss_mlp": 1.04291534, + "epoch": 0.06090485495265294, + "flos": 23551114023720.0, + "grad_norm": 1.8959600481867669, + "language_loss": 0.77933598, + "learning_rate": 3.989990022305734e-06, + "loss": 0.80632377, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.30541992, + "step": 1013, + "time_per_iteration": 2.890223503112793 + }, + { + "auxiliary_loss_clip": 0.01640708, + "auxiliary_loss_mlp": 0.01067591, + "balance_loss_clip": 1.37659717, + "balance_loss_mlp": 1.03745449, + "epoch": 0.06096497820532091, + "flos": 20344141845720.0, + "grad_norm": 2.2625178262848116, + "language_loss": 0.86788279, + "learning_rate": 3.98995106776885e-06, + "loss": 0.89496577, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.30126953, + "step": 1014, + "time_per_iteration": 2.833261251449585 + }, + { + "auxiliary_loss_clip": 0.01636631, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.37082553, + "balance_loss_mlp": 1.03959632, + "epoch": 0.061025101457988874, + "flos": 26944190626320.0, + "grad_norm": 3.11815352405068, + "language_loss": 0.74097419, + "learning_rate": 3.98991203777282e-06, + "loss": 0.76806569, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.32910156, + "step": 1015, + "time_per_iteration": 2.895167827606201 + }, + { + "auxiliary_loss_clip": 0.01626291, + "auxiliary_loss_mlp": 0.0107505, + "balance_loss_clip": 1.37335825, + "balance_loss_mlp": 1.04586756, + "epoch": 0.061085224710656846, + "flos": 25380646182360.0, + "grad_norm": 1.6483382379680946, + "language_loss": 0.7964642, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.82347763, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29199219, + "step": 1016, + "time_per_iteration": 2.862722396850586 + }, + { + "auxiliary_loss_clip": 0.01630795, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_clip": 1.37150419, + "balance_loss_mlp": 1.03441262, + "epoch": 0.06114534796332482, + "flos": 24829886063520.0, + "grad_norm": 2.272800105036564, + "language_loss": 0.76701254, + "learning_rate": 3.989833751409254e-06, + "loss": 0.79395497, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.29052734, + "step": 1017, + "time_per_iteration": 2.967468500137329 + }, + { + "auxiliary_loss_clip": 0.01643136, + "auxiliary_loss_mlp": 0.01076624, + "balance_loss_clip": 1.37640584, + "balance_loss_mlp": 1.04572463, + "epoch": 0.061205471215992784, + "flos": 20636345579760.0, + "grad_norm": 1.9500840455257664, + "language_loss": 0.86504358, + "learning_rate": 3.989794495044685e-06, + "loss": 0.89224112, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.30908203, + "step": 1018, + "time_per_iteration": 2.830322504043579 + }, + { + "auxiliary_loss_clip": 0.01627065, + "auxiliary_loss_mlp": 0.01078291, + "balance_loss_clip": 1.37126184, + "balance_loss_mlp": 1.0477736, + "epoch": 0.061265594468660756, + "flos": 16512698729520.0, + "grad_norm": 2.4813807758975512, + "language_loss": 0.7821995, + "learning_rate": 3.989755163226909e-06, + "loss": 0.8092531, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.30517578, + "step": 1019, + "time_per_iteration": 2.7919836044311523 + }, + { + "auxiliary_loss_clip": 0.01633001, + "auxiliary_loss_mlp": 0.01070797, + "balance_loss_clip": 1.37508655, + "balance_loss_mlp": 1.04068458, + "epoch": 0.06132571772132872, + "flos": 26251328563920.0, + "grad_norm": 2.217067192770426, + "language_loss": 0.84931815, + "learning_rate": 3.989715755957418e-06, + "loss": 0.87635612, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.30102539, + "step": 1020, + "time_per_iteration": 2.821068048477173 + }, + { + "auxiliary_loss_clip": 0.01640111, + "auxiliary_loss_mlp": 0.01071532, + "balance_loss_clip": 1.38009775, + "balance_loss_mlp": 1.04225385, + "epoch": 0.06138584097399669, + "flos": 37421877855600.0, + "grad_norm": 1.8666263097261115, + "language_loss": 0.79528034, + "learning_rate": 3.989676273237705e-06, + "loss": 0.82239681, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.29272461, + "step": 1021, + "time_per_iteration": 2.9465785026550293 + }, + { + "auxiliary_loss_clip": 0.01623124, + "auxiliary_loss_mlp": 0.01067515, + "balance_loss_clip": 1.36690032, + "balance_loss_mlp": 1.04103816, + "epoch": 0.061445964226664665, + "flos": 17425150132320.0, + "grad_norm": 2.4057403771540558, + "language_loss": 0.88626051, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.91316694, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.26477051, + "step": 1022, + "time_per_iteration": 2.793696165084839 + }, + { + "auxiliary_loss_clip": 0.01632017, + "auxiliary_loss_mlp": 0.01070154, + "balance_loss_clip": 1.37408543, + "balance_loss_mlp": 1.04016089, + "epoch": 0.06150608747933263, + "flos": 22604974663320.0, + "grad_norm": 1.6857648647717984, + "language_loss": 0.83246899, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85949063, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.29980469, + "step": 1023, + "time_per_iteration": 2.8525102138519287 + }, + { + "auxiliary_loss_clip": 0.01415367, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.24216437, + "balance_loss_mlp": 1.02676988, + "epoch": 0.0615662107320006, + "flos": 56754453627000.0, + "grad_norm": 0.9109227959906593, + "language_loss": 0.64998806, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67449856, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.08935547, + "step": 1024, + "time_per_iteration": 4.79477071762085 + }, + { + "auxiliary_loss_clip": 0.01649713, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_clip": 1.3891356, + "balance_loss_mlp": 1.04463768, + "epoch": 0.06162633398466857, + "flos": 22569906021480.0, + "grad_norm": 1.9472248647743604, + "language_loss": 0.8904953, + "learning_rate": 3.989517587886636e-06, + "loss": 0.91772592, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.28686523, + "step": 1025, + "time_per_iteration": 2.903167486190796 + }, + { + "auxiliary_loss_clip": 0.01638899, + "auxiliary_loss_mlp": 0.01073295, + "balance_loss_clip": 1.37909532, + "balance_loss_mlp": 1.04357588, + "epoch": 0.06168645723733654, + "flos": 25598611188360.0, + "grad_norm": 2.1702085865086884, + "language_loss": 0.84800243, + "learning_rate": 3.989477727938335e-06, + "loss": 0.87512439, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.29760742, + "step": 1026, + "time_per_iteration": 4.752484560012817 + }, + { + "auxiliary_loss_clip": 0.01641929, + "auxiliary_loss_mlp": 0.01067009, + "balance_loss_clip": 1.37885904, + "balance_loss_mlp": 1.03614616, + "epoch": 0.06174658049000451, + "flos": 16002245730960.0, + "grad_norm": 2.004269120500599, + "language_loss": 0.82492203, + "learning_rate": 3.989437792548839e-06, + "loss": 0.85201144, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.30859375, + "step": 1027, + "time_per_iteration": 4.241824626922607 + }, + { + "auxiliary_loss_clip": 0.01632209, + "auxiliary_loss_mlp": 0.01065195, + "balance_loss_clip": 1.37456799, + "balance_loss_mlp": 1.03744304, + "epoch": 0.06180670374267248, + "flos": 11288993542560.0, + "grad_norm": 3.436991989176661, + "language_loss": 0.8520546, + "learning_rate": 3.989397781719663e-06, + "loss": 0.87902862, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.27783203, + "step": 1028, + "time_per_iteration": 2.8366925716400146 + }, + { + "auxiliary_loss_clip": 0.014115, + "auxiliary_loss_mlp": 0.01021545, + "balance_loss_clip": 1.23928571, + "balance_loss_mlp": 1.01296163, + "epoch": 0.06186682699534045, + "flos": 65143808662680.0, + "grad_norm": 0.9314270091064536, + "language_loss": 0.60448986, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6288203, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.0859375, + "step": 1029, + "time_per_iteration": 3.1536831855773926 + }, + { + "auxiliary_loss_clip": 0.01624635, + "auxiliary_loss_mlp": 0.01073433, + "balance_loss_clip": 1.36757278, + "balance_loss_mlp": 1.04543126, + "epoch": 0.061926950248008414, + "flos": 21110714727480.0, + "grad_norm": 2.678528902740711, + "language_loss": 0.82925338, + "learning_rate": 3.98931753374834e-06, + "loss": 0.85623407, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.28015137, + "step": 1030, + "time_per_iteration": 2.84378981590271 + }, + { + "auxiliary_loss_clip": 0.01648733, + "auxiliary_loss_mlp": 0.01079114, + "balance_loss_clip": 1.38770723, + "balance_loss_mlp": 1.04683256, + "epoch": 0.061987073500676386, + "flos": 17752909808520.0, + "grad_norm": 2.4010391760888803, + "language_loss": 0.81535369, + "learning_rate": 3.989277296609237e-06, + "loss": 0.84263217, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.32299805, + "step": 1031, + "time_per_iteration": 2.797272205352783 + }, + { + "auxiliary_loss_clip": 0.01630707, + "auxiliary_loss_mlp": 0.01067765, + "balance_loss_clip": 1.37265921, + "balance_loss_mlp": 1.03836823, + "epoch": 0.06204719675334436, + "flos": 21841406800200.0, + "grad_norm": 1.621675313564612, + "language_loss": 0.77753341, + "learning_rate": 3.98923698403654e-06, + "loss": 0.8045181, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.29382324, + "step": 1032, + "time_per_iteration": 2.788634777069092 + }, + { + "auxiliary_loss_clip": 0.01637359, + "auxiliary_loss_mlp": 0.01077505, + "balance_loss_clip": 1.37740886, + "balance_loss_mlp": 1.04534268, + "epoch": 0.06210732000601232, + "flos": 19358345098800.0, + "grad_norm": 2.4053977329942877, + "language_loss": 0.8974092, + "learning_rate": 3.989196596031776e-06, + "loss": 0.92455786, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.3215332, + "step": 1033, + "time_per_iteration": 2.8283138275146484 + }, + { + "auxiliary_loss_clip": 0.01627869, + "auxiliary_loss_mlp": 0.01062503, + "balance_loss_clip": 1.36890984, + "balance_loss_mlp": 1.03341556, + "epoch": 0.062167443258680295, + "flos": 24754185434520.0, + "grad_norm": 1.9961292718466597, + "language_loss": 0.85626268, + "learning_rate": 3.989156132596479e-06, + "loss": 0.88316631, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.29101562, + "step": 1034, + "time_per_iteration": 2.919807195663452 + }, + { + "auxiliary_loss_clip": 0.0161889, + "auxiliary_loss_mlp": 0.01064779, + "balance_loss_clip": 1.36991322, + "balance_loss_mlp": 1.03647852, + "epoch": 0.06222756651134827, + "flos": 34465056131880.0, + "grad_norm": 1.975166487930252, + "language_loss": 0.82067758, + "learning_rate": 3.989115593732182e-06, + "loss": 0.84751427, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.28320312, + "step": 1035, + "time_per_iteration": 2.9213006496429443 + }, + { + "auxiliary_loss_clip": 0.01628172, + "auxiliary_loss_mlp": 0.0107354, + "balance_loss_clip": 1.37056065, + "balance_loss_mlp": 1.04311752, + "epoch": 0.06228768976401623, + "flos": 25672037749200.0, + "grad_norm": 2.955831174610752, + "language_loss": 0.79241413, + "learning_rate": 3.989074979440421e-06, + "loss": 0.81943119, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.30419922, + "step": 1036, + "time_per_iteration": 2.8495588302612305 + }, + { + "auxiliary_loss_clip": 0.01624171, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_clip": 1.37135601, + "balance_loss_mlp": 1.043648, + "epoch": 0.062347813016684205, + "flos": 25300356808680.0, + "grad_norm": 1.6447624283150595, + "language_loss": 0.8739534, + "learning_rate": 3.989034289722739e-06, + "loss": 0.90092325, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29138184, + "step": 1037, + "time_per_iteration": 2.8743927478790283 + }, + { + "auxiliary_loss_clip": 0.01623492, + "auxiliary_loss_mlp": 0.01068195, + "balance_loss_clip": 1.3705802, + "balance_loss_mlp": 1.03751063, + "epoch": 0.06240793626935217, + "flos": 26912776736880.0, + "grad_norm": 2.829710358377263, + "language_loss": 0.81379557, + "learning_rate": 3.988993524580676e-06, + "loss": 0.84071249, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.30639648, + "step": 1038, + "time_per_iteration": 2.8622097969055176 + }, + { + "auxiliary_loss_clip": 0.0163291, + "auxiliary_loss_mlp": 0.01074926, + "balance_loss_clip": 1.38176775, + "balance_loss_mlp": 1.04360986, + "epoch": 0.06246805952202014, + "flos": 21620721034080.0, + "grad_norm": 1.8533471895995564, + "language_loss": 0.86228132, + "learning_rate": 3.98895268401578e-06, + "loss": 0.88935971, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.31335449, + "step": 1039, + "time_per_iteration": 2.7710683345794678 + }, + { + "auxiliary_loss_clip": 0.01636414, + "auxiliary_loss_mlp": 0.0107712, + "balance_loss_clip": 1.38076591, + "balance_loss_mlp": 1.04656696, + "epoch": 0.0625281827746881, + "flos": 19315845127080.0, + "grad_norm": 1.7417275698045964, + "language_loss": 0.81022632, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83736163, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.30554199, + "step": 1040, + "time_per_iteration": 2.8338940143585205 + }, + { + "auxiliary_loss_clip": 0.01637074, + "auxiliary_loss_mlp": 0.01065786, + "balance_loss_clip": 1.3810879, + "balance_loss_mlp": 1.0366993, + "epoch": 0.06258830602735609, + "flos": 27751557928680.0, + "grad_norm": 2.1163751414999354, + "language_loss": 0.70369709, + "learning_rate": 3.988870776623685e-06, + "loss": 0.73072565, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.29101562, + "step": 1041, + "time_per_iteration": 2.823910713195801 + }, + { + "auxiliary_loss_clip": 0.01635532, + "auxiliary_loss_mlp": 0.01067292, + "balance_loss_clip": 1.37814605, + "balance_loss_mlp": 1.03663099, + "epoch": 0.06264842928002405, + "flos": 23227943092200.0, + "grad_norm": 1.9902828237744248, + "language_loss": 0.82312143, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.85014963, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.30664062, + "step": 1042, + "time_per_iteration": 2.8387176990509033 + }, + { + "auxiliary_loss_clip": 0.01639658, + "auxiliary_loss_mlp": 0.01067431, + "balance_loss_clip": 1.38470483, + "balance_loss_mlp": 1.03789043, + "epoch": 0.06270855253269202, + "flos": 38406659393520.0, + "grad_norm": 1.6730783233383033, + "language_loss": 0.77854764, + "learning_rate": 3.988788567558874e-06, + "loss": 0.80561852, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.2956543, + "step": 1043, + "time_per_iteration": 2.988520622253418 + }, + { + "auxiliary_loss_clip": 0.01616012, + "auxiliary_loss_mlp": 0.0106469, + "balance_loss_clip": 1.36726439, + "balance_loss_mlp": 1.03581738, + "epoch": 0.06276867578535998, + "flos": 22458405800160.0, + "grad_norm": 2.118970226904351, + "language_loss": 0.93753338, + "learning_rate": 3.988747349903097e-06, + "loss": 0.96434033, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.28857422, + "step": 1044, + "time_per_iteration": 2.896826982498169 + }, + { + "auxiliary_loss_clip": 0.01641257, + "auxiliary_loss_mlp": 0.01086052, + "balance_loss_clip": 1.38570917, + "balance_loss_mlp": 1.05515313, + "epoch": 0.06282879903802796, + "flos": 22935698749800.0, + "grad_norm": 1.7934459907076221, + "language_loss": 0.85879302, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88606608, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30883789, + "step": 1045, + "time_per_iteration": 2.940657138824463 + }, + { + "auxiliary_loss_clip": 0.01627514, + "auxiliary_loss_mlp": 0.01073898, + "balance_loss_clip": 1.37729812, + "balance_loss_mlp": 1.0418663, + "epoch": 0.06288892229069593, + "flos": 34825326123240.0, + "grad_norm": 2.2721403544453014, + "language_loss": 0.79062891, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.81764305, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.32043457, + "step": 1046, + "time_per_iteration": 2.9129409790039062 + }, + { + "auxiliary_loss_clip": 0.01632679, + "auxiliary_loss_mlp": 0.0106976, + "balance_loss_clip": 1.37919545, + "balance_loss_mlp": 1.04118586, + "epoch": 0.06294904554336389, + "flos": 19431852876360.0, + "grad_norm": 2.0429622355866295, + "language_loss": 0.78139317, + "learning_rate": 3.988623244461039e-06, + "loss": 0.80841756, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.28527832, + "step": 1047, + "time_per_iteration": 2.8673086166381836 + }, + { + "auxiliary_loss_clip": 0.0165119, + "auxiliary_loss_mlp": 0.01079856, + "balance_loss_clip": 1.38902152, + "balance_loss_mlp": 1.04883802, + "epoch": 0.06300916879603187, + "flos": 40669360195680.0, + "grad_norm": 1.9715067154006125, + "language_loss": 0.77728474, + "learning_rate": 3.988581725160672e-06, + "loss": 0.80459523, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.31030273, + "step": 1048, + "time_per_iteration": 3.0170981884002686 + }, + { + "auxiliary_loss_clip": 0.01637265, + "auxiliary_loss_mlp": 0.01079615, + "balance_loss_clip": 1.38060415, + "balance_loss_mlp": 1.04883564, + "epoch": 0.06306929204869983, + "flos": 23809264324920.0, + "grad_norm": 2.567677311165326, + "language_loss": 0.78966027, + "learning_rate": 3.988540130453087e-06, + "loss": 0.81682909, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.30810547, + "step": 1049, + "time_per_iteration": 2.886970043182373 + }, + { + "auxiliary_loss_clip": 0.0163832, + "auxiliary_loss_mlp": 0.01074244, + "balance_loss_clip": 1.382774, + "balance_loss_mlp": 1.04248667, + "epoch": 0.0631294153013678, + "flos": 18920384668800.0, + "grad_norm": 2.2843660288190115, + "language_loss": 0.83145851, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85858417, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31762695, + "step": 1050, + "time_per_iteration": 2.8579540252685547 + }, + { + "auxiliary_loss_clip": 0.01633054, + "auxiliary_loss_mlp": 0.01072538, + "balance_loss_clip": 1.38337433, + "balance_loss_mlp": 1.04423738, + "epoch": 0.06318953855403578, + "flos": 24285582673920.0, + "grad_norm": 1.9328806795150788, + "language_loss": 0.7831533, + "learning_rate": 3.988456714822575e-06, + "loss": 0.81020916, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.28320312, + "step": 1051, + "time_per_iteration": 2.8026187419891357 + }, + { + "auxiliary_loss_clip": 0.01644527, + "auxiliary_loss_mlp": 0.01081871, + "balance_loss_clip": 1.38852239, + "balance_loss_mlp": 1.05009007, + "epoch": 0.06324966180670374, + "flos": 22534106429160.0, + "grad_norm": 2.1003055193860987, + "language_loss": 0.81593704, + "learning_rate": 3.98841489390281e-06, + "loss": 0.84320098, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.31787109, + "step": 1052, + "time_per_iteration": 2.8066279888153076 + }, + { + "auxiliary_loss_clip": 0.01640612, + "auxiliary_loss_mlp": 0.01077925, + "balance_loss_clip": 1.38168979, + "balance_loss_mlp": 1.04762244, + "epoch": 0.06330978505937171, + "flos": 15782290915320.0, + "grad_norm": 2.1844893749081034, + "language_loss": 0.78510046, + "learning_rate": 3.988372997582155e-06, + "loss": 0.81228584, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.30322266, + "step": 1053, + "time_per_iteration": 2.751340866088867 + }, + { + "auxiliary_loss_clip": 0.01632303, + "auxiliary_loss_mlp": 0.01067852, + "balance_loss_clip": 1.37634587, + "balance_loss_mlp": 1.0395751, + "epoch": 0.06336990831203967, + "flos": 21476182588920.0, + "grad_norm": 1.840458240270999, + "language_loss": 0.85373759, + "learning_rate": 3.988331025862195e-06, + "loss": 0.88073909, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.28271484, + "step": 1054, + "time_per_iteration": 2.950801134109497 + }, + { + "auxiliary_loss_clip": 0.01633896, + "auxiliary_loss_mlp": 0.01066882, + "balance_loss_clip": 1.37994897, + "balance_loss_mlp": 1.03634059, + "epoch": 0.06343003156470765, + "flos": 18483520664520.0, + "grad_norm": 1.926811520198083, + "language_loss": 0.85930288, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.8863107, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30541992, + "step": 1055, + "time_per_iteration": 2.8010175228118896 + }, + { + "auxiliary_loss_clip": 0.01645437, + "auxiliary_loss_mlp": 0.01079608, + "balance_loss_clip": 1.38190889, + "balance_loss_mlp": 1.04928112, + "epoch": 0.06349015481737562, + "flos": 25160285283120.0, + "grad_norm": 2.841536613466711, + "language_loss": 0.82735896, + "learning_rate": 3.988246856230734e-06, + "loss": 0.85460943, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.30322266, + "step": 1056, + "time_per_iteration": 2.9110536575317383 + }, + { + "auxiliary_loss_clip": 0.01648362, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_clip": 1.38343859, + "balance_loss_mlp": 1.04241717, + "epoch": 0.06355027807004358, + "flos": 26877667486680.0, + "grad_norm": 2.3387502596848933, + "language_loss": 0.81825244, + "learning_rate": 3.988204658322426e-06, + "loss": 0.84551024, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.34985352, + "step": 1057, + "time_per_iteration": 2.9012069702148438 + }, + { + "auxiliary_loss_clip": 0.01608354, + "auxiliary_loss_mlp": 0.01069612, + "balance_loss_clip": 1.36197066, + "balance_loss_mlp": 1.04115713, + "epoch": 0.06361040132271156, + "flos": 21401578385640.0, + "grad_norm": 2.1246399764616735, + "language_loss": 0.83709508, + "learning_rate": 3.988162385021196e-06, + "loss": 0.86387473, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.28479004, + "step": 1058, + "time_per_iteration": 2.7650628089904785 + }, + { + "auxiliary_loss_clip": 0.01632472, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_clip": 1.37972796, + "balance_loss_mlp": 1.04723144, + "epoch": 0.06367052457537953, + "flos": 25738601497200.0, + "grad_norm": 1.907020156912386, + "language_loss": 0.88092196, + "learning_rate": 3.988120036328651e-06, + "loss": 0.90805542, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.33666992, + "step": 1059, + "time_per_iteration": 2.846078395843506 + }, + { + "auxiliary_loss_clip": 0.0164264, + "auxiliary_loss_mlp": 0.01073404, + "balance_loss_clip": 1.38506281, + "balance_loss_mlp": 1.04269576, + "epoch": 0.0637306478280475, + "flos": 17634871641240.0, + "grad_norm": 3.267856332059352, + "language_loss": 0.92133749, + "learning_rate": 3.988077612246394e-06, + "loss": 0.94849789, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.30737305, + "step": 1060, + "time_per_iteration": 2.83256459236145 + }, + { + "auxiliary_loss_clip": 0.01622919, + "auxiliary_loss_mlp": 0.01070885, + "balance_loss_clip": 1.37033677, + "balance_loss_mlp": 1.04167891, + "epoch": 0.06379077108071547, + "flos": 13666605668280.0, + "grad_norm": 2.0612008600991882, + "language_loss": 0.88065588, + "learning_rate": 3.988035112776035e-06, + "loss": 0.90759391, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.29199219, + "step": 1061, + "time_per_iteration": 2.899599075317383 + }, + { + "auxiliary_loss_clip": 0.01652044, + "auxiliary_loss_mlp": 0.0107183, + "balance_loss_clip": 1.3850255, + "balance_loss_mlp": 1.03976309, + "epoch": 0.06385089433338344, + "flos": 28485173803320.0, + "grad_norm": 2.415876284594753, + "language_loss": 0.77421999, + "learning_rate": 3.987992537919185e-06, + "loss": 0.80145872, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.32080078, + "step": 1062, + "time_per_iteration": 2.924149751663208 + }, + { + "auxiliary_loss_clip": 0.01639179, + "auxiliary_loss_mlp": 0.01073544, + "balance_loss_clip": 1.38077688, + "balance_loss_mlp": 1.0435394, + "epoch": 0.0639110175860514, + "flos": 24315737704200.0, + "grad_norm": 1.936321884223686, + "language_loss": 0.86541671, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89254391, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.3001709, + "step": 1063, + "time_per_iteration": 6.18410325050354 + }, + { + "auxiliary_loss_clip": 0.01644897, + "auxiliary_loss_mlp": 0.01064283, + "balance_loss_clip": 1.38639867, + "balance_loss_mlp": 1.03157222, + "epoch": 0.06397114083871938, + "flos": 22095861740640.0, + "grad_norm": 2.2361779874875, + "language_loss": 0.81065041, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.83774221, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.32714844, + "step": 1064, + "time_per_iteration": 2.784796714782715 + }, + { + "auxiliary_loss_clip": 0.01636077, + "auxiliary_loss_mlp": 0.01073898, + "balance_loss_clip": 1.37887526, + "balance_loss_mlp": 1.04270053, + "epoch": 0.06403126409138735, + "flos": 19577528355600.0, + "grad_norm": 2.228459226290483, + "language_loss": 0.84826267, + "learning_rate": 3.987864361045851e-06, + "loss": 0.87536246, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.31213379, + "step": 1065, + "time_per_iteration": 4.288462162017822 + }, + { + "auxiliary_loss_clip": 0.01642794, + "auxiliary_loss_mlp": 0.01059016, + "balance_loss_clip": 1.38553381, + "balance_loss_mlp": 1.03060877, + "epoch": 0.06409138734405531, + "flos": 40814873241480.0, + "grad_norm": 1.491800197532277, + "language_loss": 0.68643391, + "learning_rate": 3.987821484659211e-06, + "loss": 0.71345204, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.28417969, + "step": 1066, + "time_per_iteration": 3.0190439224243164 + }, + { + "auxiliary_loss_clip": 0.01646285, + "auxiliary_loss_mlp": 0.01074689, + "balance_loss_clip": 1.3906157, + "balance_loss_mlp": 1.04405272, + "epoch": 0.06415151059672328, + "flos": 20445530585400.0, + "grad_norm": 2.353495366826825, + "language_loss": 0.90916687, + "learning_rate": 3.987778532894181e-06, + "loss": 0.93637669, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.3059082, + "step": 1067, + "time_per_iteration": 2.7905609607696533 + }, + { + "auxiliary_loss_clip": 0.01647692, + "auxiliary_loss_mlp": 0.01074154, + "balance_loss_clip": 1.38873386, + "balance_loss_mlp": 1.04447067, + "epoch": 0.06421163384939126, + "flos": 18075918306600.0, + "grad_norm": 3.005907173884748, + "language_loss": 0.84560776, + "learning_rate": 3.987735505752391e-06, + "loss": 0.87282622, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29663086, + "step": 1068, + "time_per_iteration": 2.8907792568206787 + }, + { + "auxiliary_loss_clip": 0.01643641, + "auxiliary_loss_mlp": 0.01071638, + "balance_loss_clip": 1.39076531, + "balance_loss_mlp": 1.04178858, + "epoch": 0.06427175710205922, + "flos": 25124972991120.0, + "grad_norm": 2.508359642836277, + "language_loss": 0.90144503, + "learning_rate": 3.987692403235471e-06, + "loss": 0.92859781, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.29833984, + "step": 1069, + "time_per_iteration": 2.888148546218872 + }, + { + "auxiliary_loss_clip": 0.01652235, + "auxiliary_loss_mlp": 0.01086257, + "balance_loss_clip": 1.3895781, + "balance_loss_mlp": 1.05449939, + "epoch": 0.06433188035472719, + "flos": 17384558753520.0, + "grad_norm": 3.0245045201475365, + "language_loss": 0.96433079, + "learning_rate": 3.987649225345056e-06, + "loss": 0.99171567, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.31762695, + "step": 1070, + "time_per_iteration": 2.751068115234375 + }, + { + "auxiliary_loss_clip": 0.01653526, + "auxiliary_loss_mlp": 0.01068391, + "balance_loss_clip": 1.3941474, + "balance_loss_mlp": 1.03632355, + "epoch": 0.06439200360739517, + "flos": 23551032807000.0, + "grad_norm": 1.480422672350403, + "language_loss": 0.88375986, + "learning_rate": 3.987605972082782e-06, + "loss": 0.91097897, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.32104492, + "step": 1071, + "time_per_iteration": 2.8791747093200684 + }, + { + "auxiliary_loss_clip": 0.01640247, + "auxiliary_loss_mlp": 0.0106763, + "balance_loss_clip": 1.38528681, + "balance_loss_mlp": 1.03584921, + "epoch": 0.06445212686006313, + "flos": 21984645777840.0, + "grad_norm": 1.6747696305804018, + "language_loss": 0.77189112, + "learning_rate": 3.987562643450292e-06, + "loss": 0.79896992, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.31774902, + "step": 1072, + "time_per_iteration": 2.922267198562622 + }, + { + "auxiliary_loss_clip": 0.01657313, + "auxiliary_loss_mlp": 0.01076453, + "balance_loss_clip": 1.39582527, + "balance_loss_mlp": 1.0450058, + "epoch": 0.0645122501127311, + "flos": 25926817556520.0, + "grad_norm": 1.9480885803094998, + "language_loss": 0.81507576, + "learning_rate": 3.987519239449226e-06, + "loss": 0.84241337, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.31445312, + "step": 1073, + "time_per_iteration": 2.9054794311523438 + }, + { + "auxiliary_loss_clip": 0.0163613, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_clip": 1.38722754, + "balance_loss_mlp": 1.04782319, + "epoch": 0.06457237336539907, + "flos": 25631080895160.0, + "grad_norm": 1.7137746834336067, + "language_loss": 0.8085022, + "learning_rate": 3.987475760081233e-06, + "loss": 0.83564979, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.30786133, + "step": 1074, + "time_per_iteration": 2.8870036602020264 + }, + { + "auxiliary_loss_clip": 0.01646754, + "auxiliary_loss_mlp": 0.01069936, + "balance_loss_clip": 1.39029741, + "balance_loss_mlp": 1.04001451, + "epoch": 0.06463249661806704, + "flos": 19468748894400.0, + "grad_norm": 1.6334848860750926, + "language_loss": 0.79913151, + "learning_rate": 3.987432205347958e-06, + "loss": 0.82629842, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.29931641, + "step": 1075, + "time_per_iteration": 2.8710474967956543 + }, + { + "auxiliary_loss_clip": 0.01641045, + "auxiliary_loss_mlp": 0.01068622, + "balance_loss_clip": 1.3865937, + "balance_loss_mlp": 1.04029775, + "epoch": 0.064692619870735, + "flos": 24503344638120.0, + "grad_norm": 2.624619086821488, + "language_loss": 0.89143223, + "learning_rate": 3.987388575251055e-06, + "loss": 0.91852885, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.28344727, + "step": 1076, + "time_per_iteration": 2.918598175048828 + }, + { + "auxiliary_loss_clip": 0.0164583, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_clip": 1.3927629, + "balance_loss_mlp": 1.03878045, + "epoch": 0.06475274312340297, + "flos": 17023029903000.0, + "grad_norm": 2.332324045218483, + "language_loss": 0.81373942, + "learning_rate": 3.98734486979218e-06, + "loss": 0.84088314, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.29785156, + "step": 1077, + "time_per_iteration": 2.9157166481018066 + }, + { + "auxiliary_loss_clip": 0.01667318, + "auxiliary_loss_mlp": 0.01069197, + "balance_loss_clip": 1.40395761, + "balance_loss_mlp": 1.03791654, + "epoch": 0.06481286637607095, + "flos": 24577867624680.0, + "grad_norm": 3.316566152148257, + "language_loss": 0.92272103, + "learning_rate": 3.987301088972986e-06, + "loss": 0.95008618, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.31286621, + "step": 1078, + "time_per_iteration": 2.932846784591675 + }, + { + "auxiliary_loss_clip": 0.0167372, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.41002202, + "balance_loss_mlp": 1.04069412, + "epoch": 0.06487298962873891, + "flos": 21110592902400.0, + "grad_norm": 2.17188029385554, + "language_loss": 0.7913624, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81881416, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.30761719, + "step": 1079, + "time_per_iteration": 2.7867271900177 + }, + { + "auxiliary_loss_clip": 0.01654642, + "auxiliary_loss_mlp": 0.01077759, + "balance_loss_clip": 1.39646757, + "balance_loss_mlp": 1.04604983, + "epoch": 0.06493311288140688, + "flos": 24613586000280.0, + "grad_norm": 2.0491515908094615, + "language_loss": 0.69974792, + "learning_rate": 3.987213301260294e-06, + "loss": 0.72707194, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31689453, + "step": 1080, + "time_per_iteration": 2.9223556518554688 + }, + { + "auxiliary_loss_clip": 0.01655588, + "auxiliary_loss_mlp": 0.01075258, + "balance_loss_clip": 1.3971597, + "balance_loss_mlp": 1.04288125, + "epoch": 0.06499323613407486, + "flos": 25343506514160.0, + "grad_norm": 2.0218554806397697, + "language_loss": 0.72864616, + "learning_rate": 3.987169294370123e-06, + "loss": 0.75595462, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.32348633, + "step": 1081, + "time_per_iteration": 2.890613555908203 + }, + { + "auxiliary_loss_clip": 0.01649683, + "auxiliary_loss_mlp": 0.01075317, + "balance_loss_clip": 1.39480519, + "balance_loss_mlp": 1.04320204, + "epoch": 0.06505335938674282, + "flos": 20380550563440.0, + "grad_norm": 15.858933463420232, + "language_loss": 0.84369814, + "learning_rate": 3.987125212126294e-06, + "loss": 0.8709482, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.32128906, + "step": 1082, + "time_per_iteration": 2.833679437637329 + }, + { + "auxiliary_loss_clip": 0.01674031, + "auxiliary_loss_mlp": 0.01081712, + "balance_loss_clip": 1.40571487, + "balance_loss_mlp": 1.04947782, + "epoch": 0.06511348263941079, + "flos": 25343425297440.0, + "grad_norm": 2.362662890049169, + "language_loss": 0.83288312, + "learning_rate": 3.987081054530478e-06, + "loss": 0.86044061, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.32250977, + "step": 1083, + "time_per_iteration": 2.8683080673217773 + }, + { + "auxiliary_loss_clip": 0.01657385, + "auxiliary_loss_mlp": 0.01081145, + "balance_loss_clip": 1.396716, + "balance_loss_mlp": 1.0492208, + "epoch": 0.06517360589207877, + "flos": 20336791732560.0, + "grad_norm": 2.7664531589617485, + "language_loss": 0.80624962, + "learning_rate": 3.987036821584348e-06, + "loss": 0.83363491, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.31933594, + "step": 1084, + "time_per_iteration": 2.872598648071289 + }, + { + "auxiliary_loss_clip": 0.01652197, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_clip": 1.39362121, + "balance_loss_mlp": 1.04363251, + "epoch": 0.06523372914474673, + "flos": 31687354194840.0, + "grad_norm": 2.0589213019419357, + "language_loss": 0.67141163, + "learning_rate": 3.986992513289584e-06, + "loss": 0.69870722, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.33764648, + "step": 1085, + "time_per_iteration": 2.9211337566375732 + }, + { + "auxiliary_loss_clip": 0.01654547, + "auxiliary_loss_mlp": 0.010764, + "balance_loss_clip": 1.39826488, + "balance_loss_mlp": 1.04595399, + "epoch": 0.0652938523974147, + "flos": 20783198701440.0, + "grad_norm": 1.9490343325332329, + "language_loss": 0.77687085, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.80418032, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.30444336, + "step": 1086, + "time_per_iteration": 2.904407024383545 + }, + { + "auxiliary_loss_clip": 0.01657009, + "auxiliary_loss_mlp": 0.0107755, + "balance_loss_clip": 1.39972472, + "balance_loss_mlp": 1.04307437, + "epoch": 0.06535397565008266, + "flos": 16695392051880.0, + "grad_norm": 3.535792504346402, + "language_loss": 0.86125422, + "learning_rate": 3.986903670660872e-06, + "loss": 0.88859975, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.34460449, + "step": 1087, + "time_per_iteration": 2.8286449909210205 + }, + { + "auxiliary_loss_clip": 0.01661025, + "auxiliary_loss_mlp": 0.01073918, + "balance_loss_clip": 1.40078115, + "balance_loss_mlp": 1.04137397, + "epoch": 0.06541409890275064, + "flos": 26873647259040.0, + "grad_norm": 1.8273633078456213, + "language_loss": 0.78729677, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.81464624, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.32568359, + "step": 1088, + "time_per_iteration": 2.8274264335632324 + }, + { + "auxiliary_loss_clip": 0.01650256, + "auxiliary_loss_mlp": 0.01076454, + "balance_loss_clip": 1.39331532, + "balance_loss_mlp": 1.04739094, + "epoch": 0.06547422215541861, + "flos": 20526469692840.0, + "grad_norm": 1.8090479710952736, + "language_loss": 0.7191385, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.7464056, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.29077148, + "step": 1089, + "time_per_iteration": 2.898729085922241 + }, + { + "auxiliary_loss_clip": 0.01646831, + "auxiliary_loss_mlp": 0.01065576, + "balance_loss_clip": 1.39154255, + "balance_loss_mlp": 1.03675103, + "epoch": 0.06553434540808657, + "flos": 22021541795880.0, + "grad_norm": 1.7382349579060101, + "language_loss": 0.86039585, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.88751996, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.28796387, + "step": 1090, + "time_per_iteration": 2.829694986343384 + }, + { + "auxiliary_loss_clip": 0.01653048, + "auxiliary_loss_mlp": 0.01068687, + "balance_loss_clip": 1.39625931, + "balance_loss_mlp": 1.03847885, + "epoch": 0.06559446866075455, + "flos": 24614113908960.0, + "grad_norm": 3.6940481635037807, + "language_loss": 0.72798473, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.75520205, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.30224609, + "step": 1091, + "time_per_iteration": 2.826653242111206 + }, + { + "auxiliary_loss_clip": 0.01642771, + "auxiliary_loss_mlp": 0.01075509, + "balance_loss_clip": 1.38946283, + "balance_loss_mlp": 1.04470515, + "epoch": 0.06565459191342252, + "flos": 24279369594840.0, + "grad_norm": 2.2373632059310813, + "language_loss": 0.82852125, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85570401, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.30786133, + "step": 1092, + "time_per_iteration": 2.8218071460723877 + }, + { + "auxiliary_loss_clip": 0.01645776, + "auxiliary_loss_mlp": 0.0107011, + "balance_loss_clip": 1.38525879, + "balance_loss_mlp": 1.03749406, + "epoch": 0.06571471516609048, + "flos": 24792096661560.0, + "grad_norm": 1.935184441428991, + "language_loss": 0.71546221, + "learning_rate": 3.986635334582814e-06, + "loss": 0.74262106, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.32592773, + "step": 1093, + "time_per_iteration": 2.7992794513702393 + }, + { + "auxiliary_loss_clip": 0.01648023, + "auxiliary_loss_mlp": 0.01074257, + "balance_loss_clip": 1.39275408, + "balance_loss_mlp": 1.04018748, + "epoch": 0.06577483841875846, + "flos": 26219752241040.0, + "grad_norm": 1.7287738137204, + "language_loss": 0.88049024, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90771306, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.34033203, + "step": 1094, + "time_per_iteration": 2.848726272583008 + }, + { + "auxiliary_loss_clip": 0.01652934, + "auxiliary_loss_mlp": 0.01078568, + "balance_loss_clip": 1.3954128, + "balance_loss_mlp": 1.0431149, + "epoch": 0.06583496167142643, + "flos": 25086046555080.0, + "grad_norm": 1.7514572235025359, + "language_loss": 0.81819373, + "learning_rate": 3.986545286538044e-06, + "loss": 0.84550881, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.35473633, + "step": 1095, + "time_per_iteration": 2.8122520446777344 + }, + { + "auxiliary_loss_clip": 0.01639349, + "auxiliary_loss_mlp": 0.01062957, + "balance_loss_clip": 1.38586974, + "balance_loss_mlp": 1.0332737, + "epoch": 0.06589508492409439, + "flos": 25635344772960.0, + "grad_norm": 2.1372442337725146, + "language_loss": 0.71007764, + "learning_rate": 3.986500149519811e-06, + "loss": 0.73710072, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.29675293, + "step": 1096, + "time_per_iteration": 2.842923402786255 + }, + { + "auxiliary_loss_clip": 0.01649841, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_clip": 1.39549947, + "balance_loss_mlp": 1.0572437, + "epoch": 0.06595520817676236, + "flos": 23626286744040.0, + "grad_norm": 1.8113370194967826, + "language_loss": 0.77556878, + "learning_rate": 3.986454937173292e-06, + "loss": 0.80293286, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.29321289, + "step": 1097, + "time_per_iteration": 2.906064033508301 + }, + { + "auxiliary_loss_clip": 0.016548, + "auxiliary_loss_mlp": 0.01069076, + "balance_loss_clip": 1.39364541, + "balance_loss_mlp": 1.04051387, + "epoch": 0.06601533142943034, + "flos": 33808196703600.0, + "grad_norm": 1.7216425935157307, + "language_loss": 0.78772098, + "learning_rate": 3.986409649500203e-06, + "loss": 0.81495976, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.28540039, + "step": 1098, + "time_per_iteration": 2.9285035133361816 + }, + { + "auxiliary_loss_clip": 0.0164407, + "auxiliary_loss_mlp": 0.01080568, + "balance_loss_clip": 1.38969421, + "balance_loss_mlp": 1.04990697, + "epoch": 0.0660754546820983, + "flos": 20263080913200.0, + "grad_norm": 1.7195189143901504, + "language_loss": 0.82299685, + "learning_rate": 3.986364286502261e-06, + "loss": 0.85024321, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30664062, + "step": 1099, + "time_per_iteration": 2.816408395767212 + }, + { + "auxiliary_loss_clip": 0.01628171, + "auxiliary_loss_mlp": 0.01061415, + "balance_loss_clip": 1.37735939, + "balance_loss_mlp": 1.03273296, + "epoch": 0.06613557793476627, + "flos": 19358873007480.0, + "grad_norm": 2.753307528306749, + "language_loss": 0.83299518, + "learning_rate": 3.986318848181186e-06, + "loss": 0.859891, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.28662109, + "step": 1100, + "time_per_iteration": 2.7975692749023438 + }, + { + "auxiliary_loss_clip": 0.01633089, + "auxiliary_loss_mlp": 0.01066799, + "balance_loss_clip": 1.38157809, + "balance_loss_mlp": 1.03785539, + "epoch": 0.06619570118743424, + "flos": 13776643988640.0, + "grad_norm": 2.8579861382301814, + "language_loss": 0.73875707, + "learning_rate": 3.986273334538702e-06, + "loss": 0.76575589, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.28955078, + "step": 1101, + "time_per_iteration": 4.205621242523193 + }, + { + "auxiliary_loss_clip": 0.01629996, + "auxiliary_loss_mlp": 0.01067727, + "balance_loss_clip": 1.37833428, + "balance_loss_mlp": 1.0395937, + "epoch": 0.06625582444010221, + "flos": 17862339003480.0, + "grad_norm": 2.5298131662716727, + "language_loss": 0.87393093, + "learning_rate": 3.986227745576533e-06, + "loss": 0.90090823, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.28173828, + "step": 1102, + "time_per_iteration": 4.5649168491363525 + }, + { + "auxiliary_loss_clip": 0.0163789, + "auxiliary_loss_mlp": 0.01076243, + "balance_loss_clip": 1.38529801, + "balance_loss_mlp": 1.04656005, + "epoch": 0.06631594769277017, + "flos": 11842515029880.0, + "grad_norm": 2.049124620108521, + "language_loss": 0.82802498, + "learning_rate": 3.98618208129641e-06, + "loss": 0.85516632, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.29663086, + "step": 1103, + "time_per_iteration": 4.330958127975464 + }, + { + "auxiliary_loss_clip": 0.01630112, + "auxiliary_loss_mlp": 0.01075303, + "balance_loss_clip": 1.3787365, + "balance_loss_mlp": 1.04595327, + "epoch": 0.06637607094543815, + "flos": 19798295338440.0, + "grad_norm": 1.8192573317898926, + "language_loss": 0.82397348, + "learning_rate": 3.986136341700063e-06, + "loss": 0.85102767, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.29345703, + "step": 1104, + "time_per_iteration": 4.351717710494995 + }, + { + "auxiliary_loss_clip": 0.01621552, + "auxiliary_loss_mlp": 0.01063225, + "balance_loss_clip": 1.37225413, + "balance_loss_mlp": 1.03244555, + "epoch": 0.06643619419810612, + "flos": 25491415453200.0, + "grad_norm": 1.5684593558548803, + "language_loss": 0.80769658, + "learning_rate": 3.986090526789227e-06, + "loss": 0.83454436, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30749512, + "step": 1105, + "time_per_iteration": 2.840337038040161 + }, + { + "auxiliary_loss_clip": 0.01629091, + "auxiliary_loss_mlp": 0.01079983, + "balance_loss_clip": 1.38414621, + "balance_loss_mlp": 1.051265, + "epoch": 0.06649631745077408, + "flos": 16950984026400.0, + "grad_norm": 2.0350391228629943, + "language_loss": 0.97135842, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99844915, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.28686523, + "step": 1106, + "time_per_iteration": 2.845766544342041 + }, + { + "auxiliary_loss_clip": 0.01637061, + "auxiliary_loss_mlp": 0.01067392, + "balance_loss_clip": 1.38323092, + "balance_loss_mlp": 1.03732705, + "epoch": 0.06655644070344206, + "flos": 17863151170680.0, + "grad_norm": 3.6048358879589277, + "language_loss": 0.83454525, + "learning_rate": 3.985998671031039e-06, + "loss": 0.86158979, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.30053711, + "step": 1107, + "time_per_iteration": 2.8294637203216553 + }, + { + "auxiliary_loss_clip": 0.01438814, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.25868642, + "balance_loss_mlp": 1.01602602, + "epoch": 0.06661656395611003, + "flos": 61433936641080.0, + "grad_norm": 0.8004227383221519, + "language_loss": 0.56740761, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.5920819, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.12597656, + "step": 1108, + "time_per_iteration": 3.2571403980255127 + }, + { + "auxiliary_loss_clip": 0.01625336, + "auxiliary_loss_mlp": 0.01068228, + "balance_loss_clip": 1.37408066, + "balance_loss_mlp": 1.03450394, + "epoch": 0.066676687208778, + "flos": 20667109735440.0, + "grad_norm": 2.885672986711973, + "language_loss": 0.73025757, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75719321, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.33703613, + "step": 1109, + "time_per_iteration": 2.8072361946105957 + }, + { + "auxiliary_loss_clip": 0.01622424, + "auxiliary_loss_mlp": 0.01067084, + "balance_loss_clip": 1.3733511, + "balance_loss_mlp": 1.03687596, + "epoch": 0.06673681046144596, + "flos": 20928224446920.0, + "grad_norm": 1.805549340651984, + "language_loss": 0.78457707, + "learning_rate": 3.985860322578614e-06, + "loss": 0.81147218, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.30200195, + "step": 1110, + "time_per_iteration": 2.8439738750457764 + }, + { + "auxiliary_loss_clip": 0.01632662, + "auxiliary_loss_mlp": 0.01066461, + "balance_loss_clip": 1.38101423, + "balance_loss_mlp": 1.03909051, + "epoch": 0.06679693371411394, + "flos": 31072020137640.0, + "grad_norm": 1.9864849965785025, + "language_loss": 0.71652699, + "learning_rate": 3.985814055817427e-06, + "loss": 0.74351823, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.27355957, + "step": 1111, + "time_per_iteration": 2.8832619190216064 + }, + { + "auxiliary_loss_clip": 0.01626643, + "auxiliary_loss_mlp": 0.0107289, + "balance_loss_clip": 1.37543321, + "balance_loss_mlp": 1.04504287, + "epoch": 0.0668570569667819, + "flos": 21731206046400.0, + "grad_norm": 2.0574228175651217, + "language_loss": 0.78777736, + "learning_rate": 3.985767713753971e-06, + "loss": 0.81477273, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.27819824, + "step": 1112, + "time_per_iteration": 2.7897818088531494 + }, + { + "auxiliary_loss_clip": 0.01620378, + "auxiliary_loss_mlp": 0.01079638, + "balance_loss_clip": 1.37263477, + "balance_loss_mlp": 1.05024111, + "epoch": 0.06691718021944987, + "flos": 22752396302040.0, + "grad_norm": 2.057895060146541, + "language_loss": 0.80122393, + "learning_rate": 3.985721296390005e-06, + "loss": 0.82822406, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.29394531, + "step": 1113, + "time_per_iteration": 2.7984678745269775 + }, + { + "auxiliary_loss_clip": 0.01607514, + "auxiliary_loss_mlp": 0.01069245, + "balance_loss_clip": 1.36458659, + "balance_loss_mlp": 1.04099274, + "epoch": 0.06697730347211785, + "flos": 16550082047880.0, + "grad_norm": 2.360509153794992, + "language_loss": 0.83436906, + "learning_rate": 3.985674803727289e-06, + "loss": 0.86113667, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28271484, + "step": 1114, + "time_per_iteration": 2.807293176651001 + }, + { + "auxiliary_loss_clip": 0.0142172, + "auxiliary_loss_mlp": 0.01016861, + "balance_loss_clip": 1.24868441, + "balance_loss_mlp": 1.00536907, + "epoch": 0.06703742672478581, + "flos": 59797331111520.0, + "grad_norm": 0.8402378807258373, + "language_loss": 0.58127356, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60565937, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.11474609, + "step": 1115, + "time_per_iteration": 3.248243570327759 + }, + { + "auxiliary_loss_clip": 0.01619447, + "auxiliary_loss_mlp": 0.01075093, + "balance_loss_clip": 1.37117398, + "balance_loss_mlp": 1.04254925, + "epoch": 0.06709754997745378, + "flos": 16804577596680.0, + "grad_norm": 2.4608696963172565, + "language_loss": 0.92110795, + "learning_rate": 3.985581592512658e-06, + "loss": 0.94805336, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.32543945, + "step": 1116, + "time_per_iteration": 2.7910194396972656 + }, + { + "auxiliary_loss_clip": 0.01624902, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_clip": 1.37273026, + "balance_loss_mlp": 1.04169989, + "epoch": 0.06715767323012176, + "flos": 22128575097600.0, + "grad_norm": 3.240049060224233, + "language_loss": 0.87428367, + "learning_rate": 3.985534873964279e-06, + "loss": 0.90125316, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.30322266, + "step": 1117, + "time_per_iteration": 2.8459081649780273 + }, + { + "auxiliary_loss_clip": 0.01412535, + "auxiliary_loss_mlp": 0.01015614, + "balance_loss_clip": 1.24005914, + "balance_loss_mlp": 1.00469398, + "epoch": 0.06721779648278972, + "flos": 66630921527160.0, + "grad_norm": 0.897323228350087, + "language_loss": 0.59701121, + "learning_rate": 3.985488080124218e-06, + "loss": 0.62129271, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.109375, + "step": 1118, + "time_per_iteration": 3.213833808898926 + }, + { + "auxiliary_loss_clip": 0.01618834, + "auxiliary_loss_mlp": 0.01062903, + "balance_loss_clip": 1.36656487, + "balance_loss_mlp": 1.03469825, + "epoch": 0.06727791973545769, + "flos": 22387578174360.0, + "grad_norm": 2.8848355696250993, + "language_loss": 0.85047734, + "learning_rate": 3.985441210994251e-06, + "loss": 0.87729466, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.28222656, + "step": 1119, + "time_per_iteration": 2.8372836112976074 + }, + { + "auxiliary_loss_clip": 0.01612717, + "auxiliary_loss_mlp": 0.01074776, + "balance_loss_clip": 1.36485744, + "balance_loss_mlp": 1.04781127, + "epoch": 0.06733804298812565, + "flos": 24285623282280.0, + "grad_norm": 2.101577979096278, + "language_loss": 0.85495794, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.88183284, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.26989746, + "step": 1120, + "time_per_iteration": 2.8225929737091064 + }, + { + "auxiliary_loss_clip": 0.01625974, + "auxiliary_loss_mlp": 0.01084152, + "balance_loss_clip": 1.3751471, + "balance_loss_mlp": 1.05427814, + "epoch": 0.06739816624079363, + "flos": 15921672098760.0, + "grad_norm": 1.7597025610738866, + "language_loss": 0.78912961, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81623077, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.29858398, + "step": 1121, + "time_per_iteration": 2.9502112865448 + }, + { + "auxiliary_loss_clip": 0.01406327, + "auxiliary_loss_mlp": 0.01019208, + "balance_loss_clip": 1.23436892, + "balance_loss_mlp": 1.008479, + "epoch": 0.0674582894934616, + "flos": 71416886307120.0, + "grad_norm": 0.7752117901179358, + "language_loss": 0.58306617, + "learning_rate": 3.985300151882694e-06, + "loss": 0.6073215, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10742188, + "step": 1122, + "time_per_iteration": 3.4040908813476562 + }, + { + "auxiliary_loss_clip": 0.01619764, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_clip": 1.37052083, + "balance_loss_mlp": 1.04164743, + "epoch": 0.06751841274612956, + "flos": 25270445428560.0, + "grad_norm": 2.243407929129317, + "language_loss": 0.72335756, + "learning_rate": 3.985252981610901e-06, + "loss": 0.75026131, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.29003906, + "step": 1123, + "time_per_iteration": 2.8357620239257812 + }, + { + "auxiliary_loss_clip": 0.01611797, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_clip": 1.36137319, + "balance_loss_mlp": 1.04382622, + "epoch": 0.06757853599879754, + "flos": 23807680598880.0, + "grad_norm": 1.7310307145561703, + "language_loss": 0.79185724, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81870425, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.29052734, + "step": 1124, + "time_per_iteration": 2.8135929107666016 + }, + { + "auxiliary_loss_clip": 0.01610477, + "auxiliary_loss_mlp": 0.01061542, + "balance_loss_clip": 1.36366069, + "balance_loss_mlp": 1.03402877, + "epoch": 0.0676386592514655, + "flos": 21038831284320.0, + "grad_norm": 1.8028048107487953, + "language_loss": 0.72601652, + "learning_rate": 3.985158415226128e-06, + "loss": 0.75273669, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.27514648, + "step": 1125, + "time_per_iteration": 2.8153722286224365 + }, + { + "auxiliary_loss_clip": 0.01619584, + "auxiliary_loss_mlp": 0.01075282, + "balance_loss_clip": 1.37156284, + "balance_loss_mlp": 1.04524183, + "epoch": 0.06769878250413347, + "flos": 25561796387040.0, + "grad_norm": 2.9534745253374917, + "language_loss": 0.81754589, + "learning_rate": 3.985111019116736e-06, + "loss": 0.84449458, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.30004883, + "step": 1126, + "time_per_iteration": 2.836240291595459 + }, + { + "auxiliary_loss_clip": 0.01408962, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.23683488, + "balance_loss_mlp": 1.01278436, + "epoch": 0.06775890575680145, + "flos": 70671697049880.0, + "grad_norm": 0.9742815955324282, + "language_loss": 0.59761596, + "learning_rate": 3.985063547731735e-06, + "loss": 0.62192643, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.09277344, + "step": 1127, + "time_per_iteration": 3.337501287460327 + }, + { + "auxiliary_loss_clip": 0.01612787, + "auxiliary_loss_mlp": 0.01059874, + "balance_loss_clip": 1.36799896, + "balance_loss_mlp": 1.03073883, + "epoch": 0.06781902900946941, + "flos": 24239834033400.0, + "grad_norm": 1.9457830505742726, + "language_loss": 0.82167304, + "learning_rate": 3.985016001072925e-06, + "loss": 0.84839958, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.29174805, + "step": 1128, + "time_per_iteration": 2.821357250213623 + }, + { + "auxiliary_loss_clip": 0.01630446, + "auxiliary_loss_mlp": 0.01069532, + "balance_loss_clip": 1.37704384, + "balance_loss_mlp": 1.03693986, + "epoch": 0.06787915226213738, + "flos": 22422443774400.0, + "grad_norm": 2.5074514269820356, + "language_loss": 0.76903319, + "learning_rate": 3.984968379142109e-06, + "loss": 0.79603297, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.32592773, + "step": 1129, + "time_per_iteration": 2.7891130447387695 + }, + { + "auxiliary_loss_clip": 0.01615019, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_clip": 1.36384296, + "balance_loss_mlp": 1.03916287, + "epoch": 0.06793927551480534, + "flos": 37714081589640.0, + "grad_norm": 1.8753617105463938, + "language_loss": 0.73055404, + "learning_rate": 3.984920681941094e-06, + "loss": 0.7573961, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.30041504, + "step": 1130, + "time_per_iteration": 2.9346909523010254 + }, + { + "auxiliary_loss_clip": 0.01611248, + "auxiliary_loss_mlp": 0.01068579, + "balance_loss_clip": 1.36452508, + "balance_loss_mlp": 1.03844261, + "epoch": 0.06799939876747332, + "flos": 20636304971400.0, + "grad_norm": 2.1262414289227403, + "language_loss": 0.81414974, + "learning_rate": 3.984872909471688e-06, + "loss": 0.84094799, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.30151367, + "step": 1131, + "time_per_iteration": 2.8785903453826904 + }, + { + "auxiliary_loss_clip": 0.01603615, + "auxiliary_loss_mlp": 0.01068888, + "balance_loss_clip": 1.36053753, + "balance_loss_mlp": 1.03899062, + "epoch": 0.06805952202014129, + "flos": 14868661870080.0, + "grad_norm": 1.9461380682123244, + "language_loss": 0.81039935, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83712435, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.29931641, + "step": 1132, + "time_per_iteration": 2.870535373687744 + }, + { + "auxiliary_loss_clip": 0.01617471, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_clip": 1.37023425, + "balance_loss_mlp": 1.03818309, + "epoch": 0.06811964527280925, + "flos": 48918237622200.0, + "grad_norm": 1.5971125398456725, + "language_loss": 0.64116263, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.6680001, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.28088379, + "step": 1133, + "time_per_iteration": 3.0714504718780518 + }, + { + "auxiliary_loss_clip": 0.01632755, + "auxiliary_loss_mlp": 0.01061817, + "balance_loss_clip": 1.37914324, + "balance_loss_mlp": 1.02893925, + "epoch": 0.06817976852547723, + "flos": 15381470153520.0, + "grad_norm": 1.8736104497478305, + "language_loss": 0.75836867, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.78531432, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.32910156, + "step": 1134, + "time_per_iteration": 2.8025150299072266 + }, + { + "auxiliary_loss_clip": 0.01612256, + "auxiliary_loss_mlp": 0.01067366, + "balance_loss_clip": 1.36849427, + "balance_loss_mlp": 1.03962612, + "epoch": 0.0682398917781452, + "flos": 20160352097640.0, + "grad_norm": 1.940259732981038, + "language_loss": 0.87815452, + "learning_rate": 3.984681066946423e-06, + "loss": 0.90495074, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.27709961, + "step": 1135, + "time_per_iteration": 2.8318428993225098 + }, + { + "auxiliary_loss_clip": 0.01621882, + "auxiliary_loss_mlp": 0.01059483, + "balance_loss_clip": 1.37169385, + "balance_loss_mlp": 1.02941895, + "epoch": 0.06830001503081316, + "flos": 23445867489840.0, + "grad_norm": 2.7649825132421517, + "language_loss": 0.79088473, + "learning_rate": 3.984632918162291e-06, + "loss": 0.81769836, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.30078125, + "step": 1136, + "time_per_iteration": 2.8191592693328857 + }, + { + "auxiliary_loss_clip": 0.01624524, + "auxiliary_loss_mlp": 0.0106492, + "balance_loss_clip": 1.37722182, + "balance_loss_mlp": 1.03399706, + "epoch": 0.06836013828348114, + "flos": 34356479712480.0, + "grad_norm": 2.3399613368831775, + "language_loss": 0.85000026, + "learning_rate": 3.984584694120679e-06, + "loss": 0.87689465, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.30932617, + "step": 1137, + "time_per_iteration": 2.9376399517059326 + }, + { + "auxiliary_loss_clip": 0.01613348, + "auxiliary_loss_mlp": 0.01067749, + "balance_loss_clip": 1.36974823, + "balance_loss_mlp": 1.03961539, + "epoch": 0.06842026153614911, + "flos": 23154069839400.0, + "grad_norm": 2.2747141467382415, + "language_loss": 0.8008455, + "learning_rate": 3.984536394823418e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.28161621, + "step": 1138, + "time_per_iteration": 2.83107852935791 + }, + { + "auxiliary_loss_clip": 0.01625127, + "auxiliary_loss_mlp": 0.01062763, + "balance_loss_clip": 1.37675095, + "balance_loss_mlp": 1.03386617, + "epoch": 0.06848038478881707, + "flos": 24614479384200.0, + "grad_norm": 2.1936091964887643, + "language_loss": 0.86367691, + "learning_rate": 3.984488020272336e-06, + "loss": 0.89055574, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2890625, + "step": 1139, + "time_per_iteration": 2.8279552459716797 + }, + { + "auxiliary_loss_clip": 0.01614406, + "auxiliary_loss_mlp": 0.01064698, + "balance_loss_clip": 1.36940217, + "balance_loss_mlp": 1.03505063, + "epoch": 0.06854050804148504, + "flos": 40888543452480.0, + "grad_norm": 1.7160640684614195, + "language_loss": 0.75044298, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77723396, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.29626465, + "step": 1140, + "time_per_iteration": 4.312354326248169 + }, + { + "auxiliary_loss_clip": 0.01621565, + "auxiliary_loss_mlp": 0.01073641, + "balance_loss_clip": 1.37546003, + "balance_loss_mlp": 1.04276538, + "epoch": 0.06860063129415302, + "flos": 31692673890000.0, + "grad_norm": 2.290745376196471, + "language_loss": 0.69093764, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.71788973, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.30883789, + "step": 1141, + "time_per_iteration": 4.758192539215088 + }, + { + "auxiliary_loss_clip": 0.01630837, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.37813663, + "balance_loss_mlp": 1.03423238, + "epoch": 0.06866075454682098, + "flos": 26547674350680.0, + "grad_norm": 1.898075318870065, + "language_loss": 0.79460365, + "learning_rate": 3.984342445114538e-06, + "loss": 0.82153946, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.28540039, + "step": 1142, + "time_per_iteration": 2.945744514465332 + }, + { + "auxiliary_loss_clip": 0.01610648, + "auxiliary_loss_mlp": 0.01066779, + "balance_loss_clip": 1.36784673, + "balance_loss_mlp": 1.0383954, + "epoch": 0.06872087779948895, + "flos": 29795928249600.0, + "grad_norm": 1.7191404910318229, + "language_loss": 0.69554842, + "learning_rate": 3.984293769566553e-06, + "loss": 0.7223227, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28381348, + "step": 1143, + "time_per_iteration": 4.464113235473633 + }, + { + "auxiliary_loss_clip": 0.01607632, + "auxiliary_loss_mlp": 0.01068345, + "balance_loss_clip": 1.36929488, + "balance_loss_mlp": 1.04133248, + "epoch": 0.06878100105215693, + "flos": 26946545911200.0, + "grad_norm": 1.6667268232529018, + "language_loss": 0.74998546, + "learning_rate": 3.98424501877395e-06, + "loss": 0.77674526, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.26989746, + "step": 1144, + "time_per_iteration": 2.855591058731079 + }, + { + "auxiliary_loss_clip": 0.01623368, + "auxiliary_loss_mlp": 0.01072056, + "balance_loss_clip": 1.37283337, + "balance_loss_mlp": 1.04273081, + "epoch": 0.06884112430482489, + "flos": 10674349827480.0, + "grad_norm": 2.0284393084423074, + "language_loss": 0.92115247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94810677, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.29321289, + "step": 1145, + "time_per_iteration": 2.908156394958496 + }, + { + "auxiliary_loss_clip": 0.01636264, + "auxiliary_loss_mlp": 0.01072389, + "balance_loss_clip": 1.38442421, + "balance_loss_mlp": 1.04168057, + "epoch": 0.06890124755749286, + "flos": 20198506974840.0, + "grad_norm": 2.495027036282462, + "language_loss": 0.83004546, + "learning_rate": 3.984147291462285e-06, + "loss": 0.85713196, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.30664062, + "step": 1146, + "time_per_iteration": 2.86258864402771 + }, + { + "auxiliary_loss_clip": 0.01605459, + "auxiliary_loss_mlp": 0.01069867, + "balance_loss_clip": 1.36459541, + "balance_loss_mlp": 1.04229414, + "epoch": 0.06896137081016084, + "flos": 20454139557720.0, + "grad_norm": 1.846821826348696, + "language_loss": 0.85825306, + "learning_rate": 3.98409831494693e-06, + "loss": 0.88500631, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.27587891, + "step": 1147, + "time_per_iteration": 2.785552740097046 + }, + { + "auxiliary_loss_clip": 0.01619908, + "auxiliary_loss_mlp": 0.01072227, + "balance_loss_clip": 1.37521231, + "balance_loss_mlp": 1.04209089, + "epoch": 0.0690214940628288, + "flos": 18373238694000.0, + "grad_norm": 1.8057256255498697, + "language_loss": 0.85838264, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88530397, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.30102539, + "step": 1148, + "time_per_iteration": 2.774078607559204 + }, + { + "auxiliary_loss_clip": 0.01614507, + "auxiliary_loss_mlp": 0.01066614, + "balance_loss_clip": 1.36903393, + "balance_loss_mlp": 1.03790808, + "epoch": 0.06908161731549677, + "flos": 20562878410560.0, + "grad_norm": 2.170555777812611, + "language_loss": 0.69530326, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.72211444, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.28662109, + "step": 1149, + "time_per_iteration": 2.7875940799713135 + }, + { + "auxiliary_loss_clip": 0.01622421, + "auxiliary_loss_mlp": 0.01063274, + "balance_loss_clip": 1.37285733, + "balance_loss_mlp": 1.03354359, + "epoch": 0.06914174056816474, + "flos": 27569473731720.0, + "grad_norm": 1.893104590912188, + "language_loss": 0.84582114, + "learning_rate": 3.983950933985064e-06, + "loss": 0.87267804, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.29736328, + "step": 1150, + "time_per_iteration": 2.8061509132385254 + }, + { + "auxiliary_loss_clip": 0.01633355, + "auxiliary_loss_mlp": 0.01077593, + "balance_loss_clip": 1.3848207, + "balance_loss_mlp": 1.04852974, + "epoch": 0.06920186382083271, + "flos": 15308165417760.0, + "grad_norm": 3.003416148993441, + "language_loss": 0.84105408, + "learning_rate": 3.983901656532052e-06, + "loss": 0.86816359, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.29077148, + "step": 1151, + "time_per_iteration": 2.788503885269165 + }, + { + "auxiliary_loss_clip": 0.01630581, + "auxiliary_loss_mlp": 0.01075107, + "balance_loss_clip": 1.38321614, + "balance_loss_mlp": 1.04790378, + "epoch": 0.06926198707350067, + "flos": 25196612784120.0, + "grad_norm": 3.2459211519049416, + "language_loss": 0.85812747, + "learning_rate": 3.983852303849291e-06, + "loss": 0.88518441, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.2722168, + "step": 1152, + "time_per_iteration": 2.827958822250366 + }, + { + "auxiliary_loss_clip": 0.01618879, + "auxiliary_loss_mlp": 0.01071838, + "balance_loss_clip": 1.37381637, + "balance_loss_mlp": 1.04381216, + "epoch": 0.06932211032616864, + "flos": 13259409393960.0, + "grad_norm": 1.9374986944299584, + "language_loss": 0.91208488, + "learning_rate": 3.983802875938651e-06, + "loss": 0.93899214, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.28039551, + "step": 1153, + "time_per_iteration": 2.753959894180298 + }, + { + "auxiliary_loss_clip": 0.0162609, + "auxiliary_loss_mlp": 0.01063944, + "balance_loss_clip": 1.37764287, + "balance_loss_mlp": 1.03501248, + "epoch": 0.06938223357883662, + "flos": 24832850473800.0, + "grad_norm": 2.2178493624279936, + "language_loss": 0.81923401, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84613431, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.28918457, + "step": 1154, + "time_per_iteration": 2.9045121669769287 + }, + { + "auxiliary_loss_clip": 0.01626601, + "auxiliary_loss_mlp": 0.01071203, + "balance_loss_clip": 1.38055778, + "balance_loss_mlp": 1.04106688, + "epoch": 0.06944235683150458, + "flos": 27273249770040.0, + "grad_norm": 2.152503276239238, + "language_loss": 0.75610793, + "learning_rate": 3.983703794441237e-06, + "loss": 0.78308594, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.30126953, + "step": 1155, + "time_per_iteration": 2.837709426879883 + }, + { + "auxiliary_loss_clip": 0.01621567, + "auxiliary_loss_mlp": 0.01067315, + "balance_loss_clip": 1.37689388, + "balance_loss_mlp": 1.03934896, + "epoch": 0.06950248008417255, + "flos": 25813083875400.0, + "grad_norm": 1.6485831776072886, + "language_loss": 0.71282816, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73971701, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.27978516, + "step": 1156, + "time_per_iteration": 2.8431320190429688 + }, + { + "auxiliary_loss_clip": 0.01621226, + "auxiliary_loss_mlp": 0.01070286, + "balance_loss_clip": 1.37227786, + "balance_loss_mlp": 1.03948236, + "epoch": 0.06956260333684053, + "flos": 22276443428280.0, + "grad_norm": 1.8266235174739203, + "language_loss": 0.75492835, + "learning_rate": 3.98360441205484e-06, + "loss": 0.78184348, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30786133, + "step": 1157, + "time_per_iteration": 2.850950241088867 + }, + { + "auxiliary_loss_clip": 0.01629674, + "auxiliary_loss_mlp": 0.01071609, + "balance_loss_clip": 1.38272595, + "balance_loss_mlp": 1.04187775, + "epoch": 0.0696227265895085, + "flos": 29687676697080.0, + "grad_norm": 1.555524175723284, + "language_loss": 0.72151792, + "learning_rate": 3.983554608032982e-06, + "loss": 0.74853075, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.29760742, + "step": 1158, + "time_per_iteration": 2.8785839080810547 + }, + { + "auxiliary_loss_clip": 0.01632182, + "auxiliary_loss_mlp": 0.01063291, + "balance_loss_clip": 1.3827076, + "balance_loss_mlp": 1.0323205, + "epoch": 0.06968284984217646, + "flos": 25529489113680.0, + "grad_norm": 1.8859673355684716, + "language_loss": 0.80251443, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82946914, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30981445, + "step": 1159, + "time_per_iteration": 2.878901958465576 + }, + { + "auxiliary_loss_clip": 0.01629907, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_clip": 1.38334203, + "balance_loss_mlp": 1.0401895, + "epoch": 0.06974297309484444, + "flos": 20702300202360.0, + "grad_norm": 3.03671543121584, + "language_loss": 0.82414532, + "learning_rate": 3.983454774341387e-06, + "loss": 0.85116339, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.31677246, + "step": 1160, + "time_per_iteration": 2.781386613845825 + }, + { + "auxiliary_loss_clip": 0.01622189, + "auxiliary_loss_mlp": 0.01073741, + "balance_loss_clip": 1.37517834, + "balance_loss_mlp": 1.03852689, + "epoch": 0.0698030963475124, + "flos": 26510818941000.0, + "grad_norm": 1.9770256654609384, + "language_loss": 0.76763034, + "learning_rate": 3.983404744675437e-06, + "loss": 0.79458964, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.35168457, + "step": 1161, + "time_per_iteration": 2.856696605682373 + }, + { + "auxiliary_loss_clip": 0.01619783, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_clip": 1.37489319, + "balance_loss_mlp": 1.03828251, + "epoch": 0.06986321960018037, + "flos": 23045777678520.0, + "grad_norm": 1.9962408290666365, + "language_loss": 0.8373282, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.86421716, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.30822754, + "step": 1162, + "time_per_iteration": 2.795883893966675 + }, + { + "auxiliary_loss_clip": 0.01620883, + "auxiliary_loss_mlp": 0.01061962, + "balance_loss_clip": 1.37736738, + "balance_loss_mlp": 1.03273153, + "epoch": 0.06992334285284833, + "flos": 28590704595720.0, + "grad_norm": 1.836122657480585, + "language_loss": 0.80157697, + "learning_rate": 3.983304459712716e-06, + "loss": 0.82840544, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.29223633, + "step": 1163, + "time_per_iteration": 2.8783209323883057 + }, + { + "auxiliary_loss_clip": 0.01628294, + "auxiliary_loss_mlp": 0.01068512, + "balance_loss_clip": 1.37943649, + "balance_loss_mlp": 1.03537178, + "epoch": 0.06998346610551631, + "flos": 20600302337280.0, + "grad_norm": 2.8568952598062385, + "language_loss": 0.79370821, + "learning_rate": 3.983254204419749e-06, + "loss": 0.82067633, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.33154297, + "step": 1164, + "time_per_iteration": 2.8046300411224365 + }, + { + "auxiliary_loss_clip": 0.01623819, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_clip": 1.37630248, + "balance_loss_mlp": 1.04506135, + "epoch": 0.07004358935818428, + "flos": 22533984604080.0, + "grad_norm": 1.6394309332366384, + "language_loss": 0.73050213, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75749558, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.30444336, + "step": 1165, + "time_per_iteration": 2.884996175765991 + }, + { + "auxiliary_loss_clip": 0.01621673, + "auxiliary_loss_mlp": 0.01057381, + "balance_loss_clip": 1.37612212, + "balance_loss_mlp": 1.0300343, + "epoch": 0.07010371261085224, + "flos": 28955969415360.0, + "grad_norm": 3.3430969709187663, + "language_loss": 0.81704271, + "learning_rate": 3.983153468220128e-06, + "loss": 0.84383321, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.27355957, + "step": 1166, + "time_per_iteration": 2.942800521850586 + }, + { + "auxiliary_loss_clip": 0.01626367, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.38046646, + "balance_loss_mlp": 1.02583086, + "epoch": 0.07016383586352022, + "flos": 23664401012880.0, + "grad_norm": 1.9410800336436664, + "language_loss": 0.85190976, + "learning_rate": 3.983102987317295e-06, + "loss": 0.87872642, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.29443359, + "step": 1167, + "time_per_iteration": 2.823213577270508 + }, + { + "auxiliary_loss_clip": 0.01628546, + "auxiliary_loss_mlp": 0.01064694, + "balance_loss_clip": 1.38096499, + "balance_loss_mlp": 1.03532124, + "epoch": 0.07022395911618819, + "flos": 19796914654200.0, + "grad_norm": 2.0142567215256326, + "language_loss": 0.909558, + "learning_rate": 3.983052431214997e-06, + "loss": 0.93649042, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.29370117, + "step": 1168, + "time_per_iteration": 2.8230197429656982 + }, + { + "auxiliary_loss_clip": 0.01637197, + "auxiliary_loss_mlp": 0.01065428, + "balance_loss_clip": 1.38301182, + "balance_loss_mlp": 1.03228807, + "epoch": 0.07028408236885615, + "flos": 21694147594920.0, + "grad_norm": 2.0279097714182734, + "language_loss": 0.89443606, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9214623, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.33154297, + "step": 1169, + "time_per_iteration": 2.7641870975494385 + }, + { + "auxiliary_loss_clip": 0.01632261, + "auxiliary_loss_mlp": 0.0106492, + "balance_loss_clip": 1.3845911, + "balance_loss_mlp": 1.03502202, + "epoch": 0.07034420562152413, + "flos": 25635872681640.0, + "grad_norm": 2.0815885801176357, + "language_loss": 0.84307534, + "learning_rate": 3.982951093419681e-06, + "loss": 0.87004721, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.29907227, + "step": 1170, + "time_per_iteration": 2.8323614597320557 + }, + { + "auxiliary_loss_clip": 0.01624041, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_clip": 1.3803997, + "balance_loss_mlp": 1.04098415, + "epoch": 0.0704043288741921, + "flos": 20814937457760.0, + "grad_norm": 1.876510503910318, + "language_loss": 0.76392436, + "learning_rate": 3.982900311730506e-06, + "loss": 0.79087365, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.29907227, + "step": 1171, + "time_per_iteration": 2.7659904956817627 + }, + { + "auxiliary_loss_clip": 0.01621816, + "auxiliary_loss_mlp": 0.01059376, + "balance_loss_clip": 1.37827563, + "balance_loss_mlp": 1.03109956, + "epoch": 0.07046445212686006, + "flos": 25598692405080.0, + "grad_norm": 5.145248028388511, + "language_loss": 0.89540386, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.92221582, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.28295898, + "step": 1172, + "time_per_iteration": 2.8278167247772217 + }, + { + "auxiliary_loss_clip": 0.016266, + "auxiliary_loss_mlp": 0.01064842, + "balance_loss_clip": 1.37443829, + "balance_loss_mlp": 1.03470564, + "epoch": 0.07052457537952803, + "flos": 25562527337520.0, + "grad_norm": 1.7500127456837713, + "language_loss": 0.82196045, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84887493, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.30126953, + "step": 1173, + "time_per_iteration": 2.8398964405059814 + }, + { + "auxiliary_loss_clip": 0.01623031, + "auxiliary_loss_mlp": 0.01060022, + "balance_loss_clip": 1.37813866, + "balance_loss_mlp": 1.03165054, + "epoch": 0.070584698632196, + "flos": 17973148882680.0, + "grad_norm": 2.392438561635411, + "language_loss": 0.82744932, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.85427988, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.28369141, + "step": 1174, + "time_per_iteration": 2.758960723876953 + }, + { + "auxiliary_loss_clip": 0.01617236, + "auxiliary_loss_mlp": 0.0107334, + "balance_loss_clip": 1.37302089, + "balance_loss_mlp": 1.04340625, + "epoch": 0.07064482188486397, + "flos": 25375935612600.0, + "grad_norm": 2.019048508938711, + "language_loss": 0.85309076, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87999648, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29919434, + "step": 1175, + "time_per_iteration": 2.8403868675231934 + }, + { + "auxiliary_loss_clip": 0.01631811, + "auxiliary_loss_mlp": 0.01075974, + "balance_loss_clip": 1.38714683, + "balance_loss_mlp": 1.04822195, + "epoch": 0.07070494513753194, + "flos": 24905261825640.0, + "grad_norm": 1.7335810560357507, + "language_loss": 0.83746356, + "learning_rate": 3.982645275446563e-06, + "loss": 0.86454141, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.27758789, + "step": 1176, + "time_per_iteration": 2.8130905628204346 + }, + { + "auxiliary_loss_clip": 0.01622816, + "auxiliary_loss_mlp": 0.01068597, + "balance_loss_clip": 1.3776958, + "balance_loss_mlp": 1.03743553, + "epoch": 0.07076506839019991, + "flos": 22342560484320.0, + "grad_norm": 3.0795334673960673, + "language_loss": 0.74915934, + "learning_rate": 3.982594042635701e-06, + "loss": 0.77607346, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.31152344, + "step": 1177, + "time_per_iteration": 2.8732073307037354 + }, + { + "auxiliary_loss_clip": 0.01628785, + "auxiliary_loss_mlp": 0.01071102, + "balance_loss_clip": 1.38013601, + "balance_loss_mlp": 1.04060817, + "epoch": 0.07082519164286788, + "flos": 18665401819680.0, + "grad_norm": 1.797013838568258, + "language_loss": 0.86523831, + "learning_rate": 3.982542734644673e-06, + "loss": 0.89223719, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.3046875, + "step": 1178, + "time_per_iteration": 4.281923532485962 + }, + { + "auxiliary_loss_clip": 0.01433289, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.26265752, + "balance_loss_mlp": 1.01054609, + "epoch": 0.07088531489553584, + "flos": 63670177773720.0, + "grad_norm": 0.8339083324291694, + "language_loss": 0.63321984, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65778792, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.12988281, + "step": 1179, + "time_per_iteration": 3.40169095993042 + }, + { + "auxiliary_loss_clip": 0.01625015, + "auxiliary_loss_mlp": 0.01072446, + "balance_loss_clip": 1.37620831, + "balance_loss_mlp": 1.04414511, + "epoch": 0.07094543814820382, + "flos": 21576759161400.0, + "grad_norm": 2.9191873962290527, + "language_loss": 0.85256791, + "learning_rate": 3.98243989312991e-06, + "loss": 0.87954253, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.28320312, + "step": 1180, + "time_per_iteration": 4.849510908126831 + }, + { + "auxiliary_loss_clip": 0.01616509, + "auxiliary_loss_mlp": 0.01067399, + "balance_loss_clip": 1.37051034, + "balance_loss_mlp": 1.03566575, + "epoch": 0.07100556140087179, + "flos": 22094927748360.0, + "grad_norm": 2.3223249834154474, + "language_loss": 0.88935375, + "learning_rate": 3.982388359610074e-06, + "loss": 0.91619289, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.31713867, + "step": 1181, + "time_per_iteration": 5.711491823196411 + }, + { + "auxiliary_loss_clip": 0.01608305, + "auxiliary_loss_mlp": 0.01066806, + "balance_loss_clip": 1.36704791, + "balance_loss_mlp": 1.03717089, + "epoch": 0.07106568465353975, + "flos": 47930694715800.0, + "grad_norm": 1.8142816374910395, + "language_loss": 0.84055394, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.86730504, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.29650879, + "step": 1182, + "time_per_iteration": 3.0506882667541504 + }, + { + "auxiliary_loss_clip": 0.01614876, + "auxiliary_loss_mlp": 0.01066379, + "balance_loss_clip": 1.37155843, + "balance_loss_mlp": 1.03634977, + "epoch": 0.07112580790620772, + "flos": 23446070531640.0, + "grad_norm": 1.9121792193673233, + "language_loss": 0.80095291, + "learning_rate": 3.982285067055262e-06, + "loss": 0.82776546, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.30041504, + "step": 1183, + "time_per_iteration": 2.892096757888794 + }, + { + "auxiliary_loss_clip": 0.01620481, + "auxiliary_loss_mlp": 0.01066254, + "balance_loss_clip": 1.36891675, + "balance_loss_mlp": 1.03583205, + "epoch": 0.0711859311588757, + "flos": 31875245387280.0, + "grad_norm": 2.0571416754765504, + "language_loss": 0.79478604, + "learning_rate": 3.982233308024204e-06, + "loss": 0.82165337, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.30371094, + "step": 1184, + "time_per_iteration": 2.8924810886383057 + }, + { + "auxiliary_loss_clip": 0.01599832, + "auxiliary_loss_mlp": 0.01063433, + "balance_loss_clip": 1.36049795, + "balance_loss_mlp": 1.03379798, + "epoch": 0.07124605441154366, + "flos": 19615155324120.0, + "grad_norm": 1.7117316534748843, + "language_loss": 0.77195883, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79859149, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29614258, + "step": 1185, + "time_per_iteration": 2.793976068496704 + }, + { + "auxiliary_loss_clip": 0.01612859, + "auxiliary_loss_mlp": 0.01072747, + "balance_loss_clip": 1.36871409, + "balance_loss_mlp": 1.04338598, + "epoch": 0.07130617766421163, + "flos": 14688608091120.0, + "grad_norm": 2.31823720497792, + "language_loss": 0.66428232, + "learning_rate": 3.982129564464596e-06, + "loss": 0.69113839, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29333496, + "step": 1186, + "time_per_iteration": 2.8996336460113525 + }, + { + "auxiliary_loss_clip": 0.0160103, + "auxiliary_loss_mlp": 0.01064121, + "balance_loss_clip": 1.36020577, + "balance_loss_mlp": 1.03422379, + "epoch": 0.07136630091687961, + "flos": 26073548853120.0, + "grad_norm": 1.8351775010153113, + "language_loss": 0.70286226, + "learning_rate": 3.98207757993998e-06, + "loss": 0.72951382, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.2989502, + "step": 1187, + "time_per_iteration": 2.889500856399536 + }, + { + "auxiliary_loss_clip": 0.01605808, + "auxiliary_loss_mlp": 0.0106374, + "balance_loss_clip": 1.37045217, + "balance_loss_mlp": 1.0337466, + "epoch": 0.07142642416954757, + "flos": 15673430237400.0, + "grad_norm": 2.454130816684428, + "language_loss": 0.79093015, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81762564, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.30004883, + "step": 1188, + "time_per_iteration": 2.8170158863067627 + }, + { + "auxiliary_loss_clip": 0.01604414, + "auxiliary_loss_mlp": 0.01064426, + "balance_loss_clip": 1.36607623, + "balance_loss_mlp": 1.03558922, + "epoch": 0.07148654742221554, + "flos": 19760059244520.0, + "grad_norm": 1.7984770745754133, + "language_loss": 0.85753155, + "learning_rate": 3.981973385410981e-06, + "loss": 0.88422, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.28808594, + "step": 1189, + "time_per_iteration": 2.8367440700531006 + }, + { + "auxiliary_loss_clip": 0.01611617, + "auxiliary_loss_mlp": 0.01068984, + "balance_loss_clip": 1.37241411, + "balance_loss_mlp": 1.03729784, + "epoch": 0.07154667067488352, + "flos": 23476387995360.0, + "grad_norm": 1.6807626559336706, + "language_loss": 0.77617872, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.80298471, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.31665039, + "step": 1190, + "time_per_iteration": 2.8756539821624756 + }, + { + "auxiliary_loss_clip": 0.01621265, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_clip": 1.37441754, + "balance_loss_mlp": 1.04403019, + "epoch": 0.07160679392755148, + "flos": 18337560926760.0, + "grad_norm": 1.9142458442757615, + "language_loss": 0.76452637, + "learning_rate": 3.981868890255468e-06, + "loss": 0.79150879, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.32910156, + "step": 1191, + "time_per_iteration": 2.773775339126587 + }, + { + "auxiliary_loss_clip": 0.01610981, + "auxiliary_loss_mlp": 0.01069953, + "balance_loss_clip": 1.36597955, + "balance_loss_mlp": 1.03922057, + "epoch": 0.07166691718021945, + "flos": 17751285474120.0, + "grad_norm": 2.6461071278594868, + "language_loss": 0.74656856, + "learning_rate": 3.981816529947719e-06, + "loss": 0.7733779, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.30737305, + "step": 1192, + "time_per_iteration": 2.81162691116333 + }, + { + "auxiliary_loss_clip": 0.01604819, + "auxiliary_loss_mlp": 0.01069322, + "balance_loss_clip": 1.36155915, + "balance_loss_mlp": 1.03940094, + "epoch": 0.07172704043288743, + "flos": 22456578423960.0, + "grad_norm": 1.9258931682031837, + "language_loss": 0.78858125, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.81532264, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.2989502, + "step": 1193, + "time_per_iteration": 2.804783821105957 + }, + { + "auxiliary_loss_clip": 0.01617813, + "auxiliary_loss_mlp": 0.01063715, + "balance_loss_clip": 1.37683213, + "balance_loss_mlp": 1.03269637, + "epoch": 0.07178716368555539, + "flos": 23227496400240.0, + "grad_norm": 2.1800084346165067, + "language_loss": 0.86109287, + "learning_rate": 3.981711583882166e-06, + "loss": 0.8879081, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.30981445, + "step": 1194, + "time_per_iteration": 2.871061325073242 + }, + { + "auxiliary_loss_clip": 0.01613188, + "auxiliary_loss_mlp": 0.01084696, + "balance_loss_clip": 1.37397981, + "balance_loss_mlp": 1.05422604, + "epoch": 0.07184728693822336, + "flos": 25155696538440.0, + "grad_norm": 1.977995163649277, + "language_loss": 0.8259511, + "learning_rate": 3.981658998128341e-06, + "loss": 0.85292995, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.30444336, + "step": 1195, + "time_per_iteration": 2.8566060066223145 + }, + { + "auxiliary_loss_clip": 0.01617551, + "auxiliary_loss_mlp": 0.01059009, + "balance_loss_clip": 1.37641096, + "balance_loss_mlp": 1.03085196, + "epoch": 0.07190741019089132, + "flos": 22716434276280.0, + "grad_norm": 1.7670866327037842, + "language_loss": 0.79697049, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82373607, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.28149414, + "step": 1196, + "time_per_iteration": 2.849759340286255 + }, + { + "auxiliary_loss_clip": 0.0161377, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.37314713, + "balance_loss_mlp": 1.03888452, + "epoch": 0.0719675334435593, + "flos": 29355612534720.0, + "grad_norm": 2.164383487020967, + "language_loss": 0.72032928, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.74715745, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.30151367, + "step": 1197, + "time_per_iteration": 2.9105584621429443 + }, + { + "auxiliary_loss_clip": 0.01611568, + "auxiliary_loss_mlp": 0.01069622, + "balance_loss_clip": 1.37389052, + "balance_loss_mlp": 1.04122639, + "epoch": 0.07202765669622727, + "flos": 17644333389120.0, + "grad_norm": 1.8807383155540338, + "language_loss": 0.86052984, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88734174, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28356934, + "step": 1198, + "time_per_iteration": 2.8747808933258057 + }, + { + "auxiliary_loss_clip": 0.01619974, + "auxiliary_loss_mlp": 0.0106977, + "balance_loss_clip": 1.38028598, + "balance_loss_mlp": 1.04032528, + "epoch": 0.07208777994889523, + "flos": 21439042920720.0, + "grad_norm": 1.8450280746001992, + "language_loss": 0.83744085, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86433828, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.29443359, + "step": 1199, + "time_per_iteration": 2.8430118560791016 + }, + { + "auxiliary_loss_clip": 0.0162547, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_clip": 1.38412869, + "balance_loss_mlp": 1.04149592, + "epoch": 0.07214790320156321, + "flos": 26946261652680.0, + "grad_norm": 1.9230503530904666, + "language_loss": 0.76842678, + "learning_rate": 3.981394942228581e-06, + "loss": 0.79539496, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.29821777, + "step": 1200, + "time_per_iteration": 2.8441474437713623 + }, + { + "auxiliary_loss_clip": 0.01622365, + "auxiliary_loss_mlp": 0.01081827, + "balance_loss_clip": 1.38104033, + "balance_loss_mlp": 1.05276358, + "epoch": 0.07220802645423118, + "flos": 23885452254240.0, + "grad_norm": 2.0817082274081176, + "language_loss": 0.82454062, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85158253, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.29052734, + "step": 1201, + "time_per_iteration": 2.881164073944092 + }, + { + "auxiliary_loss_clip": 0.01619041, + "auxiliary_loss_mlp": 0.01069328, + "balance_loss_clip": 1.37288404, + "balance_loss_mlp": 1.03969264, + "epoch": 0.07226814970689914, + "flos": 19248184953360.0, + "grad_norm": 2.2710872675064633, + "language_loss": 0.69422996, + "learning_rate": 3.981288793911775e-06, + "loss": 0.72111368, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.29638672, + "step": 1202, + "time_per_iteration": 2.84975004196167 + }, + { + "auxiliary_loss_clip": 0.01613728, + "auxiliary_loss_mlp": 0.01073625, + "balance_loss_clip": 1.37227678, + "balance_loss_mlp": 1.04401338, + "epoch": 0.07232827295956712, + "flos": 19176748202160.0, + "grad_norm": 1.926301312009928, + "language_loss": 0.88109463, + "learning_rate": 3.98123560705636e-06, + "loss": 0.90796816, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.29614258, + "step": 1203, + "time_per_iteration": 2.8260653018951416 + }, + { + "auxiliary_loss_clip": 0.01619965, + "auxiliary_loss_mlp": 0.01073542, + "balance_loss_clip": 1.37398005, + "balance_loss_mlp": 1.04357314, + "epoch": 0.07238839621223508, + "flos": 17644292780760.0, + "grad_norm": 1.7398416264676564, + "language_loss": 0.79436588, + "learning_rate": 3.981182345072293e-06, + "loss": 0.82130098, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.29968262, + "step": 1204, + "time_per_iteration": 2.827958106994629 + }, + { + "auxiliary_loss_clip": 0.01618279, + "auxiliary_loss_mlp": 0.01069709, + "balance_loss_clip": 1.37799942, + "balance_loss_mlp": 1.04190946, + "epoch": 0.07244851946490305, + "flos": 28298175994800.0, + "grad_norm": 1.46616098155785, + "language_loss": 0.82512581, + "learning_rate": 3.981129007961593e-06, + "loss": 0.85200572, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.27783203, + "step": 1205, + "time_per_iteration": 2.8423891067504883 + }, + { + "auxiliary_loss_clip": 0.01620875, + "auxiliary_loss_mlp": 0.01078283, + "balance_loss_clip": 1.37960339, + "balance_loss_mlp": 1.04995859, + "epoch": 0.07250864271757101, + "flos": 22569946629840.0, + "grad_norm": 1.6589371083281557, + "language_loss": 0.77285671, + "learning_rate": 3.981075595726283e-06, + "loss": 0.79984832, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.28356934, + "step": 1206, + "time_per_iteration": 2.864222764968872 + }, + { + "auxiliary_loss_clip": 0.01609642, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_clip": 1.36937606, + "balance_loss_mlp": 1.03779972, + "epoch": 0.072568765970239, + "flos": 21767452330680.0, + "grad_norm": 1.6692692110941114, + "language_loss": 0.78165507, + "learning_rate": 3.981022108368387e-06, + "loss": 0.8084197, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.29052734, + "step": 1207, + "time_per_iteration": 2.778046131134033 + }, + { + "auxiliary_loss_clip": 0.01601895, + "auxiliary_loss_mlp": 0.01066746, + "balance_loss_clip": 1.36344802, + "balance_loss_mlp": 1.0383625, + "epoch": 0.07262888922290696, + "flos": 25525144019160.0, + "grad_norm": 1.8619465153294206, + "language_loss": 0.80548, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.83216643, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.28405762, + "step": 1208, + "time_per_iteration": 2.8766205310821533 + }, + { + "auxiliary_loss_clip": 0.01599559, + "auxiliary_loss_mlp": 0.01066563, + "balance_loss_clip": 1.36274672, + "balance_loss_mlp": 1.03991938, + "epoch": 0.07268901247557492, + "flos": 21250867469760.0, + "grad_norm": 2.7038294617890344, + "language_loss": 0.78704625, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81370741, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.26611328, + "step": 1209, + "time_per_iteration": 2.839289903640747 + }, + { + "auxiliary_loss_clip": 0.01615849, + "auxiliary_loss_mlp": 0.01078739, + "balance_loss_clip": 1.37659502, + "balance_loss_mlp": 1.05029571, + "epoch": 0.0727491357282429, + "flos": 25484227773480.0, + "grad_norm": 2.188514027678098, + "language_loss": 0.81850898, + "learning_rate": 3.980861195579486e-06, + "loss": 0.84545487, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.28466797, + "step": 1210, + "time_per_iteration": 2.8703229427337646 + }, + { + "auxiliary_loss_clip": 0.01606839, + "auxiliary_loss_mlp": 0.01070495, + "balance_loss_clip": 1.37127817, + "balance_loss_mlp": 1.04176545, + "epoch": 0.07280925898091087, + "flos": 24467545045800.0, + "grad_norm": 1.8784577373725575, + "language_loss": 0.85184574, + "learning_rate": 3.98080740775156e-06, + "loss": 0.87861907, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28710938, + "step": 1211, + "time_per_iteration": 2.881141185760498 + }, + { + "auxiliary_loss_clip": 0.01596687, + "auxiliary_loss_mlp": 0.01065515, + "balance_loss_clip": 1.35926628, + "balance_loss_mlp": 1.03571272, + "epoch": 0.07286938223357883, + "flos": 18291731069520.0, + "grad_norm": 2.2532327278570263, + "language_loss": 0.91775399, + "learning_rate": 3.98075354481122e-06, + "loss": 0.94437599, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.29858398, + "step": 1212, + "time_per_iteration": 2.7586610317230225 + }, + { + "auxiliary_loss_clip": 0.01605664, + "auxiliary_loss_mlp": 0.01062207, + "balance_loss_clip": 1.36651003, + "balance_loss_mlp": 1.03321576, + "epoch": 0.07292950548624681, + "flos": 21219737838840.0, + "grad_norm": 1.6463478817967006, + "language_loss": 0.73012906, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.7568078, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.28967285, + "step": 1213, + "time_per_iteration": 2.8306314945220947 + }, + { + "auxiliary_loss_clip": 0.01601137, + "auxiliary_loss_mlp": 0.01063121, + "balance_loss_clip": 1.36140501, + "balance_loss_mlp": 1.03417671, + "epoch": 0.07298962873891478, + "flos": 24647070916080.0, + "grad_norm": 1.8209703424651604, + "language_loss": 0.85404503, + "learning_rate": 3.980645593601465e-06, + "loss": 0.88068759, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.28894043, + "step": 1214, + "time_per_iteration": 2.8361587524414062 + }, + { + "auxiliary_loss_clip": 0.01604434, + "auxiliary_loss_mlp": 0.01063025, + "balance_loss_clip": 1.36325121, + "balance_loss_mlp": 1.03524971, + "epoch": 0.07304975199158274, + "flos": 27058492824480.0, + "grad_norm": 1.997774869065986, + "language_loss": 0.84869003, + "learning_rate": 3.980591505336144e-06, + "loss": 0.87536466, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.27807617, + "step": 1215, + "time_per_iteration": 2.9613802433013916 + }, + { + "auxiliary_loss_clip": 0.01598435, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_clip": 1.35889161, + "balance_loss_mlp": 1.03835857, + "epoch": 0.07310987524425071, + "flos": 33556421914920.0, + "grad_norm": 1.5253891644496873, + "language_loss": 0.8205815, + "learning_rate": 3.980537341966595e-06, + "loss": 0.84725052, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.30114746, + "step": 1216, + "time_per_iteration": 4.363391637802124 + }, + { + "auxiliary_loss_clip": 0.01611173, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_clip": 1.36903048, + "balance_loss_mlp": 1.04325128, + "epoch": 0.07316999849691869, + "flos": 28116822748320.0, + "grad_norm": 1.8788379892438825, + "language_loss": 0.77232277, + "learning_rate": 3.980483103494872e-06, + "loss": 0.79914695, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.27978516, + "step": 1217, + "time_per_iteration": 2.878166437149048 + }, + { + "auxiliary_loss_clip": 0.01600725, + "auxiliary_loss_mlp": 0.01063363, + "balance_loss_clip": 1.35921979, + "balance_loss_mlp": 1.03599238, + "epoch": 0.07323012174958665, + "flos": 14396729223960.0, + "grad_norm": 1.999045703082806, + "language_loss": 0.86827278, + "learning_rate": 3.98042878992303e-06, + "loss": 0.89491367, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.27380371, + "step": 1218, + "time_per_iteration": 2.8338749408721924 + }, + { + "auxiliary_loss_clip": 0.01594847, + "auxiliary_loss_mlp": 0.01072151, + "balance_loss_clip": 1.35612273, + "balance_loss_mlp": 1.0446136, + "epoch": 0.07329024500225462, + "flos": 21621451984560.0, + "grad_norm": 1.7760977857892764, + "language_loss": 0.87471068, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9013806, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.2755127, + "step": 1219, + "time_per_iteration": 6.676003456115723 + }, + { + "auxiliary_loss_clip": 0.01592458, + "auxiliary_loss_mlp": 0.01066867, + "balance_loss_clip": 1.353351, + "balance_loss_mlp": 1.03888893, + "epoch": 0.0733503682549226, + "flos": 13227995504520.0, + "grad_norm": 2.0537099919663513, + "language_loss": 0.84962314, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87621641, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.27990723, + "step": 1220, + "time_per_iteration": 4.221264839172363 + }, + { + "auxiliary_loss_clip": 0.01601228, + "auxiliary_loss_mlp": 0.01067364, + "balance_loss_clip": 1.35966957, + "balance_loss_mlp": 1.03851581, + "epoch": 0.07341049150759056, + "flos": 20891937554280.0, + "grad_norm": 2.65717633875086, + "language_loss": 0.78059417, + "learning_rate": 3.98026539862741e-06, + "loss": 0.80728006, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.28845215, + "step": 1221, + "time_per_iteration": 2.8238513469696045 + }, + { + "auxiliary_loss_clip": 0.0159496, + "auxiliary_loss_mlp": 0.01058728, + "balance_loss_clip": 1.35605264, + "balance_loss_mlp": 1.03053463, + "epoch": 0.07347061476025853, + "flos": 15417797654520.0, + "grad_norm": 2.0421939047916977, + "language_loss": 0.92498028, + "learning_rate": 3.980210784675722e-06, + "loss": 0.95151722, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.28173828, + "step": 1222, + "time_per_iteration": 2.757739543914795 + }, + { + "auxiliary_loss_clip": 0.01606083, + "auxiliary_loss_mlp": 0.01066202, + "balance_loss_clip": 1.36310673, + "balance_loss_mlp": 1.03908181, + "epoch": 0.0735307380129265, + "flos": 11112553907640.0, + "grad_norm": 2.237453341855543, + "language_loss": 0.91463816, + "learning_rate": 3.980156095634242e-06, + "loss": 0.94136095, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.2713623, + "step": 1223, + "time_per_iteration": 2.765638589859009 + }, + { + "auxiliary_loss_clip": 0.01607591, + "auxiliary_loss_mlp": 0.01079414, + "balance_loss_clip": 1.36896074, + "balance_loss_mlp": 1.05204403, + "epoch": 0.07359086126559447, + "flos": 23737665140280.0, + "grad_norm": 1.7864085739021511, + "language_loss": 0.83074021, + "learning_rate": 3.980101331505045e-06, + "loss": 0.85761029, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.27380371, + "step": 1224, + "time_per_iteration": 2.826908588409424 + }, + { + "auxiliary_loss_clip": 0.01595301, + "auxiliary_loss_mlp": 0.01078179, + "balance_loss_clip": 1.35353744, + "balance_loss_mlp": 1.04933059, + "epoch": 0.07365098451826244, + "flos": 20997833821920.0, + "grad_norm": 1.9063085665552442, + "language_loss": 0.84528375, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.87201858, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.28833008, + "step": 1225, + "time_per_iteration": 2.813711166381836 + }, + { + "auxiliary_loss_clip": 0.01598974, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.3583653, + "balance_loss_mlp": 1.03686392, + "epoch": 0.0737111077709304, + "flos": 19937798346960.0, + "grad_norm": 1.764998967964697, + "language_loss": 0.90394694, + "learning_rate": 3.979991577991808e-06, + "loss": 0.93059427, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2890625, + "step": 1226, + "time_per_iteration": 2.786180019378662 + }, + { + "auxiliary_loss_clip": 0.01618916, + "auxiliary_loss_mlp": 0.01061989, + "balance_loss_clip": 1.36887038, + "balance_loss_mlp": 1.03254426, + "epoch": 0.07377123102359838, + "flos": 16585922248560.0, + "grad_norm": 3.2972312307071423, + "language_loss": 0.77276033, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79956943, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.29394531, + "step": 1227, + "time_per_iteration": 2.822887659072876 + }, + { + "auxiliary_loss_clip": 0.01592022, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_clip": 1.35486317, + "balance_loss_mlp": 1.03349018, + "epoch": 0.07383135427626634, + "flos": 28334300454000.0, + "grad_norm": 1.5644926220418547, + "language_loss": 0.8618139, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.88835299, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.28381348, + "step": 1228, + "time_per_iteration": 2.8953073024749756 + }, + { + "auxiliary_loss_clip": 0.01592145, + "auxiliary_loss_mlp": 0.01059637, + "balance_loss_clip": 1.34987867, + "balance_loss_mlp": 1.03183711, + "epoch": 0.07389147752893431, + "flos": 20051938111680.0, + "grad_norm": 1.910330105778886, + "language_loss": 0.8045224, + "learning_rate": 3.97982638461608e-06, + "loss": 0.83104014, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.27770996, + "step": 1229, + "time_per_iteration": 2.7449519634246826 + }, + { + "auxiliary_loss_clip": 0.01598803, + "auxiliary_loss_mlp": 0.0107116, + "balance_loss_clip": 1.35693657, + "balance_loss_mlp": 1.04207253, + "epoch": 0.07395160078160229, + "flos": 18118743145200.0, + "grad_norm": 2.2036327683967087, + "language_loss": 0.79445922, + "learning_rate": 3.979771170004287e-06, + "loss": 0.82115877, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.29101562, + "step": 1230, + "time_per_iteration": 2.8815736770629883 + }, + { + "auxiliary_loss_clip": 0.01589077, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_clip": 1.35125923, + "balance_loss_mlp": 1.03391063, + "epoch": 0.07401172403427025, + "flos": 23592233311200.0, + "grad_norm": 3.0193159474958184, + "language_loss": 0.82145095, + "learning_rate": 3.979715880319372e-06, + "loss": 0.8479743, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29345703, + "step": 1231, + "time_per_iteration": 2.799421548843384 + }, + { + "auxiliary_loss_clip": 0.01593181, + "auxiliary_loss_mlp": 0.01072599, + "balance_loss_clip": 1.34776139, + "balance_loss_mlp": 1.04209304, + "epoch": 0.07407184728693822, + "flos": 26365590153720.0, + "grad_norm": 1.930616942160573, + "language_loss": 0.95461124, + "learning_rate": 3.979660515563434e-06, + "loss": 0.981269, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.30505371, + "step": 1232, + "time_per_iteration": 2.8378777503967285 + }, + { + "auxiliary_loss_clip": 0.01585626, + "auxiliary_loss_mlp": 0.01069885, + "balance_loss_clip": 1.3491255, + "balance_loss_mlp": 1.04383814, + "epoch": 0.0741319705396062, + "flos": 22205575194120.0, + "grad_norm": 1.8043301845869746, + "language_loss": 0.81978965, + "learning_rate": 3.979605075738569e-06, + "loss": 0.84634471, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.26074219, + "step": 1233, + "time_per_iteration": 2.8035829067230225 + }, + { + "auxiliary_loss_clip": 0.01595622, + "auxiliary_loss_mlp": 0.01066202, + "balance_loss_clip": 1.35155416, + "balance_loss_mlp": 1.0357796, + "epoch": 0.07419209379227416, + "flos": 39207529358280.0, + "grad_norm": 2.101572109112746, + "language_loss": 0.71557891, + "learning_rate": 3.979549560846883e-06, + "loss": 0.74219716, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30419922, + "step": 1234, + "time_per_iteration": 2.955695152282715 + }, + { + "auxiliary_loss_clip": 0.01588216, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.34791529, + "balance_loss_mlp": 1.03683019, + "epoch": 0.07425221704494213, + "flos": 22786530951600.0, + "grad_norm": 1.7291738706141486, + "language_loss": 0.77587998, + "learning_rate": 3.979493970890478e-06, + "loss": 0.80239689, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.26623535, + "step": 1235, + "time_per_iteration": 2.8754661083221436 + }, + { + "auxiliary_loss_clip": 0.01583153, + "auxiliary_loss_mlp": 0.010595, + "balance_loss_clip": 1.34753561, + "balance_loss_mlp": 1.02979279, + "epoch": 0.0743123402976101, + "flos": 22278067762680.0, + "grad_norm": 1.8036268623081169, + "language_loss": 0.83391356, + "learning_rate": 3.979438305871464e-06, + "loss": 0.86034006, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.29736328, + "step": 1236, + "time_per_iteration": 2.803776741027832 + }, + { + "auxiliary_loss_clip": 0.01587041, + "auxiliary_loss_mlp": 0.01068201, + "balance_loss_clip": 1.34631336, + "balance_loss_mlp": 1.0406754, + "epoch": 0.07437246355027807, + "flos": 29321071801560.0, + "grad_norm": 1.689878156858221, + "language_loss": 0.76511312, + "learning_rate": 3.979382565791951e-06, + "loss": 0.79166555, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.27502441, + "step": 1237, + "time_per_iteration": 2.8978137969970703 + }, + { + "auxiliary_loss_clip": 0.01574522, + "auxiliary_loss_mlp": 0.01064082, + "balance_loss_clip": 1.3377223, + "balance_loss_mlp": 1.03755784, + "epoch": 0.07443258680294604, + "flos": 31951473924960.0, + "grad_norm": 1.5930141534024769, + "language_loss": 0.78397882, + "learning_rate": 3.979326750654053e-06, + "loss": 0.81036484, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.26538086, + "step": 1238, + "time_per_iteration": 2.905189037322998 + }, + { + "auxiliary_loss_clip": 0.01590306, + "auxiliary_loss_mlp": 0.01057508, + "balance_loss_clip": 1.34652686, + "balance_loss_mlp": 1.02947044, + "epoch": 0.074492710055614, + "flos": 22680553467240.0, + "grad_norm": 1.8703207331642766, + "language_loss": 0.87140334, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.89788151, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.28015137, + "step": 1239, + "time_per_iteration": 2.892028331756592 + }, + { + "auxiliary_loss_clip": 0.01582564, + "auxiliary_loss_mlp": 0.01060465, + "balance_loss_clip": 1.34085584, + "balance_loss_mlp": 1.03171134, + "epoch": 0.07455283330828198, + "flos": 21289469038920.0, + "grad_norm": 1.8210339482408622, + "language_loss": 0.90297192, + "learning_rate": 3.979214895211569e-06, + "loss": 0.92940223, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.28735352, + "step": 1240, + "time_per_iteration": 2.796041250228882 + }, + { + "auxiliary_loss_clip": 0.01581627, + "auxiliary_loss_mlp": 0.01065477, + "balance_loss_clip": 1.34108138, + "balance_loss_mlp": 1.0388453, + "epoch": 0.07461295656094995, + "flos": 24393671793000.0, + "grad_norm": 1.8246052151823144, + "language_loss": 0.89737678, + "learning_rate": 3.979158854911225e-06, + "loss": 0.92384779, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.26635742, + "step": 1241, + "time_per_iteration": 2.8106284141540527 + }, + { + "auxiliary_loss_clip": 0.01477003, + "auxiliary_loss_mlp": 0.01056212, + "balance_loss_clip": 1.29629409, + "balance_loss_mlp": 1.04514968, + "epoch": 0.07467307981361791, + "flos": 62123956118280.0, + "grad_norm": 0.9170305500571599, + "language_loss": 0.63154018, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65687227, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.11083984, + "step": 1242, + "time_per_iteration": 3.3599517345428467 + }, + { + "auxiliary_loss_clip": 0.01601345, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.35305595, + "balance_loss_mlp": 1.02923429, + "epoch": 0.07473320306628589, + "flos": 24868528241040.0, + "grad_norm": 3.1673459952670537, + "language_loss": 0.63971663, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.66630924, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.28674316, + "step": 1243, + "time_per_iteration": 2.800135850906372 + }, + { + "auxiliary_loss_clip": 0.01576739, + "auxiliary_loss_mlp": 0.0106831, + "balance_loss_clip": 1.33867598, + "balance_loss_mlp": 1.04072452, + "epoch": 0.07479332631895386, + "flos": 24902541065520.0, + "grad_norm": 1.8348420041634024, + "language_loss": 0.77048993, + "learning_rate": 3.978990283719296e-06, + "loss": 0.79694039, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27600098, + "step": 1244, + "time_per_iteration": 2.833726167678833 + }, + { + "auxiliary_loss_clip": 0.0157913, + "auxiliary_loss_mlp": 0.01059828, + "balance_loss_clip": 1.33933187, + "balance_loss_mlp": 1.0320996, + "epoch": 0.07485344957162182, + "flos": 17818905039480.0, + "grad_norm": 3.220597657071342, + "language_loss": 0.70436025, + "learning_rate": 3.978933943232123e-06, + "loss": 0.73074979, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.27709961, + "step": 1245, + "time_per_iteration": 2.716365337371826 + }, + { + "auxiliary_loss_clip": 0.01580699, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_clip": 1.34161592, + "balance_loss_mlp": 1.0392946, + "epoch": 0.0749135728242898, + "flos": 25015746837960.0, + "grad_norm": 2.210828984821568, + "language_loss": 0.89048481, + "learning_rate": 3.978877527703576e-06, + "loss": 0.91698468, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.30004883, + "step": 1246, + "time_per_iteration": 2.8199167251586914 + }, + { + "auxiliary_loss_clip": 0.01603564, + "auxiliary_loss_mlp": 0.01074639, + "balance_loss_clip": 1.35376978, + "balance_loss_mlp": 1.04381168, + "epoch": 0.07497369607695777, + "flos": 17826620627880.0, + "grad_norm": 2.687344997782284, + "language_loss": 0.89129704, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.91807902, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.30810547, + "step": 1247, + "time_per_iteration": 2.7711117267608643 + }, + { + "auxiliary_loss_clip": 0.01573957, + "auxiliary_loss_mlp": 0.01071119, + "balance_loss_clip": 1.33727562, + "balance_loss_mlp": 1.04243684, + "epoch": 0.07503381932962573, + "flos": 15125147228520.0, + "grad_norm": 2.0622621369697893, + "language_loss": 0.65434957, + "learning_rate": 3.978764471530921e-06, + "loss": 0.68080032, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28674316, + "step": 1248, + "time_per_iteration": 2.7329938411712646 + }, + { + "auxiliary_loss_clip": 0.01568901, + "auxiliary_loss_mlp": 0.01063595, + "balance_loss_clip": 1.33575833, + "balance_loss_mlp": 1.03763115, + "epoch": 0.0750939425822937, + "flos": 12819743412840.0, + "grad_norm": 3.73861165609065, + "language_loss": 0.7481299, + "learning_rate": 3.978707830891102e-06, + "loss": 0.77445489, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25976562, + "step": 1249, + "time_per_iteration": 2.790360450744629 + }, + { + "auxiliary_loss_clip": 0.01586663, + "auxiliary_loss_mlp": 0.01075836, + "balance_loss_clip": 1.34371138, + "balance_loss_mlp": 1.04738045, + "epoch": 0.07515406583496168, + "flos": 24212196721440.0, + "grad_norm": 2.793868130009025, + "language_loss": 0.82881522, + "learning_rate": 3.978651115218482e-06, + "loss": 0.85544026, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.2845459, + "step": 1250, + "time_per_iteration": 2.763383626937866 + }, + { + "auxiliary_loss_clip": 0.01577121, + "auxiliary_loss_mlp": 0.01060577, + "balance_loss_clip": 1.33918023, + "balance_loss_mlp": 1.0339452, + "epoch": 0.07521418908762964, + "flos": 26693877738600.0, + "grad_norm": 2.5805491875508726, + "language_loss": 0.68438786, + "learning_rate": 3.978594324515215e-06, + "loss": 0.71076483, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.26611328, + "step": 1251, + "time_per_iteration": 2.809906244277954 + }, + { + "auxiliary_loss_clip": 0.01461658, + "auxiliary_loss_mlp": 0.01017389, + "balance_loss_clip": 1.28249228, + "balance_loss_mlp": 1.00828159, + "epoch": 0.0752743123402976, + "flos": 59110844561640.0, + "grad_norm": 0.9001658693533702, + "language_loss": 0.70385408, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72864455, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09130859, + "step": 1252, + "time_per_iteration": 3.3309450149536133 + }, + { + "auxiliary_loss_clip": 0.01575804, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_clip": 1.3390398, + "balance_loss_mlp": 1.03946781, + "epoch": 0.07533443559296558, + "flos": 23482276207560.0, + "grad_norm": 2.1077960873377473, + "language_loss": 0.80653453, + "learning_rate": 3.97848051802535e-06, + "loss": 0.83295804, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.27075195, + "step": 1253, + "time_per_iteration": 2.896716833114624 + }, + { + "auxiliary_loss_clip": 0.01580619, + "auxiliary_loss_mlp": 0.01069556, + "balance_loss_clip": 1.33932316, + "balance_loss_mlp": 1.04418802, + "epoch": 0.07539455884563355, + "flos": 20883044323440.0, + "grad_norm": 2.457016024401177, + "language_loss": 0.94762254, + "learning_rate": 3.978423502243069e-06, + "loss": 0.97412425, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.25378418, + "step": 1254, + "time_per_iteration": 2.7516796588897705 + }, + { + "auxiliary_loss_clip": 0.01567511, + "auxiliary_loss_mlp": 0.01057388, + "balance_loss_clip": 1.33408582, + "balance_loss_mlp": 1.03174639, + "epoch": 0.07545468209830151, + "flos": 27678699884880.0, + "grad_norm": 1.7524634437183728, + "language_loss": 0.88770503, + "learning_rate": 3.97836641143877e-06, + "loss": 0.91395402, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.2565918, + "step": 1255, + "time_per_iteration": 2.801436185836792 + }, + { + "auxiliary_loss_clip": 0.01565262, + "auxiliary_loss_mlp": 0.01070098, + "balance_loss_clip": 1.33016849, + "balance_loss_mlp": 1.04332399, + "epoch": 0.0755148053509695, + "flos": 14141056032720.0, + "grad_norm": 1.6996462515523294, + "language_loss": 0.79929483, + "learning_rate": 3.978309245614618e-06, + "loss": 0.82564843, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.26782227, + "step": 1256, + "time_per_iteration": 4.161005020141602 + }, + { + "auxiliary_loss_clip": 0.01458812, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.28062344, + "balance_loss_mlp": 1.02320313, + "epoch": 0.07557492860363746, + "flos": 58248145045800.0, + "grad_norm": 0.7934019276430269, + "language_loss": 0.58086276, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60582018, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.13769531, + "step": 1257, + "time_per_iteration": 4.974039316177368 + }, + { + "auxiliary_loss_clip": 0.01577837, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_clip": 1.33719707, + "balance_loss_mlp": 1.04298306, + "epoch": 0.07563505185630542, + "flos": 24649751067840.0, + "grad_norm": 2.74696985853254, + "language_loss": 0.90566659, + "learning_rate": 3.978194688915432e-06, + "loss": 0.9321475, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.27258301, + "step": 1258, + "time_per_iteration": 4.374457836151123 + }, + { + "auxiliary_loss_clip": 0.01561955, + "auxiliary_loss_mlp": 0.01064078, + "balance_loss_clip": 1.33010364, + "balance_loss_mlp": 1.03760123, + "epoch": 0.07569517510897339, + "flos": 15527186241120.0, + "grad_norm": 2.03271195808807, + "language_loss": 0.81408811, + "learning_rate": 3.978137298044741e-06, + "loss": 0.84034842, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26489258, + "step": 1259, + "time_per_iteration": 4.206702470779419 + }, + { + "auxiliary_loss_clip": 0.01569922, + "auxiliary_loss_mlp": 0.01064059, + "balance_loss_clip": 1.33273292, + "balance_loss_mlp": 1.03878617, + "epoch": 0.07575529836164137, + "flos": 22933546506720.0, + "grad_norm": 1.6546530671943722, + "language_loss": 0.76180363, + "learning_rate": 3.978079832162885e-06, + "loss": 0.7881434, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.25256348, + "step": 1260, + "time_per_iteration": 2.824075222015381 + }, + { + "auxiliary_loss_clip": 0.01563072, + "auxiliary_loss_mlp": 0.01065153, + "balance_loss_clip": 1.32733238, + "balance_loss_mlp": 1.03798532, + "epoch": 0.07581542161430933, + "flos": 19505198220480.0, + "grad_norm": 1.8360364267353142, + "language_loss": 0.85221314, + "learning_rate": 3.978022291272044e-06, + "loss": 0.8784954, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27185059, + "step": 1261, + "time_per_iteration": 2.7937772274017334 + }, + { + "auxiliary_loss_clip": 0.01567617, + "auxiliary_loss_mlp": 0.01060782, + "balance_loss_clip": 1.33000886, + "balance_loss_mlp": 1.03585517, + "epoch": 0.0758755448669773, + "flos": 24978972645000.0, + "grad_norm": 1.8784447877835366, + "language_loss": 0.8336193, + "learning_rate": 3.977964675374399e-06, + "loss": 0.85990328, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.24938965, + "step": 1262, + "time_per_iteration": 2.7854018211364746 + }, + { + "auxiliary_loss_clip": 0.01570464, + "auxiliary_loss_mlp": 0.01064552, + "balance_loss_clip": 1.3303957, + "balance_loss_mlp": 1.03595376, + "epoch": 0.07593566811964528, + "flos": 22753208469240.0, + "grad_norm": 2.9260962981554695, + "language_loss": 0.83320212, + "learning_rate": 3.977906984472136e-06, + "loss": 0.85955226, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.2857666, + "step": 1263, + "time_per_iteration": 2.838287591934204 + }, + { + "auxiliary_loss_clip": 0.01565767, + "auxiliary_loss_mlp": 0.01061494, + "balance_loss_clip": 1.32608676, + "balance_loss_mlp": 1.03543496, + "epoch": 0.07599579137231324, + "flos": 23117620513320.0, + "grad_norm": 1.934194064880421, + "language_loss": 0.76277351, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78904617, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.26049805, + "step": 1264, + "time_per_iteration": 2.730295181274414 + }, + { + "auxiliary_loss_clip": 0.01569357, + "auxiliary_loss_mlp": 0.01063442, + "balance_loss_clip": 1.33044863, + "balance_loss_mlp": 1.03646469, + "epoch": 0.07605591462498121, + "flos": 14506199027280.0, + "grad_norm": 2.2820182648527423, + "language_loss": 0.81074834, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83707631, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.26989746, + "step": 1265, + "time_per_iteration": 2.7636547088623047 + }, + { + "auxiliary_loss_clip": 0.01577044, + "auxiliary_loss_mlp": 0.01055688, + "balance_loss_clip": 1.33335769, + "balance_loss_mlp": 1.02959359, + "epoch": 0.07611603787764919, + "flos": 23519537700840.0, + "grad_norm": 2.0467437055180024, + "language_loss": 0.65250784, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67883515, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26098633, + "step": 1266, + "time_per_iteration": 2.808925151824951 + }, + { + "auxiliary_loss_clip": 0.01571809, + "auxiliary_loss_mlp": 0.01071477, + "balance_loss_clip": 1.32931495, + "balance_loss_mlp": 1.04383194, + "epoch": 0.07617616113031715, + "flos": 21512225831400.0, + "grad_norm": 2.0701859548557198, + "language_loss": 0.8063162, + "learning_rate": 3.977675470860691e-06, + "loss": 0.83274901, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.27624512, + "step": 1267, + "time_per_iteration": 2.736250877380371 + }, + { + "auxiliary_loss_clip": 0.01559015, + "auxiliary_loss_mlp": 0.01057027, + "balance_loss_clip": 1.32020462, + "balance_loss_mlp": 1.03219545, + "epoch": 0.07623628438298512, + "flos": 14577067261440.0, + "grad_norm": 3.6036146896929764, + "language_loss": 0.73160511, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75776553, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.24841309, + "step": 1268, + "time_per_iteration": 2.7340357303619385 + }, + { + "auxiliary_loss_clip": 0.0155852, + "auxiliary_loss_mlp": 0.01058498, + "balance_loss_clip": 1.32012796, + "balance_loss_mlp": 1.03205752, + "epoch": 0.07629640763565308, + "flos": 14724488900160.0, + "grad_norm": 2.79881081967442, + "language_loss": 0.83170527, + "learning_rate": 3.977559264084269e-06, + "loss": 0.85787547, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.26489258, + "step": 1269, + "time_per_iteration": 2.7653348445892334 + }, + { + "auxiliary_loss_clip": 0.01565681, + "auxiliary_loss_mlp": 0.01061051, + "balance_loss_clip": 1.32952619, + "balance_loss_mlp": 1.03391886, + "epoch": 0.07635653088832106, + "flos": 14907060397440.0, + "grad_norm": 2.1430067301082594, + "language_loss": 0.89013219, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91639948, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.27172852, + "step": 1270, + "time_per_iteration": 2.7616631984710693 + }, + { + "auxiliary_loss_clip": 0.01569934, + "auxiliary_loss_mlp": 0.01065805, + "balance_loss_clip": 1.32849455, + "balance_loss_mlp": 1.03804111, + "epoch": 0.07641665414098903, + "flos": 26657265979080.0, + "grad_norm": 2.482996521751865, + "language_loss": 0.71649349, + "learning_rate": 3.977442757350869e-06, + "loss": 0.7428509, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.27807617, + "step": 1271, + "time_per_iteration": 2.7694101333618164 + }, + { + "auxiliary_loss_clip": 0.015559, + "auxiliary_loss_mlp": 0.01067602, + "balance_loss_clip": 1.32541275, + "balance_loss_mlp": 1.04150689, + "epoch": 0.07647677739365699, + "flos": 25198399551960.0, + "grad_norm": 1.5876816767395403, + "language_loss": 0.8334927, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85972774, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2611084, + "step": 1272, + "time_per_iteration": 2.8215909004211426 + }, + { + "auxiliary_loss_clip": 0.01563143, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.32728708, + "balance_loss_mlp": 1.04042673, + "epoch": 0.07653690064632497, + "flos": 20563000235640.0, + "grad_norm": 1.7390361876436005, + "language_loss": 0.80993432, + "learning_rate": 3.977325950678162e-06, + "loss": 0.83623052, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.26062012, + "step": 1273, + "time_per_iteration": 2.79502272605896 + }, + { + "auxiliary_loss_clip": 0.0157255, + "auxiliary_loss_mlp": 0.01059155, + "balance_loss_clip": 1.33238685, + "balance_loss_mlp": 1.03213, + "epoch": 0.07659702389899294, + "flos": 22273925709960.0, + "grad_norm": 1.772856076880509, + "language_loss": 0.81862473, + "learning_rate": 3.977267434870103e-06, + "loss": 0.84494174, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.2701416, + "step": 1274, + "time_per_iteration": 2.9157958030700684 + }, + { + "auxiliary_loss_clip": 0.01564993, + "auxiliary_loss_mlp": 0.01068395, + "balance_loss_clip": 1.33094203, + "balance_loss_mlp": 1.0410248, + "epoch": 0.0766571471516609, + "flos": 32642711652960.0, + "grad_norm": 1.6922922959559852, + "language_loss": 0.73138154, + "learning_rate": 3.977208844083865e-06, + "loss": 0.7577154, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27392578, + "step": 1275, + "time_per_iteration": 2.9032859802246094 + }, + { + "auxiliary_loss_clip": 0.01565372, + "auxiliary_loss_mlp": 0.01065224, + "balance_loss_clip": 1.32677698, + "balance_loss_mlp": 1.03819931, + "epoch": 0.07671727040432888, + "flos": 15271513049880.0, + "grad_norm": 2.0829971821481714, + "language_loss": 0.81093872, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.83724463, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.2701416, + "step": 1276, + "time_per_iteration": 2.72684383392334 + }, + { + "auxiliary_loss_clip": 0.0157772, + "auxiliary_loss_mlp": 0.01062598, + "balance_loss_clip": 1.33771467, + "balance_loss_mlp": 1.03579926, + "epoch": 0.07677739365699685, + "flos": 28190005659000.0, + "grad_norm": 2.223012822533688, + "language_loss": 0.59995598, + "learning_rate": 3.97709143758574e-06, + "loss": 0.62635911, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.26831055, + "step": 1277, + "time_per_iteration": 2.8236491680145264 + }, + { + "auxiliary_loss_clip": 0.01584357, + "auxiliary_loss_mlp": 0.0106151, + "balance_loss_clip": 1.34022689, + "balance_loss_mlp": 1.03417575, + "epoch": 0.07683751690966481, + "flos": 18300705517080.0, + "grad_norm": 2.422295355367062, + "language_loss": 0.75225306, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77871174, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.27331543, + "step": 1278, + "time_per_iteration": 2.7132551670074463 + }, + { + "auxiliary_loss_clip": 0.0156267, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.32788503, + "balance_loss_mlp": 1.0375309, + "epoch": 0.07689764016233278, + "flos": 21986148287160.0, + "grad_norm": 2.130875954707927, + "language_loss": 0.89252269, + "learning_rate": 3.976973731201596e-06, + "loss": 0.91878772, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26330566, + "step": 1279, + "time_per_iteration": 2.8227415084838867 + }, + { + "auxiliary_loss_clip": 0.01563826, + "auxiliary_loss_mlp": 0.01056919, + "balance_loss_clip": 1.32807922, + "balance_loss_mlp": 1.03015685, + "epoch": 0.07695776341500075, + "flos": 22241009311200.0, + "grad_norm": 2.2664801506481984, + "language_loss": 0.83192027, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85812783, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.26757812, + "step": 1280, + "time_per_iteration": 2.7934494018554688 + }, + { + "auxiliary_loss_clip": 0.01563573, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.33161831, + "balance_loss_mlp": 1.03724742, + "epoch": 0.07701788666766872, + "flos": 16148083643640.0, + "grad_norm": 2.035002647377808, + "language_loss": 0.76443291, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.79069507, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.25415039, + "step": 1281, + "time_per_iteration": 2.780538320541382 + }, + { + "auxiliary_loss_clip": 0.01577629, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_clip": 1.33556652, + "balance_loss_mlp": 1.03359413, + "epoch": 0.07707800992033668, + "flos": 19467693077040.0, + "grad_norm": 1.8568284138406281, + "language_loss": 0.76010764, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.78650838, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.28857422, + "step": 1282, + "time_per_iteration": 2.7264797687530518 + }, + { + "auxiliary_loss_clip": 0.0157191, + "auxiliary_loss_mlp": 0.01065057, + "balance_loss_clip": 1.3366034, + "balance_loss_mlp": 1.03828263, + "epoch": 0.07713813317300466, + "flos": 18995070088800.0, + "grad_norm": 1.922654897387241, + "language_loss": 0.84378016, + "learning_rate": 3.976737418846713e-06, + "loss": 0.87014985, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26806641, + "step": 1283, + "time_per_iteration": 2.7454583644866943 + }, + { + "auxiliary_loss_clip": 0.01574997, + "auxiliary_loss_mlp": 0.0106891, + "balance_loss_clip": 1.3372376, + "balance_loss_mlp": 1.03973985, + "epoch": 0.07719825642567263, + "flos": 18118540103400.0, + "grad_norm": 1.8066169839342059, + "language_loss": 0.75694227, + "learning_rate": 3.976678153357181e-06, + "loss": 0.78338128, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.29162598, + "step": 1284, + "time_per_iteration": 2.7519168853759766 + }, + { + "auxiliary_loss_clip": 0.01561763, + "auxiliary_loss_mlp": 0.01064976, + "balance_loss_clip": 1.32659459, + "balance_loss_mlp": 1.03870249, + "epoch": 0.0772583796783406, + "flos": 42202140483960.0, + "grad_norm": 1.6921356205777243, + "language_loss": 0.76671171, + "learning_rate": 3.976618812911817e-06, + "loss": 0.79297918, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26281738, + "step": 1285, + "time_per_iteration": 2.9712207317352295 + }, + { + "auxiliary_loss_clip": 0.01565311, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.32913208, + "balance_loss_mlp": 1.03694689, + "epoch": 0.07731850293100857, + "flos": 24759058437720.0, + "grad_norm": 1.901357814306102, + "language_loss": 0.84410596, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.87039185, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.26342773, + "step": 1286, + "time_per_iteration": 2.8928651809692383 + }, + { + "auxiliary_loss_clip": 0.01579582, + "auxiliary_loss_mlp": 0.01060429, + "balance_loss_clip": 1.33623767, + "balance_loss_mlp": 1.03341627, + "epoch": 0.07737862618367654, + "flos": 17570297702880.0, + "grad_norm": 2.2735541899515006, + "language_loss": 0.77971911, + "learning_rate": 3.97649990716259e-06, + "loss": 0.8061192, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.26989746, + "step": 1287, + "time_per_iteration": 2.702484607696533 + }, + { + "auxiliary_loss_clip": 0.0156983, + "auxiliary_loss_mlp": 0.01063883, + "balance_loss_clip": 1.33477485, + "balance_loss_mlp": 1.03647721, + "epoch": 0.0774387494363445, + "flos": 25632339754320.0, + "grad_norm": 1.9215362898661232, + "language_loss": 0.84915453, + "learning_rate": 3.976440341863237e-06, + "loss": 0.87549168, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27404785, + "step": 1288, + "time_per_iteration": 2.7929978370666504 + }, + { + "auxiliary_loss_clip": 0.01575482, + "auxiliary_loss_mlp": 0.01059788, + "balance_loss_clip": 1.33429956, + "balance_loss_mlp": 1.03358555, + "epoch": 0.07749887268901248, + "flos": 12243701266920.0, + "grad_norm": 1.9270603601524998, + "language_loss": 0.85768527, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88403797, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.26196289, + "step": 1289, + "time_per_iteration": 2.702481269836426 + }, + { + "auxiliary_loss_clip": 0.01571107, + "auxiliary_loss_mlp": 0.01054248, + "balance_loss_clip": 1.33296287, + "balance_loss_mlp": 1.02700901, + "epoch": 0.07755899594168045, + "flos": 25087102372440.0, + "grad_norm": 1.7507427062278884, + "language_loss": 0.85682797, + "learning_rate": 3.976320986426344e-06, + "loss": 0.88308156, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.27246094, + "step": 1290, + "time_per_iteration": 2.7817764282226562 + }, + { + "auxiliary_loss_clip": 0.01565658, + "auxiliary_loss_mlp": 0.01057347, + "balance_loss_clip": 1.3323195, + "balance_loss_mlp": 1.02938032, + "epoch": 0.07761911919434841, + "flos": 14250485227680.0, + "grad_norm": 2.0216643963488883, + "language_loss": 0.91373289, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93996298, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27966309, + "step": 1291, + "time_per_iteration": 2.76788067817688 + }, + { + "auxiliary_loss_clip": 0.01421275, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.25051296, + "balance_loss_mlp": 1.02676952, + "epoch": 0.07767924244701638, + "flos": 67254719104440.0, + "grad_norm": 0.8889222316685677, + "language_loss": 0.65063751, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67520136, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.08349609, + "step": 1292, + "time_per_iteration": 3.3395566940307617 + }, + { + "auxiliary_loss_clip": 0.01573434, + "auxiliary_loss_mlp": 0.0106597, + "balance_loss_clip": 1.33745742, + "balance_loss_mlp": 1.03782499, + "epoch": 0.07773936569968436, + "flos": 28556813596320.0, + "grad_norm": 1.7556629845719631, + "language_loss": 0.88343155, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90982556, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.28149414, + "step": 1293, + "time_per_iteration": 2.8411242961883545 + }, + { + "auxiliary_loss_clip": 0.0157914, + "auxiliary_loss_mlp": 0.01063448, + "balance_loss_clip": 1.34135115, + "balance_loss_mlp": 1.0353024, + "epoch": 0.07779948895235232, + "flos": 27496006562520.0, + "grad_norm": 1.978928920425004, + "language_loss": 0.85706526, + "learning_rate": 3.976081376263239e-06, + "loss": 0.88349116, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28149414, + "step": 1294, + "time_per_iteration": 4.2652716636657715 + }, + { + "auxiliary_loss_clip": 0.01585986, + "auxiliary_loss_mlp": 0.01063021, + "balance_loss_clip": 1.34412336, + "balance_loss_mlp": 1.03563833, + "epoch": 0.07785961220502029, + "flos": 18227888081640.0, + "grad_norm": 2.1563900535058367, + "language_loss": 0.80177999, + "learning_rate": 3.976021286383768e-06, + "loss": 0.82827014, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.27404785, + "step": 1295, + "time_per_iteration": 2.7573771476745605 + }, + { + "auxiliary_loss_clip": 0.01577422, + "auxiliary_loss_mlp": 0.01056807, + "balance_loss_clip": 1.34061277, + "balance_loss_mlp": 1.02897203, + "epoch": 0.07791973545768827, + "flos": 24613626608640.0, + "grad_norm": 2.2395077249956405, + "language_loss": 0.88057101, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90691328, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.27819824, + "step": 1296, + "time_per_iteration": 2.8941562175750732 + }, + { + "auxiliary_loss_clip": 0.01582962, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.34368396, + "balance_loss_mlp": 1.04112411, + "epoch": 0.07797985871035623, + "flos": 14286041169840.0, + "grad_norm": 2.4766401497308577, + "language_loss": 0.9723413, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.99885553, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.27331543, + "step": 1297, + "time_per_iteration": 4.483082056045532 + }, + { + "auxiliary_loss_clip": 0.01584016, + "auxiliary_loss_mlp": 0.01063677, + "balance_loss_clip": 1.34285808, + "balance_loss_mlp": 1.03725982, + "epoch": 0.0780399819630242, + "flos": 26615537566200.0, + "grad_norm": 2.2319942234425394, + "language_loss": 0.77380931, + "learning_rate": 3.97584056716893e-06, + "loss": 0.80028623, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.26452637, + "step": 1298, + "time_per_iteration": 4.290179967880249 + }, + { + "auxiliary_loss_clip": 0.01582294, + "auxiliary_loss_mlp": 0.0106834, + "balance_loss_clip": 1.34400821, + "balance_loss_mlp": 1.0418396, + "epoch": 0.07810010521569218, + "flos": 21839457598920.0, + "grad_norm": 1.5843426872895219, + "language_loss": 0.81531864, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.84182495, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.26525879, + "step": 1299, + "time_per_iteration": 2.812760591506958 + }, + { + "auxiliary_loss_clip": 0.01571794, + "auxiliary_loss_mlp": 0.01057866, + "balance_loss_clip": 1.3393681, + "balance_loss_mlp": 1.03131855, + "epoch": 0.07816022846836014, + "flos": 25086533855400.0, + "grad_norm": 1.7509626124768138, + "language_loss": 0.86905336, + "learning_rate": 3.975719713068202e-06, + "loss": 0.89534998, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26538086, + "step": 1300, + "time_per_iteration": 2.920841693878174 + }, + { + "auxiliary_loss_clip": 0.01584464, + "auxiliary_loss_mlp": 0.01054369, + "balance_loss_clip": 1.34514904, + "balance_loss_mlp": 1.02588964, + "epoch": 0.0782203517210281, + "flos": 40925073995280.0, + "grad_norm": 1.7917623832906673, + "language_loss": 0.72399402, + "learning_rate": 3.975659173637458e-06, + "loss": 0.7503823, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.28491211, + "step": 1301, + "time_per_iteration": 2.9230761528015137 + }, + { + "auxiliary_loss_clip": 0.01586047, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_clip": 1.34794569, + "balance_loss_mlp": 1.04656243, + "epoch": 0.07828047497369607, + "flos": 41179488327360.0, + "grad_norm": 1.9792464239064476, + "language_loss": 0.71658075, + "learning_rate": 3.97559855928952e-06, + "loss": 0.74317682, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.26977539, + "step": 1302, + "time_per_iteration": 2.9360744953155518 + }, + { + "auxiliary_loss_clip": 0.01582934, + "auxiliary_loss_mlp": 0.01058856, + "balance_loss_clip": 1.34598303, + "balance_loss_mlp": 1.03102088, + "epoch": 0.07834059822636405, + "flos": 23512674888000.0, + "grad_norm": 2.325959868414412, + "language_loss": 0.82589215, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.85231006, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.27819824, + "step": 1303, + "time_per_iteration": 2.7743542194366455 + }, + { + "auxiliary_loss_clip": 0.01574303, + "auxiliary_loss_mlp": 0.01061931, + "balance_loss_clip": 1.33769631, + "balance_loss_mlp": 1.034024, + "epoch": 0.07840072147903202, + "flos": 20198791233360.0, + "grad_norm": 1.6315483127653918, + "language_loss": 0.75225675, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77861905, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.27905273, + "step": 1304, + "time_per_iteration": 2.7550790309906006 + }, + { + "auxiliary_loss_clip": 0.01578375, + "auxiliary_loss_mlp": 0.01064886, + "balance_loss_clip": 1.33980191, + "balance_loss_mlp": 1.03891075, + "epoch": 0.07846084473169998, + "flos": 21365778793320.0, + "grad_norm": 1.7324081542660084, + "language_loss": 0.76396918, + "learning_rate": 3.975416266765542e-06, + "loss": 0.79040176, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.2598877, + "step": 1305, + "time_per_iteration": 2.7780818939208984 + }, + { + "auxiliary_loss_clip": 0.01596818, + "auxiliary_loss_mlp": 0.01074973, + "balance_loss_clip": 1.35742688, + "balance_loss_mlp": 1.04526591, + "epoch": 0.07852096798436796, + "flos": 25416486383040.0, + "grad_norm": 1.6394789869901856, + "language_loss": 0.85981679, + "learning_rate": 3.975355352771841e-06, + "loss": 0.88653469, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.296875, + "step": 1306, + "time_per_iteration": 2.8051204681396484 + }, + { + "auxiliary_loss_clip": 0.01586995, + "auxiliary_loss_mlp": 0.01052859, + "balance_loss_clip": 1.34997523, + "balance_loss_mlp": 1.0260607, + "epoch": 0.07858109123703592, + "flos": 24576852415680.0, + "grad_norm": 2.66224312975168, + "language_loss": 0.91504663, + "learning_rate": 3.975294363872468e-06, + "loss": 0.94144511, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.26794434, + "step": 1307, + "time_per_iteration": 2.856062412261963 + }, + { + "auxiliary_loss_clip": 0.01584641, + "auxiliary_loss_mlp": 0.01066523, + "balance_loss_clip": 1.34749413, + "balance_loss_mlp": 1.03812706, + "epoch": 0.07864121448970389, + "flos": 20703071761200.0, + "grad_norm": 1.8135564431843825, + "language_loss": 0.83796513, + "learning_rate": 3.975233300069735e-06, + "loss": 0.8644768, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.28417969, + "step": 1308, + "time_per_iteration": 2.8992502689361572 + }, + { + "auxiliary_loss_clip": 0.01577914, + "auxiliary_loss_mlp": 0.01059282, + "balance_loss_clip": 1.34542894, + "balance_loss_mlp": 1.03335369, + "epoch": 0.07870133774237187, + "flos": 22971741992280.0, + "grad_norm": 1.4667140598413693, + "language_loss": 0.77578181, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80215377, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.25927734, + "step": 1309, + "time_per_iteration": 2.8219518661499023 + }, + { + "auxiliary_loss_clip": 0.01593707, + "auxiliary_loss_mlp": 0.01076566, + "balance_loss_clip": 1.35442495, + "balance_loss_mlp": 1.04669261, + "epoch": 0.07876146099503983, + "flos": 18847404799920.0, + "grad_norm": 1.8037805484324105, + "language_loss": 0.81304467, + "learning_rate": 3.975110947763453e-06, + "loss": 0.83974737, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29882812, + "step": 1310, + "time_per_iteration": 2.745622396469116 + }, + { + "auxiliary_loss_clip": 0.01575316, + "auxiliary_loss_mlp": 0.01055498, + "balance_loss_clip": 1.34728765, + "balance_loss_mlp": 1.02983212, + "epoch": 0.0788215842477078, + "flos": 23811132309480.0, + "grad_norm": 1.6181914983061951, + "language_loss": 0.73353434, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75984251, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25683594, + "step": 1311, + "time_per_iteration": 2.837965965270996 + }, + { + "auxiliary_loss_clip": 0.0158224, + "auxiliary_loss_mlp": 0.01086929, + "balance_loss_clip": 1.34768558, + "balance_loss_mlp": 1.05879545, + "epoch": 0.07888170750037576, + "flos": 21584921441760.0, + "grad_norm": 1.8093124756424643, + "language_loss": 0.86118877, + "learning_rate": 3.974988295871553e-06, + "loss": 0.8878805, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28149414, + "step": 1312, + "time_per_iteration": 2.7809360027313232 + }, + { + "auxiliary_loss_clip": 0.01582493, + "auxiliary_loss_mlp": 0.01069223, + "balance_loss_clip": 1.35070968, + "balance_loss_mlp": 1.04313982, + "epoch": 0.07894183075304374, + "flos": 19869610264560.0, + "grad_norm": 1.7227703839990245, + "language_loss": 0.82765573, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.85417295, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26074219, + "step": 1313, + "time_per_iteration": 2.8259212970733643 + }, + { + "auxiliary_loss_clip": 0.01599284, + "auxiliary_loss_mlp": 0.01068501, + "balance_loss_clip": 1.35765183, + "balance_loss_mlp": 1.03922343, + "epoch": 0.07900195400571171, + "flos": 16147758776760.0, + "grad_norm": 2.2567876336630657, + "language_loss": 0.73269111, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75936896, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.29296875, + "step": 1314, + "time_per_iteration": 2.7812511920928955 + }, + { + "auxiliary_loss_clip": 0.01584505, + "auxiliary_loss_mlp": 0.01069366, + "balance_loss_clip": 1.35119319, + "balance_loss_mlp": 1.04282975, + "epoch": 0.07906207725837967, + "flos": 23735431680480.0, + "grad_norm": 1.6671167957013544, + "language_loss": 0.80310798, + "learning_rate": 3.974803756351379e-06, + "loss": 0.82964671, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.26550293, + "step": 1315, + "time_per_iteration": 2.818563461303711 + }, + { + "auxiliary_loss_clip": 0.015916, + "auxiliary_loss_mlp": 0.01066519, + "balance_loss_clip": 1.35522592, + "balance_loss_mlp": 1.03939867, + "epoch": 0.07912220051104765, + "flos": 24321219832800.0, + "grad_norm": 1.6275853460287153, + "language_loss": 0.73992109, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76650226, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.2713623, + "step": 1316, + "time_per_iteration": 2.8303449153900146 + }, + { + "auxiliary_loss_clip": 0.01596144, + "auxiliary_loss_mlp": 0.01065665, + "balance_loss_clip": 1.35420012, + "balance_loss_mlp": 1.03869963, + "epoch": 0.07918232376371562, + "flos": 18884706901560.0, + "grad_norm": 2.4934293007950874, + "language_loss": 0.66053975, + "learning_rate": 3.974680355576927e-06, + "loss": 0.68715787, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.26928711, + "step": 1317, + "time_per_iteration": 2.7828047275543213 + }, + { + "auxiliary_loss_clip": 0.0160728, + "auxiliary_loss_mlp": 0.01072006, + "balance_loss_clip": 1.36516809, + "balance_loss_mlp": 1.04153633, + "epoch": 0.07924244701638358, + "flos": 27381420105840.0, + "grad_norm": 2.2391441997129284, + "language_loss": 0.73754668, + "learning_rate": 3.974618542868415e-06, + "loss": 0.76433951, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.3046875, + "step": 1318, + "time_per_iteration": 2.8341400623321533 + }, + { + "auxiliary_loss_clip": 0.01597634, + "auxiliary_loss_mlp": 0.01064583, + "balance_loss_clip": 1.36412966, + "balance_loss_mlp": 1.03749847, + "epoch": 0.07930257026905156, + "flos": 25125988200120.0, + "grad_norm": 1.6008680558084412, + "language_loss": 0.90361917, + "learning_rate": 3.97455665528217e-06, + "loss": 0.93024135, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27099609, + "step": 1319, + "time_per_iteration": 2.8407328128814697 + }, + { + "auxiliary_loss_clip": 0.01588956, + "auxiliary_loss_mlp": 0.01062548, + "balance_loss_clip": 1.35444927, + "balance_loss_mlp": 1.03639352, + "epoch": 0.07936269352171953, + "flos": 21839416990560.0, + "grad_norm": 1.9186124160211226, + "language_loss": 0.80687118, + "learning_rate": 3.974494692820539e-06, + "loss": 0.83338618, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.26159668, + "step": 1320, + "time_per_iteration": 2.7871720790863037 + }, + { + "auxiliary_loss_clip": 0.01596195, + "auxiliary_loss_mlp": 0.01065828, + "balance_loss_clip": 1.35996819, + "balance_loss_mlp": 1.03900611, + "epoch": 0.07942281677438749, + "flos": 16943471479800.0, + "grad_norm": 2.334494138761537, + "language_loss": 0.69491833, + "learning_rate": 3.974432655485872e-06, + "loss": 0.7215386, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.26855469, + "step": 1321, + "time_per_iteration": 2.9021480083465576 + }, + { + "auxiliary_loss_clip": 0.01590476, + "auxiliary_loss_mlp": 0.01065954, + "balance_loss_clip": 1.35819793, + "balance_loss_mlp": 1.03819013, + "epoch": 0.07948294002705546, + "flos": 18991293511320.0, + "grad_norm": 1.9480541503495095, + "language_loss": 0.84560955, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.87217385, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27758789, + "step": 1322, + "time_per_iteration": 2.789069175720215 + }, + { + "auxiliary_loss_clip": 0.01587447, + "auxiliary_loss_mlp": 0.01060068, + "balance_loss_clip": 1.35189021, + "balance_loss_mlp": 1.03400922, + "epoch": 0.07954306327972344, + "flos": 21658551044400.0, + "grad_norm": 1.8976469397982962, + "language_loss": 0.91203606, + "learning_rate": 3.974308356206838e-06, + "loss": 0.93851113, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26049805, + "step": 1323, + "time_per_iteration": 2.791341781616211 + }, + { + "auxiliary_loss_clip": 0.01588172, + "auxiliary_loss_mlp": 0.01064508, + "balance_loss_clip": 1.35638082, + "balance_loss_mlp": 1.03751898, + "epoch": 0.0796031865323914, + "flos": 23225303548800.0, + "grad_norm": 1.5938269151964248, + "language_loss": 0.83059794, + "learning_rate": 3.974246094267187e-06, + "loss": 0.85712481, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26989746, + "step": 1324, + "time_per_iteration": 2.7453811168670654 + }, + { + "auxiliary_loss_clip": 0.01585561, + "auxiliary_loss_mlp": 0.01061736, + "balance_loss_clip": 1.35072625, + "balance_loss_mlp": 1.03465176, + "epoch": 0.07966330978505937, + "flos": 23299745318640.0, + "grad_norm": 2.242674744981, + "language_loss": 0.79699314, + "learning_rate": 3.974183757463925e-06, + "loss": 0.82346606, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27099609, + "step": 1325, + "time_per_iteration": 2.7914464473724365 + }, + { + "auxiliary_loss_clip": 0.01588947, + "auxiliary_loss_mlp": 0.01076368, + "balance_loss_clip": 1.35732269, + "balance_loss_mlp": 1.04927206, + "epoch": 0.07972343303772735, + "flos": 18367675348680.0, + "grad_norm": 2.019164573220263, + "language_loss": 0.88452113, + "learning_rate": 3.974121345799418e-06, + "loss": 0.9111743, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27099609, + "step": 1326, + "time_per_iteration": 2.7494208812713623 + }, + { + "auxiliary_loss_clip": 0.01585547, + "auxiliary_loss_mlp": 0.01059134, + "balance_loss_clip": 1.35314536, + "balance_loss_mlp": 1.03119135, + "epoch": 0.07978355629039531, + "flos": 21767574155760.0, + "grad_norm": 1.9079406941058568, + "language_loss": 0.82952321, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85597003, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27978516, + "step": 1327, + "time_per_iteration": 2.772700786590576 + }, + { + "auxiliary_loss_clip": 0.01595618, + "auxiliary_loss_mlp": 0.01060064, + "balance_loss_clip": 1.35532129, + "balance_loss_mlp": 1.03224063, + "epoch": 0.07984367954306328, + "flos": 18555891408000.0, + "grad_norm": 2.252684789563645, + "language_loss": 0.80007815, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.82663494, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.27819824, + "step": 1328, + "time_per_iteration": 2.725896120071411 + }, + { + "auxiliary_loss_clip": 0.01596297, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.35868216, + "balance_loss_mlp": 1.03247261, + "epoch": 0.07990380279573125, + "flos": 16907712495840.0, + "grad_norm": 2.505697041536486, + "language_loss": 0.74818408, + "learning_rate": 3.973933661662101e-06, + "loss": 0.77475047, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27856445, + "step": 1329, + "time_per_iteration": 2.7564074993133545 + }, + { + "auxiliary_loss_clip": 0.01590122, + "auxiliary_loss_mlp": 0.01071127, + "balance_loss_clip": 1.35828936, + "balance_loss_mlp": 1.04437685, + "epoch": 0.07996392604839922, + "flos": 24103823343840.0, + "grad_norm": 1.6343291305920893, + "language_loss": 0.81662691, + "learning_rate": 3.973870950576305e-06, + "loss": 0.84323931, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26757812, + "step": 1330, + "time_per_iteration": 2.782710552215576 + }, + { + "auxiliary_loss_clip": 0.01587042, + "auxiliary_loss_mlp": 0.0106934, + "balance_loss_clip": 1.35025835, + "balance_loss_mlp": 1.04367447, + "epoch": 0.08002404930106718, + "flos": 14282264592360.0, + "grad_norm": 1.9768240100426966, + "language_loss": 0.89065385, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91721761, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.25646973, + "step": 1331, + "time_per_iteration": 2.7608511447906494 + }, + { + "auxiliary_loss_clip": 0.01603214, + "auxiliary_loss_mlp": 0.0106475, + "balance_loss_clip": 1.3626225, + "balance_loss_mlp": 1.03623545, + "epoch": 0.08008417255373516, + "flos": 40413646396080.0, + "grad_norm": 1.8175829038116922, + "language_loss": 0.73541069, + "learning_rate": 3.973745303858942e-06, + "loss": 0.76209033, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.28564453, + "step": 1332, + "time_per_iteration": 3.0578956604003906 + }, + { + "auxiliary_loss_clip": 0.01583185, + "auxiliary_loss_mlp": 0.0105835, + "balance_loss_clip": 1.35298586, + "balance_loss_mlp": 1.03275561, + "epoch": 0.08014429580640313, + "flos": 18483561272880.0, + "grad_norm": 1.885139600818587, + "language_loss": 0.83095932, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85737473, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25561523, + "step": 1333, + "time_per_iteration": 4.1616551876068115 + }, + { + "auxiliary_loss_clip": 0.0159118, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_clip": 1.35812354, + "balance_loss_mlp": 1.03381658, + "epoch": 0.0802044190590711, + "flos": 22058437813920.0, + "grad_norm": 2.5698935439608257, + "language_loss": 0.75973713, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.78625697, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2701416, + "step": 1334, + "time_per_iteration": 4.3288702964782715 + }, + { + "auxiliary_loss_clip": 0.01590317, + "auxiliary_loss_mlp": 0.01064686, + "balance_loss_clip": 1.35740721, + "balance_loss_mlp": 1.03934169, + "epoch": 0.08026454231173906, + "flos": 24577583366160.0, + "grad_norm": 2.1940672101886434, + "language_loss": 0.80712235, + "learning_rate": 3.973556272454221e-06, + "loss": 0.83367234, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25341797, + "step": 1335, + "time_per_iteration": 4.249351739883423 + }, + { + "auxiliary_loss_clip": 0.01414764, + "auxiliary_loss_mlp": 0.01021906, + "balance_loss_clip": 1.2579813, + "balance_loss_mlp": 1.01456261, + "epoch": 0.08032466556440704, + "flos": 52594235625600.0, + "grad_norm": 0.736874218989209, + "language_loss": 0.55928242, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58364916, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.07324219, + "step": 1336, + "time_per_iteration": 3.3725626468658447 + }, + { + "auxiliary_loss_clip": 0.01591521, + "auxiliary_loss_mlp": 0.01059682, + "balance_loss_clip": 1.3606658, + "balance_loss_mlp": 1.03400469, + "epoch": 0.080384788817075, + "flos": 23847987719160.0, + "grad_norm": 3.5896993811089235, + "language_loss": 0.68121564, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70772761, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25695801, + "step": 1337, + "time_per_iteration": 4.303264856338501 + }, + { + "auxiliary_loss_clip": 0.0158807, + "auxiliary_loss_mlp": 0.01074376, + "balance_loss_clip": 1.35743618, + "balance_loss_mlp": 1.04762554, + "epoch": 0.08044491206974297, + "flos": 25306123195800.0, + "grad_norm": 1.717282961114958, + "language_loss": 0.87676859, + "learning_rate": 3.973366567512453e-06, + "loss": 0.90339303, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26721191, + "step": 1338, + "time_per_iteration": 2.872560977935791 + }, + { + "auxiliary_loss_clip": 0.01590062, + "auxiliary_loss_mlp": 0.01074256, + "balance_loss_clip": 1.35378647, + "balance_loss_mlp": 1.0446806, + "epoch": 0.08050503532241095, + "flos": 22380877794960.0, + "grad_norm": 2.2945652992138417, + "language_loss": 0.8755672, + "learning_rate": 3.973303182868147e-06, + "loss": 0.90221035, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.29589844, + "step": 1339, + "time_per_iteration": 2.8450393676757812 + }, + { + "auxiliary_loss_clip": 0.01571413, + "auxiliary_loss_mlp": 0.01055196, + "balance_loss_clip": 1.34557736, + "balance_loss_mlp": 1.03148484, + "epoch": 0.08056515857507891, + "flos": 18374050861200.0, + "grad_norm": 2.3970331359408394, + "language_loss": 0.90048403, + "learning_rate": 3.973239723395988e-06, + "loss": 0.92675006, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.23718262, + "step": 1340, + "time_per_iteration": 2.7683029174804688 + }, + { + "auxiliary_loss_clip": 0.01416135, + "auxiliary_loss_mlp": 0.01009563, + "balance_loss_clip": 1.2505064, + "balance_loss_mlp": 1.00069416, + "epoch": 0.08062528182774688, + "flos": 51361252834680.0, + "grad_norm": 0.8721381328293131, + "language_loss": 0.64858717, + "learning_rate": 3.97317618909838e-06, + "loss": 0.67284417, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.08886719, + "step": 1341, + "time_per_iteration": 3.3433101177215576 + }, + { + "auxiliary_loss_clip": 0.01589828, + "auxiliary_loss_mlp": 0.01074355, + "balance_loss_clip": 1.35017979, + "balance_loss_mlp": 1.0452441, + "epoch": 0.08068540508041486, + "flos": 17603782618680.0, + "grad_norm": 2.2380871840586756, + "language_loss": 0.90176183, + "learning_rate": 3.973112579977733e-06, + "loss": 0.92840368, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.2911377, + "step": 1342, + "time_per_iteration": 2.784313440322876 + }, + { + "auxiliary_loss_clip": 0.01593222, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_clip": 1.36155844, + "balance_loss_mlp": 1.03991771, + "epoch": 0.08074552833308282, + "flos": 10564758199080.0, + "grad_norm": 2.384415080815607, + "language_loss": 0.76650172, + "learning_rate": 3.973048896036459e-06, + "loss": 0.79310805, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27490234, + "step": 1343, + "time_per_iteration": 2.713519811630249 + }, + { + "auxiliary_loss_clip": 0.01412916, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.24882603, + "balance_loss_mlp": 1.01719499, + "epoch": 0.08080565158575079, + "flos": 60855376776840.0, + "grad_norm": 0.8002739345317464, + "language_loss": 0.57418454, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59857821, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.09228516, + "step": 1344, + "time_per_iteration": 3.169650077819824 + }, + { + "auxiliary_loss_clip": 0.0159721, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_clip": 1.36285019, + "balance_loss_mlp": 1.04054141, + "epoch": 0.08086577483841875, + "flos": 18336911193000.0, + "grad_norm": 2.1128077525878193, + "language_loss": 0.8728385, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89948297, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.26721191, + "step": 1345, + "time_per_iteration": 2.768602132797241 + }, + { + "auxiliary_loss_clip": 0.01577293, + "auxiliary_loss_mlp": 0.01061769, + "balance_loss_clip": 1.34924126, + "balance_loss_mlp": 1.03245592, + "epoch": 0.08092589809108673, + "flos": 21548634549120.0, + "grad_norm": 1.7346970143388813, + "language_loss": 0.87805361, + "learning_rate": 3.972857395313042e-06, + "loss": 0.90444428, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.29333496, + "step": 1346, + "time_per_iteration": 2.778862714767456 + }, + { + "auxiliary_loss_clip": 0.01579836, + "auxiliary_loss_mlp": 0.01060488, + "balance_loss_clip": 1.35173786, + "balance_loss_mlp": 1.03411937, + "epoch": 0.0809860213437547, + "flos": 22133204450640.0, + "grad_norm": 1.5116372942200362, + "language_loss": 0.93004405, + "learning_rate": 3.972793412113439e-06, + "loss": 0.9564473, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26367188, + "step": 1347, + "time_per_iteration": 2.7920968532562256 + }, + { + "auxiliary_loss_clip": 0.01574959, + "auxiliary_loss_mlp": 0.01067115, + "balance_loss_clip": 1.34582853, + "balance_loss_mlp": 1.03802776, + "epoch": 0.08104614459642266, + "flos": 21730475095920.0, + "grad_norm": 1.764751038212512, + "language_loss": 0.89981961, + "learning_rate": 3.972729354105312e-06, + "loss": 0.92624032, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.29101562, + "step": 1348, + "time_per_iteration": 2.763753652572632 + }, + { + "auxiliary_loss_clip": 0.01576892, + "auxiliary_loss_mlp": 0.01065939, + "balance_loss_clip": 1.3498075, + "balance_loss_mlp": 1.04082131, + "epoch": 0.08110626784909064, + "flos": 23957254480680.0, + "grad_norm": 1.5631669121329343, + "language_loss": 0.76761949, + "learning_rate": 3.97266522129109e-06, + "loss": 0.79404777, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25109863, + "step": 1349, + "time_per_iteration": 2.8015530109405518 + }, + { + "auxiliary_loss_clip": 0.01586582, + "auxiliary_loss_mlp": 0.01070853, + "balance_loss_clip": 1.35168004, + "balance_loss_mlp": 1.04444814, + "epoch": 0.0811663911017586, + "flos": 19030341772440.0, + "grad_norm": 1.9973445793816205, + "language_loss": 0.89024568, + "learning_rate": 3.972601013673205e-06, + "loss": 0.91682005, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26403809, + "step": 1350, + "time_per_iteration": 2.71225643157959 + }, + { + "auxiliary_loss_clip": 0.01576982, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.35006261, + "balance_loss_mlp": 1.04564691, + "epoch": 0.08122651435442657, + "flos": 15345548736120.0, + "grad_norm": 2.4989185747980445, + "language_loss": 0.82656908, + "learning_rate": 3.972536731254092e-06, + "loss": 0.85306162, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26635742, + "step": 1351, + "time_per_iteration": 2.780240535736084 + }, + { + "auxiliary_loss_clip": 0.01574699, + "auxiliary_loss_mlp": 0.01057933, + "balance_loss_clip": 1.34377217, + "balance_loss_mlp": 1.03037202, + "epoch": 0.08128663760709455, + "flos": 23226765449760.0, + "grad_norm": 1.767056536923252, + "language_loss": 0.75678754, + "learning_rate": 3.972472374036189e-06, + "loss": 0.78311384, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27600098, + "step": 1352, + "time_per_iteration": 2.7849442958831787 + }, + { + "auxiliary_loss_clip": 0.01582712, + "auxiliary_loss_mlp": 0.01060635, + "balance_loss_clip": 1.34862542, + "balance_loss_mlp": 1.03495789, + "epoch": 0.08134676085976252, + "flos": 22970726783280.0, + "grad_norm": 2.0844509096249326, + "language_loss": 0.83472306, + "learning_rate": 3.972407942021935e-06, + "loss": 0.86115658, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.25683594, + "step": 1353, + "time_per_iteration": 2.8348703384399414 + }, + { + "auxiliary_loss_clip": 0.01410961, + "auxiliary_loss_mlp": 0.01024991, + "balance_loss_clip": 1.2444334, + "balance_loss_mlp": 1.01531148, + "epoch": 0.08140688411243048, + "flos": 64334979459360.0, + "grad_norm": 0.8560792505123134, + "language_loss": 0.59772557, + "learning_rate": 3.972343435213775e-06, + "loss": 0.62208509, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.09667969, + "step": 1354, + "time_per_iteration": 3.3303580284118652 + }, + { + "auxiliary_loss_clip": 0.01577646, + "auxiliary_loss_mlp": 0.01059277, + "balance_loss_clip": 1.3517344, + "balance_loss_mlp": 1.03300357, + "epoch": 0.08146700736509845, + "flos": 22496966760960.0, + "grad_norm": 1.6896590136819196, + "language_loss": 0.83351606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85988533, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26306152, + "step": 1355, + "time_per_iteration": 2.814314603805542 + }, + { + "auxiliary_loss_clip": 0.01572137, + "auxiliary_loss_mlp": 0.01065978, + "balance_loss_clip": 1.34439397, + "balance_loss_mlp": 1.03956163, + "epoch": 0.08152713061776642, + "flos": 20452555831680.0, + "grad_norm": 1.7625007404730346, + "language_loss": 0.72201365, + "learning_rate": 3.972214197225521e-06, + "loss": 0.74839479, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26416016, + "step": 1356, + "time_per_iteration": 2.8531198501586914 + }, + { + "auxiliary_loss_clip": 0.01573524, + "auxiliary_loss_mlp": 0.01060065, + "balance_loss_clip": 1.34042025, + "balance_loss_mlp": 1.03234935, + "epoch": 0.08158725387043439, + "flos": 23555499726600.0, + "grad_norm": 2.5836851949319284, + "language_loss": 0.7113682, + "learning_rate": 3.972149466050329e-06, + "loss": 0.73770404, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27722168, + "step": 1357, + "time_per_iteration": 2.8476293087005615 + }, + { + "auxiliary_loss_clip": 0.01585856, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_clip": 1.35221922, + "balance_loss_mlp": 1.03124309, + "epoch": 0.08164737712310235, + "flos": 22022353963080.0, + "grad_norm": 2.3574631444332295, + "language_loss": 0.84989715, + "learning_rate": 3.97208466009103e-06, + "loss": 0.87633693, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.26879883, + "step": 1358, + "time_per_iteration": 2.788384437561035 + }, + { + "auxiliary_loss_clip": 0.01572297, + "auxiliary_loss_mlp": 0.0106837, + "balance_loss_clip": 1.34081781, + "balance_loss_mlp": 1.03958094, + "epoch": 0.08170750037577033, + "flos": 23373090662760.0, + "grad_norm": 1.822261573714481, + "language_loss": 1.02873158, + "learning_rate": 3.972019779350084e-06, + "loss": 1.05513823, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28771973, + "step": 1359, + "time_per_iteration": 2.790506601333618 + }, + { + "auxiliary_loss_clip": 0.01582804, + "auxiliary_loss_mlp": 0.01069838, + "balance_loss_clip": 1.34706068, + "balance_loss_mlp": 1.04270601, + "epoch": 0.0817676236284383, + "flos": 28403260095240.0, + "grad_norm": 2.7739301993536976, + "language_loss": 0.84454036, + "learning_rate": 3.971954823829951e-06, + "loss": 0.87106669, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.27124023, + "step": 1360, + "time_per_iteration": 2.854642629623413 + }, + { + "auxiliary_loss_clip": 0.01581342, + "auxiliary_loss_mlp": 0.01077949, + "balance_loss_clip": 1.34532785, + "balance_loss_mlp": 1.04870653, + "epoch": 0.08182774688110626, + "flos": 19213684828560.0, + "grad_norm": 2.2755208225040477, + "language_loss": 0.72948676, + "learning_rate": 3.971889793533093e-06, + "loss": 0.75607967, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2923584, + "step": 1361, + "time_per_iteration": 2.7518749237060547 + }, + { + "auxiliary_loss_clip": 0.01568645, + "auxiliary_loss_mlp": 0.01073613, + "balance_loss_clip": 1.33906329, + "balance_loss_mlp": 1.04592037, + "epoch": 0.08188787013377424, + "flos": 22789292320080.0, + "grad_norm": 2.1107986870372173, + "language_loss": 0.76974595, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79616857, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27648926, + "step": 1362, + "time_per_iteration": 2.8030893802642822 + }, + { + "auxiliary_loss_clip": 0.01575519, + "auxiliary_loss_mlp": 0.01076218, + "balance_loss_clip": 1.3454318, + "balance_loss_mlp": 1.05040932, + "epoch": 0.08194799338644221, + "flos": 16471944917280.0, + "grad_norm": 2.10698461301885, + "language_loss": 0.73349285, + "learning_rate": 3.971759508619069e-06, + "loss": 0.7600103, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25842285, + "step": 1363, + "time_per_iteration": 2.7829787731170654 + }, + { + "auxiliary_loss_clip": 0.01586408, + "auxiliary_loss_mlp": 0.01077072, + "balance_loss_clip": 1.35422802, + "balance_loss_mlp": 1.04779375, + "epoch": 0.08200811663911017, + "flos": 23918774736600.0, + "grad_norm": 1.9039839142683352, + "language_loss": 0.78189856, + "learning_rate": 3.971694254006844e-06, + "loss": 0.80853337, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.29272461, + "step": 1364, + "time_per_iteration": 2.7934954166412354 + }, + { + "auxiliary_loss_clip": 0.01582401, + "auxiliary_loss_mlp": 0.0107295, + "balance_loss_clip": 1.34778857, + "balance_loss_mlp": 1.04426873, + "epoch": 0.08206823989177814, + "flos": 17901306047880.0, + "grad_norm": 1.7108257749616507, + "language_loss": 0.82613176, + "learning_rate": 3.971628924627776e-06, + "loss": 0.85268533, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28686523, + "step": 1365, + "time_per_iteration": 2.8766016960144043 + }, + { + "auxiliary_loss_clip": 0.01577563, + "auxiliary_loss_mlp": 0.01067697, + "balance_loss_clip": 1.34837985, + "balance_loss_mlp": 1.0416379, + "epoch": 0.08212836314444612, + "flos": 22092937938720.0, + "grad_norm": 1.6042366204106469, + "language_loss": 0.82488739, + "learning_rate": 3.97156352048434e-06, + "loss": 0.85134006, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26062012, + "step": 1366, + "time_per_iteration": 2.7463018894195557 + }, + { + "auxiliary_loss_clip": 0.01578452, + "auxiliary_loss_mlp": 0.0109067, + "balance_loss_clip": 1.34541094, + "balance_loss_mlp": 1.0644083, + "epoch": 0.08218848639711408, + "flos": 17601264900360.0, + "grad_norm": 1.6651755319185801, + "language_loss": 0.82116574, + "learning_rate": 3.97149804157902e-06, + "loss": 0.847857, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26269531, + "step": 1367, + "time_per_iteration": 2.7366092205047607 + }, + { + "auxiliary_loss_clip": 0.0158917, + "auxiliary_loss_mlp": 0.01081895, + "balance_loss_clip": 1.35234332, + "balance_loss_mlp": 1.05386925, + "epoch": 0.08224860964978205, + "flos": 17862298395120.0, + "grad_norm": 2.639058819343191, + "language_loss": 0.8371377, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86384839, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28039551, + "step": 1368, + "time_per_iteration": 2.738034725189209 + }, + { + "auxiliary_loss_clip": 0.01564099, + "auxiliary_loss_mlp": 0.0106427, + "balance_loss_clip": 1.33966255, + "balance_loss_mlp": 1.03812706, + "epoch": 0.08230873290245003, + "flos": 25232899676760.0, + "grad_norm": 1.6035163020692733, + "language_loss": 0.81768614, + "learning_rate": 3.971366859492653e-06, + "loss": 0.84396982, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26147461, + "step": 1369, + "time_per_iteration": 2.8222439289093018 + }, + { + "auxiliary_loss_clip": 0.01579507, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_clip": 1.3481195, + "balance_loss_mlp": 1.0433495, + "epoch": 0.08236885615511799, + "flos": 31766344101000.0, + "grad_norm": 2.109371570504117, + "language_loss": 0.75515378, + "learning_rate": 3.971301156316582e-06, + "loss": 0.78167051, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28820801, + "step": 1370, + "time_per_iteration": 2.8432464599609375 + }, + { + "auxiliary_loss_clip": 0.0158805, + "auxiliary_loss_mlp": 0.01079177, + "balance_loss_clip": 1.35398197, + "balance_loss_mlp": 1.04937446, + "epoch": 0.08242897940778596, + "flos": 23191128290880.0, + "grad_norm": 1.5288712718507778, + "language_loss": 0.7514509, + "learning_rate": 3.971235378388573e-06, + "loss": 0.77812314, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.29785156, + "step": 1371, + "time_per_iteration": 2.742992401123047 + }, + { + "auxiliary_loss_clip": 0.01580023, + "auxiliary_loss_mlp": 0.01068679, + "balance_loss_clip": 1.34682989, + "balance_loss_mlp": 1.03861415, + "epoch": 0.08248910266045394, + "flos": 34497607055400.0, + "grad_norm": 2.03783839123832, + "language_loss": 0.71048939, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73697644, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.30053711, + "step": 1372, + "time_per_iteration": 4.356487512588501 + }, + { + "auxiliary_loss_clip": 0.01586533, + "auxiliary_loss_mlp": 0.0106351, + "balance_loss_clip": 1.34810662, + "balance_loss_mlp": 1.03066802, + "epoch": 0.0825492259131219, + "flos": 13439219522760.0, + "grad_norm": 2.2963718411272294, + "language_loss": 0.88450754, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.911008, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.32824707, + "step": 1373, + "time_per_iteration": 4.242610454559326 + }, + { + "auxiliary_loss_clip": 0.01583718, + "auxiliary_loss_mlp": 0.01063207, + "balance_loss_clip": 1.34733534, + "balance_loss_mlp": 1.03500199, + "epoch": 0.08260934916578987, + "flos": 25818038095320.0, + "grad_norm": 2.0913077108767752, + "language_loss": 0.82653141, + "learning_rate": 3.971037596117882e-06, + "loss": 0.85300064, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28198242, + "step": 1374, + "time_per_iteration": 4.3589770793914795 + }, + { + "auxiliary_loss_clip": 0.01428356, + "auxiliary_loss_mlp": 0.0101418, + "balance_loss_clip": 1.26427066, + "balance_loss_mlp": 1.00521505, + "epoch": 0.08266947241845783, + "flos": 63474530384520.0, + "grad_norm": 0.8311813996443405, + "language_loss": 0.60624504, + "learning_rate": 3.970971519207095e-06, + "loss": 0.63067037, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.08984375, + "step": 1375, + "time_per_iteration": 4.708900451660156 + }, + { + "auxiliary_loss_clip": 0.0142232, + "auxiliary_loss_mlp": 0.01014967, + "balance_loss_clip": 1.26139319, + "balance_loss_mlp": 1.0048579, + "epoch": 0.08272959567112581, + "flos": 70009436709720.0, + "grad_norm": 0.9184486235975597, + "language_loss": 0.62249899, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64687181, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.10107422, + "step": 1376, + "time_per_iteration": 3.1758036613464355 + }, + { + "auxiliary_loss_clip": 0.0159073, + "auxiliary_loss_mlp": 0.01074067, + "balance_loss_clip": 1.35261893, + "balance_loss_mlp": 1.04480076, + "epoch": 0.08278971892379378, + "flos": 20418177531960.0, + "grad_norm": 1.839275990105905, + "language_loss": 0.82674921, + "learning_rate": 3.970839141169718e-06, + "loss": 0.85339725, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.29272461, + "step": 1377, + "time_per_iteration": 2.7886617183685303 + }, + { + "auxiliary_loss_clip": 0.01576803, + "auxiliary_loss_mlp": 0.01065568, + "balance_loss_clip": 1.34352493, + "balance_loss_mlp": 1.03704143, + "epoch": 0.08284984217646174, + "flos": 26255876700240.0, + "grad_norm": 1.7771978683728522, + "language_loss": 0.8548497, + "learning_rate": 3.970772840048147e-06, + "loss": 0.88127345, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28503418, + "step": 1378, + "time_per_iteration": 2.775960683822632 + }, + { + "auxiliary_loss_clip": 0.01590671, + "auxiliary_loss_mlp": 0.010729, + "balance_loss_clip": 1.3504951, + "balance_loss_mlp": 1.04295456, + "epoch": 0.08290996542912972, + "flos": 27199904425920.0, + "grad_norm": 1.9053870060553295, + "language_loss": 0.88205659, + "learning_rate": 3.970706464194672e-06, + "loss": 0.9086923, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29968262, + "step": 1379, + "time_per_iteration": 2.957630157470703 + }, + { + "auxiliary_loss_clip": 0.01579527, + "auxiliary_loss_mlp": 0.01059937, + "balance_loss_clip": 1.34940016, + "balance_loss_mlp": 1.03355622, + "epoch": 0.08297008868179769, + "flos": 38625355350000.0, + "grad_norm": 2.659553337628084, + "language_loss": 0.79179865, + "learning_rate": 3.970640013611812e-06, + "loss": 0.81819332, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2635498, + "step": 1380, + "time_per_iteration": 2.929072618484497 + }, + { + "auxiliary_loss_clip": 0.01576413, + "auxiliary_loss_mlp": 0.01067634, + "balance_loss_clip": 1.34553385, + "balance_loss_mlp": 1.03628254, + "epoch": 0.08303021193446565, + "flos": 19979567368200.0, + "grad_norm": 2.1817218548166553, + "language_loss": 0.87061644, + "learning_rate": 3.970573488302083e-06, + "loss": 0.89705694, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.31323242, + "step": 1381, + "time_per_iteration": 2.7355892658233643 + }, + { + "auxiliary_loss_clip": 0.01600028, + "auxiliary_loss_mlp": 0.01068585, + "balance_loss_clip": 1.35986066, + "balance_loss_mlp": 1.03607655, + "epoch": 0.08309033518713363, + "flos": 13666646276640.0, + "grad_norm": 2.752519322130927, + "language_loss": 0.89028168, + "learning_rate": 3.970506888268011e-06, + "loss": 0.91696781, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.32507324, + "step": 1382, + "time_per_iteration": 2.7936878204345703 + }, + { + "auxiliary_loss_clip": 0.01598061, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_clip": 1.361848, + "balance_loss_mlp": 1.03179479, + "epoch": 0.0831504584398016, + "flos": 17973189491040.0, + "grad_norm": 1.8727117513596, + "language_loss": 0.77741551, + "learning_rate": 3.970440213512121e-06, + "loss": 0.80398285, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2689209, + "step": 1383, + "time_per_iteration": 2.7629716396331787 + }, + { + "auxiliary_loss_clip": 0.01598103, + "auxiliary_loss_mlp": 0.01065339, + "balance_loss_clip": 1.36086321, + "balance_loss_mlp": 1.03632379, + "epoch": 0.08321058169246956, + "flos": 22606517781000.0, + "grad_norm": 1.9620283404131933, + "language_loss": 0.83303648, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85967094, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.29003906, + "step": 1384, + "time_per_iteration": 2.7577860355377197 + }, + { + "auxiliary_loss_clip": 0.01609028, + "auxiliary_loss_mlp": 0.01064996, + "balance_loss_clip": 1.36360693, + "balance_loss_mlp": 1.03309596, + "epoch": 0.08327070494513754, + "flos": 22854759642360.0, + "grad_norm": 2.569701629700961, + "language_loss": 0.85758412, + "learning_rate": 3.970306639845e-06, + "loss": 0.88432443, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.3190918, + "step": 1385, + "time_per_iteration": 2.7384350299835205 + }, + { + "auxiliary_loss_clip": 0.0160523, + "auxiliary_loss_mlp": 0.01072009, + "balance_loss_clip": 1.36113048, + "balance_loss_mlp": 1.03801036, + "epoch": 0.0833308281978055, + "flos": 22788033460920.0, + "grad_norm": 1.7291299391645134, + "language_loss": 0.69143844, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71821082, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.34033203, + "step": 1386, + "time_per_iteration": 2.8284401893615723 + }, + { + "auxiliary_loss_clip": 0.01598593, + "auxiliary_loss_mlp": 0.01067821, + "balance_loss_clip": 1.36110806, + "balance_loss_mlp": 1.03855491, + "epoch": 0.08339095145047347, + "flos": 20817373959360.0, + "grad_norm": 1.6086459126691615, + "language_loss": 0.8236782, + "learning_rate": 3.97017276732098e-06, + "loss": 0.85034233, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.2923584, + "step": 1387, + "time_per_iteration": 2.855592727661133 + }, + { + "auxiliary_loss_clip": 0.01604013, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.3633424, + "balance_loss_mlp": 1.05008793, + "epoch": 0.08345107470314143, + "flos": 18519969990600.0, + "grad_norm": 2.0499172891399717, + "language_loss": 0.77332181, + "learning_rate": 3.970105718993978e-06, + "loss": 0.80017966, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.31665039, + "step": 1388, + "time_per_iteration": 2.730926275253296 + }, + { + "auxiliary_loss_clip": 0.01595774, + "auxiliary_loss_mlp": 0.01081779, + "balance_loss_clip": 1.3625114, + "balance_loss_mlp": 1.0489254, + "epoch": 0.08351119795580941, + "flos": 18812295549720.0, + "grad_norm": 1.88720632366906, + "language_loss": 0.79779291, + "learning_rate": 3.970038595960369e-06, + "loss": 0.82456845, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.328125, + "step": 1389, + "time_per_iteration": 2.7720730304718018 + }, + { + "auxiliary_loss_clip": 0.01610733, + "auxiliary_loss_mlp": 0.01076597, + "balance_loss_clip": 1.37287557, + "balance_loss_mlp": 1.04681802, + "epoch": 0.08357132120847738, + "flos": 18446380996320.0, + "grad_norm": 3.7288100366795507, + "language_loss": 0.88296819, + "learning_rate": 3.969971398222699e-06, + "loss": 0.90984142, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.29760742, + "step": 1390, + "time_per_iteration": 2.7091193199157715 + }, + { + "auxiliary_loss_clip": 0.0160294, + "auxiliary_loss_mlp": 0.01065889, + "balance_loss_clip": 1.36468077, + "balance_loss_mlp": 1.03503776, + "epoch": 0.08363144446114534, + "flos": 25927995198960.0, + "grad_norm": 1.628254398660631, + "language_loss": 0.87070358, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89739192, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.30859375, + "step": 1391, + "time_per_iteration": 2.864932060241699 + }, + { + "auxiliary_loss_clip": 0.01625292, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_clip": 1.37894893, + "balance_loss_mlp": 1.04292834, + "epoch": 0.08369156771381332, + "flos": 18045925709760.0, + "grad_norm": 2.1729508198591194, + "language_loss": 0.88171905, + "learning_rate": 3.969836778645371e-06, + "loss": 0.90870667, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.30541992, + "step": 1392, + "time_per_iteration": 2.7523133754730225 + }, + { + "auxiliary_loss_clip": 0.01602536, + "auxiliary_loss_mlp": 0.01080688, + "balance_loss_clip": 1.3630743, + "balance_loss_mlp": 1.0506475, + "epoch": 0.08375169096648129, + "flos": 22680350425440.0, + "grad_norm": 2.2233338567507763, + "language_loss": 0.81090504, + "learning_rate": 3.969769356810819e-06, + "loss": 0.8377372, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.30053711, + "step": 1393, + "time_per_iteration": 2.86067533493042 + }, + { + "auxiliary_loss_clip": 0.01606804, + "auxiliary_loss_mlp": 0.0107622, + "balance_loss_clip": 1.37383974, + "balance_loss_mlp": 1.04708517, + "epoch": 0.08381181421914925, + "flos": 26108698711680.0, + "grad_norm": 1.858255246964361, + "language_loss": 0.85107821, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87790847, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.29125977, + "step": 1394, + "time_per_iteration": 2.78365159034729 + }, + { + "auxiliary_loss_clip": 0.01610978, + "auxiliary_loss_mlp": 0.01075449, + "balance_loss_clip": 1.37274039, + "balance_loss_mlp": 1.04708934, + "epoch": 0.08387193747181723, + "flos": 20634477595200.0, + "grad_norm": 1.7156768969706027, + "language_loss": 0.83223355, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85909784, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.28356934, + "step": 1395, + "time_per_iteration": 2.7853071689605713 + }, + { + "auxiliary_loss_clip": 0.01606239, + "auxiliary_loss_mlp": 0.01073411, + "balance_loss_clip": 1.36681509, + "balance_loss_mlp": 1.03955531, + "epoch": 0.0839320607244852, + "flos": 13447097544600.0, + "grad_norm": 3.490074926055122, + "language_loss": 0.83123094, + "learning_rate": 3.969566643154293e-06, + "loss": 0.8580274, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.33837891, + "step": 1396, + "time_per_iteration": 2.721076726913452 + }, + { + "auxiliary_loss_clip": 0.01605889, + "auxiliary_loss_mlp": 0.01079017, + "balance_loss_clip": 1.37121761, + "balance_loss_mlp": 1.04733086, + "epoch": 0.08399218397715316, + "flos": 23482519857720.0, + "grad_norm": 2.023451457861439, + "language_loss": 0.77191466, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79876375, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.31713867, + "step": 1397, + "time_per_iteration": 2.847717761993408 + }, + { + "auxiliary_loss_clip": 0.01614894, + "auxiliary_loss_mlp": 0.01058241, + "balance_loss_clip": 1.37859643, + "balance_loss_mlp": 1.02882051, + "epoch": 0.08405230722982113, + "flos": 25926167822760.0, + "grad_norm": 1.7638387004825538, + "language_loss": 0.78482991, + "learning_rate": 3.969431127281516e-06, + "loss": 0.81156123, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.29418945, + "step": 1398, + "time_per_iteration": 2.9151391983032227 + }, + { + "auxiliary_loss_clip": 0.01608507, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.37866914, + "balance_loss_mlp": 1.03553832, + "epoch": 0.0841124304824891, + "flos": 17971930631880.0, + "grad_norm": 2.2613080480451977, + "language_loss": 0.94675153, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97348344, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.29162598, + "step": 1399, + "time_per_iteration": 2.780752658843994 + }, + { + "auxiliary_loss_clip": 0.01617645, + "auxiliary_loss_mlp": 0.01070575, + "balance_loss_clip": 1.37300634, + "balance_loss_mlp": 1.03962839, + "epoch": 0.08417255373515707, + "flos": 25635182339520.0, + "grad_norm": 2.158422402906953, + "language_loss": 0.82563633, + "learning_rate": 3.96929531268464e-06, + "loss": 0.85251856, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.30957031, + "step": 1400, + "time_per_iteration": 2.825191020965576 + }, + { + "auxiliary_loss_clip": 0.01613739, + "auxiliary_loss_mlp": 0.01071752, + "balance_loss_clip": 1.37578905, + "balance_loss_mlp": 1.04085267, + "epoch": 0.08423267698782504, + "flos": 26255308183200.0, + "grad_norm": 1.7089529607690155, + "language_loss": 0.86689281, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89374781, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.30883789, + "step": 1401, + "time_per_iteration": 2.7733864784240723 + }, + { + "auxiliary_loss_clip": 0.01616413, + "auxiliary_loss_mlp": 0.0107945, + "balance_loss_clip": 1.37777448, + "balance_loss_mlp": 1.049052, + "epoch": 0.08429280024049302, + "flos": 20124430680240.0, + "grad_norm": 1.8436936656827938, + "language_loss": 0.88009667, + "learning_rate": 3.969159199384263e-06, + "loss": 0.90705538, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.30383301, + "step": 1402, + "time_per_iteration": 2.7383813858032227 + }, + { + "auxiliary_loss_clip": 0.01598563, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_clip": 1.36727762, + "balance_loss_mlp": 1.03550041, + "epoch": 0.08435292349316098, + "flos": 42932385864720.0, + "grad_norm": 2.1063417338778314, + "language_loss": 0.89593947, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.92256409, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28417969, + "step": 1403, + "time_per_iteration": 3.001909017562866 + }, + { + "auxiliary_loss_clip": 0.0162021, + "auxiliary_loss_mlp": 0.01065829, + "balance_loss_clip": 1.38109791, + "balance_loss_mlp": 1.03593159, + "epoch": 0.08441304674582895, + "flos": 22862515839120.0, + "grad_norm": 2.5247057734657052, + "language_loss": 0.80483657, + "learning_rate": 3.969022787401033e-06, + "loss": 0.83169699, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29907227, + "step": 1404, + "time_per_iteration": 2.783916473388672 + }, + { + "auxiliary_loss_clip": 0.01627895, + "auxiliary_loss_mlp": 0.01087417, + "balance_loss_clip": 1.38361931, + "balance_loss_mlp": 1.05711365, + "epoch": 0.08447316999849692, + "flos": 18702135404280.0, + "grad_norm": 1.8674306259354958, + "language_loss": 0.84235823, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86951137, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.30334473, + "step": 1405, + "time_per_iteration": 2.7351248264312744 + }, + { + "auxiliary_loss_clip": 0.01608691, + "auxiliary_loss_mlp": 0.01064664, + "balance_loss_clip": 1.37498665, + "balance_loss_mlp": 1.03561234, + "epoch": 0.08453329325116489, + "flos": 25489506860280.0, + "grad_norm": 1.778451993936768, + "language_loss": 0.81017852, + "learning_rate": 3.968886076755639e-06, + "loss": 0.8369121, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.29040527, + "step": 1406, + "time_per_iteration": 2.805734634399414 + }, + { + "auxiliary_loss_clip": 0.0162179, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_clip": 1.38063145, + "balance_loss_mlp": 1.04825377, + "epoch": 0.08459341650383286, + "flos": 20924569694520.0, + "grad_norm": 1.7961761678311259, + "language_loss": 0.79859906, + "learning_rate": 3.96881760944111e-06, + "loss": 0.82559705, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.29760742, + "step": 1407, + "time_per_iteration": 2.748882532119751 + }, + { + "auxiliary_loss_clip": 0.01617688, + "auxiliary_loss_mlp": 0.01067716, + "balance_loss_clip": 1.38183641, + "balance_loss_mlp": 1.03960633, + "epoch": 0.08465353975650082, + "flos": 13047088950000.0, + "grad_norm": 2.5666524535088326, + "language_loss": 0.92585361, + "learning_rate": 3.968749067468819e-06, + "loss": 0.95270771, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28125, + "step": 1408, + "time_per_iteration": 2.7672903537750244 + }, + { + "auxiliary_loss_clip": 0.01449447, + "auxiliary_loss_mlp": 0.01010626, + "balance_loss_clip": 1.30253577, + "balance_loss_mlp": 1.00290167, + "epoch": 0.0847136630091688, + "flos": 60892638270120.0, + "grad_norm": 0.8876595784372489, + "language_loss": 0.61838275, + "learning_rate": 3.968680450841368e-06, + "loss": 0.6429835, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07714844, + "step": 1409, + "time_per_iteration": 3.4064435958862305 + }, + { + "auxiliary_loss_clip": 0.01596282, + "auxiliary_loss_mlp": 0.0107509, + "balance_loss_clip": 1.37037337, + "balance_loss_mlp": 1.04577684, + "epoch": 0.08477378626183676, + "flos": 22050600400440.0, + "grad_norm": 2.009856240547952, + "language_loss": 0.87296438, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89967811, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.29284668, + "step": 1410, + "time_per_iteration": 4.357713937759399 + }, + { + "auxiliary_loss_clip": 0.01609348, + "auxiliary_loss_mlp": 0.01073739, + "balance_loss_clip": 1.37454283, + "balance_loss_mlp": 1.04254162, + "epoch": 0.08483390951450473, + "flos": 16694336234520.0, + "grad_norm": 2.339136498506493, + "language_loss": 0.74624455, + "learning_rate": 3.968542993631388e-06, + "loss": 0.77307546, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.31176758, + "step": 1411, + "time_per_iteration": 2.876882791519165 + }, + { + "auxiliary_loss_clip": 0.01442197, + "auxiliary_loss_mlp": 0.01010544, + "balance_loss_clip": 1.29673004, + "balance_loss_mlp": 1.0018177, + "epoch": 0.08489403276717271, + "flos": 51600090537720.0, + "grad_norm": 0.9035080462660087, + "language_loss": 0.5679549, + "learning_rate": 3.968474153054073e-06, + "loss": 0.59248233, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08740234, + "step": 1412, + "time_per_iteration": 4.67578387260437 + }, + { + "auxiliary_loss_clip": 0.01606188, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_clip": 1.37575364, + "balance_loss_mlp": 1.03586137, + "epoch": 0.08495415601984067, + "flos": 17096781330720.0, + "grad_norm": 2.4591262320938925, + "language_loss": 0.89271754, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91942143, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28308105, + "step": 1413, + "time_per_iteration": 4.284907579421997 + }, + { + "auxiliary_loss_clip": 0.0160452, + "auxiliary_loss_mlp": 0.01058865, + "balance_loss_clip": 1.37443006, + "balance_loss_mlp": 1.02839494, + "epoch": 0.08501427927250864, + "flos": 23153582539080.0, + "grad_norm": 1.955193892875317, + "language_loss": 0.88692975, + "learning_rate": 3.968336247967844e-06, + "loss": 0.91356361, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.30493164, + "step": 1414, + "time_per_iteration": 2.7716989517211914 + }, + { + "auxiliary_loss_clip": 0.01614876, + "auxiliary_loss_mlp": 0.0107057, + "balance_loss_clip": 1.38444257, + "balance_loss_mlp": 1.04330635, + "epoch": 0.08507440252517662, + "flos": 19068293607840.0, + "grad_norm": 1.7178338926887737, + "language_loss": 0.77855408, + "learning_rate": 3.96826718346416e-06, + "loss": 0.80540848, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27282715, + "step": 1415, + "time_per_iteration": 4.320003986358643 + }, + { + "auxiliary_loss_clip": 0.01597317, + "auxiliary_loss_mlp": 0.01062247, + "balance_loss_clip": 1.36859524, + "balance_loss_mlp": 1.03420854, + "epoch": 0.08513452577784458, + "flos": 60192409113360.0, + "grad_norm": 1.985083754090771, + "language_loss": 0.70812166, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73471725, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28027344, + "step": 1416, + "time_per_iteration": 3.164186477661133 + }, + { + "auxiliary_loss_clip": 0.0160954, + "auxiliary_loss_mlp": 0.01075284, + "balance_loss_clip": 1.37295198, + "balance_loss_mlp": 1.0438602, + "epoch": 0.08519464903051255, + "flos": 27314287840800.0, + "grad_norm": 1.7503355938835587, + "language_loss": 0.75967324, + "learning_rate": 3.968128830548748e-06, + "loss": 0.78652149, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.31445312, + "step": 1417, + "time_per_iteration": 2.8411366939544678 + }, + { + "auxiliary_loss_clip": 0.01595325, + "auxiliary_loss_mlp": 0.01059617, + "balance_loss_clip": 1.36566401, + "balance_loss_mlp": 1.03203201, + "epoch": 0.08525477228318051, + "flos": 20271283801920.0, + "grad_norm": 2.138428154502657, + "language_loss": 0.82924867, + "learning_rate": 3.968059542142265e-06, + "loss": 0.85579813, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27587891, + "step": 1418, + "time_per_iteration": 2.73305082321167 + }, + { + "auxiliary_loss_clip": 0.01438949, + "auxiliary_loss_mlp": 0.01011552, + "balance_loss_clip": 1.29526949, + "balance_loss_mlp": 1.0033983, + "epoch": 0.08531489553584849, + "flos": 67629509907480.0, + "grad_norm": 0.8524634243041185, + "language_loss": 0.56609547, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.59060049, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.08154297, + "step": 1419, + "time_per_iteration": 3.212524652481079 + }, + { + "auxiliary_loss_clip": 0.01601296, + "auxiliary_loss_mlp": 0.01069539, + "balance_loss_clip": 1.36914873, + "balance_loss_mlp": 1.04038095, + "epoch": 0.08537501878851646, + "flos": 27532740147120.0, + "grad_norm": 2.287577754183956, + "language_loss": 0.70768535, + "learning_rate": 3.967920741444886e-06, + "loss": 0.73439366, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.29162598, + "step": 1420, + "time_per_iteration": 2.7880496978759766 + }, + { + "auxiliary_loss_clip": 0.01596455, + "auxiliary_loss_mlp": 0.01059871, + "balance_loss_clip": 1.36761332, + "balance_loss_mlp": 1.03114104, + "epoch": 0.08543514204118442, + "flos": 22789454753520.0, + "grad_norm": 1.4904801063430235, + "language_loss": 0.88481867, + "learning_rate": 3.967851229159252e-06, + "loss": 0.9113819, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28735352, + "step": 1421, + "time_per_iteration": 2.813218593597412 + }, + { + "auxiliary_loss_clip": 0.01431327, + "auxiliary_loss_mlp": 0.01008631, + "balance_loss_clip": 1.2903831, + "balance_loss_mlp": 1.00090647, + "epoch": 0.0854952652938524, + "flos": 61006453167960.0, + "grad_norm": 0.7937311132718664, + "language_loss": 0.6345064, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65890598, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07714844, + "step": 1422, + "time_per_iteration": 3.2307217121124268 + }, + { + "auxiliary_loss_clip": 0.01597458, + "auxiliary_loss_mlp": 0.01073641, + "balance_loss_clip": 1.37017405, + "balance_loss_mlp": 1.04479194, + "epoch": 0.08555538854652037, + "flos": 28043924096160.0, + "grad_norm": 1.8129400945540692, + "language_loss": 0.83821124, + "learning_rate": 3.967711980727276e-06, + "loss": 0.86492223, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.28857422, + "step": 1423, + "time_per_iteration": 2.9486868381500244 + }, + { + "auxiliary_loss_clip": 0.0160427, + "auxiliary_loss_mlp": 0.01067382, + "balance_loss_clip": 1.3758136, + "balance_loss_mlp": 1.04042935, + "epoch": 0.08561551179918833, + "flos": 23513974355520.0, + "grad_norm": 1.7416731106249264, + "language_loss": 0.74802089, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77473748, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26965332, + "step": 1424, + "time_per_iteration": 2.8117306232452393 + }, + { + "auxiliary_loss_clip": 0.0159971, + "auxiliary_loss_mlp": 0.01064696, + "balance_loss_clip": 1.37127209, + "balance_loss_mlp": 1.03647888, + "epoch": 0.08567563505185631, + "flos": 17930892561120.0, + "grad_norm": 1.770258355313686, + "language_loss": 0.76416636, + "learning_rate": 3.96757243383196e-06, + "loss": 0.79081041, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28186035, + "step": 1425, + "time_per_iteration": 2.7383241653442383 + }, + { + "auxiliary_loss_clip": 0.01595634, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_clip": 1.36881876, + "balance_loss_mlp": 1.03233457, + "epoch": 0.08573575830452428, + "flos": 19724340868920.0, + "grad_norm": 2.723979085876524, + "language_loss": 0.93460143, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.96116602, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.28515625, + "step": 1426, + "time_per_iteration": 2.8004848957061768 + }, + { + "auxiliary_loss_clip": 0.01615591, + "auxiliary_loss_mlp": 0.01080799, + "balance_loss_clip": 1.3815968, + "balance_loss_mlp": 1.04944682, + "epoch": 0.08579588155719224, + "flos": 17936131039560.0, + "grad_norm": 2.348397955248732, + "language_loss": 0.76559389, + "learning_rate": 3.967432588494471e-06, + "loss": 0.79255784, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.31323242, + "step": 1427, + "time_per_iteration": 2.825587272644043 + }, + { + "auxiliary_loss_clip": 0.01598008, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_clip": 1.36967111, + "balance_loss_mlp": 1.04265046, + "epoch": 0.08585600480986022, + "flos": 16037517414600.0, + "grad_norm": 2.676673061477468, + "language_loss": 0.8274883, + "learning_rate": 3.96736255391654e-06, + "loss": 0.85416746, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27258301, + "step": 1428, + "time_per_iteration": 2.7356228828430176 + }, + { + "auxiliary_loss_clip": 0.01609191, + "auxiliary_loss_mlp": 0.01074241, + "balance_loss_clip": 1.3736186, + "balance_loss_mlp": 1.04510593, + "epoch": 0.08591612806252819, + "flos": 28663278381000.0, + "grad_norm": 1.7951344708411634, + "language_loss": 0.80790448, + "learning_rate": 3.967292444736023e-06, + "loss": 0.83473879, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29101562, + "step": 1429, + "time_per_iteration": 2.812656879425049 + }, + { + "auxiliary_loss_clip": 0.01603495, + "auxiliary_loss_mlp": 0.01068109, + "balance_loss_clip": 1.3724668, + "balance_loss_mlp": 1.03924823, + "epoch": 0.08597625131519615, + "flos": 20963983430880.0, + "grad_norm": 2.338987604898923, + "language_loss": 0.88106263, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90777862, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28894043, + "step": 1430, + "time_per_iteration": 2.8321635723114014 + }, + { + "auxiliary_loss_clip": 0.01596453, + "auxiliary_loss_mlp": 0.01080499, + "balance_loss_clip": 1.37292814, + "balance_loss_mlp": 1.05241287, + "epoch": 0.08603637456786412, + "flos": 23261346791280.0, + "grad_norm": 1.9857137264682823, + "language_loss": 0.82497066, + "learning_rate": 3.96715200257787e-06, + "loss": 0.85174012, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.28051758, + "step": 1431, + "time_per_iteration": 2.7919507026672363 + }, + { + "auxiliary_loss_clip": 0.01598133, + "auxiliary_loss_mlp": 0.01082418, + "balance_loss_clip": 1.36940145, + "balance_loss_mlp": 1.05340266, + "epoch": 0.0860964978205321, + "flos": 28700052573960.0, + "grad_norm": 1.5909673194576477, + "language_loss": 0.78096437, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80776989, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.29003906, + "step": 1432, + "time_per_iteration": 2.814639091491699 + }, + { + "auxiliary_loss_clip": 0.01600537, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_clip": 1.37136304, + "balance_loss_mlp": 1.0559386, + "epoch": 0.08615662107320006, + "flos": 19323195240240.0, + "grad_norm": 2.3441708211294237, + "language_loss": 0.73730975, + "learning_rate": 3.967011262041315e-06, + "loss": 0.76417851, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.30371094, + "step": 1433, + "time_per_iteration": 2.7604598999023438 + }, + { + "auxiliary_loss_clip": 0.01608761, + "auxiliary_loss_mlp": 0.01075815, + "balance_loss_clip": 1.3753643, + "balance_loss_mlp": 1.04408109, + "epoch": 0.08621674432586802, + "flos": 15855961126320.0, + "grad_norm": 2.754729103487467, + "language_loss": 0.85858572, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.88543147, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.31713867, + "step": 1434, + "time_per_iteration": 2.7777843475341797 + }, + { + "auxiliary_loss_clip": 0.01598321, + "auxiliary_loss_mlp": 0.01082323, + "balance_loss_clip": 1.36893189, + "balance_loss_mlp": 1.05302191, + "epoch": 0.086276867578536, + "flos": 14104728531720.0, + "grad_norm": 2.0998591808203453, + "language_loss": 0.7881695, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81497598, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.29321289, + "step": 1435, + "time_per_iteration": 2.7453505992889404 + }, + { + "auxiliary_loss_clip": 0.01430889, + "auxiliary_loss_mlp": 0.01018021, + "balance_loss_clip": 1.29420936, + "balance_loss_mlp": 1.01010549, + "epoch": 0.08633699083120397, + "flos": 70201754213400.0, + "grad_norm": 0.8969754471723288, + "language_loss": 0.57920575, + "learning_rate": 3.96679959182369e-06, + "loss": 0.6036948, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07910156, + "step": 1436, + "time_per_iteration": 3.3748040199279785 + }, + { + "auxiliary_loss_clip": 0.01603514, + "auxiliary_loss_mlp": 0.01074132, + "balance_loss_clip": 1.37270927, + "balance_loss_mlp": 1.04436541, + "epoch": 0.08639711408387193, + "flos": 30304878738840.0, + "grad_norm": 2.6748891121773752, + "language_loss": 0.70467132, + "learning_rate": 3.966728885918437e-06, + "loss": 0.73144782, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29748535, + "step": 1437, + "time_per_iteration": 2.9149515628814697 + }, + { + "auxiliary_loss_clip": 0.01597939, + "auxiliary_loss_mlp": 0.0107303, + "balance_loss_clip": 1.37019432, + "balance_loss_mlp": 1.04463458, + "epoch": 0.08645723733653991, + "flos": 20301844915800.0, + "grad_norm": 2.0028719191779905, + "language_loss": 0.73437059, + "learning_rate": 3.966658105434627e-06, + "loss": 0.76108027, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.28417969, + "step": 1438, + "time_per_iteration": 2.7740976810455322 + }, + { + "auxiliary_loss_clip": 0.01591089, + "auxiliary_loss_mlp": 0.01075416, + "balance_loss_clip": 1.36287785, + "balance_loss_mlp": 1.04547071, + "epoch": 0.08651736058920788, + "flos": 32897085376680.0, + "grad_norm": 1.5459146674692725, + "language_loss": 0.64674175, + "learning_rate": 3.966587250374945e-06, + "loss": 0.67340684, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.29931641, + "step": 1439, + "time_per_iteration": 2.9427919387817383 + }, + { + "auxiliary_loss_clip": 0.01587178, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_clip": 1.36001837, + "balance_loss_mlp": 1.03874326, + "epoch": 0.08657748384187584, + "flos": 22642439198400.0, + "grad_norm": 1.7746810993211115, + "language_loss": 0.87555647, + "learning_rate": 3.966516320742077e-06, + "loss": 0.90210587, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.29003906, + "step": 1440, + "time_per_iteration": 2.842419385910034 + }, + { + "auxiliary_loss_clip": 0.01617775, + "auxiliary_loss_mlp": 0.01076728, + "balance_loss_clip": 1.38111591, + "balance_loss_mlp": 1.0461148, + "epoch": 0.08663760709454381, + "flos": 23663507628960.0, + "grad_norm": 2.082725445538214, + "language_loss": 0.84505111, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.87199616, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.30615234, + "step": 1441, + "time_per_iteration": 2.7726476192474365 + }, + { + "auxiliary_loss_clip": 0.01408531, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.26935267, + "balance_loss_mlp": 1.00493705, + "epoch": 0.08669773034721179, + "flos": 62700038872560.0, + "grad_norm": 1.0754202893437304, + "language_loss": 0.60433167, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62854928, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08300781, + "step": 1442, + "time_per_iteration": 3.4103662967681885 + }, + { + "auxiliary_loss_clip": 0.016065, + "auxiliary_loss_mlp": 0.01072299, + "balance_loss_clip": 1.37231779, + "balance_loss_mlp": 1.04101801, + "epoch": 0.08675785359987975, + "flos": 20672348213880.0, + "grad_norm": 2.0647885535854895, + "language_loss": 0.79496431, + "learning_rate": 3.96630308443127e-06, + "loss": 0.82175231, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.31323242, + "step": 1443, + "time_per_iteration": 2.7859675884246826 + }, + { + "auxiliary_loss_clip": 0.01600848, + "auxiliary_loss_mlp": 0.01062805, + "balance_loss_clip": 1.37145185, + "balance_loss_mlp": 1.03394449, + "epoch": 0.08681797685254772, + "flos": 26946545911200.0, + "grad_norm": 1.6290047755584292, + "language_loss": 0.82840025, + "learning_rate": 3.966231856532584e-06, + "loss": 0.85503674, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28833008, + "step": 1444, + "time_per_iteration": 2.8191299438476562 + }, + { + "auxiliary_loss_clip": 0.01610507, + "auxiliary_loss_mlp": 0.01057284, + "balance_loss_clip": 1.37828064, + "balance_loss_mlp": 1.02932942, + "epoch": 0.0868781001052157, + "flos": 17717394474720.0, + "grad_norm": 2.0614245314645916, + "language_loss": 0.87975287, + "learning_rate": 3.966160554074189e-06, + "loss": 0.90643078, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.27966309, + "step": 1445, + "time_per_iteration": 2.7575039863586426 + }, + { + "auxiliary_loss_clip": 0.01603338, + "auxiliary_loss_mlp": 0.0106636, + "balance_loss_clip": 1.37473893, + "balance_loss_mlp": 1.03860772, + "epoch": 0.08693822335788366, + "flos": 19900658678760.0, + "grad_norm": 3.942685937275886, + "language_loss": 0.82608432, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8527813, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27770996, + "step": 1446, + "time_per_iteration": 2.7626349925994873 + }, + { + "auxiliary_loss_clip": 0.01403439, + "auxiliary_loss_mlp": 0.01013718, + "balance_loss_clip": 1.26536274, + "balance_loss_mlp": 1.00556362, + "epoch": 0.08699834661055163, + "flos": 67037183809200.0, + "grad_norm": 0.7279007246730458, + "language_loss": 0.54817581, + "learning_rate": 3.966017725489091e-06, + "loss": 0.5723474, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.08154297, + "step": 1447, + "time_per_iteration": 3.3082613945007324 + }, + { + "auxiliary_loss_clip": 0.01588679, + "auxiliary_loss_mlp": 0.010683, + "balance_loss_clip": 1.36397624, + "balance_loss_mlp": 1.03828287, + "epoch": 0.0870584698632196, + "flos": 13484521471320.0, + "grad_norm": 2.0807899959063336, + "language_loss": 0.84976923, + "learning_rate": 3.965946199367804e-06, + "loss": 0.87633908, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.30029297, + "step": 1448, + "time_per_iteration": 2.8054773807525635 + }, + { + "auxiliary_loss_clip": 0.0160638, + "auxiliary_loss_mlp": 0.01067779, + "balance_loss_clip": 1.37462997, + "balance_loss_mlp": 1.03850162, + "epoch": 0.08711859311588757, + "flos": 16111309450680.0, + "grad_norm": 2.70281899244881, + "language_loss": 0.81017232, + "learning_rate": 3.965874598697638e-06, + "loss": 0.83691388, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29272461, + "step": 1449, + "time_per_iteration": 4.195694446563721 + }, + { + "auxiliary_loss_clip": 0.01609512, + "auxiliary_loss_mlp": 0.01070298, + "balance_loss_clip": 1.38490844, + "balance_loss_mlp": 1.03792059, + "epoch": 0.08717871636855554, + "flos": 38479517437320.0, + "grad_norm": 1.5018087590682214, + "language_loss": 0.70868444, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73548251, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.32373047, + "step": 1450, + "time_per_iteration": 4.580091953277588 + }, + { + "auxiliary_loss_clip": 0.01598398, + "auxiliary_loss_mlp": 0.01065098, + "balance_loss_clip": 1.37169588, + "balance_loss_mlp": 1.03353179, + "epoch": 0.0872388396212235, + "flos": 17604757219320.0, + "grad_norm": 1.7015432956895875, + "language_loss": 0.83904755, + "learning_rate": 3.965731173721542e-06, + "loss": 0.86568254, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.31567383, + "step": 1451, + "time_per_iteration": 4.293182849884033 + }, + { + "auxiliary_loss_clip": 0.01595876, + "auxiliary_loss_mlp": 0.01068292, + "balance_loss_clip": 1.37164581, + "balance_loss_mlp": 1.04080248, + "epoch": 0.08729896287389148, + "flos": 25264151132760.0, + "grad_norm": 2.323370806062668, + "language_loss": 0.74999493, + "learning_rate": 3.965659349421049e-06, + "loss": 0.7766366, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27490234, + "step": 1452, + "time_per_iteration": 2.8075380325317383 + }, + { + "auxiliary_loss_clip": 0.01603504, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.37429428, + "balance_loss_mlp": 1.03517318, + "epoch": 0.08735908612655945, + "flos": 15636656044440.0, + "grad_norm": 3.099938154610627, + "language_loss": 0.82031542, + "learning_rate": 3.965587450582556e-06, + "loss": 0.84699446, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.29223633, + "step": 1453, + "time_per_iteration": 4.249650001525879 + }, + { + "auxiliary_loss_clip": 0.01594224, + "auxiliary_loss_mlp": 0.01075911, + "balance_loss_clip": 1.37026405, + "balance_loss_mlp": 1.04608536, + "epoch": 0.08741920937922741, + "flos": 20344547929320.0, + "grad_norm": 1.7947555952533587, + "language_loss": 0.71404874, + "learning_rate": 3.96551547720879e-06, + "loss": 0.74075007, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2980957, + "step": 1454, + "time_per_iteration": 2.7653064727783203 + }, + { + "auxiliary_loss_clip": 0.01386086, + "auxiliary_loss_mlp": 0.01008124, + "balance_loss_clip": 1.24974298, + "balance_loss_mlp": 1.00078094, + "epoch": 0.08747933263189539, + "flos": 62836391410200.0, + "grad_norm": 0.77882131921693, + "language_loss": 0.58630008, + "learning_rate": 3.96544342930248e-06, + "loss": 0.61024219, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07324219, + "step": 1455, + "time_per_iteration": 3.2979965209960938 + }, + { + "auxiliary_loss_clip": 0.01602977, + "auxiliary_loss_mlp": 0.01074796, + "balance_loss_clip": 1.37551689, + "balance_loss_mlp": 1.04237127, + "epoch": 0.08753945588456336, + "flos": 33042314163960.0, + "grad_norm": 1.5500862213507463, + "language_loss": 0.77794814, + "learning_rate": 3.965371306866359e-06, + "loss": 0.80472589, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.32397461, + "step": 1456, + "time_per_iteration": 2.8769986629486084 + }, + { + "auxiliary_loss_clip": 0.01606468, + "auxiliary_loss_mlp": 0.01064879, + "balance_loss_clip": 1.37816143, + "balance_loss_mlp": 1.03717434, + "epoch": 0.08759957913723132, + "flos": 35553135002400.0, + "grad_norm": 2.1277077660113926, + "language_loss": 0.72993433, + "learning_rate": 3.96529910990316e-06, + "loss": 0.75664771, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27709961, + "step": 1457, + "time_per_iteration": 2.8764760494232178 + }, + { + "auxiliary_loss_clip": 0.01594511, + "auxiliary_loss_mlp": 0.01063705, + "balance_loss_clip": 1.3730011, + "balance_loss_mlp": 1.03445125, + "epoch": 0.0876597023898993, + "flos": 23915932151400.0, + "grad_norm": 1.5111448318214398, + "language_loss": 0.87014085, + "learning_rate": 3.965226838415622e-06, + "loss": 0.89672303, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.29248047, + "step": 1458, + "time_per_iteration": 2.8078935146331787 + }, + { + "auxiliary_loss_clip": 0.01613047, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.38584304, + "balance_loss_mlp": 1.04693151, + "epoch": 0.08771982564256726, + "flos": 18118621320120.0, + "grad_norm": 1.6366511952023448, + "language_loss": 0.8078348, + "learning_rate": 3.965154492406486e-06, + "loss": 0.83476049, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.32592773, + "step": 1459, + "time_per_iteration": 2.745971202850342 + }, + { + "auxiliary_loss_clip": 0.01606348, + "auxiliary_loss_mlp": 0.01064803, + "balance_loss_clip": 1.37564683, + "balance_loss_mlp": 1.03435707, + "epoch": 0.08777994889523523, + "flos": 17716460482440.0, + "grad_norm": 2.0812962844075344, + "language_loss": 0.84176552, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86847699, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.3046875, + "step": 1460, + "time_per_iteration": 2.7411584854125977 + }, + { + "auxiliary_loss_clip": 0.0159531, + "auxiliary_loss_mlp": 0.01060728, + "balance_loss_clip": 1.36945593, + "balance_loss_mlp": 1.03285658, + "epoch": 0.0878400721479032, + "flos": 12823357556880.0, + "grad_norm": 3.0165037191685844, + "language_loss": 0.81333649, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83989686, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27868652, + "step": 1461, + "time_per_iteration": 2.6938350200653076 + }, + { + "auxiliary_loss_clip": 0.0160657, + "auxiliary_loss_mlp": 0.01071628, + "balance_loss_clip": 1.37757754, + "balance_loss_mlp": 1.04373288, + "epoch": 0.08790019540057117, + "flos": 26397897427080.0, + "grad_norm": 1.6155746241797013, + "language_loss": 0.76632416, + "learning_rate": 3.964937007276932e-06, + "loss": 0.79310614, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27880859, + "step": 1462, + "time_per_iteration": 2.9339704513549805 + }, + { + "auxiliary_loss_clip": 0.01614345, + "auxiliary_loss_mlp": 0.01086216, + "balance_loss_clip": 1.38319135, + "balance_loss_mlp": 1.05321872, + "epoch": 0.08796031865323914, + "flos": 19138552716600.0, + "grad_norm": 1.740915135836099, + "language_loss": 0.74654448, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.77355003, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.32983398, + "step": 1463, + "time_per_iteration": 2.7568411827087402 + }, + { + "auxiliary_loss_clip": 0.016119, + "auxiliary_loss_mlp": 0.0107646, + "balance_loss_clip": 1.37782061, + "balance_loss_mlp": 1.04439294, + "epoch": 0.0880204419059071, + "flos": 26069447408760.0, + "grad_norm": 1.8751056251857963, + "language_loss": 0.83881998, + "learning_rate": 3.964791644632941e-06, + "loss": 0.86570358, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.32055664, + "step": 1464, + "time_per_iteration": 2.8246049880981445 + }, + { + "auxiliary_loss_clip": 0.0160135, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_clip": 1.37394023, + "balance_loss_mlp": 1.04980206, + "epoch": 0.08808056515857508, + "flos": 22382339695920.0, + "grad_norm": 1.7952185052681058, + "language_loss": 0.78876328, + "learning_rate": 3.964718851551923e-06, + "loss": 0.81555712, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.28234863, + "step": 1465, + "time_per_iteration": 2.748040199279785 + }, + { + "auxiliary_loss_clip": 0.01613975, + "auxiliary_loss_mlp": 0.01076711, + "balance_loss_clip": 1.38342178, + "balance_loss_mlp": 1.04611027, + "epoch": 0.08814068841124305, + "flos": 23190519165480.0, + "grad_norm": 1.9072563992755136, + "language_loss": 0.85849571, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.88540256, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.3059082, + "step": 1466, + "time_per_iteration": 2.798898696899414 + }, + { + "auxiliary_loss_clip": 0.01608917, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_clip": 1.38106203, + "balance_loss_mlp": 1.04984534, + "epoch": 0.08820081166391101, + "flos": 25160731975080.0, + "grad_norm": 2.0373012066818164, + "language_loss": 0.83981556, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86670434, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.30078125, + "step": 1467, + "time_per_iteration": 3.149204730987549 + }, + { + "auxiliary_loss_clip": 0.0160892, + "auxiliary_loss_mlp": 0.01073946, + "balance_loss_clip": 1.38346159, + "balance_loss_mlp": 1.04314196, + "epoch": 0.08826093491657899, + "flos": 22236136308000.0, + "grad_norm": 1.6216096010394954, + "language_loss": 0.75683248, + "learning_rate": 3.964500025305907e-06, + "loss": 0.78366113, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.30810547, + "step": 1468, + "time_per_iteration": 2.7599034309387207 + }, + { + "auxiliary_loss_clip": 0.01601532, + "auxiliary_loss_mlp": 0.01072981, + "balance_loss_clip": 1.37679708, + "balance_loss_mlp": 1.04341698, + "epoch": 0.08832105816924696, + "flos": 22131864374760.0, + "grad_norm": 1.562071089804194, + "language_loss": 0.80723333, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.83397841, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.29553223, + "step": 1469, + "time_per_iteration": 2.8082640171051025 + }, + { + "auxiliary_loss_clip": 0.01604379, + "auxiliary_loss_mlp": 0.01074745, + "balance_loss_clip": 1.37473965, + "balance_loss_mlp": 1.04599142, + "epoch": 0.08838118142191492, + "flos": 17570988045000.0, + "grad_norm": 2.809125045195181, + "language_loss": 0.77990448, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.8066957, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28771973, + "step": 1470, + "time_per_iteration": 2.805654287338257 + }, + { + "auxiliary_loss_clip": 0.01600796, + "auxiliary_loss_mlp": 0.01069976, + "balance_loss_clip": 1.37691379, + "balance_loss_mlp": 1.04096031, + "epoch": 0.0884413046745829, + "flos": 20782102275720.0, + "grad_norm": 1.7234244975670208, + "language_loss": 0.84637415, + "learning_rate": 3.964280528613569e-06, + "loss": 0.8730818, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.29016113, + "step": 1471, + "time_per_iteration": 2.7835652828216553 + }, + { + "auxiliary_loss_clip": 0.01586405, + "auxiliary_loss_mlp": 0.01064963, + "balance_loss_clip": 1.36752295, + "balance_loss_mlp": 1.03928506, + "epoch": 0.08850142792725087, + "flos": 22130321257080.0, + "grad_norm": 1.5045839609304301, + "language_loss": 0.83755887, + "learning_rate": 3.964207214074324e-06, + "loss": 0.86407256, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2565918, + "step": 1472, + "time_per_iteration": 2.765523910522461 + }, + { + "auxiliary_loss_clip": 0.01595595, + "auxiliary_loss_mlp": 0.01064626, + "balance_loss_clip": 1.37017286, + "balance_loss_mlp": 1.03413177, + "epoch": 0.08856155117991883, + "flos": 22423824458640.0, + "grad_norm": 2.596311330250045, + "language_loss": 0.83539468, + "learning_rate": 3.964133825052146e-06, + "loss": 0.86199689, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.30517578, + "step": 1473, + "time_per_iteration": 2.806995153427124 + }, + { + "auxiliary_loss_clip": 0.01596955, + "auxiliary_loss_mlp": 0.01067606, + "balance_loss_clip": 1.37092459, + "balance_loss_mlp": 1.0403074, + "epoch": 0.0886216744325868, + "flos": 29943349888320.0, + "grad_norm": 1.4662567810533427, + "language_loss": 0.78999865, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81664431, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27355957, + "step": 1474, + "time_per_iteration": 2.90370512008667 + }, + { + "auxiliary_loss_clip": 0.01592989, + "auxiliary_loss_mlp": 0.0106567, + "balance_loss_clip": 1.36922276, + "balance_loss_mlp": 1.03567743, + "epoch": 0.08868179768525478, + "flos": 23987490727680.0, + "grad_norm": 1.8675617484698253, + "language_loss": 0.79177129, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81835788, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.29968262, + "step": 1475, + "time_per_iteration": 2.8613548278808594 + }, + { + "auxiliary_loss_clip": 0.01595974, + "auxiliary_loss_mlp": 0.01066834, + "balance_loss_clip": 1.37013578, + "balance_loss_mlp": 1.03445709, + "epoch": 0.08874192093792274, + "flos": 43185135254040.0, + "grad_norm": 1.6058544357273734, + "language_loss": 0.74393415, + "learning_rate": 3.963913211115848e-06, + "loss": 0.77056223, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.32397461, + "step": 1476, + "time_per_iteration": 3.00693941116333 + }, + { + "auxiliary_loss_clip": 0.015989, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_clip": 1.37234807, + "balance_loss_mlp": 1.03329599, + "epoch": 0.0888020441905907, + "flos": 32858727457680.0, + "grad_norm": 1.550206729241853, + "language_loss": 0.75072998, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.7773633, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.3112793, + "step": 1477, + "time_per_iteration": 2.848391056060791 + }, + { + "auxiliary_loss_clip": 0.01593543, + "auxiliary_loss_mlp": 0.01061964, + "balance_loss_clip": 1.36832905, + "balance_loss_mlp": 1.03387809, + "epoch": 0.08886216744325869, + "flos": 23154638356440.0, + "grad_norm": 2.2997979290663673, + "language_loss": 0.8733694, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89992446, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.28051758, + "step": 1478, + "time_per_iteration": 2.777871608734131 + }, + { + "auxiliary_loss_clip": 0.01597426, + "auxiliary_loss_mlp": 0.01069536, + "balance_loss_clip": 1.37038851, + "balance_loss_mlp": 1.04342937, + "epoch": 0.08892229069592665, + "flos": 23336884986840.0, + "grad_norm": 1.572632568203243, + "language_loss": 0.77569795, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80236757, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2611084, + "step": 1479, + "time_per_iteration": 2.788530111312866 + }, + { + "auxiliary_loss_clip": 0.01597555, + "auxiliary_loss_mlp": 0.01064741, + "balance_loss_clip": 1.37280071, + "balance_loss_mlp": 1.03255439, + "epoch": 0.08898241394859462, + "flos": 26219427374160.0, + "grad_norm": 2.994926711788402, + "language_loss": 0.78649867, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.81312156, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.32202148, + "step": 1480, + "time_per_iteration": 2.829756498336792 + }, + { + "auxiliary_loss_clip": 0.01622867, + "auxiliary_loss_mlp": 0.01081295, + "balance_loss_clip": 1.39161301, + "balance_loss_mlp": 1.05068183, + "epoch": 0.0890425372012626, + "flos": 23555946418560.0, + "grad_norm": 1.5392437279245117, + "language_loss": 0.67199826, + "learning_rate": 3.963544031823624e-06, + "loss": 0.69903994, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.30615234, + "step": 1481, + "time_per_iteration": 2.8376922607421875 + }, + { + "auxiliary_loss_clip": 0.01603342, + "auxiliary_loss_mlp": 0.0106352, + "balance_loss_clip": 1.37990141, + "balance_loss_mlp": 1.03702033, + "epoch": 0.08910266045393056, + "flos": 23007988276560.0, + "grad_norm": 2.803964684832586, + "language_loss": 0.96506459, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.99173325, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26525879, + "step": 1482, + "time_per_iteration": 2.7386574745178223 + }, + { + "auxiliary_loss_clip": 0.01611198, + "auxiliary_loss_mlp": 0.01066959, + "balance_loss_clip": 1.3808198, + "balance_loss_mlp": 1.03861094, + "epoch": 0.08916278370659853, + "flos": 31942174610520.0, + "grad_norm": 1.7692661875768074, + "language_loss": 0.78522712, + "learning_rate": 3.96339583888261e-06, + "loss": 0.81200874, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.2833252, + "step": 1483, + "time_per_iteration": 2.872187376022339 + }, + { + "auxiliary_loss_clip": 0.01615019, + "auxiliary_loss_mlp": 0.01091065, + "balance_loss_clip": 1.38835001, + "balance_loss_mlp": 1.05939078, + "epoch": 0.08922290695926649, + "flos": 17534944802520.0, + "grad_norm": 2.3500885183399975, + "language_loss": 0.85867399, + "learning_rate": 3.963321630732448e-06, + "loss": 0.8857348, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.31652832, + "step": 1484, + "time_per_iteration": 2.7305996417999268 + }, + { + "auxiliary_loss_clip": 0.01624125, + "auxiliary_loss_mlp": 0.01074695, + "balance_loss_clip": 1.39295983, + "balance_loss_mlp": 1.04389095, + "epoch": 0.08928303021193447, + "flos": 32131568312280.0, + "grad_norm": 1.5896664723161942, + "language_loss": 0.80624914, + "learning_rate": 3.963247348132932e-06, + "loss": 0.83323735, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.30810547, + "step": 1485, + "time_per_iteration": 2.968259572982788 + }, + { + "auxiliary_loss_clip": 0.01611841, + "auxiliary_loss_mlp": 0.01070294, + "balance_loss_clip": 1.3848393, + "balance_loss_mlp": 1.04092026, + "epoch": 0.08934315346460243, + "flos": 22130036998560.0, + "grad_norm": 1.7994762017090098, + "language_loss": 0.83509672, + "learning_rate": 3.96317299108688e-06, + "loss": 0.86191809, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.29370117, + "step": 1486, + "time_per_iteration": 2.7775652408599854 + }, + { + "auxiliary_loss_clip": 0.01605614, + "auxiliary_loss_mlp": 0.01071078, + "balance_loss_clip": 1.38101459, + "balance_loss_mlp": 1.04165745, + "epoch": 0.0894032767172704, + "flos": 22570799405400.0, + "grad_norm": 1.7955144261829423, + "language_loss": 0.76937228, + "learning_rate": 3.963098559597111e-06, + "loss": 0.79613924, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.29406738, + "step": 1487, + "time_per_iteration": 2.7971577644348145 + }, + { + "auxiliary_loss_clip": 0.01610946, + "auxiliary_loss_mlp": 0.01069273, + "balance_loss_clip": 1.38513505, + "balance_loss_mlp": 1.03827906, + "epoch": 0.08946339996993838, + "flos": 20198222716320.0, + "grad_norm": 2.0970751178373734, + "language_loss": 0.83785784, + "learning_rate": 3.963024053666449e-06, + "loss": 0.86466002, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.31005859, + "step": 1488, + "time_per_iteration": 4.1976563930511475 + }, + { + "auxiliary_loss_clip": 0.01609636, + "auxiliary_loss_mlp": 0.01066725, + "balance_loss_clip": 1.38490152, + "balance_loss_mlp": 1.03846037, + "epoch": 0.08952352322260634, + "flos": 48368655145800.0, + "grad_norm": 1.7660134779512853, + "language_loss": 0.72269702, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74946058, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2824707, + "step": 1489, + "time_per_iteration": 4.51870584487915 + }, + { + "auxiliary_loss_clip": 0.01605085, + "auxiliary_loss_mlp": 0.0106082, + "balance_loss_clip": 1.38005674, + "balance_loss_mlp": 1.03364003, + "epoch": 0.08958364647527431, + "flos": 31799057457960.0, + "grad_norm": 1.7166270071623813, + "language_loss": 0.89975214, + "learning_rate": 3.962874818493745e-06, + "loss": 0.92641115, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27172852, + "step": 1490, + "time_per_iteration": 4.288602113723755 + }, + { + "auxiliary_loss_clip": 0.01622729, + "auxiliary_loss_mlp": 0.01085789, + "balance_loss_clip": 1.391433, + "balance_loss_mlp": 1.05689263, + "epoch": 0.08964376972794229, + "flos": 23373496746360.0, + "grad_norm": 3.2505898724990296, + "language_loss": 0.74542141, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.77250659, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.28881836, + "step": 1491, + "time_per_iteration": 4.27495551109314 + }, + { + "auxiliary_loss_clip": 0.01610578, + "auxiliary_loss_mlp": 0.01070404, + "balance_loss_clip": 1.38580132, + "balance_loss_mlp": 1.04384422, + "epoch": 0.08970389298061025, + "flos": 23300029577160.0, + "grad_norm": 1.6427537917100279, + "language_loss": 0.7706567, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79746652, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26550293, + "step": 1492, + "time_per_iteration": 2.810063600540161 + }, + { + "auxiliary_loss_clip": 0.01610527, + "auxiliary_loss_mlp": 0.01062934, + "balance_loss_clip": 1.38664329, + "balance_loss_mlp": 1.03569508, + "epoch": 0.08976401623327822, + "flos": 33767727149880.0, + "grad_norm": 1.8744014138579304, + "language_loss": 0.71140742, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73814195, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27233887, + "step": 1493, + "time_per_iteration": 2.8972578048706055 + }, + { + "auxiliary_loss_clip": 0.01613569, + "auxiliary_loss_mlp": 0.0107412, + "balance_loss_clip": 1.38660586, + "balance_loss_mlp": 1.04547369, + "epoch": 0.08982413948594618, + "flos": 23916175801560.0, + "grad_norm": 1.7827479971171762, + "language_loss": 0.87113762, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89801449, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.28637695, + "step": 1494, + "time_per_iteration": 2.84999418258667 + }, + { + "auxiliary_loss_clip": 0.01605479, + "auxiliary_loss_mlp": 0.01068545, + "balance_loss_clip": 1.38134515, + "balance_loss_mlp": 1.03960061, + "epoch": 0.08988426273861416, + "flos": 16842042131760.0, + "grad_norm": 1.6439390963756983, + "language_loss": 0.83139694, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85813725, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.28942871, + "step": 1495, + "time_per_iteration": 2.9208412170410156 + }, + { + "auxiliary_loss_clip": 0.01633919, + "auxiliary_loss_mlp": 0.01071591, + "balance_loss_clip": 1.40390563, + "balance_loss_mlp": 1.04529333, + "epoch": 0.08994438599128213, + "flos": 14797468769040.0, + "grad_norm": 2.0127148575339597, + "language_loss": 0.70699191, + "learning_rate": 3.962425326688585e-06, + "loss": 0.73404694, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26330566, + "step": 1496, + "time_per_iteration": 2.728452205657959 + }, + { + "auxiliary_loss_clip": 0.01615752, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_clip": 1.39117336, + "balance_loss_mlp": 1.04249668, + "epoch": 0.09000450924395009, + "flos": 17388822631320.0, + "grad_norm": 1.5368160525700718, + "language_loss": 0.80074823, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82758975, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25891113, + "step": 1497, + "time_per_iteration": 2.773092746734619 + }, + { + "auxiliary_loss_clip": 0.01619857, + "auxiliary_loss_mlp": 0.01067567, + "balance_loss_clip": 1.38846564, + "balance_loss_mlp": 1.04054189, + "epoch": 0.09006463249661807, + "flos": 24285745107360.0, + "grad_norm": 6.269352721284522, + "language_loss": 0.8328352, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.85970944, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27038574, + "step": 1498, + "time_per_iteration": 2.7907233238220215 + }, + { + "auxiliary_loss_clip": 0.0162888, + "auxiliary_loss_mlp": 0.0107733, + "balance_loss_clip": 1.39947343, + "balance_loss_mlp": 1.05041313, + "epoch": 0.09012475574928604, + "flos": 13665834109440.0, + "grad_norm": 2.123927229988375, + "language_loss": 0.79612064, + "learning_rate": 3.962199576140195e-06, + "loss": 0.8231827, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26928711, + "step": 1499, + "time_per_iteration": 2.743387222290039 + }, + { + "auxiliary_loss_clip": 0.01607455, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_clip": 1.38551617, + "balance_loss_mlp": 1.05026889, + "epoch": 0.090184879001954, + "flos": 23332539892320.0, + "grad_norm": 1.575259147032666, + "language_loss": 0.93124479, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95809042, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26867676, + "step": 1500, + "time_per_iteration": 2.754915237426758 + }, + { + "auxiliary_loss_clip": 0.01628415, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_clip": 1.39529061, + "balance_loss_mlp": 1.03261924, + "epoch": 0.09024500225462198, + "flos": 23007866451480.0, + "grad_norm": 2.1251361763633545, + "language_loss": 0.74167842, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76859087, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.30200195, + "step": 1501, + "time_per_iteration": 2.8088929653167725 + }, + { + "auxiliary_loss_clip": 0.01389372, + "auxiliary_loss_mlp": 0.0102315, + "balance_loss_clip": 1.24966419, + "balance_loss_mlp": 1.01304102, + "epoch": 0.09030512550728995, + "flos": 62204367317040.0, + "grad_norm": 0.7405706298783455, + "language_loss": 0.58324003, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60736525, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.10107422, + "step": 1502, + "time_per_iteration": 3.237403631210327 + }, + { + "auxiliary_loss_clip": 0.01610482, + "auxiliary_loss_mlp": 0.01067568, + "balance_loss_clip": 1.38470447, + "balance_loss_mlp": 1.04113948, + "epoch": 0.09036524875995791, + "flos": 38807886238920.0, + "grad_norm": 2.208798258208047, + "language_loss": 0.69559205, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72237253, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26464844, + "step": 1503, + "time_per_iteration": 2.8977463245391846 + }, + { + "auxiliary_loss_clip": 0.01624142, + "auxiliary_loss_mlp": 0.01076923, + "balance_loss_clip": 1.39427948, + "balance_loss_mlp": 1.04948103, + "epoch": 0.09042537201262588, + "flos": 21695203412280.0, + "grad_norm": 1.8870640837081911, + "language_loss": 0.86232507, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88933575, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27441406, + "step": 1504, + "time_per_iteration": 2.8099327087402344 + }, + { + "auxiliary_loss_clip": 0.01627159, + "auxiliary_loss_mlp": 0.01075986, + "balance_loss_clip": 1.39278054, + "balance_loss_mlp": 1.04587364, + "epoch": 0.09048549526529386, + "flos": 22271326774920.0, + "grad_norm": 1.8256452357572885, + "language_loss": 0.73003286, + "learning_rate": 3.961746066137014e-06, + "loss": 0.75706434, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.30102539, + "step": 1505, + "time_per_iteration": 2.8313679695129395 + }, + { + "auxiliary_loss_clip": 0.01603592, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_clip": 1.38061881, + "balance_loss_mlp": 1.04460573, + "epoch": 0.09054561851796182, + "flos": 14615019096840.0, + "grad_norm": 3.9547804750569266, + "language_loss": 0.8116802, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83844709, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28466797, + "step": 1506, + "time_per_iteration": 2.846331834793091 + }, + { + "auxiliary_loss_clip": 0.01606588, + "auxiliary_loss_mlp": 0.01064021, + "balance_loss_clip": 1.38494313, + "balance_loss_mlp": 1.03870082, + "epoch": 0.09060574177062979, + "flos": 27641925691920.0, + "grad_norm": 1.9168102197573467, + "language_loss": 0.7624054, + "learning_rate": 3.961594300988482e-06, + "loss": 0.7891115, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25341797, + "step": 1507, + "time_per_iteration": 2.873072385787964 + }, + { + "auxiliary_loss_clip": 0.01381751, + "auxiliary_loss_mlp": 0.01016347, + "balance_loss_clip": 1.24359298, + "balance_loss_mlp": 1.00638068, + "epoch": 0.09066586502329776, + "flos": 66100506196680.0, + "grad_norm": 0.7326563436422034, + "language_loss": 0.57680666, + "learning_rate": 3.961518306836998e-06, + "loss": 0.60078764, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09960938, + "step": 1508, + "time_per_iteration": 3.142634391784668 + }, + { + "auxiliary_loss_clip": 0.01612336, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_clip": 1.38751078, + "balance_loss_mlp": 1.04197645, + "epoch": 0.09072598827596573, + "flos": 18921115619280.0, + "grad_norm": 1.8843678458025064, + "language_loss": 0.85812712, + "learning_rate": 3.961442238304543e-06, + "loss": 0.8849436, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27319336, + "step": 1509, + "time_per_iteration": 2.879164934158325 + }, + { + "auxiliary_loss_clip": 0.01624529, + "auxiliary_loss_mlp": 0.01098249, + "balance_loss_clip": 1.39154577, + "balance_loss_mlp": 1.06719458, + "epoch": 0.0907861115286337, + "flos": 24826474961280.0, + "grad_norm": 2.47228897683051, + "language_loss": 0.84343058, + "learning_rate": 3.961366095394002e-06, + "loss": 0.87065834, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.3104248, + "step": 1510, + "time_per_iteration": 2.8415231704711914 + }, + { + "auxiliary_loss_clip": 0.016112, + "auxiliary_loss_mlp": 0.01065765, + "balance_loss_clip": 1.38413882, + "balance_loss_mlp": 1.03913331, + "epoch": 0.09084623478130167, + "flos": 21657860702280.0, + "grad_norm": 2.0342103752701943, + "language_loss": 0.86736923, + "learning_rate": 3.961289878108262e-06, + "loss": 0.89413881, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26660156, + "step": 1511, + "time_per_iteration": 2.891733407974243 + }, + { + "auxiliary_loss_clip": 0.01605298, + "auxiliary_loss_mlp": 0.01058018, + "balance_loss_clip": 1.38299561, + "balance_loss_mlp": 1.03315091, + "epoch": 0.09090635803396964, + "flos": 27645418010880.0, + "grad_norm": 1.490133249915304, + "language_loss": 0.85516077, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.88179398, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24865723, + "step": 1512, + "time_per_iteration": 2.8693408966064453 + }, + { + "auxiliary_loss_clip": 0.0160135, + "auxiliary_loss_mlp": 0.01062805, + "balance_loss_clip": 1.38040543, + "balance_loss_mlp": 1.0362457, + "epoch": 0.0909664812866376, + "flos": 17672742259920.0, + "grad_norm": 2.3061160482120155, + "language_loss": 0.87746078, + "learning_rate": 3.961137220422749e-06, + "loss": 0.90410233, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26550293, + "step": 1513, + "time_per_iteration": 2.7508716583251953 + }, + { + "auxiliary_loss_clip": 0.01599409, + "auxiliary_loss_mlp": 0.01053394, + "balance_loss_clip": 1.37420988, + "balance_loss_mlp": 1.02715611, + "epoch": 0.09102660453930557, + "flos": 23956767180360.0, + "grad_norm": 1.7280312099254793, + "language_loss": 0.87092203, + "learning_rate": 3.961060780028764e-06, + "loss": 0.89745003, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26220703, + "step": 1514, + "time_per_iteration": 2.8732352256774902 + }, + { + "auxiliary_loss_clip": 0.01596495, + "auxiliary_loss_mlp": 0.01061449, + "balance_loss_clip": 1.37469888, + "balance_loss_mlp": 1.0354141, + "epoch": 0.09108672779197355, + "flos": 25818606612360.0, + "grad_norm": 1.7360310030763786, + "language_loss": 0.90619028, + "learning_rate": 3.960984265271159e-06, + "loss": 0.93276972, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26000977, + "step": 1515, + "time_per_iteration": 2.800304412841797 + }, + { + "auxiliary_loss_clip": 0.0160215, + "auxiliary_loss_mlp": 0.01063832, + "balance_loss_clip": 1.37803173, + "balance_loss_mlp": 1.03567529, + "epoch": 0.09114685104464151, + "flos": 29645054900280.0, + "grad_norm": 3.0176487180316327, + "language_loss": 0.85801411, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.88467395, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.28149414, + "step": 1516, + "time_per_iteration": 2.8890044689178467 + }, + { + "auxiliary_loss_clip": 0.01605021, + "auxiliary_loss_mlp": 0.01068102, + "balance_loss_clip": 1.37672997, + "balance_loss_mlp": 1.04193592, + "epoch": 0.09120697429730948, + "flos": 33736232043720.0, + "grad_norm": 1.557344321743336, + "language_loss": 0.81216282, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83889413, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26171875, + "step": 1517, + "time_per_iteration": 2.8665342330932617 + }, + { + "auxiliary_loss_clip": 0.01607376, + "auxiliary_loss_mlp": 0.01075736, + "balance_loss_clip": 1.37904191, + "balance_loss_mlp": 1.04815078, + "epoch": 0.09126709754997746, + "flos": 18405424142280.0, + "grad_norm": 1.5670745995236008, + "language_loss": 0.78261155, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80944264, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.27636719, + "step": 1518, + "time_per_iteration": 2.900693416595459 + }, + { + "auxiliary_loss_clip": 0.0159874, + "auxiliary_loss_mlp": 0.01063657, + "balance_loss_clip": 1.37403846, + "balance_loss_mlp": 1.03806269, + "epoch": 0.09132722080264542, + "flos": 22097201816520.0, + "grad_norm": 1.6197023403105797, + "language_loss": 0.86783588, + "learning_rate": 3.960677462662594e-06, + "loss": 0.89445984, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25610352, + "step": 1519, + "time_per_iteration": 2.739736318588257 + }, + { + "auxiliary_loss_clip": 0.0160732, + "auxiliary_loss_mlp": 0.01066195, + "balance_loss_clip": 1.38167977, + "balance_loss_mlp": 1.03764391, + "epoch": 0.09138734405531339, + "flos": 21038140942200.0, + "grad_norm": 2.4217721349465795, + "language_loss": 0.73639119, + "learning_rate": 3.96060057613046e-06, + "loss": 0.76312631, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.28540039, + "step": 1520, + "time_per_iteration": 2.762087106704712 + }, + { + "auxiliary_loss_clip": 0.01604479, + "auxiliary_loss_mlp": 0.01064267, + "balance_loss_clip": 1.37604499, + "balance_loss_mlp": 1.03855371, + "epoch": 0.09144746730798137, + "flos": 20088752913000.0, + "grad_norm": 10.32054383860983, + "language_loss": 0.86146909, + "learning_rate": 3.960523615252156e-06, + "loss": 0.88815659, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25695801, + "step": 1521, + "time_per_iteration": 2.7602076530456543 + }, + { + "auxiliary_loss_clip": 0.01605427, + "auxiliary_loss_mlp": 0.01068296, + "balance_loss_clip": 1.37729037, + "balance_loss_mlp": 1.04243994, + "epoch": 0.09150759056064933, + "flos": 22782632549040.0, + "grad_norm": 2.4096970501834116, + "language_loss": 0.84977865, + "learning_rate": 3.960446580030599e-06, + "loss": 0.87651587, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.25878906, + "step": 1522, + "time_per_iteration": 2.786376953125 + }, + { + "auxiliary_loss_clip": 0.01582595, + "auxiliary_loss_mlp": 0.0107549, + "balance_loss_clip": 1.36279643, + "balance_loss_mlp": 1.04760683, + "epoch": 0.0915677138133173, + "flos": 27570082857120.0, + "grad_norm": 1.609587325256098, + "language_loss": 0.81024605, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83682692, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.27868652, + "step": 1523, + "time_per_iteration": 2.8569271564483643 + }, + { + "auxiliary_loss_clip": 0.01607237, + "auxiliary_loss_mlp": 0.01077044, + "balance_loss_clip": 1.3794446, + "balance_loss_mlp": 1.05007839, + "epoch": 0.09162783706598528, + "flos": 17679117772440.0, + "grad_norm": 1.9464592620299865, + "language_loss": 0.7502259, + "learning_rate": 3.960292286569418e-06, + "loss": 0.77706861, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27026367, + "step": 1524, + "time_per_iteration": 2.706591844558716 + }, + { + "auxiliary_loss_clip": 0.01605905, + "auxiliary_loss_mlp": 0.01055156, + "balance_loss_clip": 1.38019776, + "balance_loss_mlp": 1.02976465, + "epoch": 0.09168796031865324, + "flos": 18482749105680.0, + "grad_norm": 2.084336068662979, + "language_loss": 0.86647058, + "learning_rate": 3.960215028335644e-06, + "loss": 0.89308119, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25402832, + "step": 1525, + "time_per_iteration": 2.724026679992676 + }, + { + "auxiliary_loss_clip": 0.01603375, + "auxiliary_loss_mlp": 0.01052677, + "balance_loss_clip": 1.37549901, + "balance_loss_mlp": 1.02451968, + "epoch": 0.0917480835713212, + "flos": 29393604978480.0, + "grad_norm": 2.032589501020662, + "language_loss": 0.75164813, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77820861, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28173828, + "step": 1526, + "time_per_iteration": 4.34856653213501 + }, + { + "auxiliary_loss_clip": 0.01587411, + "auxiliary_loss_mlp": 0.01068487, + "balance_loss_clip": 1.36651897, + "balance_loss_mlp": 1.03978181, + "epoch": 0.09180820682398917, + "flos": 19834135539120.0, + "grad_norm": 2.059245765547236, + "language_loss": 0.77158505, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79814404, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.28710938, + "step": 1527, + "time_per_iteration": 2.7474186420440674 + }, + { + "auxiliary_loss_clip": 0.0159263, + "auxiliary_loss_mlp": 0.01060083, + "balance_loss_clip": 1.36870718, + "balance_loss_mlp": 1.03085327, + "epoch": 0.09186833007665715, + "flos": 23847134943600.0, + "grad_norm": 1.7839835737809229, + "language_loss": 0.79124922, + "learning_rate": 3.959982807656753e-06, + "loss": 0.81777632, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.29272461, + "step": 1528, + "time_per_iteration": 5.757329702377319 + }, + { + "auxiliary_loss_clip": 0.01598942, + "auxiliary_loss_mlp": 0.0105787, + "balance_loss_clip": 1.37192023, + "balance_loss_mlp": 1.0307622, + "epoch": 0.09192845332932512, + "flos": 12936603937680.0, + "grad_norm": 2.4805240969149507, + "language_loss": 0.77707303, + "learning_rate": 3.959905252114384e-06, + "loss": 0.80364114, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27099609, + "step": 1529, + "time_per_iteration": 2.76609468460083 + }, + { + "auxiliary_loss_clip": 0.01601687, + "auxiliary_loss_mlp": 0.01058391, + "balance_loss_clip": 1.37409186, + "balance_loss_mlp": 1.02999496, + "epoch": 0.09198857658199308, + "flos": 24573116446560.0, + "grad_norm": 1.6876549741060538, + "language_loss": 0.83373594, + "learning_rate": 3.959827622252211e-06, + "loss": 0.86033666, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28417969, + "step": 1530, + "time_per_iteration": 4.238658666610718 + }, + { + "auxiliary_loss_clip": 0.01598001, + "auxiliary_loss_mlp": 0.01067674, + "balance_loss_clip": 1.37661433, + "balance_loss_mlp": 1.03927803, + "epoch": 0.09204869983466106, + "flos": 20271852318960.0, + "grad_norm": 1.9054472867175987, + "language_loss": 0.84476852, + "learning_rate": 3.959749918073179e-06, + "loss": 0.87142527, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.28417969, + "step": 1531, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.0159677, + "auxiliary_loss_mlp": 0.01058856, + "balance_loss_clip": 1.37441945, + "balance_loss_mlp": 1.03118801, + "epoch": 0.09210882308732903, + "flos": 20890435044960.0, + "grad_norm": 1.7763879371369693, + "language_loss": 0.81206816, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83862442, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27685547, + "step": 1532, + "time_per_iteration": 2.798586845397949 + }, + { + "auxiliary_loss_clip": 0.01596726, + "auxiliary_loss_mlp": 0.01055796, + "balance_loss_clip": 1.37295139, + "balance_loss_mlp": 1.02819872, + "epoch": 0.09216894633999699, + "flos": 30962509725960.0, + "grad_norm": 2.4309457552388514, + "language_loss": 0.83885866, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.8653838, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27587891, + "step": 1533, + "time_per_iteration": 2.8364977836608887 + }, + { + "auxiliary_loss_clip": 0.01598888, + "auxiliary_loss_mlp": 0.01060037, + "balance_loss_clip": 1.37438786, + "balance_loss_mlp": 1.03358495, + "epoch": 0.09222906959266497, + "flos": 13155462327600.0, + "grad_norm": 1.9725179011164289, + "language_loss": 0.90840977, + "learning_rate": 3.959516359664402e-06, + "loss": 0.93499911, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26464844, + "step": 1534, + "time_per_iteration": 2.8703436851501465 + }, + { + "auxiliary_loss_clip": 0.01595734, + "auxiliary_loss_mlp": 0.01067678, + "balance_loss_clip": 1.37123251, + "balance_loss_mlp": 1.04000914, + "epoch": 0.09228919284533293, + "flos": 26000041075560.0, + "grad_norm": 2.291573540250368, + "language_loss": 0.7604239, + "learning_rate": 3.959438358247424e-06, + "loss": 0.787058, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27661133, + "step": 1535, + "time_per_iteration": 2.7990455627441406 + }, + { + "auxiliary_loss_clip": 0.01589138, + "auxiliary_loss_mlp": 0.01056602, + "balance_loss_clip": 1.36928821, + "balance_loss_mlp": 1.0309602, + "epoch": 0.0923493160980009, + "flos": 18665401819680.0, + "grad_norm": 1.6421198028988069, + "language_loss": 0.82230753, + "learning_rate": 3.959360282528346e-06, + "loss": 0.84876496, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25646973, + "step": 1536, + "time_per_iteration": 2.7525763511657715 + }, + { + "auxiliary_loss_clip": 0.01588954, + "auxiliary_loss_mlp": 0.01064563, + "balance_loss_clip": 1.36893129, + "balance_loss_mlp": 1.03811049, + "epoch": 0.09240943935066886, + "flos": 21145336677360.0, + "grad_norm": 2.694282441148595, + "language_loss": 0.9005444, + "learning_rate": 3.959282132510131e-06, + "loss": 0.92707956, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26477051, + "step": 1537, + "time_per_iteration": 2.7914280891418457 + }, + { + "auxiliary_loss_clip": 0.01598982, + "auxiliary_loss_mlp": 0.01070623, + "balance_loss_clip": 1.3753258, + "balance_loss_mlp": 1.0442059, + "epoch": 0.09246956260333684, + "flos": 20597013060120.0, + "grad_norm": 2.107166672362697, + "language_loss": 0.81083244, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83752847, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2644043, + "step": 1538, + "time_per_iteration": 2.73881196975708 + }, + { + "auxiliary_loss_clip": 0.01377785, + "auxiliary_loss_mlp": 0.01021927, + "balance_loss_clip": 1.23664689, + "balance_loss_mlp": 1.01110303, + "epoch": 0.09252968585600481, + "flos": 67575372317640.0, + "grad_norm": 0.7482290712234246, + "language_loss": 0.57380754, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59780467, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10839844, + "step": 1539, + "time_per_iteration": 3.425865888595581 + }, + { + "auxiliary_loss_clip": 0.01600995, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_clip": 1.37629008, + "balance_loss_mlp": 1.03737295, + "epoch": 0.09258980910867277, + "flos": 17388294722640.0, + "grad_norm": 2.3770831899322, + "language_loss": 0.68156856, + "learning_rate": 3.959047236690304e-06, + "loss": 0.70823789, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.28564453, + "step": 1540, + "time_per_iteration": 2.7174184322357178 + }, + { + "auxiliary_loss_clip": 0.01597343, + "auxiliary_loss_mlp": 0.0106143, + "balance_loss_clip": 1.37507677, + "balance_loss_mlp": 1.03401184, + "epoch": 0.09264993236134075, + "flos": 19870828515360.0, + "grad_norm": 1.7571122294146277, + "language_loss": 0.84047878, + "learning_rate": 3.958968789505198e-06, + "loss": 0.8670665, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27392578, + "step": 1541, + "time_per_iteration": 2.768453359603882 + }, + { + "auxiliary_loss_clip": 0.01374359, + "auxiliary_loss_mlp": 0.01012528, + "balance_loss_clip": 1.23688841, + "balance_loss_mlp": 1.00284803, + "epoch": 0.09271005561400872, + "flos": 62297715601440.0, + "grad_norm": 0.8895757850286724, + "language_loss": 0.61912119, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64299011, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09667969, + "step": 1542, + "time_per_iteration": 3.220348834991455 + }, + { + "auxiliary_loss_clip": 0.01601107, + "auxiliary_loss_mlp": 0.01074996, + "balance_loss_clip": 1.3783716, + "balance_loss_mlp": 1.04890084, + "epoch": 0.09277017886667668, + "flos": 23334935785560.0, + "grad_norm": 1.5538127009294576, + "language_loss": 0.82964325, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85640424, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26074219, + "step": 1543, + "time_per_iteration": 2.7811431884765625 + }, + { + "auxiliary_loss_clip": 0.01588222, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_clip": 1.36830771, + "balance_loss_mlp": 1.04853559, + "epoch": 0.09283030211934466, + "flos": 54754474889520.0, + "grad_norm": 2.178777551864135, + "language_loss": 0.7314468, + "learning_rate": 3.958733002256038e-06, + "loss": 0.75807822, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26379395, + "step": 1544, + "time_per_iteration": 3.067840576171875 + }, + { + "auxiliary_loss_clip": 0.01605361, + "auxiliary_loss_mlp": 0.01074071, + "balance_loss_clip": 1.37808084, + "balance_loss_mlp": 1.04510283, + "epoch": 0.09289042537201263, + "flos": 30340718939520.0, + "grad_norm": 1.609299016326488, + "language_loss": 0.77973711, + "learning_rate": 3.958654257951637e-06, + "loss": 0.80653149, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.29003906, + "step": 1545, + "time_per_iteration": 2.8558430671691895 + }, + { + "auxiliary_loss_clip": 0.01593588, + "auxiliary_loss_mlp": 0.0106298, + "balance_loss_clip": 1.37502217, + "balance_loss_mlp": 1.03659964, + "epoch": 0.09295054862468059, + "flos": 17751082432320.0, + "grad_norm": 2.5090794883820515, + "language_loss": 0.75364703, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.78021276, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26391602, + "step": 1546, + "time_per_iteration": 2.8140454292297363 + }, + { + "auxiliary_loss_clip": 0.01602988, + "auxiliary_loss_mlp": 0.01063916, + "balance_loss_clip": 1.37930954, + "balance_loss_mlp": 1.0352819, + "epoch": 0.09301067187734856, + "flos": 23663101545360.0, + "grad_norm": 2.0162942461970053, + "language_loss": 0.84261608, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.28649902, + "step": 1547, + "time_per_iteration": 2.7802984714508057 + }, + { + "auxiliary_loss_clip": 0.01596455, + "auxiliary_loss_mlp": 0.01074229, + "balance_loss_clip": 1.37489343, + "balance_loss_mlp": 1.04806209, + "epoch": 0.09307079513001654, + "flos": 27533592922680.0, + "grad_norm": 2.0630766359751527, + "language_loss": 0.68436593, + "learning_rate": 3.958417579416199e-06, + "loss": 0.7110728, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26159668, + "step": 1548, + "time_per_iteration": 2.7843737602233887 + }, + { + "auxiliary_loss_clip": 0.01607001, + "auxiliary_loss_mlp": 0.01074251, + "balance_loss_clip": 1.3803184, + "balance_loss_mlp": 1.04765546, + "epoch": 0.0931309183826845, + "flos": 20631716226720.0, + "grad_norm": 2.309454499557049, + "language_loss": 0.84143162, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.86824417, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26623535, + "step": 1549, + "time_per_iteration": 2.7925634384155273 + }, + { + "auxiliary_loss_clip": 0.01600348, + "auxiliary_loss_mlp": 0.01064073, + "balance_loss_clip": 1.37724435, + "balance_loss_mlp": 1.0369525, + "epoch": 0.09319104163535247, + "flos": 29026634607720.0, + "grad_norm": 1.7540506208606983, + "language_loss": 0.76687574, + "learning_rate": 3.958259422403966e-06, + "loss": 0.79351997, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2713623, + "step": 1550, + "time_per_iteration": 2.8993473052978516 + }, + { + "auxiliary_loss_clip": 0.0159657, + "auxiliary_loss_mlp": 0.01081937, + "balance_loss_clip": 1.37044811, + "balance_loss_mlp": 1.05343437, + "epoch": 0.09325116488802045, + "flos": 25307057188080.0, + "grad_norm": 2.024508192621669, + "language_loss": 0.83544624, + "learning_rate": 3.95818023251026e-06, + "loss": 0.86223131, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28479004, + "step": 1551, + "time_per_iteration": 2.7985143661499023 + }, + { + "auxiliary_loss_clip": 0.01370279, + "auxiliary_loss_mlp": 0.01014062, + "balance_loss_clip": 1.23358202, + "balance_loss_mlp": 1.00495434, + "epoch": 0.09331128814068841, + "flos": 61551771766560.0, + "grad_norm": 0.7478967650876983, + "language_loss": 0.6185509, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6423943, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.09130859, + "step": 1552, + "time_per_iteration": 3.374713182449341 + }, + { + "auxiliary_loss_clip": 0.01369351, + "auxiliary_loss_mlp": 0.01021095, + "balance_loss_clip": 1.23319101, + "balance_loss_mlp": 1.0106051, + "epoch": 0.09337141139335638, + "flos": 53308620118800.0, + "grad_norm": 0.8303560734138226, + "language_loss": 0.58960807, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61351252, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.10498047, + "step": 1553, + "time_per_iteration": 3.3717310428619385 + }, + { + "auxiliary_loss_clip": 0.01600578, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_clip": 1.37189412, + "balance_loss_mlp": 1.04032803, + "epoch": 0.09343153464602436, + "flos": 23482073165760.0, + "grad_norm": 1.7060897417268337, + "language_loss": 0.88249326, + "learning_rate": 3.957942217314823e-06, + "loss": 0.90918583, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28381348, + "step": 1554, + "time_per_iteration": 2.802440643310547 + }, + { + "auxiliary_loss_clip": 0.01588611, + "auxiliary_loss_mlp": 0.01067974, + "balance_loss_clip": 1.36711061, + "balance_loss_mlp": 1.04087806, + "epoch": 0.09349165789869232, + "flos": 19358101448640.0, + "grad_norm": 1.6916277890967444, + "language_loss": 0.81542522, + "learning_rate": 3.957862730421599e-06, + "loss": 0.84199107, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.27111816, + "step": 1555, + "time_per_iteration": 2.7624905109405518 + }, + { + "auxiliary_loss_clip": 0.01370876, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.23751509, + "balance_loss_mlp": 1.01409173, + "epoch": 0.09355178115136029, + "flos": 67516669610280.0, + "grad_norm": 0.8713521401924962, + "language_loss": 0.59557188, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61952645, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.10498047, + "step": 1556, + "time_per_iteration": 3.3102400302886963 + }, + { + "auxiliary_loss_clip": 0.01595339, + "auxiliary_loss_mlp": 0.01068343, + "balance_loss_clip": 1.37293315, + "balance_loss_mlp": 1.04133034, + "epoch": 0.09361190440402825, + "flos": 37348694944920.0, + "grad_norm": 1.5914444042174423, + "language_loss": 0.84517181, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.87180859, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.27026367, + "step": 1557, + "time_per_iteration": 2.91011118888855 + }, + { + "auxiliary_loss_clip": 0.01586496, + "auxiliary_loss_mlp": 0.01067333, + "balance_loss_clip": 1.36426938, + "balance_loss_mlp": 1.03991532, + "epoch": 0.09367202765669623, + "flos": 24905099392200.0, + "grad_norm": 1.67216572381052, + "language_loss": 0.78248787, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80902612, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27404785, + "step": 1558, + "time_per_iteration": 2.8928844928741455 + }, + { + "auxiliary_loss_clip": 0.01603655, + "auxiliary_loss_mlp": 0.01062625, + "balance_loss_clip": 1.37604618, + "balance_loss_mlp": 1.03512311, + "epoch": 0.0937321509093642, + "flos": 15709798346760.0, + "grad_norm": 1.8747459955275383, + "language_loss": 0.80724978, + "learning_rate": 3.957544040455379e-06, + "loss": 0.83391261, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27514648, + "step": 1559, + "time_per_iteration": 2.792203903198242 + }, + { + "auxiliary_loss_clip": 0.01585856, + "auxiliary_loss_mlp": 0.01058871, + "balance_loss_clip": 1.3634789, + "balance_loss_mlp": 1.03155994, + "epoch": 0.09379227416203216, + "flos": 20488355424000.0, + "grad_norm": 1.8388478358997944, + "language_loss": 0.76670378, + "learning_rate": 3.957464182380599e-06, + "loss": 0.79315102, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27307129, + "step": 1560, + "time_per_iteration": 2.726881265640259 + }, + { + "auxiliary_loss_clip": 0.01599403, + "auxiliary_loss_mlp": 0.01064068, + "balance_loss_clip": 1.37246656, + "balance_loss_mlp": 1.03769898, + "epoch": 0.09385239741470014, + "flos": 24357831592320.0, + "grad_norm": 1.6333302876481994, + "language_loss": 0.81239969, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83903444, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26379395, + "step": 1561, + "time_per_iteration": 2.884208917617798 + }, + { + "auxiliary_loss_clip": 0.01587323, + "auxiliary_loss_mlp": 0.01058662, + "balance_loss_clip": 1.36229777, + "balance_loss_mlp": 1.02851415, + "epoch": 0.0939125206673681, + "flos": 33297987355200.0, + "grad_norm": 2.524484003784873, + "language_loss": 0.62218535, + "learning_rate": 3.957304243552354e-06, + "loss": 0.64864528, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.30175781, + "step": 1562, + "time_per_iteration": 2.8691630363464355 + }, + { + "auxiliary_loss_clip": 0.01585411, + "auxiliary_loss_mlp": 0.0106607, + "balance_loss_clip": 1.36671615, + "balance_loss_mlp": 1.03939068, + "epoch": 0.09397264392003607, + "flos": 19249646854320.0, + "grad_norm": 1.8118011005209052, + "language_loss": 0.85450268, + "learning_rate": 3.957224162804956e-06, + "loss": 0.88101745, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26696777, + "step": 1563, + "time_per_iteration": 2.7740156650543213 + }, + { + "auxiliary_loss_clip": 0.01578731, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_clip": 1.3598721, + "balance_loss_mlp": 1.02810121, + "epoch": 0.09403276717270405, + "flos": 19322667331560.0, + "grad_norm": 1.9050327119142088, + "language_loss": 0.76863527, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.79495347, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25012207, + "step": 1564, + "time_per_iteration": 2.7824628353118896 + }, + { + "auxiliary_loss_clip": 0.01581918, + "auxiliary_loss_mlp": 0.01062963, + "balance_loss_clip": 1.36149859, + "balance_loss_mlp": 1.0378809, + "epoch": 0.09409289042537201, + "flos": 23588334908640.0, + "grad_norm": 1.926664145359603, + "language_loss": 0.80630195, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.8327508, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25073242, + "step": 1565, + "time_per_iteration": 4.225371599197388 + }, + { + "auxiliary_loss_clip": 0.01585988, + "auxiliary_loss_mlp": 0.01069631, + "balance_loss_clip": 1.36606932, + "balance_loss_mlp": 1.04031706, + "epoch": 0.09415301367803998, + "flos": 20082539833920.0, + "grad_norm": 1.7114129698502911, + "language_loss": 0.75612831, + "learning_rate": 3.956983475266103e-06, + "loss": 0.78268456, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.29345703, + "step": 1566, + "time_per_iteration": 4.392319440841675 + }, + { + "auxiliary_loss_clip": 0.01580136, + "auxiliary_loss_mlp": 0.01066218, + "balance_loss_clip": 1.35920954, + "balance_loss_mlp": 1.04030156, + "epoch": 0.09421313693070796, + "flos": 21066103121040.0, + "grad_norm": 1.8023018883917794, + "language_loss": 0.7841441, + "learning_rate": 3.956903097664407e-06, + "loss": 0.81060761, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25927734, + "step": 1567, + "time_per_iteration": 4.203685998916626 + }, + { + "auxiliary_loss_clip": 0.01586697, + "auxiliary_loss_mlp": 0.01055844, + "balance_loss_clip": 1.3634007, + "balance_loss_mlp": 1.03081012, + "epoch": 0.09427326018337592, + "flos": 24321504091320.0, + "grad_norm": 1.6047005976919566, + "language_loss": 0.82919866, + "learning_rate": 3.956822645856749e-06, + "loss": 0.85562408, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25024414, + "step": 1568, + "time_per_iteration": 4.318426132202148 + }, + { + "auxiliary_loss_clip": 0.01590247, + "auxiliary_loss_mlp": 0.01056231, + "balance_loss_clip": 1.36701012, + "balance_loss_mlp": 1.02875328, + "epoch": 0.09433338343604389, + "flos": 20268400608360.0, + "grad_norm": 1.6405797427754485, + "language_loss": 0.77003103, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.7964958, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27478027, + "step": 1569, + "time_per_iteration": 2.8230466842651367 + }, + { + "auxiliary_loss_clip": 0.0157831, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_clip": 1.35920024, + "balance_loss_mlp": 1.0355221, + "epoch": 0.09439350668871185, + "flos": 12745870160040.0, + "grad_norm": 2.1239828656566515, + "language_loss": 0.8598758, + "learning_rate": 3.956661519635756e-06, + "loss": 0.8862747, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26049805, + "step": 1570, + "time_per_iteration": 2.78044056892395 + }, + { + "auxiliary_loss_clip": 0.01586247, + "auxiliary_loss_mlp": 0.01057821, + "balance_loss_clip": 1.36254585, + "balance_loss_mlp": 1.03083169, + "epoch": 0.09445362994137983, + "flos": 25968383535960.0, + "grad_norm": 1.5093975002561126, + "language_loss": 0.76547956, + "learning_rate": 3.95658084522853e-06, + "loss": 0.79192024, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27001953, + "step": 1571, + "time_per_iteration": 2.912950277328491 + }, + { + "auxiliary_loss_clip": 0.01570285, + "auxiliary_loss_mlp": 0.0106027, + "balance_loss_clip": 1.35546517, + "balance_loss_mlp": 1.0365591, + "epoch": 0.0945137531940478, + "flos": 19719630299160.0, + "grad_norm": 1.5876442430603799, + "language_loss": 0.79962665, + "learning_rate": 3.956500096627561e-06, + "loss": 0.82593215, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23706055, + "step": 1572, + "time_per_iteration": 2.76324462890625 + }, + { + "auxiliary_loss_clip": 0.01571934, + "auxiliary_loss_mlp": 0.0106594, + "balance_loss_clip": 1.35431623, + "balance_loss_mlp": 1.04060781, + "epoch": 0.09457387644671576, + "flos": 23621535565920.0, + "grad_norm": 1.5944164892110575, + "language_loss": 0.87834918, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90472794, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25317383, + "step": 1573, + "time_per_iteration": 2.8275198936462402 + }, + { + "auxiliary_loss_clip": 0.01598717, + "auxiliary_loss_mlp": 0.01079377, + "balance_loss_clip": 1.37657535, + "balance_loss_mlp": 1.05117238, + "epoch": 0.09463399969938374, + "flos": 26913060995400.0, + "grad_norm": 2.4070131822893375, + "language_loss": 0.81568521, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84246618, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.28210449, + "step": 1574, + "time_per_iteration": 2.804041624069214 + }, + { + "auxiliary_loss_clip": 0.01582015, + "auxiliary_loss_mlp": 0.01067006, + "balance_loss_clip": 1.36083579, + "balance_loss_mlp": 1.0433073, + "epoch": 0.0946941229520517, + "flos": 23664847704840.0, + "grad_norm": 1.8864251068192812, + "language_loss": 0.81658453, + "learning_rate": 3.95625740569284e-06, + "loss": 0.8430748, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.23693848, + "step": 1575, + "time_per_iteration": 2.813930034637451 + }, + { + "auxiliary_loss_clip": 0.01571175, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_clip": 1.35264421, + "balance_loss_mlp": 1.04563367, + "epoch": 0.09475424620471967, + "flos": 24139419894360.0, + "grad_norm": 1.7580294028331387, + "language_loss": 0.86968672, + "learning_rate": 3.956176360347553e-06, + "loss": 0.89611399, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2590332, + "step": 1576, + "time_per_iteration": 2.812238931655884 + }, + { + "auxiliary_loss_clip": 0.01374451, + "auxiliary_loss_mlp": 0.01008417, + "balance_loss_clip": 1.24372709, + "balance_loss_mlp": 1.00116909, + "epoch": 0.09481436945738765, + "flos": 68441872038120.0, + "grad_norm": 0.9711635425968066, + "language_loss": 0.65854996, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68237865, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.07226562, + "step": 1577, + "time_per_iteration": 3.2223217487335205 + }, + { + "auxiliary_loss_clip": 0.01576803, + "auxiliary_loss_mlp": 0.01057101, + "balance_loss_clip": 1.35710812, + "balance_loss_mlp": 1.03299713, + "epoch": 0.09487449271005562, + "flos": 16658414817120.0, + "grad_norm": 2.010356347522523, + "language_loss": 0.79870641, + "learning_rate": 3.956014047124844e-06, + "loss": 0.82504547, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24108887, + "step": 1578, + "time_per_iteration": 2.7520949840545654 + }, + { + "auxiliary_loss_clip": 0.01576271, + "auxiliary_loss_mlp": 0.01076439, + "balance_loss_clip": 1.35757756, + "balance_loss_mlp": 1.05055833, + "epoch": 0.09493461596272358, + "flos": 24280506628920.0, + "grad_norm": 1.5015359205617882, + "language_loss": 0.78249949, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80902654, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2590332, + "step": 1579, + "time_per_iteration": 2.775850296020508 + }, + { + "auxiliary_loss_clip": 0.01571378, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_clip": 1.35311604, + "balance_loss_mlp": 1.04531372, + "epoch": 0.09499473921539155, + "flos": 21875013541080.0, + "grad_norm": 1.8986069442975466, + "language_loss": 0.73738229, + "learning_rate": 3.955851437213144e-06, + "loss": 0.76380587, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25683594, + "step": 1580, + "time_per_iteration": 2.8168840408325195 + }, + { + "auxiliary_loss_clip": 0.01561108, + "auxiliary_loss_mlp": 0.01056852, + "balance_loss_clip": 1.34561825, + "balance_loss_mlp": 1.03200877, + "epoch": 0.09505486246805953, + "flos": 33553701154800.0, + "grad_norm": 1.8469128429166286, + "language_loss": 0.77860659, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80478621, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24841309, + "step": 1581, + "time_per_iteration": 2.872359037399292 + }, + { + "auxiliary_loss_clip": 0.015686, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_clip": 1.35201466, + "balance_loss_mlp": 1.02608716, + "epoch": 0.09511498572072749, + "flos": 21220346964240.0, + "grad_norm": 1.893840177345435, + "language_loss": 0.87874687, + "learning_rate": 3.955688530637116e-06, + "loss": 0.90493977, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24597168, + "step": 1582, + "time_per_iteration": 2.8828017711639404 + }, + { + "auxiliary_loss_clip": 0.01576397, + "auxiliary_loss_mlp": 0.01071946, + "balance_loss_clip": 1.35580206, + "balance_loss_mlp": 1.04581523, + "epoch": 0.09517510897339546, + "flos": 14615384572080.0, + "grad_norm": 1.7184399642500676, + "language_loss": 0.67104745, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69753093, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26171875, + "step": 1583, + "time_per_iteration": 2.7268991470336914 + }, + { + "auxiliary_loss_clip": 0.01579834, + "auxiliary_loss_mlp": 0.01067991, + "balance_loss_clip": 1.3614229, + "balance_loss_mlp": 1.04162216, + "epoch": 0.09523523222606343, + "flos": 27822954071520.0, + "grad_norm": 1.6740680849712455, + "language_loss": 0.70703733, + "learning_rate": 3.95552532742147e-06, + "loss": 0.73351562, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26330566, + "step": 1584, + "time_per_iteration": 2.8173165321350098 + }, + { + "auxiliary_loss_clip": 0.01571004, + "auxiliary_loss_mlp": 0.01064299, + "balance_loss_clip": 1.35427427, + "balance_loss_mlp": 1.04042184, + "epoch": 0.0952953554787314, + "flos": 20711437083360.0, + "grad_norm": 1.434871502687267, + "language_loss": 0.81046522, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2388916, + "step": 1585, + "time_per_iteration": 2.8469653129577637 + }, + { + "auxiliary_loss_clip": 0.01582148, + "auxiliary_loss_mlp": 0.01069558, + "balance_loss_clip": 1.35839319, + "balance_loss_mlp": 1.04336798, + "epoch": 0.09535547873139937, + "flos": 24792665178600.0, + "grad_norm": 1.6647267126499092, + "language_loss": 0.72705173, + "learning_rate": 3.955361827590961e-06, + "loss": 0.75356877, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26208496, + "step": 1586, + "time_per_iteration": 2.820462465286255 + }, + { + "auxiliary_loss_clip": 0.01370898, + "auxiliary_loss_mlp": 0.01013528, + "balance_loss_clip": 1.23863792, + "balance_loss_mlp": 1.00570762, + "epoch": 0.09541560198406734, + "flos": 71926185290400.0, + "grad_norm": 0.812309705844255, + "language_loss": 0.55443019, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57827449, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.078125, + "step": 1587, + "time_per_iteration": 3.1540586948394775 + }, + { + "auxiliary_loss_clip": 0.01573954, + "auxiliary_loss_mlp": 0.01065666, + "balance_loss_clip": 1.35082924, + "balance_loss_mlp": 1.04120409, + "epoch": 0.09547572523673531, + "flos": 28988073646920.0, + "grad_norm": 1.6562314425265907, + "language_loss": 0.81303203, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83942825, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24487305, + "step": 1588, + "time_per_iteration": 2.8185524940490723 + }, + { + "auxiliary_loss_clip": 0.01573648, + "auxiliary_loss_mlp": 0.01074531, + "balance_loss_clip": 1.35368085, + "balance_loss_mlp": 1.0506413, + "epoch": 0.09553584848940327, + "flos": 24139257460920.0, + "grad_norm": 1.4143608948015012, + "language_loss": 0.81792855, + "learning_rate": 3.955116021746594e-06, + "loss": 0.8444103, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23876953, + "step": 1589, + "time_per_iteration": 2.8803958892822266 + }, + { + "auxiliary_loss_clip": 0.0156157, + "auxiliary_loss_mlp": 0.01075186, + "balance_loss_clip": 1.3444798, + "balance_loss_mlp": 1.0485189, + "epoch": 0.09559597174207124, + "flos": 42859202954040.0, + "grad_norm": 1.3714908272385333, + "language_loss": 0.65049195, + "learning_rate": 3.955033938184601e-06, + "loss": 0.6768595, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2668457, + "step": 1590, + "time_per_iteration": 2.9902758598327637 + }, + { + "auxiliary_loss_clip": 0.01569972, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_clip": 1.35233927, + "balance_loss_mlp": 1.04574203, + "epoch": 0.09565609499473922, + "flos": 32677211777760.0, + "grad_norm": 1.598685770098712, + "language_loss": 0.83620894, + "learning_rate": 3.954951780487526e-06, + "loss": 0.86262882, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26269531, + "step": 1591, + "time_per_iteration": 2.948244333267212 + }, + { + "auxiliary_loss_clip": 0.01579935, + "auxiliary_loss_mlp": 0.01067654, + "balance_loss_clip": 1.35848141, + "balance_loss_mlp": 1.04126108, + "epoch": 0.09571621824740718, + "flos": 18482992755840.0, + "grad_norm": 2.251012939580581, + "language_loss": 0.75661588, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.78309178, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26428223, + "step": 1592, + "time_per_iteration": 2.833787441253662 + }, + { + "auxiliary_loss_clip": 0.01559517, + "auxiliary_loss_mlp": 0.01066391, + "balance_loss_clip": 1.34090698, + "balance_loss_mlp": 1.0413568, + "epoch": 0.09577634150007515, + "flos": 29393483153400.0, + "grad_norm": 1.7695894300529773, + "language_loss": 0.73744226, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76370132, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25012207, + "step": 1593, + "time_per_iteration": 2.944990634918213 + }, + { + "auxiliary_loss_clip": 0.01565354, + "auxiliary_loss_mlp": 0.01063885, + "balance_loss_clip": 1.34739792, + "balance_loss_mlp": 1.04003084, + "epoch": 0.09583646475274313, + "flos": 22753046035800.0, + "grad_norm": 2.157798001260932, + "language_loss": 0.70742434, + "learning_rate": 3.954704862616971e-06, + "loss": 0.73371673, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23864746, + "step": 1594, + "time_per_iteration": 2.897256374359131 + }, + { + "auxiliary_loss_clip": 0.01569892, + "auxiliary_loss_mlp": 0.01062082, + "balance_loss_clip": 1.34764278, + "balance_loss_mlp": 1.03851426, + "epoch": 0.0958965880054111, + "flos": 23223151305720.0, + "grad_norm": 2.031063015712869, + "language_loss": 0.83353949, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85985917, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.23583984, + "step": 1595, + "time_per_iteration": 2.7736129760742188 + }, + { + "auxiliary_loss_clip": 0.01564825, + "auxiliary_loss_mlp": 0.01053465, + "balance_loss_clip": 1.3460927, + "balance_loss_mlp": 1.02748966, + "epoch": 0.09595671125807906, + "flos": 21329410683960.0, + "grad_norm": 2.008312141621429, + "language_loss": 0.85397857, + "learning_rate": 3.954539880085045e-06, + "loss": 0.88016152, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2598877, + "step": 1596, + "time_per_iteration": 2.8062658309936523 + }, + { + "auxiliary_loss_clip": 0.01576929, + "auxiliary_loss_mlp": 0.01065492, + "balance_loss_clip": 1.35481465, + "balance_loss_mlp": 1.03560662, + "epoch": 0.09601683451074704, + "flos": 39610786621680.0, + "grad_norm": 1.838448666868288, + "language_loss": 0.69411719, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.72054136, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.29882812, + "step": 1597, + "time_per_iteration": 2.901090621948242 + }, + { + "auxiliary_loss_clip": 0.01572158, + "auxiliary_loss_mlp": 0.01055911, + "balance_loss_clip": 1.34742904, + "balance_loss_mlp": 1.03061485, + "epoch": 0.096076957763415, + "flos": 23738030615520.0, + "grad_norm": 2.443567725509633, + "language_loss": 0.75670475, + "learning_rate": 3.954374601087729e-06, + "loss": 0.78298545, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2532959, + "step": 1598, + "time_per_iteration": 2.7956552505493164 + }, + { + "auxiliary_loss_clip": 0.01583878, + "auxiliary_loss_mlp": 0.01065708, + "balance_loss_clip": 1.359146, + "balance_loss_mlp": 1.0382061, + "epoch": 0.09613708101608297, + "flos": 34684807905720.0, + "grad_norm": 1.6858193571275912, + "language_loss": 0.69747257, + "learning_rate": 3.954291850422382e-06, + "loss": 0.72396839, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27502441, + "step": 1599, + "time_per_iteration": 2.873473882675171 + }, + { + "auxiliary_loss_clip": 0.01576712, + "auxiliary_loss_mlp": 0.01061563, + "balance_loss_clip": 1.35806608, + "balance_loss_mlp": 1.03636217, + "epoch": 0.09619720426875093, + "flos": 20744840782440.0, + "grad_norm": 2.126151258450172, + "language_loss": 0.84530336, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8716861, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25219727, + "step": 1600, + "time_per_iteration": 2.7688283920288086 + }, + { + "auxiliary_loss_clip": 0.01576801, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.3562212, + "balance_loss_mlp": 1.03679144, + "epoch": 0.09625732752141891, + "flos": 13046682866400.0, + "grad_norm": 2.017388805637066, + "language_loss": 0.80948353, + "learning_rate": 3.954126126774001e-06, + "loss": 0.83588111, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26184082, + "step": 1601, + "time_per_iteration": 2.815619468688965 + }, + { + "auxiliary_loss_clip": 0.01575901, + "auxiliary_loss_mlp": 0.01063106, + "balance_loss_clip": 1.35081732, + "balance_loss_mlp": 1.03651094, + "epoch": 0.09631745077408688, + "flos": 22278798713160.0, + "grad_norm": 2.2101498353484903, + "language_loss": 0.83259875, + "learning_rate": 3.954043153797251e-06, + "loss": 0.85898882, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26611328, + "step": 1602, + "time_per_iteration": 2.844972848892212 + }, + { + "auxiliary_loss_clip": 0.01570499, + "auxiliary_loss_mlp": 0.01051242, + "balance_loss_clip": 1.35285878, + "balance_loss_mlp": 1.02353752, + "epoch": 0.09637757402675484, + "flos": 24759708171480.0, + "grad_norm": 1.8905412156187327, + "language_loss": 0.62880844, + "learning_rate": 3.953960106722989e-06, + "loss": 0.65502584, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.27734375, + "step": 1603, + "time_per_iteration": 4.269877910614014 + }, + { + "auxiliary_loss_clip": 0.01587502, + "auxiliary_loss_mlp": 0.01062303, + "balance_loss_clip": 1.36418629, + "balance_loss_mlp": 1.03408599, + "epoch": 0.09643769727942282, + "flos": 22530532893480.0, + "grad_norm": 2.3163982191510843, + "language_loss": 0.71886873, + "learning_rate": 3.953876985554364e-06, + "loss": 0.74536681, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.28234863, + "step": 1604, + "time_per_iteration": 2.881410837173462 + }, + { + "auxiliary_loss_clip": 0.01577046, + "auxiliary_loss_mlp": 0.01059411, + "balance_loss_clip": 1.35837567, + "balance_loss_mlp": 1.03394806, + "epoch": 0.09649782053209079, + "flos": 30927359867400.0, + "grad_norm": 1.8261067655976917, + "language_loss": 0.79595256, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82231712, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25476074, + "step": 1605, + "time_per_iteration": 4.558587074279785 + }, + { + "auxiliary_loss_clip": 0.01582096, + "auxiliary_loss_mlp": 0.01055375, + "balance_loss_clip": 1.35744822, + "balance_loss_mlp": 1.02935159, + "epoch": 0.09655794378475875, + "flos": 25343100430560.0, + "grad_norm": 2.856042452467293, + "language_loss": 0.75984949, + "learning_rate": 3.953710520946634e-06, + "loss": 0.78622413, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26013184, + "step": 1606, + "time_per_iteration": 4.2957282066345215 + }, + { + "auxiliary_loss_clip": 0.01574, + "auxiliary_loss_mlp": 0.01055582, + "balance_loss_clip": 1.35422802, + "balance_loss_mlp": 1.03053617, + "epoch": 0.09661806703742673, + "flos": 22351169456640.0, + "grad_norm": 1.90068111695488, + "language_loss": 0.76370096, + "learning_rate": 3.953627177513843e-06, + "loss": 0.78999674, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25048828, + "step": 1607, + "time_per_iteration": 4.268249988555908 + }, + { + "auxiliary_loss_clip": 0.01583058, + "auxiliary_loss_mlp": 0.01057225, + "balance_loss_clip": 1.36001444, + "balance_loss_mlp": 1.03161907, + "epoch": 0.0966781902900947, + "flos": 17462127367080.0, + "grad_norm": 1.9549227773153968, + "language_loss": 0.87306809, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89947093, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25598145, + "step": 1608, + "time_per_iteration": 2.754387855529785 + }, + { + "auxiliary_loss_clip": 0.01592978, + "auxiliary_loss_mlp": 0.01066084, + "balance_loss_clip": 1.36665976, + "balance_loss_mlp": 1.03965509, + "epoch": 0.09673831354276266, + "flos": 36911018773440.0, + "grad_norm": 2.0133801728678127, + "language_loss": 0.71558672, + "learning_rate": 3.953460268406207e-06, + "loss": 0.74217737, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26428223, + "step": 1609, + "time_per_iteration": 2.8774142265319824 + }, + { + "auxiliary_loss_clip": 0.01568628, + "auxiliary_loss_mlp": 0.01060786, + "balance_loss_clip": 1.3487401, + "balance_loss_mlp": 1.03514373, + "epoch": 0.09679843679543064, + "flos": 20705914346400.0, + "grad_norm": 3.1007766417102602, + "language_loss": 0.85428721, + "learning_rate": 3.953376702737693e-06, + "loss": 0.88058138, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25646973, + "step": 1610, + "time_per_iteration": 2.7485523223876953 + }, + { + "auxiliary_loss_clip": 0.01582169, + "auxiliary_loss_mlp": 0.01058748, + "balance_loss_clip": 1.36445808, + "balance_loss_mlp": 1.03323758, + "epoch": 0.0968585600480986, + "flos": 23519781351000.0, + "grad_norm": 1.9659246983410967, + "language_loss": 0.67794001, + "learning_rate": 3.953293062996939e-06, + "loss": 0.70434916, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25537109, + "step": 1611, + "time_per_iteration": 2.778954267501831 + }, + { + "auxiliary_loss_clip": 0.01578653, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_clip": 1.35960007, + "balance_loss_mlp": 1.02809846, + "epoch": 0.09691868330076657, + "flos": 20125973797920.0, + "grad_norm": 1.668063735342282, + "language_loss": 0.81183994, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83815312, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24597168, + "step": 1612, + "time_per_iteration": 2.7432868480682373 + }, + { + "auxiliary_loss_clip": 0.01587417, + "auxiliary_loss_mlp": 0.01071647, + "balance_loss_clip": 1.36814523, + "balance_loss_mlp": 1.04641044, + "epoch": 0.09697880655343454, + "flos": 16549026230520.0, + "grad_norm": 3.562194097965503, + "language_loss": 0.81377268, + "learning_rate": 3.953125561311398e-06, + "loss": 0.84036338, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25280762, + "step": 1613, + "time_per_iteration": 2.746497869491577 + }, + { + "auxiliary_loss_clip": 0.01577524, + "auxiliary_loss_mlp": 0.0105869, + "balance_loss_clip": 1.35766649, + "balance_loss_mlp": 1.03043723, + "epoch": 0.09703892980610251, + "flos": 26109795137400.0, + "grad_norm": 1.8776254779645365, + "language_loss": 0.84783304, + "learning_rate": 3.953041699372964e-06, + "loss": 0.87419522, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2824707, + "step": 1614, + "time_per_iteration": 2.8417036533355713 + }, + { + "auxiliary_loss_clip": 0.0135622, + "auxiliary_loss_mlp": 0.01011936, + "balance_loss_clip": 1.22054887, + "balance_loss_mlp": 1.00301874, + "epoch": 0.09709905305877048, + "flos": 60459307193160.0, + "grad_norm": 0.7088860553953443, + "language_loss": 0.54687476, + "learning_rate": 3.952957763374992e-06, + "loss": 0.57055628, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.08935547, + "step": 1615, + "time_per_iteration": 3.2534799575805664 + }, + { + "auxiliary_loss_clip": 0.01357335, + "auxiliary_loss_mlp": 0.01010875, + "balance_loss_clip": 1.22346783, + "balance_loss_mlp": 1.00224388, + "epoch": 0.09715917631143844, + "flos": 57655998362160.0, + "grad_norm": 0.776277175077229, + "language_loss": 0.58271545, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60639757, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.08642578, + "step": 1616, + "time_per_iteration": 3.4219653606414795 + }, + { + "auxiliary_loss_clip": 0.01579402, + "auxiliary_loss_mlp": 0.0107806, + "balance_loss_clip": 1.35824859, + "balance_loss_mlp": 1.04950964, + "epoch": 0.09721929956410642, + "flos": 20563081452360.0, + "grad_norm": 1.7786687379938966, + "language_loss": 0.68670785, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71328253, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28564453, + "step": 1617, + "time_per_iteration": 2.9524765014648438 + }, + { + "auxiliary_loss_clip": 0.01578202, + "auxiliary_loss_mlp": 0.01069341, + "balance_loss_clip": 1.35494924, + "balance_loss_mlp": 1.04011083, + "epoch": 0.09727942281677439, + "flos": 27350087433120.0, + "grad_norm": 1.7514342566157197, + "language_loss": 0.81153125, + "learning_rate": 3.952705511055698e-06, + "loss": 0.83800668, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.29248047, + "step": 1618, + "time_per_iteration": 2.8953495025634766 + }, + { + "auxiliary_loss_clip": 0.01569292, + "auxiliary_loss_mlp": 0.01066293, + "balance_loss_clip": 1.35213685, + "balance_loss_mlp": 1.04204607, + "epoch": 0.09733954606944235, + "flos": 24905464867440.0, + "grad_norm": 1.61258065205336, + "language_loss": 0.93118095, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95753682, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24267578, + "step": 1619, + "time_per_iteration": 2.8452770709991455 + }, + { + "auxiliary_loss_clip": 0.01571862, + "auxiliary_loss_mlp": 0.01065695, + "balance_loss_clip": 1.35642171, + "balance_loss_mlp": 1.03973114, + "epoch": 0.09739966932211033, + "flos": 31510224217800.0, + "grad_norm": 2.0465117797233683, + "language_loss": 0.88932741, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.915703, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.2598877, + "step": 1620, + "time_per_iteration": 2.8542134761810303 + }, + { + "auxiliary_loss_clip": 0.01578744, + "auxiliary_loss_mlp": 0.01080648, + "balance_loss_clip": 1.35880542, + "balance_loss_mlp": 1.05377865, + "epoch": 0.0974597925747783, + "flos": 23884112178360.0, + "grad_norm": 1.8009240873320125, + "language_loss": 0.77305746, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79965138, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26855469, + "step": 1621, + "time_per_iteration": 2.8225760459899902 + }, + { + "auxiliary_loss_clip": 0.01569684, + "auxiliary_loss_mlp": 0.01082838, + "balance_loss_clip": 1.34878314, + "balance_loss_mlp": 1.05608702, + "epoch": 0.09751991582744626, + "flos": 17024045112000.0, + "grad_norm": 1.8626219373729416, + "language_loss": 0.77915299, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80567813, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26745605, + "step": 1622, + "time_per_iteration": 2.7893192768096924 + }, + { + "auxiliary_loss_clip": 0.01591838, + "auxiliary_loss_mlp": 0.01068497, + "balance_loss_clip": 1.36642647, + "balance_loss_mlp": 1.04252124, + "epoch": 0.09758003908011423, + "flos": 28408742223840.0, + "grad_norm": 2.7571099072860794, + "language_loss": 0.86014861, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88675201, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26000977, + "step": 1623, + "time_per_iteration": 2.889219284057617 + }, + { + "auxiliary_loss_clip": 0.01570366, + "auxiliary_loss_mlp": 0.01081108, + "balance_loss_clip": 1.35108137, + "balance_loss_mlp": 1.05507326, + "epoch": 0.09764016233278221, + "flos": 18148086008280.0, + "grad_norm": 4.6875696244860485, + "language_loss": 0.80675846, + "learning_rate": 3.952199007240184e-06, + "loss": 0.83327323, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26025391, + "step": 1624, + "time_per_iteration": 2.77964448928833 + }, + { + "auxiliary_loss_clip": 0.0157079, + "auxiliary_loss_mlp": 0.01066852, + "balance_loss_clip": 1.35127223, + "balance_loss_mlp": 1.04177058, + "epoch": 0.09770028558545017, + "flos": 15269766890400.0, + "grad_norm": 2.1132507314521467, + "language_loss": 0.85845619, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88483262, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25085449, + "step": 1625, + "time_per_iteration": 2.8687844276428223 + }, + { + "auxiliary_loss_clip": 0.01583764, + "auxiliary_loss_mlp": 0.01082072, + "balance_loss_clip": 1.36048591, + "balance_loss_mlp": 1.04995728, + "epoch": 0.09776040883811814, + "flos": 23477078337480.0, + "grad_norm": 1.9870364833909484, + "language_loss": 0.85546279, + "learning_rate": 3.952029580380172e-06, + "loss": 0.88212109, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.32141113, + "step": 1626, + "time_per_iteration": 2.8604750633239746 + }, + { + "auxiliary_loss_clip": 0.01591107, + "auxiliary_loss_mlp": 0.01079125, + "balance_loss_clip": 1.36533427, + "balance_loss_mlp": 1.05241036, + "epoch": 0.09782053209078612, + "flos": 24504968972520.0, + "grad_norm": 1.8002192942834074, + "language_loss": 0.83601749, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.86271989, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26708984, + "step": 1627, + "time_per_iteration": 2.8895559310913086 + }, + { + "auxiliary_loss_clip": 0.01580996, + "auxiliary_loss_mlp": 0.01079511, + "balance_loss_clip": 1.36282587, + "balance_loss_mlp": 1.05392861, + "epoch": 0.09788065534345408, + "flos": 21585205700280.0, + "grad_norm": 1.8670312711622943, + "language_loss": 0.8458823, + "learning_rate": 3.951859857435534e-06, + "loss": 0.87248743, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25610352, + "step": 1628, + "time_per_iteration": 2.868354082107544 + }, + { + "auxiliary_loss_clip": 0.01574164, + "auxiliary_loss_mlp": 0.01061551, + "balance_loss_clip": 1.3571167, + "balance_loss_mlp": 1.03658891, + "epoch": 0.09794077859612205, + "flos": 23847825285720.0, + "grad_norm": 1.6344931647274046, + "language_loss": 0.76217401, + "learning_rate": 3.951774884939523e-06, + "loss": 0.78853112, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24938965, + "step": 1629, + "time_per_iteration": 2.9382805824279785 + }, + { + "auxiliary_loss_clip": 0.01581519, + "auxiliary_loss_mlp": 0.010715, + "balance_loss_clip": 1.36205578, + "balance_loss_mlp": 1.04439187, + "epoch": 0.09800090184879003, + "flos": 23665213180080.0, + "grad_norm": 1.7474052584593176, + "language_loss": 0.78381908, + "learning_rate": 3.951689838432013e-06, + "loss": 0.81034923, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.27099609, + "step": 1630, + "time_per_iteration": 2.9037742614746094 + }, + { + "auxiliary_loss_clip": 0.01579576, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.35798383, + "balance_loss_mlp": 1.04092407, + "epoch": 0.09806102510145799, + "flos": 17060210179560.0, + "grad_norm": 1.804033014767534, + "language_loss": 0.8706944, + "learning_rate": 3.951604717916228e-06, + "loss": 0.89716244, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26318359, + "step": 1631, + "time_per_iteration": 2.728428602218628 + }, + { + "auxiliary_loss_clip": 0.01574919, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_clip": 1.35801542, + "balance_loss_mlp": 1.0420444, + "epoch": 0.09812114835412596, + "flos": 23883787311480.0, + "grad_norm": 2.6407553818086567, + "language_loss": 0.83027172, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85668719, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24621582, + "step": 1632, + "time_per_iteration": 2.8029985427856445 + }, + { + "auxiliary_loss_clip": 0.01571616, + "auxiliary_loss_mlp": 0.01070424, + "balance_loss_clip": 1.35371995, + "balance_loss_mlp": 1.04525912, + "epoch": 0.09818127160679392, + "flos": 20600464770720.0, + "grad_norm": 1.5727413564387467, + "language_loss": 0.79586989, + "learning_rate": 3.951434254872751e-06, + "loss": 0.82229024, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25170898, + "step": 1633, + "time_per_iteration": 2.75394606590271 + }, + { + "auxiliary_loss_clip": 0.01565211, + "auxiliary_loss_mlp": 0.01062193, + "balance_loss_clip": 1.35111713, + "balance_loss_mlp": 1.03700399, + "epoch": 0.0982413948594619, + "flos": 15491630298960.0, + "grad_norm": 2.0607434711800763, + "language_loss": 0.73337591, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75964987, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25219727, + "step": 1634, + "time_per_iteration": 2.793186664581299 + }, + { + "auxiliary_loss_clip": 0.0158208, + "auxiliary_loss_mlp": 0.01073441, + "balance_loss_clip": 1.35954618, + "balance_loss_mlp": 1.04616642, + "epoch": 0.09830151811212987, + "flos": 24213455580600.0, + "grad_norm": 3.7429869441973898, + "language_loss": 0.73353922, + "learning_rate": 3.951263495834947e-06, + "loss": 0.7600944, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27307129, + "step": 1635, + "time_per_iteration": 2.8071517944335938 + }, + { + "auxiliary_loss_clip": 0.01582929, + "auxiliary_loss_mlp": 0.01060076, + "balance_loss_clip": 1.36174464, + "balance_loss_mlp": 1.03201413, + "epoch": 0.09836164136479783, + "flos": 20599449561720.0, + "grad_norm": 1.826172899518193, + "language_loss": 0.7871964, + "learning_rate": 3.951178005326264e-06, + "loss": 0.81362641, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28100586, + "step": 1636, + "time_per_iteration": 2.862832546234131 + }, + { + "auxiliary_loss_clip": 0.01583114, + "auxiliary_loss_mlp": 0.0106236, + "balance_loss_clip": 1.36409163, + "balance_loss_mlp": 1.03593183, + "epoch": 0.09842176461746581, + "flos": 19938448080720.0, + "grad_norm": 1.9181284074046783, + "language_loss": 0.70530289, + "learning_rate": 3.951092440828715e-06, + "loss": 0.73175764, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26452637, + "step": 1637, + "time_per_iteration": 2.8899898529052734 + }, + { + "auxiliary_loss_clip": 0.01574502, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_clip": 1.35807967, + "balance_loss_mlp": 1.04166996, + "epoch": 0.09848188787013377, + "flos": 21219575405400.0, + "grad_norm": 1.9350281060906358, + "language_loss": 0.7740649, + "learning_rate": 3.951006802345545e-06, + "loss": 0.80049419, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26757812, + "step": 1638, + "time_per_iteration": 2.8159892559051514 + }, + { + "auxiliary_loss_clip": 0.01564632, + "auxiliary_loss_mlp": 0.01055934, + "balance_loss_clip": 1.35106635, + "balance_loss_mlp": 1.03026819, + "epoch": 0.09854201112280174, + "flos": 30160380902040.0, + "grad_norm": 2.08597493470448, + "language_loss": 0.72851658, + "learning_rate": 3.950921089880003e-06, + "loss": 0.75472224, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25646973, + "step": 1639, + "time_per_iteration": 2.8845174312591553 + }, + { + "auxiliary_loss_clip": 0.01578151, + "auxiliary_loss_mlp": 0.01050615, + "balance_loss_clip": 1.35811591, + "balance_loss_mlp": 1.02370906, + "epoch": 0.09860213437546972, + "flos": 21800368729440.0, + "grad_norm": 3.295891559103621, + "language_loss": 0.88817841, + "learning_rate": 3.950835303435337e-06, + "loss": 0.91446602, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26916504, + "step": 1640, + "time_per_iteration": 2.763914108276367 + }, + { + "auxiliary_loss_clip": 0.0157609, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.3584137, + "balance_loss_mlp": 1.02216721, + "epoch": 0.09866225762813768, + "flos": 21840635241360.0, + "grad_norm": 1.7913961853771845, + "language_loss": 0.81741405, + "learning_rate": 3.950749443014801e-06, + "loss": 0.84363925, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24267578, + "step": 1641, + "time_per_iteration": 2.8420639038085938 + }, + { + "auxiliary_loss_clip": 0.01579812, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.36131692, + "balance_loss_mlp": 1.04222441, + "epoch": 0.09872238088080565, + "flos": 17603945052120.0, + "grad_norm": 2.582614484958422, + "language_loss": 0.87158096, + "learning_rate": 3.95066350862165e-06, + "loss": 0.89808244, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.28149414, + "step": 1642, + "time_per_iteration": 4.243187189102173 + }, + { + "auxiliary_loss_clip": 0.01584621, + "auxiliary_loss_mlp": 0.01062697, + "balance_loss_clip": 1.36658502, + "balance_loss_mlp": 1.03680503, + "epoch": 0.09878250413347361, + "flos": 27641885083560.0, + "grad_norm": 1.5739494148351525, + "language_loss": 0.81125498, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83772814, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25891113, + "step": 1643, + "time_per_iteration": 2.8739471435546875 + }, + { + "auxiliary_loss_clip": 0.01577287, + "auxiliary_loss_mlp": 0.01070007, + "balance_loss_clip": 1.36048687, + "balance_loss_mlp": 1.04426944, + "epoch": 0.0988426273861416, + "flos": 16549066838880.0, + "grad_norm": 1.9647541641341133, + "language_loss": 0.8279649, + "learning_rate": 3.950491417930543e-06, + "loss": 0.85443783, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25756836, + "step": 1644, + "time_per_iteration": 5.821784019470215 + }, + { + "auxiliary_loss_clip": 0.01571121, + "auxiliary_loss_mlp": 0.01057933, + "balance_loss_clip": 1.35854483, + "balance_loss_mlp": 1.03219581, + "epoch": 0.09890275063880956, + "flos": 21220265747520.0, + "grad_norm": 1.6575521590312126, + "language_loss": 0.68934596, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.71563649, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25744629, + "step": 1645, + "time_per_iteration": 2.786687135696411 + }, + { + "auxiliary_loss_clip": 0.01356802, + "auxiliary_loss_mlp": 0.01008383, + "balance_loss_clip": 1.22769976, + "balance_loss_mlp": 0.99989527, + "epoch": 0.09896287389147752, + "flos": 59394114456480.0, + "grad_norm": 0.9111823589372002, + "language_loss": 0.60792434, + "learning_rate": 3.950319031388119e-06, + "loss": 0.6315763, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.08496094, + "step": 1646, + "time_per_iteration": 4.637892246246338 + }, + { + "auxiliary_loss_clip": 0.01574309, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.35676908, + "balance_loss_mlp": 1.0339365, + "epoch": 0.0990229971441455, + "flos": 29648669044320.0, + "grad_norm": 1.6082418846103814, + "language_loss": 0.73712158, + "learning_rate": 3.950232727180833e-06, + "loss": 0.76350832, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.30395508, + "step": 1647, + "time_per_iteration": 2.958031415939331 + }, + { + "auxiliary_loss_clip": 0.0157695, + "auxiliary_loss_mlp": 0.01065858, + "balance_loss_clip": 1.35931945, + "balance_loss_mlp": 1.04275501, + "epoch": 0.09908312039681347, + "flos": 21839863682520.0, + "grad_norm": 2.7001269355288073, + "language_loss": 0.85074329, + "learning_rate": 3.950146349020525e-06, + "loss": 0.87717134, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2310791, + "step": 1648, + "time_per_iteration": 2.7823991775512695 + }, + { + "auxiliary_loss_clip": 0.0135728, + "auxiliary_loss_mlp": 0.01020027, + "balance_loss_clip": 1.2274344, + "balance_loss_mlp": 1.01134872, + "epoch": 0.09914324364948143, + "flos": 57580013474640.0, + "grad_norm": 0.7433738850692025, + "language_loss": 0.55741179, + "learning_rate": 3.950059896910473e-06, + "loss": 0.58118486, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08691406, + "step": 1649, + "time_per_iteration": 3.146235227584839 + }, + { + "auxiliary_loss_clip": 0.01571979, + "auxiliary_loss_mlp": 0.01055146, + "balance_loss_clip": 1.35717297, + "balance_loss_mlp": 1.03088677, + "epoch": 0.09920336690214941, + "flos": 34129662084000.0, + "grad_norm": 2.0056144452563793, + "language_loss": 0.90926844, + "learning_rate": 3.949973370853954e-06, + "loss": 0.93553972, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24243164, + "step": 1650, + "time_per_iteration": 2.8161487579345703 + }, + { + "auxiliary_loss_clip": 0.01356601, + "auxiliary_loss_mlp": 0.01018352, + "balance_loss_clip": 1.22731996, + "balance_loss_mlp": 1.00986445, + "epoch": 0.09926349015481738, + "flos": 71234761501800.0, + "grad_norm": 0.7971001544291422, + "language_loss": 0.6378504, + "learning_rate": 3.94988677085425e-06, + "loss": 0.66159993, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.08496094, + "step": 1651, + "time_per_iteration": 3.3836452960968018 + }, + { + "auxiliary_loss_clip": 0.0157042, + "auxiliary_loss_mlp": 0.01064318, + "balance_loss_clip": 1.35675502, + "balance_loss_mlp": 1.03813982, + "epoch": 0.09932361340748534, + "flos": 23153907405960.0, + "grad_norm": 1.9697567576919994, + "language_loss": 0.88260972, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90895712, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.26196289, + "step": 1652, + "time_per_iteration": 2.7900166511535645 + }, + { + "auxiliary_loss_clip": 0.01572994, + "auxiliary_loss_mlp": 0.01063813, + "balance_loss_clip": 1.35675168, + "balance_loss_mlp": 1.03737259, + "epoch": 0.09938373666015332, + "flos": 19833445197000.0, + "grad_norm": 1.8257649417703887, + "language_loss": 0.82294858, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84931672, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26452637, + "step": 1653, + "time_per_iteration": 2.7775092124938965 + }, + { + "auxiliary_loss_clip": 0.01581816, + "auxiliary_loss_mlp": 0.0105733, + "balance_loss_clip": 1.3615557, + "balance_loss_mlp": 1.02984071, + "epoch": 0.09944385991282129, + "flos": 22095739915560.0, + "grad_norm": 1.6833454024358012, + "language_loss": 0.80215514, + "learning_rate": 3.949626527228875e-06, + "loss": 0.82854664, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.27502441, + "step": 1654, + "time_per_iteration": 2.8071658611297607 + }, + { + "auxiliary_loss_clip": 0.01564452, + "auxiliary_loss_mlp": 0.01064543, + "balance_loss_clip": 1.35433125, + "balance_loss_mlp": 1.04056978, + "epoch": 0.09950398316548925, + "flos": 19833688847160.0, + "grad_norm": 1.5952165766995834, + "language_loss": 0.81848162, + "learning_rate": 3.949539631489295e-06, + "loss": 0.84477156, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23986816, + "step": 1655, + "time_per_iteration": 2.715294599533081 + }, + { + "auxiliary_loss_clip": 0.01569039, + "auxiliary_loss_mlp": 0.01059151, + "balance_loss_clip": 1.35548997, + "balance_loss_mlp": 1.03154206, + "epoch": 0.09956410641815722, + "flos": 25008396724800.0, + "grad_norm": 1.6722878087417727, + "language_loss": 0.8115983, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83788025, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.27600098, + "step": 1656, + "time_per_iteration": 2.870276927947998 + }, + { + "auxiliary_loss_clip": 0.01581607, + "auxiliary_loss_mlp": 0.01065567, + "balance_loss_clip": 1.36653399, + "balance_loss_mlp": 1.03767204, + "epoch": 0.0996242296708252, + "flos": 19322423681400.0, + "grad_norm": 1.5098669913808267, + "language_loss": 0.89043295, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91690463, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27905273, + "step": 1657, + "time_per_iteration": 2.7314887046813965 + }, + { + "auxiliary_loss_clip": 0.0158895, + "auxiliary_loss_mlp": 0.01064961, + "balance_loss_clip": 1.36666322, + "balance_loss_mlp": 1.03588593, + "epoch": 0.09968435292349316, + "flos": 21876637875480.0, + "grad_norm": 2.377685202579523, + "language_loss": 0.85554403, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.88208318, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.29089355, + "step": 1658, + "time_per_iteration": 2.949211835861206 + }, + { + "auxiliary_loss_clip": 0.01352029, + "auxiliary_loss_mlp": 0.01009971, + "balance_loss_clip": 1.22604728, + "balance_loss_mlp": 1.00153148, + "epoch": 0.09974447617616113, + "flos": 65398101169680.0, + "grad_norm": 0.9000617403288429, + "language_loss": 0.60871458, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63233459, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.08447266, + "step": 1659, + "time_per_iteration": 3.2897918224334717 + }, + { + "auxiliary_loss_clip": 0.01577195, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_clip": 1.36247659, + "balance_loss_mlp": 1.03574741, + "epoch": 0.0998045994288291, + "flos": 23665131963360.0, + "grad_norm": 1.7780026360941494, + "language_loss": 0.85809362, + "learning_rate": 3.949104043956321e-06, + "loss": 0.88448238, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.25927734, + "step": 1660, + "time_per_iteration": 2.8096494674682617 + }, + { + "auxiliary_loss_clip": 0.01584193, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_clip": 1.3716197, + "balance_loss_mlp": 1.04054606, + "epoch": 0.09986472268149707, + "flos": 19614464982000.0, + "grad_norm": 1.9322225440567853, + "language_loss": 0.80171573, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82823813, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27526855, + "step": 1661, + "time_per_iteration": 2.799997568130493 + }, + { + "auxiliary_loss_clip": 0.01589291, + "auxiliary_loss_mlp": 0.01069127, + "balance_loss_clip": 1.36703289, + "balance_loss_mlp": 1.04048109, + "epoch": 0.09992484593416504, + "flos": 26218899465480.0, + "grad_norm": 1.7542567145672021, + "language_loss": 0.84018242, + "learning_rate": 3.948929291548443e-06, + "loss": 0.86676657, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.28613281, + "step": 1662, + "time_per_iteration": 2.8079891204833984 + }, + { + "auxiliary_loss_clip": 0.01583248, + "auxiliary_loss_mlp": 0.01083958, + "balance_loss_clip": 1.36671865, + "balance_loss_mlp": 1.05487084, + "epoch": 0.09998496918683301, + "flos": 17498130001200.0, + "grad_norm": 1.8669108947306354, + "language_loss": 0.89422047, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.9208926, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.29052734, + "step": 1663, + "time_per_iteration": 2.7953784465789795 + }, + { + "auxiliary_loss_clip": 0.01597301, + "auxiliary_loss_mlp": 0.01063252, + "balance_loss_clip": 1.37619948, + "balance_loss_mlp": 1.03635883, + "epoch": 0.10004509243950098, + "flos": 22790226312360.0, + "grad_norm": 1.6293444200638167, + "language_loss": 0.70754117, + "learning_rate": 3.948754243526191e-06, + "loss": 0.73414671, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26904297, + "step": 1664, + "time_per_iteration": 2.8923447132110596 + }, + { + "auxiliary_loss_clip": 0.01581836, + "auxiliary_loss_mlp": 0.01063916, + "balance_loss_clip": 1.36511266, + "balance_loss_mlp": 1.03711724, + "epoch": 0.10010521569216894, + "flos": 16257756488760.0, + "grad_norm": 2.1222881417383435, + "language_loss": 0.79055667, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81701422, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26806641, + "step": 1665, + "time_per_iteration": 2.736898422241211 + }, + { + "auxiliary_loss_clip": 0.01601198, + "auxiliary_loss_mlp": 0.01073687, + "balance_loss_clip": 1.38315892, + "balance_loss_mlp": 1.04668581, + "epoch": 0.10016533894483691, + "flos": 23407631395920.0, + "grad_norm": 1.717260594624834, + "language_loss": 0.69973046, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72647929, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2701416, + "step": 1666, + "time_per_iteration": 2.7765233516693115 + }, + { + "auxiliary_loss_clip": 0.01594386, + "auxiliary_loss_mlp": 0.01070983, + "balance_loss_clip": 1.37374878, + "balance_loss_mlp": 1.04381514, + "epoch": 0.10022546219750489, + "flos": 19358873007480.0, + "grad_norm": 2.1689856387932, + "language_loss": 0.79523647, + "learning_rate": 3.948491117273956e-06, + "loss": 0.82189012, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27185059, + "step": 1667, + "time_per_iteration": 2.7369282245635986 + }, + { + "auxiliary_loss_clip": 0.01593728, + "auxiliary_loss_mlp": 0.01063737, + "balance_loss_clip": 1.3744607, + "balance_loss_mlp": 1.03586602, + "epoch": 0.10028558545017285, + "flos": 27091043748000.0, + "grad_norm": 2.5507530878776787, + "language_loss": 0.77663004, + "learning_rate": 3.948403260744817e-06, + "loss": 0.80320466, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27880859, + "step": 1668, + "time_per_iteration": 2.8630621433258057 + }, + { + "auxiliary_loss_clip": 0.01587467, + "auxiliary_loss_mlp": 0.01066639, + "balance_loss_clip": 1.37029886, + "balance_loss_mlp": 1.03914976, + "epoch": 0.10034570870284082, + "flos": 25852497611760.0, + "grad_norm": 1.6520760420286935, + "language_loss": 0.78225446, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80879551, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.27539062, + "step": 1669, + "time_per_iteration": 3.0596392154693604 + }, + { + "auxiliary_loss_clip": 0.01598916, + "auxiliary_loss_mlp": 0.01075884, + "balance_loss_clip": 1.37589407, + "balance_loss_mlp": 1.04825175, + "epoch": 0.1004058319555088, + "flos": 26255145749760.0, + "grad_norm": 2.1885766386995282, + "language_loss": 0.8582778, + "learning_rate": 3.948227326038933e-06, + "loss": 0.8850258, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27648926, + "step": 1670, + "time_per_iteration": 2.7909257411956787 + }, + { + "auxiliary_loss_clip": 0.01575608, + "auxiliary_loss_mlp": 0.01070507, + "balance_loss_clip": 1.36415458, + "balance_loss_mlp": 1.04383993, + "epoch": 0.10046595520817676, + "flos": 25379996448600.0, + "grad_norm": 1.7311340379058788, + "language_loss": 0.77324748, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79970866, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26660156, + "step": 1671, + "time_per_iteration": 2.7744133472442627 + }, + { + "auxiliary_loss_clip": 0.0136159, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.23328602, + "balance_loss_mlp": 1.0237906, + "epoch": 0.10052607846084473, + "flos": 67475347281000.0, + "grad_norm": 0.7819147474335394, + "language_loss": 0.6079722, + "learning_rate": 3.948051095825149e-06, + "loss": 0.63190281, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.07666016, + "step": 1672, + "time_per_iteration": 3.232455015182495 + }, + { + "auxiliary_loss_clip": 0.01593196, + "auxiliary_loss_mlp": 0.01065772, + "balance_loss_clip": 1.37553716, + "balance_loss_mlp": 1.03781724, + "epoch": 0.10058620171351271, + "flos": 21365413318080.0, + "grad_norm": 2.376748955339235, + "language_loss": 0.7749747, + "learning_rate": 3.947962869911147e-06, + "loss": 0.8015644, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.27966309, + "step": 1673, + "time_per_iteration": 2.743802309036255 + }, + { + "auxiliary_loss_clip": 0.0158985, + "auxiliary_loss_mlp": 0.01063261, + "balance_loss_clip": 1.37188816, + "balance_loss_mlp": 1.03627253, + "epoch": 0.10064632496618067, + "flos": 16804496379960.0, + "grad_norm": 1.9481772649856175, + "language_loss": 0.73595405, + "learning_rate": 3.947874570130197e-06, + "loss": 0.76248515, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27026367, + "step": 1674, + "time_per_iteration": 2.7626187801361084 + }, + { + "auxiliary_loss_clip": 0.0158602, + "auxiliary_loss_mlp": 0.01068263, + "balance_loss_clip": 1.36615777, + "balance_loss_mlp": 1.04128575, + "epoch": 0.10070644821884864, + "flos": 23629900888080.0, + "grad_norm": 1.8656420950473105, + "language_loss": 0.79862416, + "learning_rate": 3.947786196485649e-06, + "loss": 0.825167, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26965332, + "step": 1675, + "time_per_iteration": 2.864818811416626 + }, + { + "auxiliary_loss_clip": 0.01585347, + "auxiliary_loss_mlp": 0.01073077, + "balance_loss_clip": 1.36725056, + "balance_loss_mlp": 1.04691076, + "epoch": 0.1007665714715166, + "flos": 24467991737760.0, + "grad_norm": 1.867682055957892, + "language_loss": 0.8176083, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8441925, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26171875, + "step": 1676, + "time_per_iteration": 2.8091516494750977 + }, + { + "auxiliary_loss_clip": 0.01601455, + "auxiliary_loss_mlp": 0.01065376, + "balance_loss_clip": 1.38088572, + "balance_loss_mlp": 1.03863764, + "epoch": 0.10082669472418458, + "flos": 16803359345880.0, + "grad_norm": 1.89562268332031, + "language_loss": 0.85914332, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88581163, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26733398, + "step": 1677, + "time_per_iteration": 2.7902703285217285 + }, + { + "auxiliary_loss_clip": 0.01593493, + "auxiliary_loss_mlp": 0.01061654, + "balance_loss_clip": 1.3740685, + "balance_loss_mlp": 1.03368711, + "epoch": 0.10088681797685255, + "flos": 13557541948560.0, + "grad_norm": 1.8990245630825995, + "language_loss": 0.86534864, + "learning_rate": 3.947520632403936e-06, + "loss": 0.89190018, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27978516, + "step": 1678, + "time_per_iteration": 2.695068836212158 + }, + { + "auxiliary_loss_clip": 0.01596686, + "auxiliary_loss_mlp": 0.01070601, + "balance_loss_clip": 1.37927544, + "balance_loss_mlp": 1.04277754, + "epoch": 0.10094694122952051, + "flos": 25271298204120.0, + "grad_norm": 1.8881922914970042, + "language_loss": 0.90582633, + "learning_rate": 3.947431963338532e-06, + "loss": 0.93249923, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27832031, + "step": 1679, + "time_per_iteration": 2.8118011951446533 + }, + { + "auxiliary_loss_clip": 0.01361303, + "auxiliary_loss_mlp": 0.01013662, + "balance_loss_clip": 1.23811698, + "balance_loss_mlp": 1.00498343, + "epoch": 0.10100706448218849, + "flos": 69870867693480.0, + "grad_norm": 0.7727186429538108, + "language_loss": 0.52986246, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55361211, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.08691406, + "step": 1680, + "time_per_iteration": 4.759153842926025 + }, + { + "auxiliary_loss_clip": 0.01585777, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.37013543, + "balance_loss_mlp": 1.03327942, + "epoch": 0.10106718773485646, + "flos": 20011468557960.0, + "grad_norm": 1.5577903400327744, + "language_loss": 0.77194816, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79841292, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.27380371, + "step": 1681, + "time_per_iteration": 2.738539457321167 + }, + { + "auxiliary_loss_clip": 0.01598736, + "auxiliary_loss_mlp": 0.01062635, + "balance_loss_clip": 1.37624097, + "balance_loss_mlp": 1.03316617, + "epoch": 0.10112731098752442, + "flos": 13483749912480.0, + "grad_norm": 2.4982069172782144, + "language_loss": 0.93626034, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96287411, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.29492188, + "step": 1682, + "time_per_iteration": 2.7516307830810547 + }, + { + "auxiliary_loss_clip": 0.01583788, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.36738336, + "balance_loss_mlp": 1.0299921, + "epoch": 0.1011874342401924, + "flos": 18520619724360.0, + "grad_norm": 1.8945739356823812, + "language_loss": 0.87704992, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90344727, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25964355, + "step": 1683, + "time_per_iteration": 4.266565799713135 + }, + { + "auxiliary_loss_clip": 0.01583555, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_clip": 1.37042522, + "balance_loss_mlp": 1.03579617, + "epoch": 0.10124755749286037, + "flos": 20707579289160.0, + "grad_norm": 1.6533170636056087, + "language_loss": 0.74956083, + "learning_rate": 3.946987510376624e-06, + "loss": 0.77602082, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26647949, + "step": 1684, + "time_per_iteration": 5.760923862457275 + }, + { + "auxiliary_loss_clip": 0.01355649, + "auxiliary_loss_mlp": 0.01012863, + "balance_loss_clip": 1.23322058, + "balance_loss_mlp": 1.00404155, + "epoch": 0.10130768074552833, + "flos": 56125573358760.0, + "grad_norm": 0.7721943217756683, + "language_loss": 0.61084163, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63452673, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.08837891, + "step": 1685, + "time_per_iteration": 3.3786773681640625 + }, + { + "auxiliary_loss_clip": 0.01590193, + "auxiliary_loss_mlp": 0.01064719, + "balance_loss_clip": 1.37336016, + "balance_loss_mlp": 1.03591835, + "epoch": 0.1013678039981963, + "flos": 33409040884560.0, + "grad_norm": 2.035235336490799, + "language_loss": 0.62030303, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64685214, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.28796387, + "step": 1686, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01586992, + "auxiliary_loss_mlp": 0.01071595, + "balance_loss_clip": 1.37295675, + "balance_loss_mlp": 1.04272246, + "epoch": 0.10142792725086427, + "flos": 31911694713360.0, + "grad_norm": 3.214855279917228, + "language_loss": 0.81813794, + "learning_rate": 3.946719952612972e-06, + "loss": 0.84472382, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.28845215, + "step": 1687, + "time_per_iteration": 2.91774845123291 + }, + { + "auxiliary_loss_clip": 0.01598893, + "auxiliary_loss_mlp": 0.01062226, + "balance_loss_clip": 1.38011038, + "balance_loss_mlp": 1.03516567, + "epoch": 0.10148805050353224, + "flos": 28481843917800.0, + "grad_norm": 1.7098034444938062, + "language_loss": 0.72699791, + "learning_rate": 3.94663061904761e-06, + "loss": 0.75360912, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27075195, + "step": 1688, + "time_per_iteration": 2.7978932857513428 + }, + { + "auxiliary_loss_clip": 0.01586335, + "auxiliary_loss_mlp": 0.01081645, + "balance_loss_clip": 1.37017155, + "balance_loss_mlp": 1.0543108, + "epoch": 0.1015481737562002, + "flos": 25153503687000.0, + "grad_norm": 1.979543668378134, + "language_loss": 0.8675552, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89423496, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.27331543, + "step": 1689, + "time_per_iteration": 2.7830910682678223 + }, + { + "auxiliary_loss_clip": 0.01578093, + "auxiliary_loss_mlp": 0.01072741, + "balance_loss_clip": 1.36281729, + "balance_loss_mlp": 1.04706371, + "epoch": 0.10160829700886818, + "flos": 30888880123320.0, + "grad_norm": 1.8030748269123384, + "language_loss": 0.88646567, + "learning_rate": 3.946451730470993e-06, + "loss": 0.912974, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25683594, + "step": 1690, + "time_per_iteration": 2.883986711502075 + }, + { + "auxiliary_loss_clip": 0.01589304, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.37142193, + "balance_loss_mlp": 1.03792286, + "epoch": 0.10166842026153615, + "flos": 20416837456080.0, + "grad_norm": 2.1835622085670736, + "language_loss": 0.83857632, + "learning_rate": 3.946362175466521e-06, + "loss": 0.86513376, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.28515625, + "step": 1691, + "time_per_iteration": 2.7515242099761963 + }, + { + "auxiliary_loss_clip": 0.01593663, + "auxiliary_loss_mlp": 0.01062069, + "balance_loss_clip": 1.37260091, + "balance_loss_mlp": 1.03577149, + "epoch": 0.10172854351420411, + "flos": 33483888738000.0, + "grad_norm": 1.4613026359540584, + "language_loss": 0.66700435, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69356167, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26281738, + "step": 1692, + "time_per_iteration": 2.9010236263275146 + }, + { + "auxiliary_loss_clip": 0.01582721, + "auxiliary_loss_mlp": 0.01075583, + "balance_loss_clip": 1.36767697, + "balance_loss_mlp": 1.04904747, + "epoch": 0.1017886667668721, + "flos": 23555621551680.0, + "grad_norm": 1.6206843636821182, + "language_loss": 0.76143599, + "learning_rate": 3.94618284404223e-06, + "loss": 0.78801894, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.26525879, + "step": 1693, + "time_per_iteration": 2.774142265319824 + }, + { + "auxiliary_loss_clip": 0.01589062, + "auxiliary_loss_mlp": 0.01069486, + "balance_loss_clip": 1.36978936, + "balance_loss_mlp": 1.03982711, + "epoch": 0.10184879001954006, + "flos": 23301491478120.0, + "grad_norm": 1.8566787420058344, + "language_loss": 0.87721413, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.90379965, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.29626465, + "step": 1694, + "time_per_iteration": 2.812561511993408 + }, + { + "auxiliary_loss_clip": 0.01599072, + "auxiliary_loss_mlp": 0.01078693, + "balance_loss_clip": 1.37666094, + "balance_loss_mlp": 1.04945123, + "epoch": 0.10190891327220802, + "flos": 18337763968560.0, + "grad_norm": 1.8057375741196837, + "language_loss": 0.79977036, + "learning_rate": 3.946003217420147e-06, + "loss": 0.82654804, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.29223633, + "step": 1695, + "time_per_iteration": 2.7257156372070312 + }, + { + "auxiliary_loss_clip": 0.01590683, + "auxiliary_loss_mlp": 0.01078825, + "balance_loss_clip": 1.37220168, + "balance_loss_mlp": 1.05023861, + "epoch": 0.10196903652487599, + "flos": 26470552429080.0, + "grad_norm": 1.6440135394237034, + "language_loss": 0.8677001, + "learning_rate": 3.945913293418447e-06, + "loss": 0.89439517, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.28588867, + "step": 1696, + "time_per_iteration": 2.8044281005859375 + }, + { + "auxiliary_loss_clip": 0.01574185, + "auxiliary_loss_mlp": 0.01068219, + "balance_loss_clip": 1.36164463, + "balance_loss_mlp": 1.04192173, + "epoch": 0.10202915977754397, + "flos": 21874282590600.0, + "grad_norm": 1.8775882428409751, + "language_loss": 0.8238821, + "learning_rate": 3.945823295627519e-06, + "loss": 0.85030615, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.26293945, + "step": 1697, + "time_per_iteration": 2.7776763439178467 + }, + { + "auxiliary_loss_clip": 0.01590089, + "auxiliary_loss_mlp": 0.01065956, + "balance_loss_clip": 1.37180841, + "balance_loss_mlp": 1.03813267, + "epoch": 0.10208928303021193, + "flos": 22314882564000.0, + "grad_norm": 1.8055112330114356, + "language_loss": 0.81079113, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83735156, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.27807617, + "step": 1698, + "time_per_iteration": 2.811981439590454 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01064578, + "balance_loss_clip": 1.37200606, + "balance_loss_mlp": 1.03766036, + "epoch": 0.1021494062828799, + "flos": 22130443082160.0, + "grad_norm": 2.447435973278374, + "language_loss": 0.7625972, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78914267, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2689209, + "step": 1699, + "time_per_iteration": 2.8266191482543945 + }, + { + "auxiliary_loss_clip": 0.01585728, + "auxiliary_loss_mlp": 0.01059231, + "balance_loss_clip": 1.36939979, + "balance_loss_mlp": 1.03206277, + "epoch": 0.10220952953554788, + "flos": 19651564041840.0, + "grad_norm": 1.5894956025430802, + "language_loss": 0.8032459, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82969552, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.27160645, + "step": 1700, + "time_per_iteration": 2.8032350540161133 + }, + { + "auxiliary_loss_clip": 0.01581446, + "auxiliary_loss_mlp": 0.01065852, + "balance_loss_clip": 1.36513829, + "balance_loss_mlp": 1.03911376, + "epoch": 0.10226965278821584, + "flos": 29793126272760.0, + "grad_norm": 1.824813423452594, + "language_loss": 0.76786929, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79434228, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26721191, + "step": 1701, + "time_per_iteration": 2.8523194789886475 + }, + { + "auxiliary_loss_clip": 0.01587987, + "auxiliary_loss_mlp": 0.01056441, + "balance_loss_clip": 1.36899555, + "balance_loss_mlp": 1.029917, + "epoch": 0.10232977604088381, + "flos": 27022612015440.0, + "grad_norm": 1.7864438851381488, + "language_loss": 0.78138006, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80782431, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26538086, + "step": 1702, + "time_per_iteration": 2.8414318561553955 + }, + { + "auxiliary_loss_clip": 0.01580163, + "auxiliary_loss_mlp": 0.01058327, + "balance_loss_clip": 1.3655808, + "balance_loss_mlp": 1.03192222, + "epoch": 0.10238989929355179, + "flos": 20782467750960.0, + "grad_norm": 2.078467896195053, + "language_loss": 0.95133501, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97771996, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26428223, + "step": 1703, + "time_per_iteration": 2.764190435409546 + }, + { + "auxiliary_loss_clip": 0.01346327, + "auxiliary_loss_mlp": 0.01011401, + "balance_loss_clip": 1.22157192, + "balance_loss_mlp": 1.0039624, + "epoch": 0.10245002254621975, + "flos": 57712491236880.0, + "grad_norm": 0.8862938840886134, + "language_loss": 0.55187178, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57544899, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07421875, + "step": 1704, + "time_per_iteration": 3.1781420707702637 + }, + { + "auxiliary_loss_clip": 0.01583238, + "auxiliary_loss_mlp": 0.01063419, + "balance_loss_clip": 1.36765099, + "balance_loss_mlp": 1.03578663, + "epoch": 0.10251014579888772, + "flos": 16804496379960.0, + "grad_norm": 1.966424390819959, + "language_loss": 0.84318483, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86965144, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.27636719, + "step": 1705, + "time_per_iteration": 2.7904748916625977 + }, + { + "auxiliary_loss_clip": 0.01344628, + "auxiliary_loss_mlp": 0.01010322, + "balance_loss_clip": 1.21976376, + "balance_loss_mlp": 1.00288379, + "epoch": 0.1025702690515557, + "flos": 68580359837640.0, + "grad_norm": 0.7642162218842066, + "language_loss": 0.60445571, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62800527, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07421875, + "step": 1706, + "time_per_iteration": 3.2993626594543457 + }, + { + "auxiliary_loss_clip": 0.01586733, + "auxiliary_loss_mlp": 0.01063402, + "balance_loss_clip": 1.36796737, + "balance_loss_mlp": 1.03409994, + "epoch": 0.10263039230422366, + "flos": 14870773504800.0, + "grad_norm": 2.1125034697724585, + "language_loss": 0.86943692, + "learning_rate": 3.94491926006294e-06, + "loss": 0.89593828, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.29296875, + "step": 1707, + "time_per_iteration": 2.7383663654327393 + }, + { + "auxiliary_loss_clip": 0.01575229, + "auxiliary_loss_mlp": 0.01068352, + "balance_loss_clip": 1.35988116, + "balance_loss_mlp": 1.03826344, + "epoch": 0.10269051555689163, + "flos": 25343059822200.0, + "grad_norm": 1.4790560578752587, + "language_loss": 0.73073465, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75717044, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.30102539, + "step": 1708, + "time_per_iteration": 2.8390369415283203 + }, + { + "auxiliary_loss_clip": 0.01579473, + "auxiliary_loss_mlp": 0.01062479, + "balance_loss_clip": 1.36613941, + "balance_loss_mlp": 1.03403604, + "epoch": 0.10275063880955959, + "flos": 21073737492720.0, + "grad_norm": 1.8376859128147802, + "language_loss": 0.9122799, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93869936, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.28430176, + "step": 1709, + "time_per_iteration": 2.7892565727233887 + }, + { + "auxiliary_loss_clip": 0.01572665, + "auxiliary_loss_mlp": 0.01063971, + "balance_loss_clip": 1.35880852, + "balance_loss_mlp": 1.03644562, + "epoch": 0.10281076206222757, + "flos": 30372132828960.0, + "grad_norm": 2.4849483138567687, + "language_loss": 0.88316554, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90953183, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.27539062, + "step": 1710, + "time_per_iteration": 2.8240199089050293 + }, + { + "auxiliary_loss_clip": 0.01567628, + "auxiliary_loss_mlp": 0.01065205, + "balance_loss_clip": 1.35372519, + "balance_loss_mlp": 1.03791785, + "epoch": 0.10287088531489554, + "flos": 22423337158320.0, + "grad_norm": 1.7166293402620365, + "language_loss": 0.794438, + "learning_rate": 3.944555580601908e-06, + "loss": 0.82076633, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.27258301, + "step": 1711, + "time_per_iteration": 2.792659044265747 + }, + { + "auxiliary_loss_clip": 0.01578776, + "auxiliary_loss_mlp": 0.01062161, + "balance_loss_clip": 1.36275244, + "balance_loss_mlp": 1.03350294, + "epoch": 0.1029310085675635, + "flos": 25121074588560.0, + "grad_norm": 2.5798184404531463, + "language_loss": 0.74263084, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76904023, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.28662109, + "step": 1712, + "time_per_iteration": 2.7875845432281494 + }, + { + "auxiliary_loss_clip": 0.01571778, + "auxiliary_loss_mlp": 0.01060504, + "balance_loss_clip": 1.36374366, + "balance_loss_mlp": 1.03386092, + "epoch": 0.10299113182023148, + "flos": 19870503648480.0, + "grad_norm": 1.7016022023366297, + "language_loss": 0.87103248, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89735532, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26635742, + "step": 1713, + "time_per_iteration": 2.854759931564331 + }, + { + "auxiliary_loss_clip": 0.01571776, + "auxiliary_loss_mlp": 0.01059896, + "balance_loss_clip": 1.36040211, + "balance_loss_mlp": 1.0341351, + "epoch": 0.10305125507289944, + "flos": 20452312181520.0, + "grad_norm": 1.5905200192914135, + "language_loss": 0.72290456, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74922127, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.2578125, + "step": 1714, + "time_per_iteration": 2.7696316242218018 + }, + { + "auxiliary_loss_clip": 0.0158119, + "auxiliary_loss_mlp": 0.01067482, + "balance_loss_clip": 1.36514771, + "balance_loss_mlp": 1.03987312, + "epoch": 0.10311137832556741, + "flos": 26256485825640.0, + "grad_norm": 1.9499593888280427, + "language_loss": 0.90973634, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93622303, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27612305, + "step": 1715, + "time_per_iteration": 2.813095808029175 + }, + { + "auxiliary_loss_clip": 0.01568724, + "auxiliary_loss_mlp": 0.01062266, + "balance_loss_clip": 1.3572681, + "balance_loss_mlp": 1.03628993, + "epoch": 0.10317150157823539, + "flos": 35305786524960.0, + "grad_norm": 1.8024400560345688, + "language_loss": 0.75789237, + "learning_rate": 3.944099322202418e-06, + "loss": 0.78420234, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.26000977, + "step": 1716, + "time_per_iteration": 2.8886260986328125 + }, + { + "auxiliary_loss_clip": 0.01578126, + "auxiliary_loss_mlp": 0.01071864, + "balance_loss_clip": 1.36238849, + "balance_loss_mlp": 1.04524469, + "epoch": 0.10323162483090335, + "flos": 25745586135120.0, + "grad_norm": 1.8426486070817865, + "language_loss": 0.85778427, + "learning_rate": 3.944007849347342e-06, + "loss": 0.88428426, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26599121, + "step": 1717, + "time_per_iteration": 2.8285253047943115 + }, + { + "auxiliary_loss_clip": 0.01572435, + "auxiliary_loss_mlp": 0.01063006, + "balance_loss_clip": 1.357687, + "balance_loss_mlp": 1.03695869, + "epoch": 0.10329174808357132, + "flos": 16294246423200.0, + "grad_norm": 1.8595266522408043, + "language_loss": 0.83363211, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85998648, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26037598, + "step": 1718, + "time_per_iteration": 2.736605644226074 + }, + { + "auxiliary_loss_clip": 0.01573379, + "auxiliary_loss_mlp": 0.01065127, + "balance_loss_clip": 1.36177373, + "balance_loss_mlp": 1.03730392, + "epoch": 0.10335187133623928, + "flos": 36694434451680.0, + "grad_norm": 1.6446752466300387, + "language_loss": 0.73352444, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75990951, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.27844238, + "step": 1719, + "time_per_iteration": 4.421666383743286 + }, + { + "auxiliary_loss_clip": 0.01565932, + "auxiliary_loss_mlp": 0.01065488, + "balance_loss_clip": 1.35384464, + "balance_loss_mlp": 1.03940511, + "epoch": 0.10341199458890726, + "flos": 14979756007800.0, + "grad_norm": 1.7542621582834252, + "language_loss": 0.93120533, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95751953, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.26074219, + "step": 1720, + "time_per_iteration": 2.779062032699585 + }, + { + "auxiliary_loss_clip": 0.01571739, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_clip": 1.35773301, + "balance_loss_mlp": 1.0396173, + "epoch": 0.10347211784157523, + "flos": 21036394782720.0, + "grad_norm": 1.7016318084635527, + "language_loss": 0.7947064, + "learning_rate": 3.943641220792039e-06, + "loss": 0.82112038, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.30029297, + "step": 1721, + "time_per_iteration": 2.7775492668151855 + }, + { + "auxiliary_loss_clip": 0.01585762, + "auxiliary_loss_mlp": 0.01075571, + "balance_loss_clip": 1.36841381, + "balance_loss_mlp": 1.04595959, + "epoch": 0.1035322410942432, + "flos": 19796874045840.0, + "grad_norm": 1.74291026674143, + "language_loss": 0.80782747, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.83444083, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.29589844, + "step": 1722, + "time_per_iteration": 5.823894500732422 + }, + { + "auxiliary_loss_clip": 0.01352139, + "auxiliary_loss_mlp": 0.01018984, + "balance_loss_clip": 1.23103309, + "balance_loss_mlp": 1.01335764, + "epoch": 0.10359236434691117, + "flos": 52712273792880.0, + "grad_norm": 0.9545517023137868, + "language_loss": 0.67262542, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69633663, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.05615234, + "step": 1723, + "time_per_iteration": 3.125263214111328 + }, + { + "auxiliary_loss_clip": 0.0157875, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_clip": 1.36254883, + "balance_loss_mlp": 1.04575765, + "epoch": 0.10365248759957914, + "flos": 18409972278600.0, + "grad_norm": 3.330721529455615, + "language_loss": 0.77806699, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.8045879, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.27563477, + "step": 1724, + "time_per_iteration": 4.20580267906189 + }, + { + "auxiliary_loss_clip": 0.01588048, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_clip": 1.36947632, + "balance_loss_mlp": 1.03804398, + "epoch": 0.1037126108522471, + "flos": 47561409668520.0, + "grad_norm": 1.6461936612701247, + "language_loss": 0.75379127, + "learning_rate": 3.943273412987676e-06, + "loss": 0.78032148, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26928711, + "step": 1725, + "time_per_iteration": 2.9891529083251953 + }, + { + "auxiliary_loss_clip": 0.01578443, + "auxiliary_loss_mlp": 0.01063825, + "balance_loss_clip": 1.3679564, + "balance_loss_mlp": 1.03758681, + "epoch": 0.10377273410491508, + "flos": 22821355943280.0, + "grad_norm": 1.8722374172462402, + "language_loss": 0.75203931, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77846205, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.26257324, + "step": 1726, + "time_per_iteration": 2.7464599609375 + }, + { + "auxiliary_loss_clip": 0.0158718, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_clip": 1.37261021, + "balance_loss_mlp": 1.04647171, + "epoch": 0.10383285735758305, + "flos": 26144051612040.0, + "grad_norm": 1.964335342419605, + "language_loss": 0.74438071, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.77099454, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.27758789, + "step": 1727, + "time_per_iteration": 2.809248208999634 + }, + { + "auxiliary_loss_clip": 0.01576597, + "auxiliary_loss_mlp": 0.01060453, + "balance_loss_clip": 1.36448741, + "balance_loss_mlp": 1.03370225, + "epoch": 0.10389298061025101, + "flos": 17096172205320.0, + "grad_norm": 2.0422925022149796, + "language_loss": 0.84541291, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87178338, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.26721191, + "step": 1728, + "time_per_iteration": 2.7190022468566895 + }, + { + "auxiliary_loss_clip": 0.01574595, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_clip": 1.36348009, + "balance_loss_mlp": 1.0255028, + "epoch": 0.10395310386291898, + "flos": 20781330716880.0, + "grad_norm": 2.4212773981452482, + "language_loss": 0.70939636, + "learning_rate": 3.942904426157406e-06, + "loss": 0.735654, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25671387, + "step": 1729, + "time_per_iteration": 2.938080310821533 + }, + { + "auxiliary_loss_clip": 0.01584564, + "auxiliary_loss_mlp": 0.01070652, + "balance_loss_clip": 1.37181807, + "balance_loss_mlp": 1.04198217, + "epoch": 0.10401322711558696, + "flos": 12824047899000.0, + "grad_norm": 2.436581108064968, + "language_loss": 0.81111836, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83767056, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.28674316, + "step": 1730, + "time_per_iteration": 2.740882396697998 + }, + { + "auxiliary_loss_clip": 0.01575292, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.36678243, + "balance_loss_mlp": 1.03059053, + "epoch": 0.10407335036825492, + "flos": 23189828823360.0, + "grad_norm": 2.2435940032149717, + "language_loss": 0.75995219, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78625703, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24621582, + "step": 1731, + "time_per_iteration": 2.784618854522705 + }, + { + "auxiliary_loss_clip": 0.01568531, + "auxiliary_loss_mlp": 0.01056734, + "balance_loss_clip": 1.36034238, + "balance_loss_mlp": 1.03172374, + "epoch": 0.10413347362092289, + "flos": 26109795137400.0, + "grad_norm": 1.8642541577756586, + "language_loss": 0.83310449, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85935712, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.25024414, + "step": 1732, + "time_per_iteration": 2.7808430194854736 + }, + { + "auxiliary_loss_clip": 0.0158517, + "auxiliary_loss_mlp": 0.01060918, + "balance_loss_clip": 1.37399268, + "balance_loss_mlp": 1.03593159, + "epoch": 0.10419359687359087, + "flos": 12644928112320.0, + "grad_norm": 2.0693571041346757, + "language_loss": 0.84021795, + "learning_rate": 3.942534260525104e-06, + "loss": 0.86667883, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25, + "step": 1733, + "time_per_iteration": 2.789777994155884 + }, + { + "auxiliary_loss_clip": 0.01587623, + "auxiliary_loss_mlp": 0.01057733, + "balance_loss_clip": 1.37371063, + "balance_loss_mlp": 1.03129244, + "epoch": 0.10425372012625883, + "flos": 12128221426320.0, + "grad_norm": 2.089095223275057, + "language_loss": 0.76831496, + "learning_rate": 3.942441534955514e-06, + "loss": 0.79476851, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.2644043, + "step": 1734, + "time_per_iteration": 2.8746116161346436 + }, + { + "auxiliary_loss_clip": 0.0157098, + "auxiliary_loss_mlp": 0.01053125, + "balance_loss_clip": 1.36379099, + "balance_loss_mlp": 1.02797174, + "epoch": 0.1043138433789268, + "flos": 25343019213840.0, + "grad_norm": 1.7622033315530998, + "language_loss": 0.75043327, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77667433, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.25158691, + "step": 1735, + "time_per_iteration": 2.8282034397125244 + }, + { + "auxiliary_loss_clip": 0.01584897, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.37263083, + "balance_loss_mlp": 1.03380084, + "epoch": 0.10437396663159478, + "flos": 29172919212360.0, + "grad_norm": 1.6499504205375812, + "language_loss": 0.79579258, + "learning_rate": 3.94225586284712e-06, + "loss": 0.82225096, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27185059, + "step": 1736, + "time_per_iteration": 2.8736941814422607 + }, + { + "auxiliary_loss_clip": 0.01574931, + "auxiliary_loss_mlp": 0.01064272, + "balance_loss_clip": 1.3681128, + "balance_loss_mlp": 1.0382961, + "epoch": 0.10443408988426274, + "flos": 25086208988520.0, + "grad_norm": 1.682953441336372, + "language_loss": 0.70994437, + "learning_rate": 3.942162916315356e-06, + "loss": 0.73633647, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.2598877, + "step": 1737, + "time_per_iteration": 2.8141942024230957 + }, + { + "auxiliary_loss_clip": 0.01588743, + "auxiliary_loss_mlp": 0.01064901, + "balance_loss_clip": 1.37411618, + "balance_loss_mlp": 1.03490806, + "epoch": 0.1044942131369307, + "flos": 26765192664720.0, + "grad_norm": 1.8141783786028758, + "language_loss": 0.81781292, + "learning_rate": 3.942069896136581e-06, + "loss": 0.84434938, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.29980469, + "step": 1738, + "time_per_iteration": 2.84773325920105 + }, + { + "auxiliary_loss_clip": 0.01589879, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.3764559, + "balance_loss_mlp": 1.03655362, + "epoch": 0.10455433638959867, + "flos": 18447233771880.0, + "grad_norm": 2.8038295715604846, + "language_loss": 0.75274897, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77929056, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.27734375, + "step": 1739, + "time_per_iteration": 2.735412120819092 + }, + { + "auxiliary_loss_clip": 0.01576906, + "auxiliary_loss_mlp": 0.01066345, + "balance_loss_clip": 1.36737347, + "balance_loss_mlp": 1.0379138, + "epoch": 0.10461445964226665, + "flos": 23223841647840.0, + "grad_norm": 1.6799811500616169, + "language_loss": 0.77733934, + "learning_rate": 3.941883634852104e-06, + "loss": 0.80377179, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.28417969, + "step": 1740, + "time_per_iteration": 2.8093421459198 + }, + { + "auxiliary_loss_clip": 0.01586972, + "auxiliary_loss_mlp": 0.01059434, + "balance_loss_clip": 1.37754643, + "balance_loss_mlp": 1.03326774, + "epoch": 0.10467458289493461, + "flos": 24349628703600.0, + "grad_norm": 1.9234728712705922, + "language_loss": 0.86611396, + "learning_rate": 3.941790393753467e-06, + "loss": 0.89257801, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.26184082, + "step": 1741, + "time_per_iteration": 2.818331718444824 + }, + { + "auxiliary_loss_clip": 0.01595407, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_clip": 1.38109946, + "balance_loss_mlp": 1.0306654, + "epoch": 0.10473470614760258, + "flos": 21292920749520.0, + "grad_norm": 3.257490480006389, + "language_loss": 0.75996387, + "learning_rate": 3.941697079021942e-06, + "loss": 0.78650045, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.27600098, + "step": 1742, + "time_per_iteration": 2.7539525032043457 + }, + { + "auxiliary_loss_clip": 0.01578307, + "auxiliary_loss_mlp": 0.01070406, + "balance_loss_clip": 1.37166643, + "balance_loss_mlp": 1.04667115, + "epoch": 0.10479482940027056, + "flos": 21691954743480.0, + "grad_norm": 1.9113382433344999, + "language_loss": 0.87504309, + "learning_rate": 3.94160369066107e-06, + "loss": 0.90153027, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23742676, + "step": 1743, + "time_per_iteration": 2.7858190536499023 + }, + { + "auxiliary_loss_clip": 0.0157101, + "auxiliary_loss_mlp": 0.01058565, + "balance_loss_clip": 1.36332834, + "balance_loss_mlp": 1.03196955, + "epoch": 0.10485495265293852, + "flos": 21577896195480.0, + "grad_norm": 1.9683996182852312, + "language_loss": 0.75886822, + "learning_rate": 3.941510228674391e-06, + "loss": 0.785164, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.26611328, + "step": 1744, + "time_per_iteration": 2.772934913635254 + }, + { + "auxiliary_loss_clip": 0.01574408, + "auxiliary_loss_mlp": 0.01063345, + "balance_loss_clip": 1.36587834, + "balance_loss_mlp": 1.03754854, + "epoch": 0.10491507590560649, + "flos": 37969064438760.0, + "grad_norm": 1.8817920592024688, + "language_loss": 0.8016355, + "learning_rate": 3.941416693065451e-06, + "loss": 0.82801306, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.25793457, + "step": 1745, + "time_per_iteration": 2.940800189971924 + }, + { + "auxiliary_loss_clip": 0.01573821, + "auxiliary_loss_mlp": 0.01068867, + "balance_loss_clip": 1.36528432, + "balance_loss_mlp": 1.04338026, + "epoch": 0.10497519915827447, + "flos": 26401958263080.0, + "grad_norm": 1.9658714373487203, + "language_loss": 0.83541828, + "learning_rate": 3.941323083837794e-06, + "loss": 0.86184514, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.25500488, + "step": 1746, + "time_per_iteration": 2.882267475128174 + }, + { + "auxiliary_loss_clip": 0.01579279, + "auxiliary_loss_mlp": 0.0106393, + "balance_loss_clip": 1.37090492, + "balance_loss_mlp": 1.03863394, + "epoch": 0.10503532241094243, + "flos": 40669725670920.0, + "grad_norm": 1.588471555883731, + "language_loss": 0.70657134, + "learning_rate": 3.941229400994971e-06, + "loss": 0.73300344, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.25305176, + "step": 1747, + "time_per_iteration": 2.9378037452697754 + }, + { + "auxiliary_loss_clip": 0.01591817, + "auxiliary_loss_mlp": 0.01063061, + "balance_loss_clip": 1.37608874, + "balance_loss_mlp": 1.03522587, + "epoch": 0.1050954456636104, + "flos": 29795522166000.0, + "grad_norm": 2.3916225550386323, + "language_loss": 0.85044539, + "learning_rate": 3.941135644540535e-06, + "loss": 0.87699419, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.27832031, + "step": 1748, + "time_per_iteration": 2.8196496963500977 + }, + { + "auxiliary_loss_clip": 0.01566039, + "auxiliary_loss_mlp": 0.01052786, + "balance_loss_clip": 1.35764647, + "balance_loss_mlp": 1.02658343, + "epoch": 0.10515556891627838, + "flos": 23953721553360.0, + "grad_norm": 1.6475100445856041, + "language_loss": 0.72301239, + "learning_rate": 3.941041814478041e-06, + "loss": 0.74920064, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.26196289, + "step": 1749, + "time_per_iteration": 2.758483409881592 + }, + { + "auxiliary_loss_clip": 0.01566666, + "auxiliary_loss_mlp": 0.01061139, + "balance_loss_clip": 1.360641, + "balance_loss_mlp": 1.03529501, + "epoch": 0.10521569216894634, + "flos": 18264174974280.0, + "grad_norm": 1.8791676616034405, + "language_loss": 0.82071328, + "learning_rate": 3.940947910811047e-06, + "loss": 0.8469913, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.25830078, + "step": 1750, + "time_per_iteration": 2.7905333042144775 + }, + { + "auxiliary_loss_clip": 0.01578287, + "auxiliary_loss_mlp": 0.01067408, + "balance_loss_clip": 1.36666036, + "balance_loss_mlp": 1.03839278, + "epoch": 0.10527581542161431, + "flos": 15634909884960.0, + "grad_norm": 2.672619733214924, + "language_loss": 0.92813057, + "learning_rate": 3.940853933543114e-06, + "loss": 0.95458752, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.28979492, + "step": 1751, + "time_per_iteration": 2.7062487602233887 + }, + { + "auxiliary_loss_clip": 0.01575994, + "auxiliary_loss_mlp": 0.01067687, + "balance_loss_clip": 1.36932766, + "balance_loss_mlp": 1.0423553, + "epoch": 0.10533593867428227, + "flos": 18301192817400.0, + "grad_norm": 3.79585175974411, + "language_loss": 0.78981441, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81625122, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.25354004, + "step": 1752, + "time_per_iteration": 2.7985379695892334 + }, + { + "auxiliary_loss_clip": 0.01569761, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_clip": 1.36419106, + "balance_loss_mlp": 1.03470087, + "epoch": 0.10539606192695025, + "flos": 29029233542760.0, + "grad_norm": 1.8411080663184995, + "language_loss": 0.76285273, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78915954, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.2623291, + "step": 1753, + "time_per_iteration": 2.8130924701690674 + }, + { + "auxiliary_loss_clip": 0.015757, + "auxiliary_loss_mlp": 0.01065374, + "balance_loss_clip": 1.3619293, + "balance_loss_mlp": 1.03869545, + "epoch": 0.10545618517961822, + "flos": 19973435505840.0, + "grad_norm": 2.337206718031489, + "language_loss": 0.84365362, + "learning_rate": 3.940571560169328e-06, + "loss": 0.87006438, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.26672363, + "step": 1754, + "time_per_iteration": 2.7738196849823 + }, + { + "auxiliary_loss_clip": 0.0157835, + "auxiliary_loss_mlp": 0.01063277, + "balance_loss_clip": 1.36931944, + "balance_loss_mlp": 1.03548896, + "epoch": 0.10551630843228618, + "flos": 16147718168400.0, + "grad_norm": 2.7697952378770316, + "language_loss": 0.70336044, + "learning_rate": 3.940477288533302e-06, + "loss": 0.72977668, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.2779541, + "step": 1755, + "time_per_iteration": 2.717212200164795 + }, + { + "auxiliary_loss_clip": 0.01576526, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_clip": 1.36232662, + "balance_loss_mlp": 1.05360913, + "epoch": 0.10557643168495416, + "flos": 23445298972800.0, + "grad_norm": 2.014320553016333, + "language_loss": 0.76590931, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79247272, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26208496, + "step": 1756, + "time_per_iteration": 2.80759859085083 + }, + { + "auxiliary_loss_clip": 0.01574171, + "auxiliary_loss_mlp": 0.01076583, + "balance_loss_clip": 1.36229241, + "balance_loss_mlp": 1.05032134, + "epoch": 0.10563655493762213, + "flos": 21804104698560.0, + "grad_norm": 1.5580005814825215, + "language_loss": 0.79815394, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82466149, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26281738, + "step": 1757, + "time_per_iteration": 2.792797803878784 + }, + { + "auxiliary_loss_clip": 0.01561491, + "auxiliary_loss_mlp": 0.01066618, + "balance_loss_clip": 1.35287213, + "balance_loss_mlp": 1.04200089, + "epoch": 0.10569667819029009, + "flos": 53810609597280.0, + "grad_norm": 1.508443664940626, + "language_loss": 0.79332614, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81960726, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.24621582, + "step": 1758, + "time_per_iteration": 3.1625516414642334 + }, + { + "auxiliary_loss_clip": 0.01575414, + "auxiliary_loss_mlp": 0.0106517, + "balance_loss_clip": 1.36279225, + "balance_loss_mlp": 1.03844297, + "epoch": 0.10575680144295807, + "flos": 22930135404480.0, + "grad_norm": 1.8546802863892606, + "language_loss": 0.9212538, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94765961, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.26745605, + "step": 1759, + "time_per_iteration": 4.249853134155273 + }, + { + "auxiliary_loss_clip": 0.01565181, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_clip": 1.35287547, + "balance_loss_mlp": 1.03424501, + "epoch": 0.10581692469562604, + "flos": 14140690557480.0, + "grad_norm": 1.969056605072553, + "language_loss": 0.77481395, + "learning_rate": 3.940004826678365e-06, + "loss": 0.80108392, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27563477, + "step": 1760, + "time_per_iteration": 4.374704122543335 + }, + { + "auxiliary_loss_clip": 0.01563987, + "auxiliary_loss_mlp": 0.01065474, + "balance_loss_clip": 1.3511641, + "balance_loss_mlp": 1.03802037, + "epoch": 0.105877047948294, + "flos": 25964322699960.0, + "grad_norm": 2.1735858890162283, + "language_loss": 0.8945598, + "learning_rate": 3.939910113597498e-06, + "loss": 0.92085439, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.27441406, + "step": 1761, + "time_per_iteration": 2.776671886444092 + }, + { + "auxiliary_loss_clip": 0.01564956, + "auxiliary_loss_mlp": 0.01065953, + "balance_loss_clip": 1.35534143, + "balance_loss_mlp": 1.03972745, + "epoch": 0.10593717120096197, + "flos": 30670955725680.0, + "grad_norm": 2.2049790890654184, + "language_loss": 0.78340673, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80971587, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.2623291, + "step": 1762, + "time_per_iteration": 4.280065536499023 + }, + { + "auxiliary_loss_clip": 0.01340244, + "auxiliary_loss_mlp": 0.0101677, + "balance_loss_clip": 1.21936941, + "balance_loss_mlp": 1.00813949, + "epoch": 0.10599729445362994, + "flos": 66454377048360.0, + "grad_norm": 0.8253890273721628, + "language_loss": 0.60530084, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62887102, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.08642578, + "step": 1763, + "time_per_iteration": 3.404726505279541 + }, + { + "auxiliary_loss_clip": 0.01555076, + "auxiliary_loss_mlp": 0.01060641, + "balance_loss_clip": 1.34359574, + "balance_loss_mlp": 1.03423619, + "epoch": 0.10605741770629791, + "flos": 23953071819600.0, + "grad_norm": 1.511194667089071, + "language_loss": 0.80367124, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82982838, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26403809, + "step": 1764, + "time_per_iteration": 2.7904326915740967 + }, + { + "auxiliary_loss_clip": 0.01555737, + "auxiliary_loss_mlp": 0.01060879, + "balance_loss_clip": 1.3469888, + "balance_loss_mlp": 1.03518927, + "epoch": 0.10611754095896588, + "flos": 19391830014600.0, + "grad_norm": 1.714179349002624, + "language_loss": 0.80082572, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82699192, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25683594, + "step": 1765, + "time_per_iteration": 2.8153693675994873 + }, + { + "auxiliary_loss_clip": 0.01548509, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_clip": 1.3419745, + "balance_loss_mlp": 1.03643608, + "epoch": 0.10617766421163385, + "flos": 22242999120840.0, + "grad_norm": 1.6879593723159405, + "language_loss": 0.77185136, + "learning_rate": 3.939435444841306e-06, + "loss": 0.7979551, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.25427246, + "step": 1766, + "time_per_iteration": 2.788763999938965 + }, + { + "auxiliary_loss_clip": 0.01560343, + "auxiliary_loss_mlp": 0.01064213, + "balance_loss_clip": 1.35099292, + "balance_loss_mlp": 1.03700948, + "epoch": 0.10623778746430182, + "flos": 28410528991680.0, + "grad_norm": 1.5917784480241113, + "language_loss": 0.77628922, + "learning_rate": 3.939340290444895e-06, + "loss": 0.80253482, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.27197266, + "step": 1767, + "time_per_iteration": 2.9183881282806396 + }, + { + "auxiliary_loss_clip": 0.01338071, + "auxiliary_loss_mlp": 0.01009987, + "balance_loss_clip": 1.21455467, + "balance_loss_mlp": 1.0010705, + "epoch": 0.10629791071696978, + "flos": 64250361972360.0, + "grad_norm": 0.715538010292982, + "language_loss": 0.57944882, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60292947, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.08935547, + "step": 1768, + "time_per_iteration": 3.3708314895629883 + }, + { + "auxiliary_loss_clip": 0.01557758, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_clip": 1.34780288, + "balance_loss_mlp": 1.02572203, + "epoch": 0.10635803396963776, + "flos": 22752761777280.0, + "grad_norm": 1.3783375638008706, + "language_loss": 0.86548615, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89156258, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24182129, + "step": 1769, + "time_per_iteration": 2.797881841659546 + }, + { + "auxiliary_loss_clip": 0.01562715, + "auxiliary_loss_mlp": 0.01062532, + "balance_loss_clip": 1.35081625, + "balance_loss_mlp": 1.03642523, + "epoch": 0.10641815722230573, + "flos": 31402013273640.0, + "grad_norm": 1.7567213884817503, + "language_loss": 0.61882532, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64507782, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2611084, + "step": 1770, + "time_per_iteration": 2.8983945846557617 + }, + { + "auxiliary_loss_clip": 0.01338588, + "auxiliary_loss_mlp": 0.01009639, + "balance_loss_clip": 1.2147305, + "balance_loss_mlp": 1.00162816, + "epoch": 0.1064782804749737, + "flos": 58564006472520.0, + "grad_norm": 0.9029603515264776, + "language_loss": 0.57101393, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59449619, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.08007812, + "step": 1771, + "time_per_iteration": 3.126342535018921 + }, + { + "auxiliary_loss_clip": 0.01561682, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_clip": 1.35290051, + "balance_loss_mlp": 1.04579902, + "epoch": 0.10653840372764166, + "flos": 23993135289720.0, + "grad_norm": 1.5557248317355992, + "language_loss": 0.88595128, + "learning_rate": 3.938863415435429e-06, + "loss": 0.91228431, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25805664, + "step": 1772, + "time_per_iteration": 2.772942066192627 + }, + { + "auxiliary_loss_clip": 0.01562377, + "auxiliary_loss_mlp": 0.01064802, + "balance_loss_clip": 1.34774947, + "balance_loss_mlp": 1.03806329, + "epoch": 0.10659852698030964, + "flos": 18299040574320.0, + "grad_norm": 2.48239255010471, + "language_loss": 0.76761103, + "learning_rate": 3.93876781985337e-06, + "loss": 0.79388279, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.26733398, + "step": 1773, + "time_per_iteration": 2.7567880153656006 + }, + { + "auxiliary_loss_clip": 0.0156094, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_clip": 1.352126, + "balance_loss_mlp": 1.04172826, + "epoch": 0.1066586502329776, + "flos": 32166961821000.0, + "grad_norm": 2.04401837939584, + "language_loss": 0.83912909, + "learning_rate": 3.938672150753041e-06, + "loss": 0.86541605, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.26025391, + "step": 1774, + "time_per_iteration": 2.88228702545166 + }, + { + "auxiliary_loss_clip": 0.01561048, + "auxiliary_loss_mlp": 0.01066473, + "balance_loss_clip": 1.34903157, + "balance_loss_mlp": 1.04070044, + "epoch": 0.10671877348564557, + "flos": 17789643393120.0, + "grad_norm": 2.505608383094685, + "language_loss": 0.77116817, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.79744339, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25756836, + "step": 1775, + "time_per_iteration": 2.764728307723999 + }, + { + "auxiliary_loss_clip": 0.01330295, + "auxiliary_loss_mlp": 0.01019534, + "balance_loss_clip": 1.20570827, + "balance_loss_mlp": 1.01157045, + "epoch": 0.10677889673831355, + "flos": 63526492104120.0, + "grad_norm": 0.8414091755014055, + "language_loss": 0.57493621, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59843451, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07958984, + "step": 1776, + "time_per_iteration": 3.3120224475860596 + }, + { + "auxiliary_loss_clip": 0.01550312, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_clip": 1.34102762, + "balance_loss_mlp": 1.0423336, + "epoch": 0.10683901999098151, + "flos": 22022760046680.0, + "grad_norm": 1.5648972850001555, + "language_loss": 0.83198535, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85819006, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.27832031, + "step": 1777, + "time_per_iteration": 2.7740349769592285 + }, + { + "auxiliary_loss_clip": 0.0155281, + "auxiliary_loss_mlp": 0.01065763, + "balance_loss_clip": 1.34742773, + "balance_loss_mlp": 1.03964448, + "epoch": 0.10689914324364948, + "flos": 25048013502960.0, + "grad_norm": 1.6554660600955609, + "language_loss": 0.87389719, + "learning_rate": 3.938288739241625e-06, + "loss": 0.90008295, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.2611084, + "step": 1778, + "time_per_iteration": 2.9370932579040527 + }, + { + "auxiliary_loss_clip": 0.01556194, + "auxiliary_loss_mlp": 0.01066897, + "balance_loss_clip": 1.3474561, + "balance_loss_mlp": 1.04131436, + "epoch": 0.10695926649631746, + "flos": 16439434602120.0, + "grad_norm": 1.8398910982465704, + "language_loss": 0.84362185, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86985278, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25561523, + "step": 1779, + "time_per_iteration": 2.717313528060913 + }, + { + "auxiliary_loss_clip": 0.01550954, + "auxiliary_loss_mlp": 0.01061341, + "balance_loss_clip": 1.34402764, + "balance_loss_mlp": 1.03622329, + "epoch": 0.10701938974898542, + "flos": 16983169474680.0, + "grad_norm": 2.011526574574452, + "language_loss": 0.67486453, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.70098746, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.25097656, + "step": 1780, + "time_per_iteration": 2.7547242641448975 + }, + { + "auxiliary_loss_clip": 0.01550445, + "auxiliary_loss_mlp": 0.01062224, + "balance_loss_clip": 1.34358883, + "balance_loss_mlp": 1.03509223, + "epoch": 0.10707951300165339, + "flos": 15892369844040.0, + "grad_norm": 4.816063648478814, + "language_loss": 0.92448103, + "learning_rate": 3.938000408844265e-06, + "loss": 0.95060766, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.2713623, + "step": 1781, + "time_per_iteration": 2.7492868900299072 + }, + { + "auxiliary_loss_clip": 0.01553823, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.34531367, + "balance_loss_mlp": 1.03430688, + "epoch": 0.10713963625432135, + "flos": 14251338003240.0, + "grad_norm": 2.073915769667547, + "language_loss": 0.80348837, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.82961804, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24853516, + "step": 1782, + "time_per_iteration": 2.7976202964782715 + }, + { + "auxiliary_loss_clip": 0.01559818, + "auxiliary_loss_mlp": 0.01059823, + "balance_loss_clip": 1.3506403, + "balance_loss_mlp": 1.03348935, + "epoch": 0.10719975950698933, + "flos": 16760250248760.0, + "grad_norm": 2.6697872085974095, + "language_loss": 0.79457629, + "learning_rate": 3.937807821127436e-06, + "loss": 0.82077277, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26342773, + "step": 1783, + "time_per_iteration": 2.749556064605713 + }, + { + "auxiliary_loss_clip": 0.01556512, + "auxiliary_loss_mlp": 0.01063956, + "balance_loss_clip": 1.3471384, + "balance_loss_mlp": 1.03823066, + "epoch": 0.1072598827596573, + "flos": 22715743934160.0, + "grad_norm": 1.9307628221644906, + "language_loss": 0.87094378, + "learning_rate": 3.937711417044395e-06, + "loss": 0.89714861, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25732422, + "step": 1784, + "time_per_iteration": 2.8131513595581055 + }, + { + "auxiliary_loss_clip": 0.0155401, + "auxiliary_loss_mlp": 0.01063992, + "balance_loss_clip": 1.34350502, + "balance_loss_mlp": 1.03830194, + "epoch": 0.10732000601232526, + "flos": 23263499034360.0, + "grad_norm": 2.7100246739714984, + "language_loss": 1.01198983, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03816986, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25720215, + "step": 1785, + "time_per_iteration": 2.790498733520508 + }, + { + "auxiliary_loss_clip": 0.01545365, + "auxiliary_loss_mlp": 0.01061732, + "balance_loss_clip": 1.34229374, + "balance_loss_mlp": 1.03674543, + "epoch": 0.10738012926499324, + "flos": 24212440371600.0, + "grad_norm": 1.3285331689709847, + "language_loss": 0.85262227, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87869322, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.24963379, + "step": 1786, + "time_per_iteration": 2.79244327545166 + }, + { + "auxiliary_loss_clip": 0.01557558, + "auxiliary_loss_mlp": 0.01073169, + "balance_loss_clip": 1.34707153, + "balance_loss_mlp": 1.0416739, + "epoch": 0.1074402525176612, + "flos": 20927980796760.0, + "grad_norm": 1.9193776028501424, + "language_loss": 0.79261708, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81892431, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.31530762, + "step": 1787, + "time_per_iteration": 2.757612943649292 + }, + { + "auxiliary_loss_clip": 0.01558132, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_clip": 1.34663188, + "balance_loss_mlp": 1.02787256, + "epoch": 0.10750037577032917, + "flos": 16951430718360.0, + "grad_norm": 1.830906923413608, + "language_loss": 0.8369779, + "learning_rate": 3.937325065966719e-06, + "loss": 0.86310321, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.26525879, + "step": 1788, + "time_per_iteration": 2.7199177742004395 + }, + { + "auxiliary_loss_clip": 0.01554862, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_clip": 1.34794199, + "balance_loss_mlp": 1.0408057, + "epoch": 0.10756049902299715, + "flos": 20271405627000.0, + "grad_norm": 1.914574605949817, + "language_loss": 0.78772789, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.81394297, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.25817871, + "step": 1789, + "time_per_iteration": 2.7601211071014404 + }, + { + "auxiliary_loss_clip": 0.0154711, + "auxiliary_loss_mlp": 0.01056597, + "balance_loss_clip": 1.3409791, + "balance_loss_mlp": 1.02890468, + "epoch": 0.10762062227566511, + "flos": 23591786619240.0, + "grad_norm": 3.14852847158791, + "language_loss": 0.75288767, + "learning_rate": 3.937131449631859e-06, + "loss": 0.7789247, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.27722168, + "step": 1790, + "time_per_iteration": 2.8378372192382812 + }, + { + "auxiliary_loss_clip": 0.01559241, + "auxiliary_loss_mlp": 0.01071764, + "balance_loss_clip": 1.34848046, + "balance_loss_mlp": 1.04161596, + "epoch": 0.10768074552833308, + "flos": 24315250403880.0, + "grad_norm": 3.1526090767091857, + "language_loss": 0.78976816, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.81607819, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.30175781, + "step": 1791, + "time_per_iteration": 2.776804208755493 + }, + { + "auxiliary_loss_clip": 0.01544378, + "auxiliary_loss_mlp": 0.01057185, + "balance_loss_clip": 1.34129071, + "balance_loss_mlp": 1.03151906, + "epoch": 0.10774086878100106, + "flos": 25305270420240.0, + "grad_norm": 1.5787908042622358, + "language_loss": 0.70852244, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73453796, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25671387, + "step": 1792, + "time_per_iteration": 2.860922336578369 + }, + { + "auxiliary_loss_clip": 0.01537482, + "auxiliary_loss_mlp": 0.01055682, + "balance_loss_clip": 1.32819939, + "balance_loss_mlp": 1.02950346, + "epoch": 0.10780099203366902, + "flos": 22059087547680.0, + "grad_norm": 11.831568581635395, + "language_loss": 0.76728183, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.79321349, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.26184082, + "step": 1793, + "time_per_iteration": 2.7503128051757812 + }, + { + "auxiliary_loss_clip": 0.01544688, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.34153295, + "balance_loss_mlp": 1.0282836, + "epoch": 0.10786111528633699, + "flos": 22752639952200.0, + "grad_norm": 1.4791818975300826, + "language_loss": 0.85490572, + "learning_rate": 3.936743335516936e-06, + "loss": 0.88089114, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.25561523, + "step": 1794, + "time_per_iteration": 2.805837869644165 + }, + { + "auxiliary_loss_clip": 0.01554404, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_clip": 1.34049499, + "balance_loss_mlp": 1.03916132, + "epoch": 0.10792123853900495, + "flos": 20856097353600.0, + "grad_norm": 1.5744791688597475, + "language_loss": 0.75576913, + "learning_rate": 3.936646123375246e-06, + "loss": 0.78199732, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.29284668, + "step": 1795, + "time_per_iteration": 2.749037981033325 + }, + { + "auxiliary_loss_clip": 0.01547159, + "auxiliary_loss_mlp": 0.01065448, + "balance_loss_clip": 1.33610344, + "balance_loss_mlp": 1.04095089, + "epoch": 0.10798136179167293, + "flos": 17753194067040.0, + "grad_norm": 2.5286972497218034, + "language_loss": 0.82567549, + "learning_rate": 3.936548837795741e-06, + "loss": 0.85180151, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24487305, + "step": 1796, + "time_per_iteration": 2.7916717529296875 + }, + { + "auxiliary_loss_clip": 0.01557192, + "auxiliary_loss_mlp": 0.01072976, + "balance_loss_clip": 1.34745479, + "balance_loss_mlp": 1.0404315, + "epoch": 0.1080414850443409, + "flos": 13593625799400.0, + "grad_norm": 2.2179886509698146, + "language_loss": 0.74578029, + "learning_rate": 3.936451478782111e-06, + "loss": 0.77208191, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.32568359, + "step": 1797, + "time_per_iteration": 4.336418390274048 + }, + { + "auxiliary_loss_clip": 0.0153168, + "auxiliary_loss_mlp": 0.01055106, + "balance_loss_clip": 1.32766843, + "balance_loss_mlp": 1.02909446, + "epoch": 0.10810160829700886, + "flos": 16257797097120.0, + "grad_norm": 2.1105657218985763, + "language_loss": 0.82154596, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84741378, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.26000977, + "step": 1798, + "time_per_iteration": 2.729609489440918 + }, + { + "auxiliary_loss_clip": 0.01539233, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.3354696, + "balance_loss_mlp": 1.02985704, + "epoch": 0.10816173154967684, + "flos": 15162124463280.0, + "grad_norm": 2.221026969179519, + "language_loss": 0.86520129, + "learning_rate": 3.936256540467242e-06, + "loss": 0.89113903, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.2467041, + "step": 1799, + "time_per_iteration": 4.302363872528076 + }, + { + "auxiliary_loss_clip": 0.01530754, + "auxiliary_loss_mlp": 0.01054012, + "balance_loss_clip": 1.32796025, + "balance_loss_mlp": 1.03192258, + "epoch": 0.10822185480234481, + "flos": 17789846434920.0, + "grad_norm": 1.7637190633464668, + "language_loss": 0.78120786, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.80705547, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.22094727, + "step": 1800, + "time_per_iteration": 4.385709285736084 + }, + { + "auxiliary_loss_clip": 0.01531068, + "auxiliary_loss_mlp": 0.01053865, + "balance_loss_clip": 1.32729077, + "balance_loss_mlp": 1.03063083, + "epoch": 0.10828197805501277, + "flos": 25562161862280.0, + "grad_norm": 1.5401016552864093, + "language_loss": 0.72915041, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75499976, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23254395, + "step": 1801, + "time_per_iteration": 4.249311447143555 + }, + { + "auxiliary_loss_clip": 0.01546406, + "auxiliary_loss_mlp": 0.01056629, + "balance_loss_clip": 1.33542132, + "balance_loss_mlp": 1.03402722, + "epoch": 0.10834210130768075, + "flos": 28990144673280.0, + "grad_norm": 1.8081631685141708, + "language_loss": 0.6620481, + "learning_rate": 3.935963582331381e-06, + "loss": 0.6880784, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.22595215, + "step": 1802, + "time_per_iteration": 2.7833101749420166 + }, + { + "auxiliary_loss_clip": 0.01544569, + "auxiliary_loss_mlp": 0.01064779, + "balance_loss_clip": 1.33925414, + "balance_loss_mlp": 1.04038858, + "epoch": 0.10840222456034872, + "flos": 20268766083600.0, + "grad_norm": 1.8657537561083202, + "language_loss": 0.81446064, + "learning_rate": 3.935865782790621e-06, + "loss": 0.84055406, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.24401855, + "step": 1803, + "time_per_iteration": 2.805908441543579 + }, + { + "auxiliary_loss_clip": 0.01525836, + "auxiliary_loss_mlp": 0.01060476, + "balance_loss_clip": 1.32298827, + "balance_loss_mlp": 1.03609812, + "epoch": 0.10846234781301668, + "flos": 19867579846560.0, + "grad_norm": 1.5805755678497981, + "language_loss": 0.91650927, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.94237238, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.24401855, + "step": 1804, + "time_per_iteration": 2.745802879333496 + }, + { + "auxiliary_loss_clip": 0.01543035, + "auxiliary_loss_mlp": 0.01053054, + "balance_loss_clip": 1.33566272, + "balance_loss_mlp": 1.02738774, + "epoch": 0.10852247106568465, + "flos": 26474694481800.0, + "grad_norm": 1.9314249367134295, + "language_loss": 0.76910543, + "learning_rate": 3.935669963488139e-06, + "loss": 0.79506636, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.25695801, + "step": 1805, + "time_per_iteration": 2.820681571960449 + }, + { + "auxiliary_loss_clip": 0.01537215, + "auxiliary_loss_mlp": 0.01054458, + "balance_loss_clip": 1.33309436, + "balance_loss_mlp": 1.03177226, + "epoch": 0.10858259431835263, + "flos": 30087685291680.0, + "grad_norm": 1.7202786957003104, + "language_loss": 0.86227822, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88819492, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.22680664, + "step": 1806, + "time_per_iteration": 2.8690345287323 + }, + { + "auxiliary_loss_clip": 0.01542601, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_clip": 1.33851027, + "balance_loss_mlp": 1.02477217, + "epoch": 0.10864271757102059, + "flos": 19067968740960.0, + "grad_norm": 3.006567978710348, + "language_loss": 0.81648481, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.84240079, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.24255371, + "step": 1807, + "time_per_iteration": 2.786648750305176 + }, + { + "auxiliary_loss_clip": 0.0154346, + "auxiliary_loss_mlp": 0.01057998, + "balance_loss_clip": 1.33898103, + "balance_loss_mlp": 1.03592026, + "epoch": 0.10870284082368856, + "flos": 24720132001680.0, + "grad_norm": 1.9148244777461696, + "language_loss": 0.79806674, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.82408136, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.22106934, + "step": 1808, + "time_per_iteration": 2.8931236267089844 + }, + { + "auxiliary_loss_clip": 0.01537244, + "auxiliary_loss_mlp": 0.01050137, + "balance_loss_clip": 1.33148217, + "balance_loss_mlp": 1.02655697, + "epoch": 0.10876296407635654, + "flos": 20632081701960.0, + "grad_norm": 1.6439000948340492, + "language_loss": 0.79539567, + "learning_rate": 3.935277444103342e-06, + "loss": 0.82126951, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.23583984, + "step": 1809, + "time_per_iteration": 2.7818310260772705 + }, + { + "auxiliary_loss_clip": 0.01534326, + "auxiliary_loss_mlp": 0.01058564, + "balance_loss_clip": 1.32864809, + "balance_loss_mlp": 1.03440034, + "epoch": 0.1088230873290245, + "flos": 21584840225040.0, + "grad_norm": 1.857337998471608, + "language_loss": 0.85086286, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87679172, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.24182129, + "step": 1810, + "time_per_iteration": 2.783491611480713 + }, + { + "auxiliary_loss_clip": 0.01557999, + "auxiliary_loss_mlp": 0.01059871, + "balance_loss_clip": 1.34721136, + "balance_loss_mlp": 1.03431225, + "epoch": 0.10888321058169247, + "flos": 26474694481800.0, + "grad_norm": 1.5986318210714585, + "language_loss": 0.63605613, + "learning_rate": 3.935080744080564e-06, + "loss": 0.66223484, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25549316, + "step": 1811, + "time_per_iteration": 2.8885068893432617 + }, + { + "auxiliary_loss_clip": 0.01543042, + "auxiliary_loss_mlp": 0.01051574, + "balance_loss_clip": 1.3367914, + "balance_loss_mlp": 1.02702832, + "epoch": 0.10894333383436045, + "flos": 25854162554520.0, + "grad_norm": 1.9870951411033315, + "language_loss": 0.74708092, + "learning_rate": 3.934982283999626e-06, + "loss": 0.77302712, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.2454834, + "step": 1812, + "time_per_iteration": 2.816478967666626 + }, + { + "auxiliary_loss_clip": 0.01537778, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.33178079, + "balance_loss_mlp": 1.02842021, + "epoch": 0.10900345708702841, + "flos": 19541891196720.0, + "grad_norm": 1.5882505386840418, + "language_loss": 0.73570251, + "learning_rate": 3.934883750543966e-06, + "loss": 0.76160342, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2388916, + "step": 1813, + "time_per_iteration": 2.799379348754883 + }, + { + "auxiliary_loss_clip": 0.01533877, + "auxiliary_loss_mlp": 0.01056403, + "balance_loss_clip": 1.33217692, + "balance_loss_mlp": 1.03251338, + "epoch": 0.10906358033969638, + "flos": 23628601420560.0, + "grad_norm": 1.663756689792137, + "language_loss": 0.83187079, + "learning_rate": 3.93478514371732e-06, + "loss": 0.8577736, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.23876953, + "step": 1814, + "time_per_iteration": 2.8118979930877686 + }, + { + "auxiliary_loss_clip": 0.0154754, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.34058774, + "balance_loss_mlp": 1.02757096, + "epoch": 0.10912370359236434, + "flos": 21219656622120.0, + "grad_norm": 2.0272160390894776, + "language_loss": 0.85411632, + "learning_rate": 3.934686463523429e-06, + "loss": 0.88010085, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23352051, + "step": 1815, + "time_per_iteration": 2.8236453533172607 + }, + { + "auxiliary_loss_clip": 0.01539283, + "auxiliary_loss_mlp": 0.01057484, + "balance_loss_clip": 1.33785176, + "balance_loss_mlp": 1.03328419, + "epoch": 0.10918382684503232, + "flos": 13557176473320.0, + "grad_norm": 2.3211651666254163, + "language_loss": 0.71864116, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74460876, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.24230957, + "step": 1816, + "time_per_iteration": 2.808866262435913 + }, + { + "auxiliary_loss_clip": 0.01542235, + "auxiliary_loss_mlp": 0.0105689, + "balance_loss_clip": 1.33307004, + "balance_loss_mlp": 1.03142643, + "epoch": 0.10924395009770028, + "flos": 27970091451720.0, + "grad_norm": 2.0811838826957683, + "language_loss": 0.73662615, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.76261735, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.25463867, + "step": 1817, + "time_per_iteration": 2.8839826583862305 + }, + { + "auxiliary_loss_clip": 0.01538787, + "auxiliary_loss_mlp": 0.01060498, + "balance_loss_clip": 1.33291078, + "balance_loss_mlp": 1.03626287, + "epoch": 0.10930407335036825, + "flos": 25599342138840.0, + "grad_norm": 1.5674611847745963, + "language_loss": 0.67656058, + "learning_rate": 3.934389982775706e-06, + "loss": 0.70255345, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.24255371, + "step": 1818, + "time_per_iteration": 2.9287097454071045 + }, + { + "auxiliary_loss_clip": 0.01550416, + "auxiliary_loss_mlp": 0.01073605, + "balance_loss_clip": 1.34249878, + "balance_loss_mlp": 1.04903603, + "epoch": 0.10936419660303623, + "flos": 18410865662520.0, + "grad_norm": 2.0395619976943453, + "language_loss": 0.73545742, + "learning_rate": 3.934291009150275e-06, + "loss": 0.76169765, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.24572754, + "step": 1819, + "time_per_iteration": 2.756845235824585 + }, + { + "auxiliary_loss_clip": 0.01540215, + "auxiliary_loss_mlp": 0.01060461, + "balance_loss_clip": 1.33519816, + "balance_loss_mlp": 1.03610659, + "epoch": 0.1094243198557042, + "flos": 23845104525600.0, + "grad_norm": 2.782338650416258, + "language_loss": 0.74591184, + "learning_rate": 3.934191962176335e-06, + "loss": 0.77191865, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.24389648, + "step": 1820, + "time_per_iteration": 2.788320779800415 + }, + { + "auxiliary_loss_clip": 0.01543292, + "auxiliary_loss_mlp": 0.0105598, + "balance_loss_clip": 1.33709288, + "balance_loss_mlp": 1.02998066, + "epoch": 0.10948444310837216, + "flos": 14647570020360.0, + "grad_norm": 3.9216636703003984, + "language_loss": 0.82870501, + "learning_rate": 3.934092841857642e-06, + "loss": 0.8546977, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.26025391, + "step": 1821, + "time_per_iteration": 2.766648292541504 + }, + { + "auxiliary_loss_clip": 0.01529655, + "auxiliary_loss_mlp": 0.01052292, + "balance_loss_clip": 1.32634723, + "balance_loss_mlp": 1.02885497, + "epoch": 0.10954456636104014, + "flos": 27824619014280.0, + "grad_norm": 2.0758255251060644, + "language_loss": 0.77089441, + "learning_rate": 3.933993648197955e-06, + "loss": 0.79671383, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.23461914, + "step": 1822, + "time_per_iteration": 2.8225162029266357 + }, + { + "auxiliary_loss_clip": 0.01534098, + "auxiliary_loss_mlp": 0.01058958, + "balance_loss_clip": 1.32917356, + "balance_loss_mlp": 1.03389978, + "epoch": 0.1096046896137081, + "flos": 33627615015960.0, + "grad_norm": 1.5822458851054748, + "language_loss": 0.79725134, + "learning_rate": 3.933894381201034e-06, + "loss": 0.82318193, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.25061035, + "step": 1823, + "time_per_iteration": 2.959327459335327 + }, + { + "auxiliary_loss_clip": 0.0153999, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_clip": 1.33651721, + "balance_loss_mlp": 1.0247128, + "epoch": 0.10966481286637607, + "flos": 26985756605760.0, + "grad_norm": 1.3987110227738784, + "language_loss": 0.80102515, + "learning_rate": 3.933795040870645e-06, + "loss": 0.82691336, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.24121094, + "step": 1824, + "time_per_iteration": 2.8163938522338867 + }, + { + "auxiliary_loss_clip": 0.01538442, + "auxiliary_loss_mlp": 0.0105788, + "balance_loss_clip": 1.33227158, + "balance_loss_mlp": 1.03456247, + "epoch": 0.10972493611904403, + "flos": 23041391975640.0, + "grad_norm": 1.8032840014840579, + "language_loss": 0.88493443, + "learning_rate": 3.933695627210554e-06, + "loss": 0.91089767, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.23339844, + "step": 1825, + "time_per_iteration": 2.7478737831115723 + }, + { + "auxiliary_loss_clip": 0.0152658, + "auxiliary_loss_mlp": 0.01058664, + "balance_loss_clip": 1.3208406, + "balance_loss_mlp": 1.03408313, + "epoch": 0.10978505937171201, + "flos": 38111369424120.0, + "grad_norm": 1.7096736707228182, + "language_loss": 0.76835656, + "learning_rate": 3.933596140224532e-06, + "loss": 0.794209, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.24572754, + "step": 1826, + "time_per_iteration": 2.899378776550293 + }, + { + "auxiliary_loss_clip": 0.01368335, + "auxiliary_loss_mlp": 0.01023433, + "balance_loss_clip": 1.2438482, + "balance_loss_mlp": 1.01566017, + "epoch": 0.10984518262437998, + "flos": 59863976076240.0, + "grad_norm": 0.8764132894701688, + "language_loss": 0.5506202, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57453787, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.07763672, + "step": 1827, + "time_per_iteration": 3.2697744369506836 + }, + { + "auxiliary_loss_clip": 0.01369826, + "auxiliary_loss_mlp": 0.01020929, + "balance_loss_clip": 1.24571586, + "balance_loss_mlp": 1.01358533, + "epoch": 0.10990530587704794, + "flos": 66736428692400.0, + "grad_norm": 0.7435783449948352, + "language_loss": 0.55409628, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57800382, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.07324219, + "step": 1828, + "time_per_iteration": 3.23091197013855 + }, + { + "auxiliary_loss_clip": 0.01549112, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_clip": 1.3379643, + "balance_loss_mlp": 1.03805447, + "epoch": 0.10996542912971592, + "flos": 25452692058960.0, + "grad_norm": 2.4175701688853506, + "language_loss": 0.84753847, + "learning_rate": 3.933297239348612e-06, + "loss": 0.87366569, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.2557373, + "step": 1829, + "time_per_iteration": 2.7996277809143066 + }, + { + "auxiliary_loss_clip": 0.01544554, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_clip": 1.33607793, + "balance_loss_mlp": 1.04447103, + "epoch": 0.11002555238238389, + "flos": 44026068688920.0, + "grad_norm": 1.7523501784759914, + "language_loss": 0.89221752, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91837436, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.26660156, + "step": 1830, + "time_per_iteration": 2.954387903213501 + }, + { + "auxiliary_loss_clip": 0.01365854, + "auxiliary_loss_mlp": 0.01007478, + "balance_loss_clip": 1.23893428, + "balance_loss_mlp": 1.00042081, + "epoch": 0.11008567563505185, + "flos": 54080837562600.0, + "grad_norm": 0.6893088173251793, + "language_loss": 0.55607355, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.5798068, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.07080078, + "step": 1831, + "time_per_iteration": 3.2111170291900635 + }, + { + "auxiliary_loss_clip": 0.01548088, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_clip": 1.33646595, + "balance_loss_mlp": 1.06026483, + "epoch": 0.11014579888771983, + "flos": 24248483614080.0, + "grad_norm": 2.1040110775847936, + "language_loss": 0.91254056, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93888658, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26245117, + "step": 1832, + "time_per_iteration": 2.792724370956421 + }, + { + "auxiliary_loss_clip": 0.01356421, + "auxiliary_loss_mlp": 0.01012541, + "balance_loss_clip": 1.23042619, + "balance_loss_mlp": 1.00472116, + "epoch": 0.1102059221403878, + "flos": 57759336303120.0, + "grad_norm": 0.7189280176951579, + "language_loss": 0.59947252, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62316209, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.078125, + "step": 1833, + "time_per_iteration": 3.2443478107452393 + }, + { + "auxiliary_loss_clip": 0.01535956, + "auxiliary_loss_mlp": 0.01062532, + "balance_loss_clip": 1.32732391, + "balance_loss_mlp": 1.03780842, + "epoch": 0.11026604539305576, + "flos": 16799745201840.0, + "grad_norm": 2.6065675273448714, + "language_loss": 0.80809033, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83407521, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24719238, + "step": 1834, + "time_per_iteration": 2.8609602451324463 + }, + { + "auxiliary_loss_clip": 0.0154703, + "auxiliary_loss_mlp": 0.01071781, + "balance_loss_clip": 1.33890009, + "balance_loss_mlp": 1.04557824, + "epoch": 0.11032616864572373, + "flos": 23993419548240.0, + "grad_norm": 2.0969137595141416, + "language_loss": 0.90542436, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93161249, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.26208496, + "step": 1835, + "time_per_iteration": 4.219309091567993 + }, + { + "auxiliary_loss_clip": 0.01538961, + "auxiliary_loss_mlp": 0.01084688, + "balance_loss_clip": 1.33379674, + "balance_loss_mlp": 1.05804503, + "epoch": 0.1103862918983917, + "flos": 19687972759560.0, + "grad_norm": 2.3256846733128262, + "language_loss": 0.64346617, + "learning_rate": 3.932597238269386e-06, + "loss": 0.66970265, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.26647949, + "step": 1836, + "time_per_iteration": 2.7511825561523438 + }, + { + "auxiliary_loss_clip": 0.01535715, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_clip": 1.32961833, + "balance_loss_mlp": 1.05055916, + "epoch": 0.11044641515105967, + "flos": 32167286687880.0, + "grad_norm": 1.8345119949483337, + "language_loss": 0.73404014, + "learning_rate": 3.932496944947711e-06, + "loss": 0.76015162, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.24865723, + "step": 1837, + "time_per_iteration": 2.892127275466919 + }, + { + "auxiliary_loss_clip": 0.01540134, + "auxiliary_loss_mlp": 0.01066786, + "balance_loss_clip": 1.33386469, + "balance_loss_mlp": 1.04126334, + "epoch": 0.11050653840372764, + "flos": 16693402242240.0, + "grad_norm": 2.0389158008874304, + "language_loss": 0.78802735, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81409651, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.25549316, + "step": 1838, + "time_per_iteration": 4.415621280670166 + }, + { + "auxiliary_loss_clip": 0.01531445, + "auxiliary_loss_mlp": 0.01076887, + "balance_loss_clip": 1.32653642, + "balance_loss_mlp": 1.0515312, + "epoch": 0.11056666165639562, + "flos": 21213037459440.0, + "grad_norm": 2.024052429127594, + "language_loss": 0.7215637, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74764705, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25354004, + "step": 1839, + "time_per_iteration": 2.80631422996521 + }, + { + "auxiliary_loss_clip": 0.01543232, + "auxiliary_loss_mlp": 0.01061472, + "balance_loss_clip": 1.33335364, + "balance_loss_mlp": 1.03605688, + "epoch": 0.11062678490906358, + "flos": 19169357480640.0, + "grad_norm": 2.7876325787614618, + "language_loss": 0.79245448, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81850159, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25427246, + "step": 1840, + "time_per_iteration": 4.290524005889893 + }, + { + "auxiliary_loss_clip": 0.0153027, + "auxiliary_loss_mlp": 0.01063741, + "balance_loss_clip": 1.3266747, + "balance_loss_mlp": 1.03843307, + "epoch": 0.11068690816173155, + "flos": 24900104563920.0, + "grad_norm": 1.6820750413270067, + "language_loss": 0.88351941, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90945947, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.25305176, + "step": 1841, + "time_per_iteration": 2.8972392082214355 + }, + { + "auxiliary_loss_clip": 0.0153024, + "auxiliary_loss_mlp": 0.01058764, + "balance_loss_clip": 1.32532036, + "balance_loss_mlp": 1.03456414, + "epoch": 0.11074703141439952, + "flos": 16476777312120.0, + "grad_norm": 1.7326084789042793, + "language_loss": 0.90795249, + "learning_rate": 3.931994379208334e-06, + "loss": 0.93384254, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.2421875, + "step": 1842, + "time_per_iteration": 2.9505417346954346 + }, + { + "auxiliary_loss_clip": 0.01539252, + "auxiliary_loss_mlp": 0.01069021, + "balance_loss_clip": 1.33205807, + "balance_loss_mlp": 1.04596579, + "epoch": 0.11080715466706749, + "flos": 19177194894120.0, + "grad_norm": 2.0146582505500215, + "language_loss": 0.86558247, + "learning_rate": 3.931893646260937e-06, + "loss": 0.89166522, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.23071289, + "step": 1843, + "time_per_iteration": 2.7565972805023193 + }, + { + "auxiliary_loss_clip": 0.01538035, + "auxiliary_loss_mlp": 0.01061174, + "balance_loss_clip": 1.33243346, + "balance_loss_mlp": 1.03522205, + "epoch": 0.11086727791973545, + "flos": 27709870124160.0, + "grad_norm": 1.4878038476579614, + "language_loss": 0.74863958, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77463162, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.25976562, + "step": 1844, + "time_per_iteration": 2.9229815006256104 + }, + { + "auxiliary_loss_clip": 0.0153415, + "auxiliary_loss_mlp": 0.01054393, + "balance_loss_clip": 1.32632077, + "balance_loss_mlp": 1.02854824, + "epoch": 0.11092740117240343, + "flos": 18519563907000.0, + "grad_norm": 1.8147799998125476, + "language_loss": 0.75909412, + "learning_rate": 3.931691960597165e-06, + "loss": 0.78497958, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25817871, + "step": 1845, + "time_per_iteration": 2.729137659072876 + }, + { + "auxiliary_loss_clip": 0.01525238, + "auxiliary_loss_mlp": 0.01055045, + "balance_loss_clip": 1.32170701, + "balance_loss_mlp": 1.03133464, + "epoch": 0.1109875244250714, + "flos": 20527403685120.0, + "grad_norm": 1.5948364571303022, + "language_loss": 0.76427644, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.79007924, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.23693848, + "step": 1846, + "time_per_iteration": 2.7439939975738525 + }, + { + "auxiliary_loss_clip": 0.01540932, + "auxiliary_loss_mlp": 0.01057548, + "balance_loss_clip": 1.3308692, + "balance_loss_mlp": 1.03326535, + "epoch": 0.11104764767773936, + "flos": 14102413855200.0, + "grad_norm": 2.3105969980401575, + "language_loss": 0.86874473, + "learning_rate": 3.931489981933584e-06, + "loss": 0.8947295, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24291992, + "step": 1847, + "time_per_iteration": 2.8383102416992188 + }, + { + "auxiliary_loss_clip": 0.01536916, + "auxiliary_loss_mlp": 0.01055605, + "balance_loss_clip": 1.32872677, + "balance_loss_mlp": 1.02958202, + "epoch": 0.11110777093040733, + "flos": 20599368345000.0, + "grad_norm": 2.41347976907528, + "language_loss": 0.7727828, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79870796, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26049805, + "step": 1848, + "time_per_iteration": 2.782564401626587 + }, + { + "auxiliary_loss_clip": 0.01526577, + "auxiliary_loss_mlp": 0.01058161, + "balance_loss_clip": 1.3254472, + "balance_loss_mlp": 1.03390193, + "epoch": 0.11116789418307531, + "flos": 21874891716000.0, + "grad_norm": 1.6452811817437358, + "language_loss": 0.78086972, + "learning_rate": 3.931287710300832e-06, + "loss": 0.80671716, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.24243164, + "step": 1849, + "time_per_iteration": 2.7201690673828125 + }, + { + "auxiliary_loss_clip": 0.01540151, + "auxiliary_loss_mlp": 0.01060847, + "balance_loss_clip": 1.32832634, + "balance_loss_mlp": 1.03569412, + "epoch": 0.11122801743574327, + "flos": 15527308066200.0, + "grad_norm": 2.414248806620635, + "language_loss": 0.72431433, + "learning_rate": 3.931186464630601e-06, + "loss": 0.75032437, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25170898, + "step": 1850, + "time_per_iteration": 2.7335758209228516 + }, + { + "auxiliary_loss_clip": 0.0153813, + "auxiliary_loss_mlp": 0.01061258, + "balance_loss_clip": 1.32976282, + "balance_loss_mlp": 1.03635526, + "epoch": 0.11128814068841124, + "flos": 14396363748720.0, + "grad_norm": 2.193312532840476, + "language_loss": 0.82468665, + "learning_rate": 3.931085145729588e-06, + "loss": 0.85068047, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24890137, + "step": 1851, + "time_per_iteration": 2.7066824436187744 + }, + { + "auxiliary_loss_clip": 0.01534948, + "auxiliary_loss_mlp": 0.01052511, + "balance_loss_clip": 1.330585, + "balance_loss_mlp": 1.02796531, + "epoch": 0.11134826394107922, + "flos": 16658252383680.0, + "grad_norm": 2.1726852696396555, + "language_loss": 0.88204873, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90792334, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.2454834, + "step": 1852, + "time_per_iteration": 2.7539727687835693 + }, + { + "auxiliary_loss_clip": 0.0153774, + "auxiliary_loss_mlp": 0.01062605, + "balance_loss_clip": 1.33260214, + "balance_loss_mlp": 1.03640246, + "epoch": 0.11140838719374718, + "flos": 16695392051880.0, + "grad_norm": 1.8858689757332008, + "language_loss": 0.73408562, + "learning_rate": 3.930882288250578e-06, + "loss": 0.76008904, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.26208496, + "step": 1853, + "time_per_iteration": 2.797091007232666 + }, + { + "auxiliary_loss_clip": 0.0133282, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.21451283, + "balance_loss_mlp": 1.02042127, + "epoch": 0.11146851044641515, + "flos": 60990209823960.0, + "grad_norm": 0.7893844138719702, + "language_loss": 0.53758132, + "learning_rate": 3.930780749680273e-06, + "loss": 0.56119144, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.07763672, + "step": 1854, + "time_per_iteration": 3.231361150741577 + }, + { + "auxiliary_loss_clip": 0.01555876, + "auxiliary_loss_mlp": 0.01054042, + "balance_loss_clip": 1.33942997, + "balance_loss_mlp": 1.02652836, + "epoch": 0.11152863369908313, + "flos": 22198103255880.0, + "grad_norm": 1.9138808519287918, + "language_loss": 0.85029227, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.87639147, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.27514648, + "step": 1855, + "time_per_iteration": 2.82623028755188 + }, + { + "auxiliary_loss_clip": 0.01541892, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_clip": 1.33413708, + "balance_loss_mlp": 1.04648709, + "epoch": 0.11158875695175109, + "flos": 19542378497040.0, + "grad_norm": 2.4699717673451644, + "language_loss": 0.82255948, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84869027, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.24719238, + "step": 1856, + "time_per_iteration": 2.8455679416656494 + }, + { + "auxiliary_loss_clip": 0.01543065, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.33939302, + "balance_loss_mlp": 1.02805233, + "epoch": 0.11164888020441906, + "flos": 25447616013960.0, + "grad_norm": 1.6499516273420798, + "language_loss": 0.83294892, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85891289, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.25280762, + "step": 1857, + "time_per_iteration": 2.832118034362793 + }, + { + "auxiliary_loss_clip": 0.0154753, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_clip": 1.33748209, + "balance_loss_mlp": 1.02516341, + "epoch": 0.11170900345708702, + "flos": 15636980911320.0, + "grad_norm": 2.99252385448186, + "language_loss": 0.83694226, + "learning_rate": 3.930373863283608e-06, + "loss": 0.8629126, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24328613, + "step": 1858, + "time_per_iteration": 2.8281466960906982 + }, + { + "auxiliary_loss_clip": 0.0154797, + "auxiliary_loss_mlp": 0.01054523, + "balance_loss_clip": 1.3423835, + "balance_loss_mlp": 1.03064549, + "epoch": 0.111769126709755, + "flos": 23044640644440.0, + "grad_norm": 2.2023786539870387, + "language_loss": 0.9209944, + "learning_rate": 3.930271958674866e-06, + "loss": 0.94701934, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.2388916, + "step": 1859, + "time_per_iteration": 2.8208305835723877 + }, + { + "auxiliary_loss_clip": 0.01547379, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.33774745, + "balance_loss_mlp": 1.03788114, + "epoch": 0.11182924996242297, + "flos": 20855691270000.0, + "grad_norm": 2.0963636048021708, + "language_loss": 0.82284069, + "learning_rate": 3.930169980870018e-06, + "loss": 0.84893692, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24377441, + "step": 1860, + "time_per_iteration": 2.8046319484710693 + }, + { + "auxiliary_loss_clip": 0.01547525, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_clip": 1.34216452, + "balance_loss_mlp": 1.03673303, + "epoch": 0.11188937321509093, + "flos": 17459569040400.0, + "grad_norm": 1.8709031447856563, + "language_loss": 0.75557256, + "learning_rate": 3.930067929872931e-06, + "loss": 0.78165531, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.24047852, + "step": 1861, + "time_per_iteration": 2.7912654876708984 + }, + { + "auxiliary_loss_clip": 0.01535946, + "auxiliary_loss_mlp": 0.01056397, + "balance_loss_clip": 1.33290553, + "balance_loss_mlp": 1.03268647, + "epoch": 0.11194949646775891, + "flos": 24101061975360.0, + "grad_norm": 1.9300131545713826, + "language_loss": 0.89212966, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91805303, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.23718262, + "step": 1862, + "time_per_iteration": 2.9366791248321533 + }, + { + "auxiliary_loss_clip": 0.01543707, + "auxiliary_loss_mlp": 0.01059944, + "balance_loss_clip": 1.33817029, + "balance_loss_mlp": 1.03544641, + "epoch": 0.11200961972042688, + "flos": 25159148249040.0, + "grad_norm": 2.341806206703925, + "language_loss": 0.87461627, + "learning_rate": 3.92986360831752e-06, + "loss": 0.90065277, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.24523926, + "step": 1863, + "time_per_iteration": 2.796154260635376 + }, + { + "auxiliary_loss_clip": 0.01548292, + "auxiliary_loss_mlp": 0.01061404, + "balance_loss_clip": 1.34358335, + "balance_loss_mlp": 1.03487992, + "epoch": 0.11206974297309484, + "flos": 21293326833120.0, + "grad_norm": 1.7318110752350904, + "language_loss": 0.64530855, + "learning_rate": 3.929761337766945e-06, + "loss": 0.67140549, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26501465, + "step": 1864, + "time_per_iteration": 2.830669641494751 + }, + { + "auxiliary_loss_clip": 0.01543861, + "auxiliary_loss_mlp": 0.01049514, + "balance_loss_clip": 1.33997357, + "balance_loss_mlp": 1.02743626, + "epoch": 0.11212986622576282, + "flos": 18920587710600.0, + "grad_norm": 1.8805213248063162, + "language_loss": 0.74412978, + "learning_rate": 3.929658994039627e-06, + "loss": 0.77006352, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.22058105, + "step": 1865, + "time_per_iteration": 2.717872142791748 + }, + { + "auxiliary_loss_clip": 0.01547336, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_clip": 1.34089804, + "balance_loss_mlp": 1.03272486, + "epoch": 0.11218998947843078, + "flos": 22059899714880.0, + "grad_norm": 2.2129262757402497, + "language_loss": 0.85036218, + "learning_rate": 3.929556577139446e-06, + "loss": 0.87641871, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.25598145, + "step": 1866, + "time_per_iteration": 2.7813022136688232 + }, + { + "auxiliary_loss_clip": 0.01552795, + "auxiliary_loss_mlp": 0.01049598, + "balance_loss_clip": 1.3445797, + "balance_loss_mlp": 1.02592301, + "epoch": 0.11225011273109875, + "flos": 24577096065840.0, + "grad_norm": 1.5396322086450345, + "language_loss": 0.82133406, + "learning_rate": 3.929454087070286e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.23681641, + "step": 1867, + "time_per_iteration": 2.808570146560669 + }, + { + "auxiliary_loss_clip": 0.01543108, + "auxiliary_loss_mlp": 0.01051213, + "balance_loss_clip": 1.33825779, + "balance_loss_mlp": 1.02776456, + "epoch": 0.11231023598376672, + "flos": 28444501207800.0, + "grad_norm": 2.0815150281965558, + "language_loss": 0.88217616, + "learning_rate": 3.929351523836035e-06, + "loss": 0.90811944, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.23449707, + "step": 1868, + "time_per_iteration": 2.905174970626831 + }, + { + "auxiliary_loss_clip": 0.01538174, + "auxiliary_loss_mlp": 0.01047682, + "balance_loss_clip": 1.33895409, + "balance_loss_mlp": 1.02485371, + "epoch": 0.1123703592364347, + "flos": 14430254748120.0, + "grad_norm": 2.1026150110654336, + "language_loss": 0.68683743, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.71269596, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.22827148, + "step": 1869, + "time_per_iteration": 2.708987236022949 + }, + { + "auxiliary_loss_clip": 0.01547576, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_clip": 1.33972168, + "balance_loss_mlp": 1.02551615, + "epoch": 0.11243048248910266, + "flos": 22241334178080.0, + "grad_norm": 1.5261400869858401, + "language_loss": 0.77466023, + "learning_rate": 3.929146177887814e-06, + "loss": 0.80063719, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.24621582, + "step": 1870, + "time_per_iteration": 2.759882926940918 + }, + { + "auxiliary_loss_clip": 0.01547176, + "auxiliary_loss_mlp": 0.01049522, + "balance_loss_clip": 1.33861113, + "balance_loss_mlp": 1.02436888, + "epoch": 0.11249060574177062, + "flos": 18588279898080.0, + "grad_norm": 2.4561123883186657, + "language_loss": 0.76534569, + "learning_rate": 3.929043395181631e-06, + "loss": 0.79131269, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.25170898, + "step": 1871, + "time_per_iteration": 2.7391629219055176 + }, + { + "auxiliary_loss_clip": 0.01540579, + "auxiliary_loss_mlp": 0.01049291, + "balance_loss_clip": 1.33623922, + "balance_loss_mlp": 1.02507997, + "epoch": 0.1125507289944386, + "flos": 22861703671920.0, + "grad_norm": 1.7544838042248576, + "language_loss": 0.82297367, + "learning_rate": 3.928940539325929e-06, + "loss": 0.8488723, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.24230957, + "step": 1872, + "time_per_iteration": 2.7435569763183594 + }, + { + "auxiliary_loss_clip": 0.01550423, + "auxiliary_loss_mlp": 0.01052744, + "balance_loss_clip": 1.34189129, + "balance_loss_mlp": 1.02909327, + "epoch": 0.11261085224710657, + "flos": 19680338387880.0, + "grad_norm": 2.135728865696742, + "language_loss": 0.83619183, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.86222351, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.2364502, + "step": 1873, + "time_per_iteration": 2.738492012023926 + }, + { + "auxiliary_loss_clip": 0.01553582, + "auxiliary_loss_mlp": 0.01055638, + "balance_loss_clip": 1.34298873, + "balance_loss_mlp": 1.03142631, + "epoch": 0.11267097549977453, + "flos": 26068269766320.0, + "grad_norm": 2.13773363636584, + "language_loss": 0.9230026, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94909477, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.24206543, + "step": 1874, + "time_per_iteration": 2.789548873901367 + }, + { + "auxiliary_loss_clip": 0.01541338, + "auxiliary_loss_mlp": 0.0105533, + "balance_loss_clip": 1.33783078, + "balance_loss_mlp": 1.03175044, + "epoch": 0.11273109875244251, + "flos": 21072844108800.0, + "grad_norm": 1.4276759177297187, + "language_loss": 0.75330698, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77927363, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.23583984, + "step": 1875, + "time_per_iteration": 4.14522123336792 + }, + { + "auxiliary_loss_clip": 0.01547226, + "auxiliary_loss_mlp": 0.01051711, + "balance_loss_clip": 1.34690762, + "balance_loss_mlp": 1.02896619, + "epoch": 0.11279122200511048, + "flos": 27094698500400.0, + "grad_norm": 1.7607939456536195, + "language_loss": 0.72189748, + "learning_rate": 3.928528384485984e-06, + "loss": 0.7478869, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.22741699, + "step": 1876, + "time_per_iteration": 2.868840456008911 + }, + { + "auxiliary_loss_clip": 0.01542418, + "auxiliary_loss_mlp": 0.01050514, + "balance_loss_clip": 1.3412267, + "balance_loss_mlp": 1.02691078, + "epoch": 0.11285134525777844, + "flos": 20192009637240.0, + "grad_norm": 1.7907597097820642, + "language_loss": 0.76955342, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79548281, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23583984, + "step": 1877, + "time_per_iteration": 5.8902387619018555 + }, + { + "auxiliary_loss_clip": 0.01545007, + "auxiliary_loss_mlp": 0.01054585, + "balance_loss_clip": 1.33841372, + "balance_loss_mlp": 1.03070748, + "epoch": 0.11291146851044641, + "flos": 12462275398320.0, + "grad_norm": 2.231871972366795, + "language_loss": 0.88317168, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90916759, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.23876953, + "step": 1878, + "time_per_iteration": 4.3595969676971436 + }, + { + "auxiliary_loss_clip": 0.01542603, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.33759832, + "balance_loss_mlp": 1.0232935, + "epoch": 0.11297159176311439, + "flos": 23847662852280.0, + "grad_norm": 1.9364834900647931, + "language_loss": 0.81562614, + "learning_rate": 3.928218500477466e-06, + "loss": 0.84151137, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.22644043, + "step": 1879, + "time_per_iteration": 2.808269739151001 + }, + { + "auxiliary_loss_clip": 0.01543423, + "auxiliary_loss_mlp": 0.010527, + "balance_loss_clip": 1.33950686, + "balance_loss_mlp": 1.02840495, + "epoch": 0.11303171501578235, + "flos": 29936040383520.0, + "grad_norm": 1.8609102929738832, + "language_loss": 0.71038508, + "learning_rate": 3.928115059566259e-06, + "loss": 0.73634636, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.24304199, + "step": 1880, + "time_per_iteration": 2.8524527549743652 + }, + { + "auxiliary_loss_clip": 0.0153934, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.34022522, + "balance_loss_mlp": 1.02340269, + "epoch": 0.11309183826845032, + "flos": 16184939053320.0, + "grad_norm": 1.5906887241510514, + "language_loss": 0.72708875, + "learning_rate": 3.928011545540734e-06, + "loss": 0.7529403, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.22412109, + "step": 1881, + "time_per_iteration": 2.7316222190856934 + }, + { + "auxiliary_loss_clip": 0.0154117, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.33599806, + "balance_loss_mlp": 1.02696824, + "epoch": 0.1131519615211183, + "flos": 12024477401760.0, + "grad_norm": 2.094020198059566, + "language_loss": 0.75236607, + "learning_rate": 3.927907958404819e-06, + "loss": 0.77828932, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.24206543, + "step": 1882, + "time_per_iteration": 2.7332308292388916 + }, + { + "auxiliary_loss_clip": 0.0154959, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.34782887, + "balance_loss_mlp": 1.03213525, + "epoch": 0.11321208477378626, + "flos": 26255754875160.0, + "grad_norm": 1.963404206771846, + "language_loss": 0.79691952, + "learning_rate": 3.92780429816244e-06, + "loss": 0.82297832, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.24157715, + "step": 1883, + "time_per_iteration": 2.833131790161133 + }, + { + "auxiliary_loss_clip": 0.0154322, + "auxiliary_loss_mlp": 0.01056009, + "balance_loss_clip": 1.34022903, + "balance_loss_mlp": 1.03225017, + "epoch": 0.11327220802645423, + "flos": 13630399992360.0, + "grad_norm": 1.7992913139513422, + "language_loss": 0.771321, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79731327, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.2376709, + "step": 1884, + "time_per_iteration": 2.8923444747924805 + }, + { + "auxiliary_loss_clip": 0.01369081, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.25528073, + "balance_loss_mlp": 1.01906013, + "epoch": 0.1133323312791222, + "flos": 57206464549560.0, + "grad_norm": 0.7956773839720491, + "language_loss": 0.55239493, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57635313, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.07666016, + "step": 1885, + "time_per_iteration": 3.17130184173584 + }, + { + "auxiliary_loss_clip": 0.01530782, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.33401823, + "balance_loss_mlp": 1.02205157, + "epoch": 0.11339245453179017, + "flos": 24356938208400.0, + "grad_norm": 1.8096871706263054, + "language_loss": 0.90986502, + "learning_rate": 3.927492878835848e-06, + "loss": 0.93561673, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.22338867, + "step": 1886, + "time_per_iteration": 2.775373935699463 + }, + { + "auxiliary_loss_clip": 0.01544261, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.34466982, + "balance_loss_mlp": 1.03038812, + "epoch": 0.11345257778445814, + "flos": 22675355597160.0, + "grad_norm": 1.7395326559565016, + "language_loss": 0.85377413, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87974441, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.22375488, + "step": 1887, + "time_per_iteration": 2.7879860401153564 + }, + { + "auxiliary_loss_clip": 0.01537957, + "auxiliary_loss_mlp": 0.01056713, + "balance_loss_clip": 1.33785343, + "balance_loss_mlp": 1.03382516, + "epoch": 0.11351270103712612, + "flos": 20992148651520.0, + "grad_norm": 2.8938134047504303, + "language_loss": 0.77493465, + "learning_rate": 3.927284900491277e-06, + "loss": 0.80088133, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.22888184, + "step": 1888, + "time_per_iteration": 2.752887487411499 + }, + { + "auxiliary_loss_clip": 0.01558968, + "auxiliary_loss_mlp": 0.01058135, + "balance_loss_clip": 1.35355163, + "balance_loss_mlp": 1.03289807, + "epoch": 0.11357282428979408, + "flos": 37355801407920.0, + "grad_norm": 1.7305793316485643, + "language_loss": 0.68768334, + "learning_rate": 3.927180801692764e-06, + "loss": 0.71385437, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.25244141, + "step": 1889, + "time_per_iteration": 3.082475423812866 + }, + { + "auxiliary_loss_clip": 0.01541198, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.3419801, + "balance_loss_mlp": 1.02086186, + "epoch": 0.11363294754246205, + "flos": 21761361076680.0, + "grad_norm": 1.8694671163552732, + "language_loss": 0.84734297, + "learning_rate": 3.927076629815362e-06, + "loss": 0.87319624, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.23278809, + "step": 1890, + "time_per_iteration": 2.8830296993255615 + }, + { + "auxiliary_loss_clip": 0.0153823, + "auxiliary_loss_mlp": 0.01052782, + "balance_loss_clip": 1.33857012, + "balance_loss_mlp": 1.03029895, + "epoch": 0.11369307079513001, + "flos": 22606802039520.0, + "grad_norm": 2.0775438539575837, + "language_loss": 0.65531904, + "learning_rate": 3.926972384863022e-06, + "loss": 0.68122911, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.22485352, + "step": 1891, + "time_per_iteration": 2.7550246715545654 + }, + { + "auxiliary_loss_clip": 0.01551468, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.34750795, + "balance_loss_mlp": 1.02093828, + "epoch": 0.11375319404779799, + "flos": 21949252269120.0, + "grad_norm": 2.6253646316241515, + "language_loss": 0.89145792, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.91740113, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.21899414, + "step": 1892, + "time_per_iteration": 2.811317205429077 + }, + { + "auxiliary_loss_clip": 0.01548536, + "auxiliary_loss_mlp": 0.01057056, + "balance_loss_clip": 1.34463823, + "balance_loss_mlp": 1.03392959, + "epoch": 0.11381331730046595, + "flos": 26401024270800.0, + "grad_norm": 3.721680695533674, + "language_loss": 0.73727351, + "learning_rate": 3.926763675749339e-06, + "loss": 0.76332939, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.23144531, + "step": 1893, + "time_per_iteration": 2.839477777481079 + }, + { + "auxiliary_loss_clip": 0.01545025, + "auxiliary_loss_mlp": 0.01050992, + "balance_loss_clip": 1.34452844, + "balance_loss_mlp": 1.02741253, + "epoch": 0.11387344055313392, + "flos": 23809914058680.0, + "grad_norm": 1.7637130104325567, + "language_loss": 0.79938173, + "learning_rate": 3.92665921159591e-06, + "loss": 0.82534194, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.23571777, + "step": 1894, + "time_per_iteration": 2.8854997158050537 + }, + { + "auxiliary_loss_clip": 0.01554614, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.34978271, + "balance_loss_mlp": 1.02852058, + "epoch": 0.1139335638058019, + "flos": 34528330602720.0, + "grad_norm": 3.3525682776799983, + "language_loss": 0.80408776, + "learning_rate": 3.926554674383371e-06, + "loss": 0.83016181, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.24304199, + "step": 1895, + "time_per_iteration": 2.9595565795898438 + }, + { + "auxiliary_loss_clip": 0.01363486, + "auxiliary_loss_mlp": 0.01007183, + "balance_loss_clip": 1.25065231, + "balance_loss_mlp": 1.00031614, + "epoch": 0.11399368705846986, + "flos": 70604970868440.0, + "grad_norm": 0.801481779010251, + "language_loss": 0.63364005, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65734673, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.06884766, + "step": 1896, + "time_per_iteration": 3.3608551025390625 + }, + { + "auxiliary_loss_clip": 0.01548343, + "auxiliary_loss_mlp": 0.01056422, + "balance_loss_clip": 1.34999251, + "balance_loss_mlp": 1.03168607, + "epoch": 0.11405381031113783, + "flos": 21329207642160.0, + "grad_norm": 1.5506684458027247, + "language_loss": 0.85353899, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87958664, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.24755859, + "step": 1897, + "time_per_iteration": 2.8415966033935547 + }, + { + "auxiliary_loss_clip": 0.01554239, + "auxiliary_loss_mlp": 0.01057551, + "balance_loss_clip": 1.35395813, + "balance_loss_mlp": 1.03436518, + "epoch": 0.11411393356380581, + "flos": 19724462694000.0, + "grad_norm": 2.1755997078658487, + "language_loss": 0.79792714, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.824045, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.23181152, + "step": 1898, + "time_per_iteration": 2.7829363346099854 + }, + { + "auxiliary_loss_clip": 0.01554093, + "auxiliary_loss_mlp": 0.01051778, + "balance_loss_clip": 1.35091352, + "balance_loss_mlp": 1.02745938, + "epoch": 0.11417405681647377, + "flos": 17534985410880.0, + "grad_norm": 4.649795604825157, + "language_loss": 0.73651129, + "learning_rate": 3.926135795021435e-06, + "loss": 0.76257002, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.2434082, + "step": 1899, + "time_per_iteration": 2.766960859298706 + }, + { + "auxiliary_loss_clip": 0.01371696, + "auxiliary_loss_mlp": 0.01013449, + "balance_loss_clip": 1.25949144, + "balance_loss_mlp": 1.00653458, + "epoch": 0.11423418006914174, + "flos": 59689891726200.0, + "grad_norm": 0.9118626526009717, + "language_loss": 0.63469934, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65855074, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.06933594, + "step": 1900, + "time_per_iteration": 3.2140047550201416 + }, + { + "auxiliary_loss_clip": 0.01549775, + "auxiliary_loss_mlp": 0.01057198, + "balance_loss_clip": 1.35032213, + "balance_loss_mlp": 1.0332011, + "epoch": 0.1142943033218097, + "flos": 22967884198080.0, + "grad_norm": 1.5860756105824703, + "language_loss": 0.78435534, + "learning_rate": 3.925925917089001e-06, + "loss": 0.81042504, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.23999023, + "step": 1901, + "time_per_iteration": 2.8472819328308105 + }, + { + "auxiliary_loss_clip": 0.01559866, + "auxiliary_loss_mlp": 0.01057178, + "balance_loss_clip": 1.36041868, + "balance_loss_mlp": 1.03486228, + "epoch": 0.11435442657447768, + "flos": 18260398396800.0, + "grad_norm": 1.9495650263898525, + "language_loss": 0.84905934, + "learning_rate": 3.925820868573839e-06, + "loss": 0.87522978, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.2232666, + "step": 1902, + "time_per_iteration": 2.756539821624756 + }, + { + "auxiliary_loss_clip": 0.01553346, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.35317838, + "balance_loss_mlp": 1.02837276, + "epoch": 0.11441454982714565, + "flos": 24066561850560.0, + "grad_norm": 1.5710933021351037, + "language_loss": 0.78284204, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80889785, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.23864746, + "step": 1903, + "time_per_iteration": 2.7955751419067383 + }, + { + "auxiliary_loss_clip": 0.01553476, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.35430527, + "balance_loss_mlp": 1.0217557, + "epoch": 0.11447467307981361, + "flos": 25343181647280.0, + "grad_norm": 1.7552236292662404, + "language_loss": 0.75910956, + "learning_rate": 3.925610552465539e-06, + "loss": 0.7850703, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.20849609, + "step": 1904, + "time_per_iteration": 2.779575824737549 + }, + { + "auxiliary_loss_clip": 0.01552731, + "auxiliary_loss_mlp": 0.010542, + "balance_loss_clip": 1.3551563, + "balance_loss_mlp": 1.02979839, + "epoch": 0.11453479633248159, + "flos": 21731124829680.0, + "grad_norm": 2.205505586813019, + "language_loss": 0.92903817, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.95510757, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.24401855, + "step": 1905, + "time_per_iteration": 2.795449733734131 + }, + { + "auxiliary_loss_clip": 0.01568756, + "auxiliary_loss_mlp": 0.01046004, + "balance_loss_clip": 1.35943675, + "balance_loss_mlp": 1.02175677, + "epoch": 0.11459491958514956, + "flos": 12973662389160.0, + "grad_norm": 2.24600251515493, + "language_loss": 0.78819466, + "learning_rate": 3.925399944279861e-06, + "loss": 0.81434226, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.24243164, + "step": 1906, + "time_per_iteration": 2.8098087310791016 + }, + { + "auxiliary_loss_clip": 0.01554316, + "auxiliary_loss_mlp": 0.01060444, + "balance_loss_clip": 1.35406971, + "balance_loss_mlp": 1.03648305, + "epoch": 0.11465504283781752, + "flos": 22716474884640.0, + "grad_norm": 2.0933363891123333, + "language_loss": 0.82284808, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84899569, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.23962402, + "step": 1907, + "time_per_iteration": 2.8496241569519043 + }, + { + "auxiliary_loss_clip": 0.01551726, + "auxiliary_loss_mlp": 0.01054672, + "balance_loss_clip": 1.3508538, + "balance_loss_mlp": 1.03267813, + "epoch": 0.1147151660904855, + "flos": 23402880217800.0, + "grad_norm": 2.0666198104592834, + "language_loss": 0.85327446, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87933844, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.2199707, + "step": 1908, + "time_per_iteration": 2.7550463676452637 + }, + { + "auxiliary_loss_clip": 0.01379261, + "auxiliary_loss_mlp": 0.01022247, + "balance_loss_clip": 1.26880157, + "balance_loss_mlp": 1.01495099, + "epoch": 0.11477528934315347, + "flos": 63024955963560.0, + "grad_norm": 0.9203580050519572, + "language_loss": 0.61012965, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63414478, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.07275391, + "step": 1909, + "time_per_iteration": 2.990347146987915 + }, + { + "auxiliary_loss_clip": 0.01549082, + "auxiliary_loss_mlp": 0.01048924, + "balance_loss_clip": 1.35067189, + "balance_loss_mlp": 1.02685869, + "epoch": 0.11483541259582143, + "flos": 16329883582080.0, + "grad_norm": 1.8377468703292406, + "language_loss": 0.79520679, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8211869, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.22058105, + "step": 1910, + "time_per_iteration": 2.7635772228240967 + }, + { + "auxiliary_loss_clip": 0.01558682, + "auxiliary_loss_mlp": 0.01052133, + "balance_loss_clip": 1.35691357, + "balance_loss_mlp": 1.02845788, + "epoch": 0.1148955358484894, + "flos": 21585489958800.0, + "grad_norm": 1.942542040623799, + "language_loss": 0.77661228, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.80272043, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.23669434, + "step": 1911, + "time_per_iteration": 2.7690188884735107 + }, + { + "auxiliary_loss_clip": 0.01540112, + "auxiliary_loss_mlp": 0.01049489, + "balance_loss_clip": 1.34714246, + "balance_loss_mlp": 1.02631438, + "epoch": 0.11495565910115738, + "flos": 27679593268800.0, + "grad_norm": 1.5802782219846412, + "language_loss": 0.7995283, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.82542431, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.23168945, + "step": 1912, + "time_per_iteration": 2.90262508392334 + }, + { + "auxiliary_loss_clip": 0.0155129, + "auxiliary_loss_mlp": 0.01057603, + "balance_loss_clip": 1.35347199, + "balance_loss_mlp": 1.03276002, + "epoch": 0.11501578235382534, + "flos": 20637401397120.0, + "grad_norm": 1.9721301243625573, + "language_loss": 0.78962559, + "learning_rate": 3.924660515982246e-06, + "loss": 0.81571454, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.24853516, + "step": 1913, + "time_per_iteration": 4.314927816390991 + }, + { + "auxiliary_loss_clip": 0.01564836, + "auxiliary_loss_mlp": 0.01052125, + "balance_loss_clip": 1.3656348, + "balance_loss_mlp": 1.0280683, + "epoch": 0.1150759056064933, + "flos": 19833891888960.0, + "grad_norm": 1.6963714134573598, + "language_loss": 0.70813191, + "learning_rate": 3.924554591402939e-06, + "loss": 0.73430151, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.24084473, + "step": 1914, + "time_per_iteration": 2.8773860931396484 + }, + { + "auxiliary_loss_clip": 0.01383744, + "auxiliary_loss_mlp": 0.01008108, + "balance_loss_clip": 1.27450919, + "balance_loss_mlp": 1.00095522, + "epoch": 0.11513602885916129, + "flos": 70063550672400.0, + "grad_norm": 0.74678866857644, + "language_loss": 0.61151206, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63543057, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.07128906, + "step": 1915, + "time_per_iteration": 4.913774490356445 + }, + { + "auxiliary_loss_clip": 0.01553312, + "auxiliary_loss_mlp": 0.01058266, + "balance_loss_clip": 1.35421026, + "balance_loss_mlp": 1.03418612, + "epoch": 0.11519615211182925, + "flos": 15745922805960.0, + "grad_norm": 1.9777027505566827, + "language_loss": 0.93726361, + "learning_rate": 3.924342523310436e-06, + "loss": 0.96337938, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.24072266, + "step": 1916, + "time_per_iteration": 2.7971303462982178 + }, + { + "auxiliary_loss_clip": 0.01561108, + "auxiliary_loss_mlp": 0.01057985, + "balance_loss_clip": 1.36084735, + "balance_loss_mlp": 1.03338003, + "epoch": 0.11525627536449722, + "flos": 20672510647320.0, + "grad_norm": 1.715479714728479, + "language_loss": 0.72874296, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.75493389, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.24572754, + "step": 1917, + "time_per_iteration": 5.64441990852356 + }, + { + "auxiliary_loss_clip": 0.01557151, + "auxiliary_loss_mlp": 0.01055968, + "balance_loss_clip": 1.35969841, + "balance_loss_mlp": 1.03051686, + "epoch": 0.1153163986171652, + "flos": 20308179819960.0, + "grad_norm": 1.769845573058344, + "language_loss": 0.75291133, + "learning_rate": 3.92413016333289e-06, + "loss": 0.77904248, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.2545166, + "step": 1918, + "time_per_iteration": 2.8146896362304688 + }, + { + "auxiliary_loss_clip": 0.01562223, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.35898447, + "balance_loss_mlp": 1.02467251, + "epoch": 0.11537652186983316, + "flos": 17643805480440.0, + "grad_norm": 1.9251297978776987, + "language_loss": 0.87128031, + "learning_rate": 3.92402387389729e-06, + "loss": 0.89738357, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.23474121, + "step": 1919, + "time_per_iteration": 2.733933210372925 + }, + { + "auxiliary_loss_clip": 0.01549078, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_clip": 1.35142517, + "balance_loss_mlp": 1.03550005, + "epoch": 0.11543664512250112, + "flos": 21074387226480.0, + "grad_norm": 1.8493031008060086, + "language_loss": 0.8668164, + "learning_rate": 3.923917511502512e-06, + "loss": 0.89290726, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.24499512, + "step": 1920, + "time_per_iteration": 2.7457046508789062 + }, + { + "auxiliary_loss_clip": 0.01557273, + "auxiliary_loss_mlp": 0.01047295, + "balance_loss_clip": 1.36133647, + "balance_loss_mlp": 1.02459764, + "epoch": 0.11549676837516909, + "flos": 22752721168920.0, + "grad_norm": 2.0592901599179116, + "language_loss": 0.79892695, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82497257, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.22717285, + "step": 1921, + "time_per_iteration": 2.8311901092529297 + }, + { + "auxiliary_loss_clip": 0.01562234, + "auxiliary_loss_mlp": 0.01061642, + "balance_loss_clip": 1.35788846, + "balance_loss_mlp": 1.03676271, + "epoch": 0.11555689162783707, + "flos": 19173296491560.0, + "grad_norm": 1.8617770450913558, + "language_loss": 0.78581393, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81205273, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.24865723, + "step": 1922, + "time_per_iteration": 2.738797903060913 + }, + { + "auxiliary_loss_clip": 0.01558899, + "auxiliary_loss_mlp": 0.01057871, + "balance_loss_clip": 1.35686815, + "balance_loss_mlp": 1.03480434, + "epoch": 0.11561701488050503, + "flos": 24577502149440.0, + "grad_norm": 2.2456071622170284, + "language_loss": 0.84957236, + "learning_rate": 3.923597986603456e-06, + "loss": 0.87574005, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.23059082, + "step": 1923, + "time_per_iteration": 2.791323184967041 + }, + { + "auxiliary_loss_clip": 0.01564673, + "auxiliary_loss_mlp": 0.01061416, + "balance_loss_clip": 1.36338222, + "balance_loss_mlp": 1.0356189, + "epoch": 0.115677138133173, + "flos": 17097024980880.0, + "grad_norm": 1.8613842635517643, + "language_loss": 0.8083123, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83457321, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.25830078, + "step": 1924, + "time_per_iteration": 2.795527935028076 + }, + { + "auxiliary_loss_clip": 0.01379932, + "auxiliary_loss_mlp": 0.01025082, + "balance_loss_clip": 1.27015686, + "balance_loss_mlp": 1.01883495, + "epoch": 0.11573726138584098, + "flos": 62719612102080.0, + "grad_norm": 0.8153570680686247, + "language_loss": 0.61229366, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63634378, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.0625, + "step": 1925, + "time_per_iteration": 3.357940196990967 + }, + { + "auxiliary_loss_clip": 0.01551981, + "auxiliary_loss_mlp": 0.01076696, + "balance_loss_clip": 1.35161567, + "balance_loss_mlp": 1.05181754, + "epoch": 0.11579738463850894, + "flos": 22606233522480.0, + "grad_norm": 1.6067409264440218, + "language_loss": 0.7563321, + "learning_rate": 3.923277805217161e-06, + "loss": 0.78261888, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.2487793, + "step": 1926, + "time_per_iteration": 2.7552599906921387 + }, + { + "auxiliary_loss_clip": 0.01558928, + "auxiliary_loss_mlp": 0.01064281, + "balance_loss_clip": 1.35289085, + "balance_loss_mlp": 1.03817439, + "epoch": 0.11585750789117691, + "flos": 21731206046400.0, + "grad_norm": 2.942286200115588, + "language_loss": 0.73413181, + "learning_rate": 3.923170932221222e-06, + "loss": 0.76036388, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.26135254, + "step": 1927, + "time_per_iteration": 2.8290560245513916 + }, + { + "auxiliary_loss_clip": 0.01547308, + "auxiliary_loss_mlp": 0.01056105, + "balance_loss_clip": 1.34771967, + "balance_loss_mlp": 1.0330143, + "epoch": 0.11591763114384489, + "flos": 26293300626960.0, + "grad_norm": 1.8402792900396585, + "language_loss": 0.87017393, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89620805, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.23083496, + "step": 1928, + "time_per_iteration": 2.7993814945220947 + }, + { + "auxiliary_loss_clip": 0.01547534, + "auxiliary_loss_mlp": 0.01058372, + "balance_loss_clip": 1.34573698, + "balance_loss_mlp": 1.03357613, + "epoch": 0.11597775439651285, + "flos": 23005348733160.0, + "grad_norm": 1.8454559276402527, + "language_loss": 0.77883488, + "learning_rate": 3.922956967452898e-06, + "loss": 0.80489397, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.24816895, + "step": 1929, + "time_per_iteration": 2.7775402069091797 + }, + { + "auxiliary_loss_clip": 0.01541923, + "auxiliary_loss_mlp": 0.01062956, + "balance_loss_clip": 1.34401512, + "balance_loss_mlp": 1.0395906, + "epoch": 0.11603787764918082, + "flos": 31948062822720.0, + "grad_norm": 1.6305952062846154, + "language_loss": 0.77239275, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79844159, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.23388672, + "step": 1930, + "time_per_iteration": 2.881626844406128 + }, + { + "auxiliary_loss_clip": 0.01543376, + "auxiliary_loss_mlp": 0.01061162, + "balance_loss_clip": 1.34438813, + "balance_loss_mlp": 1.0363425, + "epoch": 0.1160980009018488, + "flos": 22276565253360.0, + "grad_norm": 1.9083817658320832, + "language_loss": 0.72601604, + "learning_rate": 3.922742711009693e-06, + "loss": 0.75206137, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.24841309, + "step": 1931, + "time_per_iteration": 2.781510829925537 + }, + { + "auxiliary_loss_clip": 0.01545125, + "auxiliary_loss_mlp": 0.0105266, + "balance_loss_clip": 1.3441937, + "balance_loss_mlp": 1.0272212, + "epoch": 0.11615812415451676, + "flos": 22788764411400.0, + "grad_norm": 1.5090079224916702, + "language_loss": 0.82413018, + "learning_rate": 3.922635473420164e-06, + "loss": 0.85010803, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.25476074, + "step": 1932, + "time_per_iteration": 2.852281332015991 + }, + { + "auxiliary_loss_clip": 0.0135244, + "auxiliary_loss_mlp": 0.0100896, + "balance_loss_clip": 1.24346352, + "balance_loss_mlp": 1.0031426, + "epoch": 0.11621824740718473, + "flos": 67161232013760.0, + "grad_norm": 0.7778511052519451, + "language_loss": 0.61135316, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63496715, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05810547, + "step": 1933, + "time_per_iteration": 3.184882164001465 + }, + { + "auxiliary_loss_clip": 0.01550103, + "auxiliary_loss_mlp": 0.01060402, + "balance_loss_clip": 1.34672892, + "balance_loss_mlp": 1.03464103, + "epoch": 0.11627837065985269, + "flos": 20380794213600.0, + "grad_norm": 1.9798262764984396, + "language_loss": 0.85948759, + "learning_rate": 3.922420779525586e-06, + "loss": 0.88559264, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.2578125, + "step": 1934, + "time_per_iteration": 2.802147150039673 + }, + { + "auxiliary_loss_clip": 0.01558363, + "auxiliary_loss_mlp": 0.0106067, + "balance_loss_clip": 1.34945583, + "balance_loss_mlp": 1.03439641, + "epoch": 0.11633849391252067, + "flos": 21730840571160.0, + "grad_norm": 2.1743715921782103, + "language_loss": 0.667858, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.6940484, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26293945, + "step": 1935, + "time_per_iteration": 2.8088693618774414 + }, + { + "auxiliary_loss_clip": 0.01544401, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_clip": 1.34146595, + "balance_loss_mlp": 1.02732873, + "epoch": 0.11639861716518864, + "flos": 18809777831400.0, + "grad_norm": 1.9608284581561468, + "language_loss": 0.76241791, + "learning_rate": 3.922205794037456e-06, + "loss": 0.78835613, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.2208252, + "step": 1936, + "time_per_iteration": 2.829631805419922 + }, + { + "auxiliary_loss_clip": 0.01539384, + "auxiliary_loss_mlp": 0.01061391, + "balance_loss_clip": 1.33534741, + "balance_loss_mlp": 1.0349741, + "epoch": 0.1164587404178566, + "flos": 21219940880640.0, + "grad_norm": 1.7259401590014456, + "language_loss": 0.84816211, + "learning_rate": 3.922098191955998e-06, + "loss": 0.87416995, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.2644043, + "step": 1937, + "time_per_iteration": 2.875401020050049 + }, + { + "auxiliary_loss_clip": 0.01524015, + "auxiliary_loss_mlp": 0.0105677, + "balance_loss_clip": 1.32727909, + "balance_loss_mlp": 1.03339338, + "epoch": 0.11651886367052458, + "flos": 27824415972480.0, + "grad_norm": 2.7793592481363105, + "language_loss": 0.76598942, + "learning_rate": 3.921990516988384e-06, + "loss": 0.79179734, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.23364258, + "step": 1938, + "time_per_iteration": 2.7941033840179443 + }, + { + "auxiliary_loss_clip": 0.01535334, + "auxiliary_loss_mlp": 0.01056882, + "balance_loss_clip": 1.33214581, + "balance_loss_mlp": 1.03323078, + "epoch": 0.11657898692319255, + "flos": 22894254595440.0, + "grad_norm": 1.8663419900150253, + "language_loss": 0.79963046, + "learning_rate": 3.921882769138696e-06, + "loss": 0.82555264, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.23669434, + "step": 1939, + "time_per_iteration": 2.8112785816192627 + }, + { + "auxiliary_loss_clip": 0.0153334, + "auxiliary_loss_mlp": 0.01062221, + "balance_loss_clip": 1.33253658, + "balance_loss_mlp": 1.03849792, + "epoch": 0.11663911017586051, + "flos": 24321219832800.0, + "grad_norm": 2.3963107540960493, + "language_loss": 0.86938179, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.89533734, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23718262, + "step": 1940, + "time_per_iteration": 2.7605271339416504 + }, + { + "auxiliary_loss_clip": 0.01520733, + "auxiliary_loss_mlp": 0.01054537, + "balance_loss_clip": 1.32589006, + "balance_loss_mlp": 1.03217375, + "epoch": 0.11669923342852849, + "flos": 42347856571560.0, + "grad_norm": 1.3664515332442155, + "language_loss": 0.76046801, + "learning_rate": 3.921667054809449e-06, + "loss": 0.78622067, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.22363281, + "step": 1941, + "time_per_iteration": 2.9742860794067383 + }, + { + "auxiliary_loss_clip": 0.01522142, + "auxiliary_loss_mlp": 0.01067833, + "balance_loss_clip": 1.32245708, + "balance_loss_mlp": 1.04444432, + "epoch": 0.11675935668119646, + "flos": 14645702035800.0, + "grad_norm": 1.9842090201431102, + "language_loss": 0.8872081, + "learning_rate": 3.921559088338068e-06, + "loss": 0.91310787, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.23364258, + "step": 1942, + "time_per_iteration": 2.7297825813293457 + }, + { + "auxiliary_loss_clip": 0.01513707, + "auxiliary_loss_mlp": 0.01051196, + "balance_loss_clip": 1.31898999, + "balance_loss_mlp": 1.02990484, + "epoch": 0.11681947993386442, + "flos": 35123296244400.0, + "grad_norm": 1.6057964637380677, + "language_loss": 0.68198246, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70763147, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.21289062, + "step": 1943, + "time_per_iteration": 2.940678358078003 + }, + { + "auxiliary_loss_clip": 0.01510265, + "auxiliary_loss_mlp": 0.01050079, + "balance_loss_clip": 1.31346619, + "balance_loss_mlp": 1.02646387, + "epoch": 0.11687960318653239, + "flos": 38990498344560.0, + "grad_norm": 1.8079774350986426, + "language_loss": 0.70197821, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72758168, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.23620605, + "step": 1944, + "time_per_iteration": 2.9040145874023438 + }, + { + "auxiliary_loss_clip": 0.01511937, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_clip": 1.31602335, + "balance_loss_mlp": 1.03277218, + "epoch": 0.11693972643920036, + "flos": 26000609592600.0, + "grad_norm": 1.4863860242497258, + "language_loss": 0.82561576, + "learning_rate": 3.921234751746038e-06, + "loss": 0.85127783, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.21508789, + "step": 1945, + "time_per_iteration": 2.826138734817505 + }, + { + "auxiliary_loss_clip": 0.01510694, + "auxiliary_loss_mlp": 0.01050368, + "balance_loss_clip": 1.31382084, + "balance_loss_mlp": 1.02830267, + "epoch": 0.11699984969186833, + "flos": 27277919731440.0, + "grad_norm": 1.8726834950706441, + "language_loss": 0.77444077, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.80005133, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.22070312, + "step": 1946, + "time_per_iteration": 2.8517425060272217 + }, + { + "auxiliary_loss_clip": 0.01497682, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.30249095, + "balance_loss_mlp": 1.02830207, + "epoch": 0.1170599729445363, + "flos": 15272040958560.0, + "grad_norm": 1.924236094819162, + "language_loss": 0.68954223, + "learning_rate": 3.921018163077448e-06, + "loss": 0.7150144, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.21240234, + "step": 1947, + "time_per_iteration": 2.6991665363311768 + }, + { + "auxiliary_loss_clip": 0.01510789, + "auxiliary_loss_mlp": 0.01063316, + "balance_loss_clip": 1.31310165, + "balance_loss_mlp": 1.03971219, + "epoch": 0.11712009619720427, + "flos": 17168786598960.0, + "grad_norm": 1.6291828012762712, + "language_loss": 0.85804319, + "learning_rate": 3.920909759473295e-06, + "loss": 0.88378417, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.23608398, + "step": 1948, + "time_per_iteration": 2.764704465866089 + }, + { + "auxiliary_loss_clip": 0.01344544, + "auxiliary_loss_mlp": 0.01043281, + "balance_loss_clip": 1.22910929, + "balance_loss_mlp": 1.03727269, + "epoch": 0.11718021944987224, + "flos": 70956672495840.0, + "grad_norm": 0.8313432936431651, + "language_loss": 0.65154326, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67542148, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.06005859, + "step": 1949, + "time_per_iteration": 3.285517930984497 + }, + { + "auxiliary_loss_clip": 0.01504436, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_clip": 1.31001306, + "balance_loss_mlp": 1.02791357, + "epoch": 0.1172403427025402, + "flos": 27459151152840.0, + "grad_norm": 1.5089255762716358, + "language_loss": 0.72167802, + "learning_rate": 3.920692733745835e-06, + "loss": 0.74722052, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.21911621, + "step": 1950, + "time_per_iteration": 2.818044662475586 + }, + { + "auxiliary_loss_clip": 0.01507796, + "auxiliary_loss_mlp": 0.01060024, + "balance_loss_clip": 1.30874705, + "balance_loss_mlp": 1.03760123, + "epoch": 0.11730046595520818, + "flos": 15672902328720.0, + "grad_norm": 2.2888957672902492, + "language_loss": 0.76233923, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78801751, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.22436523, + "step": 1951, + "time_per_iteration": 2.896768569946289 + }, + { + "auxiliary_loss_clip": 0.01506722, + "auxiliary_loss_mlp": 0.0106446, + "balance_loss_clip": 1.3092072, + "balance_loss_mlp": 1.04247761, + "epoch": 0.11736058920787615, + "flos": 25635750856560.0, + "grad_norm": 1.6774769748972727, + "language_loss": 0.76568723, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.79139906, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.22009277, + "step": 1952, + "time_per_iteration": 4.350861549377441 + }, + { + "auxiliary_loss_clip": 0.01512622, + "auxiliary_loss_mlp": 0.01059857, + "balance_loss_clip": 1.3155508, + "balance_loss_mlp": 1.03810096, + "epoch": 0.11742071246054411, + "flos": 21439327179240.0, + "grad_norm": 1.8815660978912023, + "language_loss": 0.72718412, + "learning_rate": 3.920366648918491e-06, + "loss": 0.75290889, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.21765137, + "step": 1953, + "time_per_iteration": 2.766913890838623 + }, + { + "auxiliary_loss_clip": 0.0151309, + "auxiliary_loss_mlp": 0.01070756, + "balance_loss_clip": 1.30905437, + "balance_loss_mlp": 1.04683089, + "epoch": 0.11748083571321208, + "flos": 16002164514240.0, + "grad_norm": 2.454730710156023, + "language_loss": 0.80025423, + "learning_rate": 3.920257808329552e-06, + "loss": 0.82609272, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.23913574, + "step": 1954, + "time_per_iteration": 2.812528371810913 + }, + { + "auxiliary_loss_clip": 0.01510586, + "auxiliary_loss_mlp": 0.01057903, + "balance_loss_clip": 1.30968177, + "balance_loss_mlp": 1.03526497, + "epoch": 0.11754095896588006, + "flos": 16184370536280.0, + "grad_norm": 1.8544113088283087, + "language_loss": 0.86053967, + "learning_rate": 3.920148894924246e-06, + "loss": 0.88622457, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.22644043, + "step": 1955, + "time_per_iteration": 5.883108377456665 + }, + { + "auxiliary_loss_clip": 0.015101, + "auxiliary_loss_mlp": 0.01056177, + "balance_loss_clip": 1.31128502, + "balance_loss_mlp": 1.03425407, + "epoch": 0.11760108221854802, + "flos": 13265581864680.0, + "grad_norm": 4.152785323901935, + "language_loss": 0.78654766, + "learning_rate": 3.920039908706701e-06, + "loss": 0.81221038, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.21911621, + "step": 1956, + "time_per_iteration": 4.203261375427246 + }, + { + "auxiliary_loss_clip": 0.01497245, + "auxiliary_loss_mlp": 0.01060027, + "balance_loss_clip": 1.30496895, + "balance_loss_mlp": 1.03836679, + "epoch": 0.11766120547121599, + "flos": 24503791330080.0, + "grad_norm": 1.9336680668796307, + "language_loss": 0.80878735, + "learning_rate": 3.91993084968105e-06, + "loss": 0.83436006, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.21655273, + "step": 1957, + "time_per_iteration": 2.770474672317505 + }, + { + "auxiliary_loss_clip": 0.01511274, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_clip": 1.31239378, + "balance_loss_mlp": 1.03634667, + "epoch": 0.11772132872388397, + "flos": 17788343925600.0, + "grad_norm": 2.3048578147558643, + "language_loss": 0.78490186, + "learning_rate": 3.919821717851428e-06, + "loss": 0.81059778, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.21948242, + "step": 1958, + "time_per_iteration": 2.809156656265259 + }, + { + "auxiliary_loss_clip": 0.01509413, + "auxiliary_loss_mlp": 0.01063971, + "balance_loss_clip": 1.31084728, + "balance_loss_mlp": 1.0413208, + "epoch": 0.11778145197655193, + "flos": 13219955049240.0, + "grad_norm": 1.7917010229113806, + "language_loss": 0.77129388, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79702771, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.22668457, + "step": 1959, + "time_per_iteration": 2.7435195446014404 + }, + { + "auxiliary_loss_clip": 0.01508144, + "auxiliary_loss_mlp": 0.0105745, + "balance_loss_clip": 1.31001461, + "balance_loss_mlp": 1.03683889, + "epoch": 0.1178415752292199, + "flos": 20235078126000.0, + "grad_norm": 1.882022433645838, + "language_loss": 0.70219201, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72784799, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.20617676, + "step": 1960, + "time_per_iteration": 2.805155038833618 + }, + { + "auxiliary_loss_clip": 0.01518456, + "auxiliary_loss_mlp": 0.01054638, + "balance_loss_clip": 1.3167398, + "balance_loss_mlp": 1.03238201, + "epoch": 0.11790169848188788, + "flos": 13043759064480.0, + "grad_norm": 2.2821466857440798, + "language_loss": 0.8206346, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.84636557, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.22265625, + "step": 1961, + "time_per_iteration": 2.7266058921813965 + }, + { + "auxiliary_loss_clip": 0.01497791, + "auxiliary_loss_mlp": 0.01061567, + "balance_loss_clip": 1.30484998, + "balance_loss_mlp": 1.04035974, + "epoch": 0.11796182173455584, + "flos": 22269864873960.0, + "grad_norm": 1.7216169045489023, + "language_loss": 0.92869365, + "learning_rate": 3.919384462576049e-06, + "loss": 0.95428729, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.21191406, + "step": 1962, + "time_per_iteration": 2.7737512588500977 + }, + { + "auxiliary_loss_clip": 0.01509483, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.3105371, + "balance_loss_mlp": 1.0301677, + "epoch": 0.1180219449872238, + "flos": 10638712668600.0, + "grad_norm": 1.9787564999694092, + "language_loss": 0.88424981, + "learning_rate": 3.919274966788707e-06, + "loss": 0.90986526, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.21887207, + "step": 1963, + "time_per_iteration": 2.757111072540283 + }, + { + "auxiliary_loss_clip": 0.01515961, + "auxiliary_loss_mlp": 0.01062, + "balance_loss_clip": 1.31585383, + "balance_loss_mlp": 1.04048288, + "epoch": 0.11808206823989177, + "flos": 20928508705440.0, + "grad_norm": 2.082690313813425, + "language_loss": 0.84457588, + "learning_rate": 3.919165398222265e-06, + "loss": 0.87035549, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.21520996, + "step": 1964, + "time_per_iteration": 2.7660303115844727 + }, + { + "auxiliary_loss_clip": 0.01518833, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.32314014, + "balance_loss_mlp": 1.04109287, + "epoch": 0.11814219149255975, + "flos": 20782752009480.0, + "grad_norm": 1.8357161104309936, + "language_loss": 0.83461595, + "learning_rate": 3.919055756880879e-06, + "loss": 0.86042774, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.21240234, + "step": 1965, + "time_per_iteration": 2.768777847290039 + }, + { + "auxiliary_loss_clip": 0.01511496, + "auxiliary_loss_mlp": 0.01062407, + "balance_loss_clip": 1.31182981, + "balance_loss_mlp": 1.04031765, + "epoch": 0.11820231474522772, + "flos": 48768257656800.0, + "grad_norm": 2.4505321709172545, + "language_loss": 0.75241339, + "learning_rate": 3.918946042768707e-06, + "loss": 0.77815247, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.22094727, + "step": 1966, + "time_per_iteration": 3.0738766193389893 + }, + { + "auxiliary_loss_clip": 0.01519945, + "auxiliary_loss_mlp": 0.01059417, + "balance_loss_clip": 1.3193692, + "balance_loss_mlp": 1.03601646, + "epoch": 0.11826243799789568, + "flos": 16694945359920.0, + "grad_norm": 2.0652234711731423, + "language_loss": 0.73027575, + "learning_rate": 3.918836255889908e-06, + "loss": 0.7560693, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.23413086, + "step": 1967, + "time_per_iteration": 2.733491897583008 + }, + { + "auxiliary_loss_clip": 0.015106, + "auxiliary_loss_mlp": 0.01053751, + "balance_loss_clip": 1.31227922, + "balance_loss_mlp": 1.03136373, + "epoch": 0.11832256125056366, + "flos": 16914291050160.0, + "grad_norm": 2.030392638177889, + "language_loss": 0.89342928, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.91907281, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.22387695, + "step": 1968, + "time_per_iteration": 2.7618143558502197 + }, + { + "auxiliary_loss_clip": 0.01508738, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.31352913, + "balance_loss_mlp": 1.02739477, + "epoch": 0.11838268450323162, + "flos": 22825579212720.0, + "grad_norm": 2.115217220521631, + "language_loss": 0.66978055, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69536102, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.21899414, + "step": 1969, + "time_per_iteration": 2.794020175933838 + }, + { + "auxiliary_loss_clip": 0.01513729, + "auxiliary_loss_mlp": 0.01050836, + "balance_loss_clip": 1.31886053, + "balance_loss_mlp": 1.02885377, + "epoch": 0.11844280775589959, + "flos": 33552239253840.0, + "grad_norm": 1.96444332031504, + "language_loss": 0.81103837, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83668411, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.21972656, + "step": 1970, + "time_per_iteration": 2.885481834411621 + }, + { + "auxiliary_loss_clip": 0.01442096, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_clip": 1.32410121, + "balance_loss_mlp": 1.040519, + "epoch": 0.11850293100856757, + "flos": 66367021820040.0, + "grad_norm": 0.8095436064610138, + "language_loss": 0.66143453, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68633556, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.07470703, + "step": 1971, + "time_per_iteration": 3.2664272785186768 + }, + { + "auxiliary_loss_clip": 0.01515169, + "auxiliary_loss_mlp": 0.01047692, + "balance_loss_clip": 1.3177048, + "balance_loss_mlp": 1.02611554, + "epoch": 0.11856305426123553, + "flos": 24686444044080.0, + "grad_norm": 1.862977813759849, + "language_loss": 0.8029654, + "learning_rate": 3.918286230142327e-06, + "loss": 0.82859397, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.21557617, + "step": 1972, + "time_per_iteration": 2.8684136867523193 + }, + { + "auxiliary_loss_clip": 0.01507005, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.31322587, + "balance_loss_mlp": 1.02282512, + "epoch": 0.1186231775139035, + "flos": 24285501457200.0, + "grad_norm": 2.01062246624341, + "language_loss": 0.73181915, + "learning_rate": 3.918176006751292e-06, + "loss": 0.75734234, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.22473145, + "step": 1973, + "time_per_iteration": 2.817808151245117 + }, + { + "auxiliary_loss_clip": 0.0150393, + "auxiliary_loss_mlp": 0.01043055, + "balance_loss_clip": 1.3106699, + "balance_loss_mlp": 1.02208614, + "epoch": 0.11868330076657148, + "flos": 21761807768640.0, + "grad_norm": 1.7304004546938387, + "language_loss": 0.72652984, + "learning_rate": 3.918065710622832e-06, + "loss": 0.75199974, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.20959473, + "step": 1974, + "time_per_iteration": 2.823235273361206 + }, + { + "auxiliary_loss_clip": 0.01499891, + "auxiliary_loss_mlp": 0.01045471, + "balance_loss_clip": 1.30630898, + "balance_loss_mlp": 1.02329803, + "epoch": 0.11874342401923944, + "flos": 17196626952720.0, + "grad_norm": 2.038495856027569, + "language_loss": 0.78460443, + "learning_rate": 3.917955341761128e-06, + "loss": 0.81005812, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.22155762, + "step": 1975, + "time_per_iteration": 2.757338762283325 + }, + { + "auxiliary_loss_clip": 0.01500898, + "auxiliary_loss_mlp": 0.01053448, + "balance_loss_clip": 1.31077862, + "balance_loss_mlp": 1.03200209, + "epoch": 0.11880354727190741, + "flos": 15233520606120.0, + "grad_norm": 2.2598242958755015, + "language_loss": 0.76191735, + "learning_rate": 3.917844900170364e-06, + "loss": 0.7874608, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.21447754, + "step": 1976, + "time_per_iteration": 2.863053798675537 + }, + { + "auxiliary_loss_clip": 0.01505397, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.31164408, + "balance_loss_mlp": 1.02248645, + "epoch": 0.11886367052457537, + "flos": 27315668525040.0, + "grad_norm": 1.7670529366937766, + "language_loss": 0.75479746, + "learning_rate": 3.91773438585473e-06, + "loss": 0.78029412, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.21765137, + "step": 1977, + "time_per_iteration": 2.953868865966797 + }, + { + "auxiliary_loss_clip": 0.01503065, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_clip": 1.30689287, + "balance_loss_mlp": 1.02654779, + "epoch": 0.11892379377724335, + "flos": 21803251923000.0, + "grad_norm": 2.8618545928252557, + "language_loss": 0.73984903, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76537013, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.22509766, + "step": 1978, + "time_per_iteration": 2.742680788040161 + }, + { + "auxiliary_loss_clip": 0.01510902, + "auxiliary_loss_mlp": 0.01053494, + "balance_loss_clip": 1.31730056, + "balance_loss_mlp": 1.03179789, + "epoch": 0.11898391702991132, + "flos": 13995055686600.0, + "grad_norm": 1.6583561513014378, + "language_loss": 0.73738647, + "learning_rate": 3.917513139065616e-06, + "loss": 0.76303041, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.21691895, + "step": 1979, + "time_per_iteration": 2.787149667739868 + }, + { + "auxiliary_loss_clip": 0.01504779, + "auxiliary_loss_mlp": 0.01048036, + "balance_loss_clip": 1.31143117, + "balance_loss_mlp": 1.02738881, + "epoch": 0.11904404028257928, + "flos": 32241078723960.0, + "grad_norm": 2.0344610477174547, + "language_loss": 0.99110687, + "learning_rate": 3.917402406600525e-06, + "loss": 1.01663518, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.20654297, + "step": 1980, + "time_per_iteration": 2.809521436691284 + }, + { + "auxiliary_loss_clip": 0.01516271, + "auxiliary_loss_mlp": 0.0104898, + "balance_loss_clip": 1.32020783, + "balance_loss_mlp": 1.02603245, + "epoch": 0.11910416353524726, + "flos": 23591502360720.0, + "grad_norm": 1.682615424447516, + "language_loss": 0.86131299, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88696551, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.22937012, + "step": 1981, + "time_per_iteration": 2.7858262062072754 + }, + { + "auxiliary_loss_clip": 0.01513466, + "auxiliary_loss_mlp": 0.01059345, + "balance_loss_clip": 1.3182199, + "balance_loss_mlp": 1.03589666, + "epoch": 0.11916428678791523, + "flos": 25337902560480.0, + "grad_norm": 1.7116677821451676, + "language_loss": 0.85009193, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87582004, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.23461914, + "step": 1982, + "time_per_iteration": 2.780348777770996 + }, + { + "auxiliary_loss_clip": 0.01505435, + "auxiliary_loss_mlp": 0.01045791, + "balance_loss_clip": 1.31344056, + "balance_loss_mlp": 1.02483368, + "epoch": 0.11922441004058319, + "flos": 19792731993120.0, + "grad_norm": 1.943708138549581, + "language_loss": 0.85969847, + "learning_rate": 3.917069772973513e-06, + "loss": 0.88521075, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.20947266, + "step": 1983, + "time_per_iteration": 2.739304542541504 + }, + { + "auxiliary_loss_clip": 0.01515917, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.31951606, + "balance_loss_mlp": 1.0230993, + "epoch": 0.11928453329325117, + "flos": 21540431660400.0, + "grad_norm": 2.8823283914692395, + "language_loss": 0.77835453, + "learning_rate": 3.916958749701277e-06, + "loss": 0.80396193, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.21740723, + "step": 1984, + "time_per_iteration": 2.75704026222229 + }, + { + "auxiliary_loss_clip": 0.0151679, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_clip": 1.3236469, + "balance_loss_mlp": 1.02851999, + "epoch": 0.11934465654591914, + "flos": 20820094719480.0, + "grad_norm": 1.6326156216336865, + "language_loss": 0.83694661, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.86260593, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.20605469, + "step": 1985, + "time_per_iteration": 2.772310972213745 + }, + { + "auxiliary_loss_clip": 0.01516843, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.32609653, + "balance_loss_mlp": 1.02333939, + "epoch": 0.1194047797985871, + "flos": 19064923113960.0, + "grad_norm": 2.252553490944795, + "language_loss": 0.74800217, + "learning_rate": 3.916736485087216e-06, + "loss": 0.77361512, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.2109375, + "step": 1986, + "time_per_iteration": 2.7399682998657227 + }, + { + "auxiliary_loss_clip": 0.01519285, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.3256402, + "balance_loss_mlp": 1.02825737, + "epoch": 0.11946490305125507, + "flos": 27195518723040.0, + "grad_norm": 2.0253647394058167, + "language_loss": 0.72287858, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74856764, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.21350098, + "step": 1987, + "time_per_iteration": 2.8096256256103516 + }, + { + "auxiliary_loss_clip": 0.01519771, + "auxiliary_loss_mlp": 0.01056767, + "balance_loss_clip": 1.32328653, + "balance_loss_mlp": 1.03502297, + "epoch": 0.11952502630392305, + "flos": 21145580327520.0, + "grad_norm": 2.1679622060269406, + "language_loss": 0.72774774, + "learning_rate": 3.916513929741799e-06, + "loss": 0.7535131, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.2175293, + "step": 1988, + "time_per_iteration": 2.7682788372039795 + }, + { + "auxiliary_loss_clip": 0.01512717, + "auxiliary_loss_mlp": 0.01051839, + "balance_loss_clip": 1.3217032, + "balance_loss_mlp": 1.03013074, + "epoch": 0.11958514955659101, + "flos": 22128940572840.0, + "grad_norm": 1.7849330577877283, + "language_loss": 0.81195748, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83760309, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.21704102, + "step": 1989, + "time_per_iteration": 2.8416314125061035 + }, + { + "auxiliary_loss_clip": 0.01520396, + "auxiliary_loss_mlp": 0.01054547, + "balance_loss_clip": 1.32457125, + "balance_loss_mlp": 1.03213549, + "epoch": 0.11964527280925898, + "flos": 17426043516240.0, + "grad_norm": 2.8585186588316103, + "language_loss": 0.76203001, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78777945, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.22399902, + "step": 1990, + "time_per_iteration": 4.191422939300537 + }, + { + "auxiliary_loss_clip": 0.01451493, + "auxiliary_loss_mlp": 0.011102, + "balance_loss_clip": 1.33272362, + "balance_loss_mlp": 1.10371494, + "epoch": 0.11970539606192696, + "flos": 70694623792080.0, + "grad_norm": 0.9207791789403659, + "language_loss": 0.5523625, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57797945, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.06494141, + "step": 1991, + "time_per_iteration": 3.292675495147705 + }, + { + "auxiliary_loss_clip": 0.01511275, + "auxiliary_loss_mlp": 0.01053126, + "balance_loss_clip": 1.32192421, + "balance_loss_mlp": 1.03179979, + "epoch": 0.11976551931459492, + "flos": 21219981489000.0, + "grad_norm": 4.52841787948732, + "language_loss": 0.78515542, + "learning_rate": 3.916067946991971e-06, + "loss": 0.81079942, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.2130127, + "step": 1992, + "time_per_iteration": 2.748400926589966 + }, + { + "auxiliary_loss_clip": 0.01515925, + "auxiliary_loss_mlp": 0.01050065, + "balance_loss_clip": 1.319628, + "balance_loss_mlp": 1.02814221, + "epoch": 0.11982564256726289, + "flos": 25994193471720.0, + "grad_norm": 2.142499495422262, + "language_loss": 0.79311198, + "learning_rate": 3.915956269650216e-06, + "loss": 0.8187719, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.21899414, + "step": 1993, + "time_per_iteration": 4.36981463432312 + }, + { + "auxiliary_loss_clip": 0.01517296, + "auxiliary_loss_mlp": 0.01057724, + "balance_loss_clip": 1.32350993, + "balance_loss_mlp": 1.03534865, + "epoch": 0.11988576581993086, + "flos": 21655627242480.0, + "grad_norm": 1.7886687493269247, + "language_loss": 0.82657677, + "learning_rate": 3.915844519655208e-06, + "loss": 0.85232699, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.22375488, + "step": 1994, + "time_per_iteration": 4.3265321254730225 + }, + { + "auxiliary_loss_clip": 0.01518531, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_clip": 1.32693207, + "balance_loss_mlp": 1.04362535, + "epoch": 0.11994588907259883, + "flos": 17861811094800.0, + "grad_norm": 2.337963543920911, + "language_loss": 0.89007401, + "learning_rate": 3.915732697011183e-06, + "loss": 0.91589838, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.20288086, + "step": 1995, + "time_per_iteration": 4.345260858535767 + }, + { + "auxiliary_loss_clip": 0.01519988, + "auxiliary_loss_mlp": 0.01062007, + "balance_loss_clip": 1.32483685, + "balance_loss_mlp": 1.04019177, + "epoch": 0.1200060123252668, + "flos": 24468194779560.0, + "grad_norm": 1.8862034877134672, + "language_loss": 0.74441028, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.77023029, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.21813965, + "step": 1996, + "time_per_iteration": 2.9510996341705322 + }, + { + "auxiliary_loss_clip": 0.01518344, + "auxiliary_loss_mlp": 0.01058399, + "balance_loss_clip": 1.32587171, + "balance_loss_mlp": 1.03609478, + "epoch": 0.12006613557793476, + "flos": 18736594920720.0, + "grad_norm": 1.777352422925208, + "language_loss": 0.88505018, + "learning_rate": 3.915508833793048e-06, + "loss": 0.91081762, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.22290039, + "step": 1997, + "time_per_iteration": 2.8136870861053467 + }, + { + "auxiliary_loss_clip": 0.01520801, + "auxiliary_loss_mlp": 0.01073129, + "balance_loss_clip": 1.32808459, + "balance_loss_mlp": 1.04984784, + "epoch": 0.12012625883060274, + "flos": 22272138942120.0, + "grad_norm": 1.8726059705437839, + "language_loss": 0.79302001, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81895936, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.23278809, + "step": 1998, + "time_per_iteration": 2.906921148300171 + }, + { + "auxiliary_loss_clip": 0.01519925, + "auxiliary_loss_mlp": 0.0106469, + "balance_loss_clip": 1.32992768, + "balance_loss_mlp": 1.04277921, + "epoch": 0.1201863820832707, + "flos": 21763472711400.0, + "grad_norm": 1.7270037709917962, + "language_loss": 0.73971486, + "learning_rate": 3.915284680029769e-06, + "loss": 0.76556098, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.21911621, + "step": 1999, + "time_per_iteration": 2.782494306564331 + }, + { + "auxiliary_loss_clip": 0.01523655, + "auxiliary_loss_mlp": 0.01067993, + "balance_loss_clip": 1.33041286, + "balance_loss_mlp": 1.04672635, + "epoch": 0.12024650533593867, + "flos": 21913005984840.0, + "grad_norm": 2.5775317989473923, + "language_loss": 0.75744259, + "learning_rate": 3.915172494204323e-06, + "loss": 0.78335905, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.21276855, + "step": 2000, + "time_per_iteration": 2.7900073528289795 + }, + { + "auxiliary_loss_clip": 0.01523333, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_clip": 1.32932806, + "balance_loss_mlp": 1.03386343, + "epoch": 0.12030662858860665, + "flos": 21694228811640.0, + "grad_norm": 1.5286981619232558, + "language_loss": 0.85314828, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87894726, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.22692871, + "step": 2001, + "time_per_iteration": 2.7530364990234375 + }, + { + "auxiliary_loss_clip": 0.01530048, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_clip": 1.33859801, + "balance_loss_mlp": 1.03428078, + "epoch": 0.12036675184127461, + "flos": 12937172454720.0, + "grad_norm": 2.0122026590233717, + "language_loss": 0.7485221, + "learning_rate": 3.91494790468709e-06, + "loss": 0.77438068, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.21533203, + "step": 2002, + "time_per_iteration": 2.739044427871704 + }, + { + "auxiliary_loss_clip": 0.01540723, + "auxiliary_loss_mlp": 0.01058884, + "balance_loss_clip": 1.34224737, + "balance_loss_mlp": 1.03522074, + "epoch": 0.12042687509394258, + "flos": 20856056745240.0, + "grad_norm": 2.0517205397543568, + "language_loss": 0.78141081, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80740696, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.23669434, + "step": 2003, + "time_per_iteration": 2.74222993850708 + }, + { + "auxiliary_loss_clip": 0.01524429, + "auxiliary_loss_mlp": 0.01061889, + "balance_loss_clip": 1.33402455, + "balance_loss_mlp": 1.03898859, + "epoch": 0.12048699834661056, + "flos": 23883827919840.0, + "grad_norm": 1.5477881945694876, + "language_loss": 0.72561812, + "learning_rate": 3.914723024709793e-06, + "loss": 0.75148129, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.22888184, + "step": 2004, + "time_per_iteration": 2.800509452819824 + }, + { + "auxiliary_loss_clip": 0.01535061, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.3403933, + "balance_loss_mlp": 1.03476536, + "epoch": 0.12054712159927852, + "flos": 19761236886960.0, + "grad_norm": 1.660753584494264, + "language_loss": 0.78736711, + "learning_rate": 3.914610475809279e-06, + "loss": 0.81329525, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.22998047, + "step": 2005, + "time_per_iteration": 2.7724125385284424 + }, + { + "auxiliary_loss_clip": 0.01436792, + "auxiliary_loss_mlp": 0.0101082, + "balance_loss_clip": 1.32645726, + "balance_loss_mlp": 1.00555122, + "epoch": 0.12060724485194649, + "flos": 51685438975200.0, + "grad_norm": 0.9589939653929075, + "language_loss": 0.5804311, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60490727, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05273438, + "step": 2006, + "time_per_iteration": 3.060147523880005 + }, + { + "auxiliary_loss_clip": 0.01516666, + "auxiliary_loss_mlp": 0.0104996, + "balance_loss_clip": 1.3290906, + "balance_loss_mlp": 1.02847838, + "epoch": 0.12066736810461445, + "flos": 18995232522240.0, + "grad_norm": 1.70481631943371, + "language_loss": 0.77070564, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.79637194, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.21508789, + "step": 2007, + "time_per_iteration": 2.861069679260254 + }, + { + "auxiliary_loss_clip": 0.01523294, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_clip": 1.33053899, + "balance_loss_mlp": 1.0273186, + "epoch": 0.12072749135728243, + "flos": 16476330620160.0, + "grad_norm": 3.1059380819410043, + "language_loss": 0.83260441, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85834873, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.23828125, + "step": 2008, + "time_per_iteration": 2.753894805908203 + }, + { + "auxiliary_loss_clip": 0.01521689, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.32879186, + "balance_loss_mlp": 1.02339518, + "epoch": 0.1207876146099504, + "flos": 18081644085360.0, + "grad_norm": 1.8820955479237769, + "language_loss": 0.84393549, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86960769, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.22131348, + "step": 2009, + "time_per_iteration": 2.7773585319519043 + }, + { + "auxiliary_loss_clip": 0.0151126, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.32185578, + "balance_loss_mlp": 1.0239774, + "epoch": 0.12084773786261836, + "flos": 21876597267120.0, + "grad_norm": 1.6980902796208965, + "language_loss": 0.8444612, + "learning_rate": 3.914046642358844e-06, + "loss": 0.87004131, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.2277832, + "step": 2010, + "time_per_iteration": 2.891343832015991 + }, + { + "auxiliary_loss_clip": 0.01524939, + "auxiliary_loss_mlp": 0.01052221, + "balance_loss_clip": 1.33309281, + "balance_loss_mlp": 1.03000057, + "epoch": 0.12090786111528634, + "flos": 18337682751840.0, + "grad_norm": 1.578191058480928, + "language_loss": 0.84168363, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86745524, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.22216797, + "step": 2011, + "time_per_iteration": 2.791405439376831 + }, + { + "auxiliary_loss_clip": 0.01532706, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_clip": 1.33947635, + "balance_loss_mlp": 1.027578, + "epoch": 0.1209679843679543, + "flos": 21110836552560.0, + "grad_norm": 1.792537839342763, + "language_loss": 0.96916521, + "learning_rate": 3.913820600882834e-06, + "loss": 0.99498379, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.21569824, + "step": 2012, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.01512941, + "auxiliary_loss_mlp": 0.0104322, + "balance_loss_clip": 1.32496428, + "balance_loss_mlp": 1.02170312, + "epoch": 0.12102810762062227, + "flos": 29246954898600.0, + "grad_norm": 1.7826905496815286, + "language_loss": 0.80441153, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82997322, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.21508789, + "step": 2013, + "time_per_iteration": 2.8360400199890137 + }, + { + "auxiliary_loss_clip": 0.01529329, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.33409667, + "balance_loss_mlp": 1.02509284, + "epoch": 0.12108823087329025, + "flos": 17935156438920.0, + "grad_norm": 2.485144833619645, + "language_loss": 0.77702636, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.80278969, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.21899414, + "step": 2014, + "time_per_iteration": 2.7501018047332764 + }, + { + "auxiliary_loss_clip": 0.01530553, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_clip": 1.34103203, + "balance_loss_mlp": 1.02879262, + "epoch": 0.12114835412595822, + "flos": 22097079991440.0, + "grad_norm": 2.1358623572235103, + "language_loss": 0.87835836, + "learning_rate": 3.913480994387535e-06, + "loss": 0.90416181, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.20983887, + "step": 2015, + "time_per_iteration": 2.770540714263916 + }, + { + "auxiliary_loss_clip": 0.0151582, + "auxiliary_loss_mlp": 0.01047958, + "balance_loss_clip": 1.3270247, + "balance_loss_mlp": 1.02663183, + "epoch": 0.12120847737862618, + "flos": 20417202931320.0, + "grad_norm": 1.8558980376323264, + "language_loss": 0.69777429, + "learning_rate": 3.913367647097926e-06, + "loss": 0.72341204, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.2130127, + "step": 2016, + "time_per_iteration": 2.8061985969543457 + }, + { + "auxiliary_loss_clip": 0.01521049, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_clip": 1.32878995, + "balance_loss_mlp": 1.02846515, + "epoch": 0.12126860063129415, + "flos": 22314070396800.0, + "grad_norm": 2.336722893922419, + "language_loss": 0.80640548, + "learning_rate": 3.913254227253225e-06, + "loss": 0.83212471, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.22424316, + "step": 2017, + "time_per_iteration": 2.812175989151001 + }, + { + "auxiliary_loss_clip": 0.01527936, + "auxiliary_loss_mlp": 0.01049036, + "balance_loss_clip": 1.33625996, + "balance_loss_mlp": 1.02508688, + "epoch": 0.12132872388396213, + "flos": 13703298644520.0, + "grad_norm": 2.0696439804238977, + "language_loss": 0.69236124, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71813095, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23937988, + "step": 2018, + "time_per_iteration": 2.8212645053863525 + }, + { + "auxiliary_loss_clip": 0.01521808, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.3312006, + "balance_loss_mlp": 1.03041995, + "epoch": 0.12138884713663009, + "flos": 26472339196920.0, + "grad_norm": 1.7965653761919336, + "language_loss": 0.72809464, + "learning_rate": 3.91302716991575e-06, + "loss": 0.75381947, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.20263672, + "step": 2019, + "time_per_iteration": 2.8122000694274902 + }, + { + "auxiliary_loss_clip": 0.0152472, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_clip": 1.33215189, + "balance_loss_mlp": 1.02801657, + "epoch": 0.12144897038929806, + "flos": 26147340889200.0, + "grad_norm": 1.568716159999822, + "language_loss": 0.92523324, + "learning_rate": 3.912913532431586e-06, + "loss": 0.95098191, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.22143555, + "step": 2020, + "time_per_iteration": 2.843050479888916 + }, + { + "auxiliary_loss_clip": 0.01526012, + "auxiliary_loss_mlp": 0.01045044, + "balance_loss_clip": 1.33337259, + "balance_loss_mlp": 1.02361012, + "epoch": 0.12150909364196603, + "flos": 24722933978520.0, + "grad_norm": 2.2781838498513753, + "language_loss": 0.78562689, + "learning_rate": 3.912799822409549e-06, + "loss": 0.81133741, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.2142334, + "step": 2021, + "time_per_iteration": 2.841996669769287 + }, + { + "auxiliary_loss_clip": 0.01521396, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.33286929, + "balance_loss_mlp": 1.02233481, + "epoch": 0.121569216894634, + "flos": 25191820997640.0, + "grad_norm": 1.858512102612036, + "language_loss": 0.81162971, + "learning_rate": 3.912686039853952e-06, + "loss": 0.83728898, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.22180176, + "step": 2022, + "time_per_iteration": 2.8237898349761963 + }, + { + "auxiliary_loss_clip": 0.01533829, + "auxiliary_loss_mlp": 0.01053691, + "balance_loss_clip": 1.34064913, + "balance_loss_mlp": 1.03224576, + "epoch": 0.12162934014730196, + "flos": 13448112753600.0, + "grad_norm": 1.6352173814152995, + "language_loss": 0.85163921, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87751448, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.21435547, + "step": 2023, + "time_per_iteration": 2.715925693511963 + }, + { + "auxiliary_loss_clip": 0.01536154, + "auxiliary_loss_mlp": 0.01049312, + "balance_loss_clip": 1.34063005, + "balance_loss_mlp": 1.0263164, + "epoch": 0.12168946339996994, + "flos": 16950537334440.0, + "grad_norm": 2.160067092318385, + "language_loss": 0.86199105, + "learning_rate": 3.912458257159335e-06, + "loss": 0.88784575, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.23010254, + "step": 2024, + "time_per_iteration": 2.8013532161712646 + }, + { + "auxiliary_loss_clip": 0.01527914, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.33567548, + "balance_loss_mlp": 1.02960777, + "epoch": 0.12174958665263791, + "flos": 29827139097240.0, + "grad_norm": 1.7513076316112128, + "language_loss": 0.72132903, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74712121, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.21716309, + "step": 2025, + "time_per_iteration": 2.849040985107422 + }, + { + "auxiliary_loss_clip": 0.01534218, + "auxiliary_loss_mlp": 0.0104646, + "balance_loss_clip": 1.34254861, + "balance_loss_mlp": 1.02397668, + "epoch": 0.12180970990530587, + "flos": 24646949091000.0, + "grad_norm": 1.4803227414361773, + "language_loss": 0.76378167, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78958839, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.22460938, + "step": 2026, + "time_per_iteration": 2.790930986404419 + }, + { + "auxiliary_loss_clip": 0.01534573, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_clip": 1.34118664, + "balance_loss_mlp": 1.02562082, + "epoch": 0.12186983315797385, + "flos": 20526550909560.0, + "grad_norm": 1.942886550558167, + "language_loss": 0.89439356, + "learning_rate": 3.912116039223659e-06, + "loss": 0.92020202, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.2064209, + "step": 2027, + "time_per_iteration": 2.7399864196777344 + }, + { + "auxiliary_loss_clip": 0.01523296, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_clip": 1.33430827, + "balance_loss_mlp": 1.02490425, + "epoch": 0.12192995641064182, + "flos": 27824091105600.0, + "grad_norm": 1.5113943207327452, + "language_loss": 0.75839007, + "learning_rate": 3.912001821557399e-06, + "loss": 0.78407323, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.20129395, + "step": 2028, + "time_per_iteration": 2.8015778064727783 + }, + { + "auxiliary_loss_clip": 0.01539551, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.34637618, + "balance_loss_mlp": 1.02717602, + "epoch": 0.12199007966330978, + "flos": 22022110312920.0, + "grad_norm": 1.931467597467481, + "language_loss": 0.77495205, + "learning_rate": 3.911887531387839e-06, + "loss": 0.80084336, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.22412109, + "step": 2029, + "time_per_iteration": 4.189545392990112 + }, + { + "auxiliary_loss_clip": 0.01539398, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.34464049, + "balance_loss_mlp": 1.02642298, + "epoch": 0.12205020291597775, + "flos": 23300516877480.0, + "grad_norm": 1.6236519609885822, + "language_loss": 0.78879833, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81467438, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.21765137, + "step": 2030, + "time_per_iteration": 2.809272527694702 + }, + { + "auxiliary_loss_clip": 0.01534801, + "auxiliary_loss_mlp": 0.01054456, + "balance_loss_clip": 1.34449053, + "balance_loss_mlp": 1.03137732, + "epoch": 0.12211032616864573, + "flos": 26037261960480.0, + "grad_norm": 2.0918937172377574, + "language_loss": 0.74946415, + "learning_rate": 3.911658733556155e-06, + "loss": 0.77535677, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.23071289, + "step": 2031, + "time_per_iteration": 2.8069987297058105 + }, + { + "auxiliary_loss_clip": 0.01536862, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.34699345, + "balance_loss_mlp": 1.01925921, + "epoch": 0.12217044942131369, + "flos": 20415619205280.0, + "grad_norm": 1.7198462281527984, + "language_loss": 0.75671697, + "learning_rate": 3.911544225902707e-06, + "loss": 0.7824924, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.2142334, + "step": 2032, + "time_per_iteration": 4.443471670150757 + }, + { + "auxiliary_loss_clip": 0.01523979, + "auxiliary_loss_mlp": 0.0104713, + "balance_loss_clip": 1.3369782, + "balance_loss_mlp": 1.02575529, + "epoch": 0.12223057267398166, + "flos": 22862353405680.0, + "grad_norm": 1.4984054111834453, + "language_loss": 0.89630908, + "learning_rate": 3.911429645763311e-06, + "loss": 0.92202014, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.21374512, + "step": 2033, + "time_per_iteration": 4.298927068710327 + }, + { + "auxiliary_loss_clip": 0.01544632, + "auxiliary_loss_mlp": 0.01050664, + "balance_loss_clip": 1.34896255, + "balance_loss_mlp": 1.02880156, + "epoch": 0.12229069592664964, + "flos": 20052384803640.0, + "grad_norm": 2.8423347503637335, + "language_loss": 0.66491663, + "learning_rate": 3.911314993142311e-06, + "loss": 0.69086957, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.21862793, + "step": 2034, + "time_per_iteration": 4.215655088424683 + }, + { + "auxiliary_loss_clip": 0.01542821, + "auxiliary_loss_mlp": 0.01057159, + "balance_loss_clip": 1.35167527, + "balance_loss_mlp": 1.03014624, + "epoch": 0.1223508191793176, + "flos": 22279489055280.0, + "grad_norm": 1.5005399735401301, + "language_loss": 0.76772761, + "learning_rate": 3.911200268044055e-06, + "loss": 0.7937274, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.2701416, + "step": 2035, + "time_per_iteration": 2.9025444984436035 + }, + { + "auxiliary_loss_clip": 0.01549727, + "auxiliary_loss_mlp": 0.01048406, + "balance_loss_clip": 1.35243845, + "balance_loss_mlp": 1.02561378, + "epoch": 0.12241094243198557, + "flos": 21290687289720.0, + "grad_norm": 1.756867132283175, + "language_loss": 0.72255588, + "learning_rate": 3.911085470472892e-06, + "loss": 0.74853718, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.22790527, + "step": 2036, + "time_per_iteration": 2.762903928756714 + }, + { + "auxiliary_loss_clip": 0.01545857, + "auxiliary_loss_mlp": 0.01054155, + "balance_loss_clip": 1.3531462, + "balance_loss_mlp": 1.02989626, + "epoch": 0.12247106568465355, + "flos": 17386629779880.0, + "grad_norm": 1.8144045094176107, + "language_loss": 0.83301646, + "learning_rate": 3.910970600433178e-06, + "loss": 0.8590166, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.24243164, + "step": 2037, + "time_per_iteration": 2.810882568359375 + }, + { + "auxiliary_loss_clip": 0.01555436, + "auxiliary_loss_mlp": 0.0105604, + "balance_loss_clip": 1.35901642, + "balance_loss_mlp": 1.03144681, + "epoch": 0.12253118893732151, + "flos": 27050208719040.0, + "grad_norm": 3.162493942668324, + "language_loss": 0.79992276, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82603753, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24584961, + "step": 2038, + "time_per_iteration": 2.843215227127075 + }, + { + "auxiliary_loss_clip": 0.01401538, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.29554725, + "balance_loss_mlp": 1.02813995, + "epoch": 0.12259131218998948, + "flos": 53874428958000.0, + "grad_norm": 0.822275430568934, + "language_loss": 0.58682656, + "learning_rate": 3.910740642965518e-06, + "loss": 0.61117578, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.05249023, + "step": 2039, + "time_per_iteration": 3.1285808086395264 + }, + { + "auxiliary_loss_clip": 0.01552884, + "auxiliary_loss_mlp": 0.01053345, + "balance_loss_clip": 1.35724294, + "balance_loss_mlp": 1.02879977, + "epoch": 0.12265143544265744, + "flos": 17896067569440.0, + "grad_norm": 1.9464457948690768, + "language_loss": 0.81450975, + "learning_rate": 3.910625555546292e-06, + "loss": 0.840572, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.24560547, + "step": 2040, + "time_per_iteration": 2.71136474609375 + }, + { + "auxiliary_loss_clip": 0.01546398, + "auxiliary_loss_mlp": 0.0105232, + "balance_loss_clip": 1.35594141, + "balance_loss_mlp": 1.02930081, + "epoch": 0.12271155869532542, + "flos": 21805119907560.0, + "grad_norm": 1.9126677518779212, + "language_loss": 0.831967, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85795414, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.23022461, + "step": 2041, + "time_per_iteration": 2.8144659996032715 + }, + { + "auxiliary_loss_clip": 0.01567402, + "auxiliary_loss_mlp": 0.01055836, + "balance_loss_clip": 1.36933398, + "balance_loss_mlp": 1.03211308, + "epoch": 0.12277168194799339, + "flos": 19833567022080.0, + "grad_norm": 1.5877671913297273, + "language_loss": 0.67599326, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.70222569, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.23730469, + "step": 2042, + "time_per_iteration": 2.807056188583374 + }, + { + "auxiliary_loss_clip": 0.01544921, + "auxiliary_loss_mlp": 0.01051232, + "balance_loss_clip": 1.35177541, + "balance_loss_mlp": 1.02772415, + "epoch": 0.12283180520066135, + "flos": 23226156324360.0, + "grad_norm": 1.5655794439598427, + "language_loss": 0.81629193, + "learning_rate": 3.910279858599409e-06, + "loss": 0.84225345, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.23547363, + "step": 2043, + "time_per_iteration": 2.7724125385284424 + }, + { + "auxiliary_loss_clip": 0.01553404, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_clip": 1.35792422, + "balance_loss_mlp": 1.02403855, + "epoch": 0.12289192845332933, + "flos": 18593152901280.0, + "grad_norm": 1.6321116899981338, + "language_loss": 0.80729216, + "learning_rate": 3.910164481401946e-06, + "loss": 0.83329844, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.23181152, + "step": 2044, + "time_per_iteration": 2.8323817253112793 + }, + { + "auxiliary_loss_clip": 0.01548098, + "auxiliary_loss_mlp": 0.01057445, + "balance_loss_clip": 1.3564167, + "balance_loss_mlp": 1.03363907, + "epoch": 0.1229520517059973, + "flos": 25774279264440.0, + "grad_norm": 1.7828992363336138, + "language_loss": 0.7862016, + "learning_rate": 3.910049031770853e-06, + "loss": 0.81225705, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.23815918, + "step": 2045, + "time_per_iteration": 2.7823898792266846 + }, + { + "auxiliary_loss_clip": 0.01553249, + "auxiliary_loss_mlp": 0.01063091, + "balance_loss_clip": 1.35965335, + "balance_loss_mlp": 1.03924894, + "epoch": 0.12301217495866526, + "flos": 20892140596080.0, + "grad_norm": 1.972247599987293, + "language_loss": 0.68277943, + "learning_rate": 3.90993350971051e-06, + "loss": 0.70894283, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.23840332, + "step": 2046, + "time_per_iteration": 2.762730598449707 + }, + { + "auxiliary_loss_clip": 0.0155127, + "auxiliary_loss_mlp": 0.01053276, + "balance_loss_clip": 1.35906792, + "balance_loss_mlp": 1.03003001, + "epoch": 0.12307229821133324, + "flos": 22383436121640.0, + "grad_norm": 2.390511964177344, + "language_loss": 0.73601234, + "learning_rate": 3.909817915225297e-06, + "loss": 0.76205778, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.2322998, + "step": 2047, + "time_per_iteration": 2.744324207305908 + }, + { + "auxiliary_loss_clip": 0.01556667, + "auxiliary_loss_mlp": 0.01061483, + "balance_loss_clip": 1.3638463, + "balance_loss_mlp": 1.03675902, + "epoch": 0.1231324214640012, + "flos": 23372603362440.0, + "grad_norm": 1.5634231790243611, + "language_loss": 0.76849306, + "learning_rate": 3.909702248319597e-06, + "loss": 0.79467458, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.24743652, + "step": 2048, + "time_per_iteration": 2.916073799133301 + }, + { + "auxiliary_loss_clip": 0.01549142, + "auxiliary_loss_mlp": 0.01050678, + "balance_loss_clip": 1.35868824, + "balance_loss_mlp": 1.02887499, + "epoch": 0.12319254471666917, + "flos": 23772205873440.0, + "grad_norm": 2.1018684764191486, + "language_loss": 0.85138392, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87738216, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.21813965, + "step": 2049, + "time_per_iteration": 2.810713052749634 + }, + { + "auxiliary_loss_clip": 0.01559526, + "auxiliary_loss_mlp": 0.01056993, + "balance_loss_clip": 1.36407077, + "balance_loss_mlp": 1.03272247, + "epoch": 0.12325266796933713, + "flos": 23555418509880.0, + "grad_norm": 1.8937807914739764, + "language_loss": 0.75909579, + "learning_rate": 3.909470697264285e-06, + "loss": 0.78526103, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.24304199, + "step": 2050, + "time_per_iteration": 2.788135528564453 + }, + { + "auxiliary_loss_clip": 0.01555685, + "auxiliary_loss_mlp": 0.01058075, + "balance_loss_clip": 1.36205196, + "balance_loss_mlp": 1.03280246, + "epoch": 0.12331279122200511, + "flos": 24429146518440.0, + "grad_norm": 1.9908743692325725, + "language_loss": 0.81145948, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83759701, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.25268555, + "step": 2051, + "time_per_iteration": 2.813791275024414 + }, + { + "auxiliary_loss_clip": 0.01552876, + "auxiliary_loss_mlp": 0.01055157, + "balance_loss_clip": 1.36218131, + "balance_loss_mlp": 1.03254318, + "epoch": 0.12337291447467308, + "flos": 25489872335520.0, + "grad_norm": 1.9741098929074854, + "language_loss": 0.80029064, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82637095, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.22619629, + "step": 2052, + "time_per_iteration": 2.8039183616638184 + }, + { + "auxiliary_loss_clip": 0.01560711, + "auxiliary_loss_mlp": 0.01061093, + "balance_loss_clip": 1.36493802, + "balance_loss_mlp": 1.03716719, + "epoch": 0.12343303772734104, + "flos": 23555256076440.0, + "grad_norm": 2.276909973707339, + "language_loss": 0.74060118, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76681924, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.23937988, + "step": 2053, + "time_per_iteration": 2.8154258728027344 + }, + { + "auxiliary_loss_clip": 0.01560333, + "auxiliary_loss_mlp": 0.01059367, + "balance_loss_clip": 1.36284065, + "balance_loss_mlp": 1.03363001, + "epoch": 0.12349316098000902, + "flos": 47565389287800.0, + "grad_norm": 2.3231208022607035, + "language_loss": 0.74496913, + "learning_rate": 3.909006726300991e-06, + "loss": 0.77116615, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.25744629, + "step": 2054, + "time_per_iteration": 3.020256280899048 + }, + { + "auxiliary_loss_clip": 0.01539415, + "auxiliary_loss_mlp": 0.01047855, + "balance_loss_clip": 1.35044372, + "balance_loss_mlp": 1.02618277, + "epoch": 0.12355328423267699, + "flos": 25051424605200.0, + "grad_norm": 2.360232742082999, + "language_loss": 0.85072732, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87660003, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.21679688, + "step": 2055, + "time_per_iteration": 2.9334964752197266 + }, + { + "auxiliary_loss_clip": 0.01554597, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.36039793, + "balance_loss_mlp": 1.03739834, + "epoch": 0.12361340748534495, + "flos": 27714905560800.0, + "grad_norm": 1.8515459248593544, + "language_loss": 0.7852025, + "learning_rate": 3.908774306463384e-06, + "loss": 0.81135941, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.23693848, + "step": 2056, + "time_per_iteration": 2.8580422401428223 + }, + { + "auxiliary_loss_clip": 0.01551467, + "auxiliary_loss_mlp": 0.01058263, + "balance_loss_clip": 1.357651, + "balance_loss_mlp": 1.03548193, + "epoch": 0.12367353073801293, + "flos": 26146203855120.0, + "grad_norm": 2.487898869080669, + "language_loss": 0.84062815, + "learning_rate": 3.908657987971009e-06, + "loss": 0.86672544, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.22766113, + "step": 2057, + "time_per_iteration": 2.777754306793213 + }, + { + "auxiliary_loss_clip": 0.01565483, + "auxiliary_loss_mlp": 0.01061546, + "balance_loss_clip": 1.36930442, + "balance_loss_mlp": 1.03691745, + "epoch": 0.1237336539906809, + "flos": 25161584750640.0, + "grad_norm": 1.6591486652944134, + "language_loss": 0.78472853, + "learning_rate": 3.90854159710213e-06, + "loss": 0.81099886, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.24633789, + "step": 2058, + "time_per_iteration": 2.828284502029419 + }, + { + "auxiliary_loss_clip": 0.01569017, + "auxiliary_loss_mlp": 0.01059163, + "balance_loss_clip": 1.37131393, + "balance_loss_mlp": 1.03290093, + "epoch": 0.12379377724334886, + "flos": 15308815151520.0, + "grad_norm": 1.8265396003317864, + "language_loss": 0.83618939, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.86247116, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.26245117, + "step": 2059, + "time_per_iteration": 2.8126585483551025 + }, + { + "auxiliary_loss_clip": 0.01560102, + "auxiliary_loss_mlp": 0.01064207, + "balance_loss_clip": 1.36214268, + "balance_loss_mlp": 1.03831518, + "epoch": 0.12385390049601683, + "flos": 21320679886560.0, + "grad_norm": 2.2851745543536026, + "language_loss": 0.81302536, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83926845, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.25891113, + "step": 2060, + "time_per_iteration": 2.7561190128326416 + }, + { + "auxiliary_loss_clip": 0.01553096, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.35896862, + "balance_loss_mlp": 1.02874649, + "epoch": 0.1239140237486848, + "flos": 15119746316640.0, + "grad_norm": 1.900662191889655, + "language_loss": 0.86245155, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88853216, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.2623291, + "step": 2061, + "time_per_iteration": 2.7183759212493896 + }, + { + "auxiliary_loss_clip": 0.01552563, + "auxiliary_loss_mlp": 0.01053652, + "balance_loss_clip": 1.36095786, + "balance_loss_mlp": 1.03004813, + "epoch": 0.12397414700135277, + "flos": 21981275283960.0, + "grad_norm": 1.9286434127664533, + "language_loss": 0.8479178, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87397993, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.23596191, + "step": 2062, + "time_per_iteration": 2.7772090435028076 + }, + { + "auxiliary_loss_clip": 0.01559342, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.36861336, + "balance_loss_mlp": 1.03583479, + "epoch": 0.12403427025402074, + "flos": 13404029055840.0, + "grad_norm": 2.096855254965665, + "language_loss": 0.78960544, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81580234, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.24499512, + "step": 2063, + "time_per_iteration": 2.7226991653442383 + }, + { + "auxiliary_loss_clip": 0.01565765, + "auxiliary_loss_mlp": 0.0105835, + "balance_loss_clip": 1.37153327, + "balance_loss_mlp": 1.03349471, + "epoch": 0.12409439350668872, + "flos": 15308571501360.0, + "grad_norm": 2.0274857642389974, + "language_loss": 0.79458153, + "learning_rate": 3.907841732229663e-06, + "loss": 0.82082266, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.24841309, + "step": 2064, + "time_per_iteration": 2.771155834197998 + }, + { + "auxiliary_loss_clip": 0.01555352, + "auxiliary_loss_mlp": 0.0106066, + "balance_loss_clip": 1.36166883, + "balance_loss_mlp": 1.03631759, + "epoch": 0.12415451675935668, + "flos": 25014812845680.0, + "grad_norm": 2.3510644992638796, + "language_loss": 0.92738211, + "learning_rate": 3.907724834849002e-06, + "loss": 0.95354223, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.2434082, + "step": 2065, + "time_per_iteration": 2.8001108169555664 + }, + { + "auxiliary_loss_clip": 0.01554844, + "auxiliary_loss_mlp": 0.01052199, + "balance_loss_clip": 1.35908151, + "balance_loss_mlp": 1.02801144, + "epoch": 0.12421464001202465, + "flos": 23664969529920.0, + "grad_norm": 2.0360735656035565, + "language_loss": 0.81100452, + "learning_rate": 3.907607865127225e-06, + "loss": 0.837075, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.24133301, + "step": 2066, + "time_per_iteration": 2.764425277709961 + }, + { + "auxiliary_loss_clip": 0.01423164, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.31953239, + "balance_loss_mlp": 1.0284642, + "epoch": 0.12427476326469263, + "flos": 65749129436160.0, + "grad_norm": 0.8651627692105429, + "language_loss": 0.63335609, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65791696, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.04467773, + "step": 2067, + "time_per_iteration": 3.297550678253174 + }, + { + "auxiliary_loss_clip": 0.0156607, + "auxiliary_loss_mlp": 0.01052701, + "balance_loss_clip": 1.36987185, + "balance_loss_mlp": 1.02791774, + "epoch": 0.12433488651736059, + "flos": 24540849781560.0, + "grad_norm": 1.7880381749288625, + "language_loss": 0.9354893, + "learning_rate": 3.907373708678063e-06, + "loss": 0.96167701, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.2479248, + "step": 2068, + "time_per_iteration": 4.324353933334351 + }, + { + "auxiliary_loss_clip": 0.01554175, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.36203241, + "balance_loss_mlp": 1.03843641, + "epoch": 0.12439500977002856, + "flos": 21036475999440.0, + "grad_norm": 1.9387022130984901, + "language_loss": 0.81546772, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.84162211, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.22827148, + "step": 2069, + "time_per_iteration": 2.8218185901641846 + }, + { + "auxiliary_loss_clip": 0.01556495, + "auxiliary_loss_mlp": 0.01056458, + "balance_loss_clip": 1.36198723, + "balance_loss_mlp": 1.03259277, + "epoch": 0.12445513302269653, + "flos": 26835898465440.0, + "grad_norm": 1.5905565262644201, + "language_loss": 0.77529216, + "learning_rate": 3.907139262917696e-06, + "loss": 0.80142176, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.23876953, + "step": 2070, + "time_per_iteration": 2.881732702255249 + }, + { + "auxiliary_loss_clip": 0.01560116, + "auxiliary_loss_mlp": 0.01058529, + "balance_loss_clip": 1.36572349, + "balance_loss_mlp": 1.03442502, + "epoch": 0.1245152562753645, + "flos": 18373482344160.0, + "grad_norm": 1.9848614624455674, + "language_loss": 0.81051576, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83670217, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.2409668, + "step": 2071, + "time_per_iteration": 4.304964065551758 + }, + { + "auxiliary_loss_clip": 0.01545341, + "auxiliary_loss_mlp": 0.01055915, + "balance_loss_clip": 1.35716748, + "balance_loss_mlp": 1.03229952, + "epoch": 0.12457537952803246, + "flos": 33114319432200.0, + "grad_norm": 1.7055554584798243, + "language_loss": 0.7826972, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80870974, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.23632812, + "step": 2072, + "time_per_iteration": 4.424708127975464 + }, + { + "auxiliary_loss_clip": 0.01549942, + "auxiliary_loss_mlp": 0.01054597, + "balance_loss_clip": 1.35788488, + "balance_loss_mlp": 1.03175664, + "epoch": 0.12463550278070043, + "flos": 22274940918960.0, + "grad_norm": 1.8302890212050724, + "language_loss": 0.7535916, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77963698, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.22839355, + "step": 2073, + "time_per_iteration": 4.252338647842407 + }, + { + "auxiliary_loss_clip": 0.01546987, + "auxiliary_loss_mlp": 0.01059269, + "balance_loss_clip": 1.35797739, + "balance_loss_mlp": 1.03543913, + "epoch": 0.12469562603336841, + "flos": 14682476228760.0, + "grad_norm": 2.0601815734636806, + "language_loss": 0.90455389, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93061644, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.23852539, + "step": 2074, + "time_per_iteration": 2.7890055179595947 + }, + { + "auxiliary_loss_clip": 0.01551014, + "auxiliary_loss_mlp": 0.0105941, + "balance_loss_clip": 1.35592103, + "balance_loss_mlp": 1.03361273, + "epoch": 0.12475574928603637, + "flos": 24650197759800.0, + "grad_norm": 2.5279482664879214, + "language_loss": 0.83489132, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86099553, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.25805664, + "step": 2075, + "time_per_iteration": 2.79736065864563 + }, + { + "auxiliary_loss_clip": 0.01553784, + "auxiliary_loss_mlp": 0.01052393, + "balance_loss_clip": 1.36039305, + "balance_loss_mlp": 1.02873015, + "epoch": 0.12481587253870434, + "flos": 21768102064440.0, + "grad_norm": 1.7697650525761608, + "language_loss": 0.74018848, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76625025, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.23669434, + "step": 2076, + "time_per_iteration": 2.8889167308807373 + }, + { + "auxiliary_loss_clip": 0.0154864, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.36129713, + "balance_loss_mlp": 1.02369952, + "epoch": 0.12487599579137232, + "flos": 21437337369600.0, + "grad_norm": 1.7182472959210762, + "language_loss": 0.75923121, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78516972, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.21520996, + "step": 2077, + "time_per_iteration": 2.825979232788086 + }, + { + "auxiliary_loss_clip": 0.01546022, + "auxiliary_loss_mlp": 0.0105817, + "balance_loss_clip": 1.35606897, + "balance_loss_mlp": 1.03362489, + "epoch": 0.12493611904404028, + "flos": 16111999792800.0, + "grad_norm": 1.8828077186215428, + "language_loss": 0.82545334, + "learning_rate": 3.906198587476043e-06, + "loss": 0.85149527, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.24523926, + "step": 2078, + "time_per_iteration": 2.751410484313965 + }, + { + "auxiliary_loss_clip": 0.01548742, + "auxiliary_loss_mlp": 0.01050924, + "balance_loss_clip": 1.35673237, + "balance_loss_mlp": 1.02701104, + "epoch": 0.12499624229670825, + "flos": 21585327525360.0, + "grad_norm": 1.5992567995398288, + "language_loss": 0.75654769, + "learning_rate": 3.906080677724374e-06, + "loss": 0.78254437, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23913574, + "step": 2079, + "time_per_iteration": 2.9210453033447266 + }, + { + "auxiliary_loss_clip": 0.01564975, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.36943746, + "balance_loss_mlp": 1.03393459, + "epoch": 0.1250563655493762, + "flos": 25704101372400.0, + "grad_norm": 2.2629076555860848, + "language_loss": 0.84095442, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86716849, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.22509766, + "step": 2080, + "time_per_iteration": 2.7872893810272217 + }, + { + "auxiliary_loss_clip": 0.0154151, + "auxiliary_loss_mlp": 0.01058892, + "balance_loss_clip": 1.35179663, + "balance_loss_mlp": 1.03540826, + "epoch": 0.12511648880204418, + "flos": 16913884966560.0, + "grad_norm": 1.9790487389054456, + "language_loss": 0.85117483, + "learning_rate": 3.9058446413892e-06, + "loss": 0.87717885, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.23461914, + "step": 2081, + "time_per_iteration": 2.790691614151001 + }, + { + "auxiliary_loss_clip": 0.01545466, + "auxiliary_loss_mlp": 0.01051788, + "balance_loss_clip": 1.35614157, + "balance_loss_mlp": 1.02870953, + "epoch": 0.12517661205471217, + "flos": 17572125079080.0, + "grad_norm": 1.8283413707648588, + "language_loss": 0.77024472, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79621726, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.23083496, + "step": 2082, + "time_per_iteration": 2.811058759689331 + }, + { + "auxiliary_loss_clip": 0.01572653, + "auxiliary_loss_mlp": 0.0105703, + "balance_loss_clip": 1.37076211, + "balance_loss_mlp": 1.03266382, + "epoch": 0.12523673530738014, + "flos": 16038045323280.0, + "grad_norm": 2.21893867202495, + "language_loss": 0.8022989, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.82859576, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.24377441, + "step": 2083, + "time_per_iteration": 2.764464855194092 + }, + { + "auxiliary_loss_clip": 0.01550192, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.3558954, + "balance_loss_mlp": 1.02672386, + "epoch": 0.1252968585600481, + "flos": 18812498591520.0, + "grad_norm": 2.1051497684591847, + "language_loss": 0.91016239, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.93617404, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24255371, + "step": 2084, + "time_per_iteration": 2.7384963035583496 + }, + { + "auxiliary_loss_clip": 0.01553189, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.36166, + "balance_loss_mlp": 1.02870166, + "epoch": 0.12535698181271607, + "flos": 27277310606040.0, + "grad_norm": 1.6833516336013568, + "language_loss": 0.80419123, + "learning_rate": 3.905371701516869e-06, + "loss": 0.83023632, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.22607422, + "step": 2085, + "time_per_iteration": 2.8176229000091553 + }, + { + "auxiliary_loss_clip": 0.01537439, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_clip": 1.34794617, + "balance_loss_mlp": 1.03011942, + "epoch": 0.12541710506538403, + "flos": 22059168764400.0, + "grad_norm": 1.647166245980738, + "language_loss": 0.88468051, + "learning_rate": 3.905253285907856e-06, + "loss": 0.91059059, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23449707, + "step": 2086, + "time_per_iteration": 2.808774709701538 + }, + { + "auxiliary_loss_clip": 0.01539474, + "auxiliary_loss_mlp": 0.01050608, + "balance_loss_clip": 1.35480607, + "balance_loss_mlp": 1.02915001, + "epoch": 0.125477228318052, + "flos": 12606854451840.0, + "grad_norm": 2.001159486912042, + "language_loss": 0.87259185, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89849269, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.21459961, + "step": 2087, + "time_per_iteration": 2.711864709854126 + }, + { + "auxiliary_loss_clip": 0.01550441, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_clip": 1.36033583, + "balance_loss_mlp": 1.03161526, + "epoch": 0.12553735157071996, + "flos": 23883868528200.0, + "grad_norm": 1.8434814234785657, + "language_loss": 0.73654139, + "learning_rate": 3.905016237952136e-06, + "loss": 0.76259661, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.23474121, + "step": 2088, + "time_per_iteration": 2.8958661556243896 + }, + { + "auxiliary_loss_clip": 0.01425276, + "auxiliary_loss_mlp": 0.01017075, + "balance_loss_clip": 1.31914711, + "balance_loss_mlp": 1.01273537, + "epoch": 0.12559747482338796, + "flos": 69936578665920.0, + "grad_norm": 0.810803627878535, + "language_loss": 0.61793596, + "learning_rate": 3.904897605614418e-06, + "loss": 0.6423595, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04345703, + "step": 2089, + "time_per_iteration": 3.228060722351074 + }, + { + "auxiliary_loss_clip": 0.01536459, + "auxiliary_loss_mlp": 0.01053386, + "balance_loss_clip": 1.34871674, + "balance_loss_mlp": 1.03072453, + "epoch": 0.12565759807605592, + "flos": 24284892331800.0, + "grad_norm": 2.659367540937695, + "language_loss": 0.78505808, + "learning_rate": 3.904778901042793e-06, + "loss": 0.81095654, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.22668457, + "step": 2090, + "time_per_iteration": 2.8298044204711914 + }, + { + "auxiliary_loss_clip": 0.01421384, + "auxiliary_loss_mlp": 0.01020027, + "balance_loss_clip": 1.31390548, + "balance_loss_mlp": 1.0153538, + "epoch": 0.12571772132872389, + "flos": 56464320919320.0, + "grad_norm": 0.9012644623199819, + "language_loss": 0.59534395, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61975807, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.04663086, + "step": 2091, + "time_per_iteration": 3.172837972640991 + }, + { + "auxiliary_loss_clip": 0.01548826, + "auxiliary_loss_mlp": 0.01057803, + "balance_loss_clip": 1.35944748, + "balance_loss_mlp": 1.03510571, + "epoch": 0.12577784458139185, + "flos": 41253442796880.0, + "grad_norm": 1.7598869762454246, + "language_loss": 0.63942313, + "learning_rate": 3.904541275215825e-06, + "loss": 0.66548944, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.22680664, + "step": 2092, + "time_per_iteration": 2.9469523429870605 + }, + { + "auxiliary_loss_clip": 0.01555962, + "auxiliary_loss_mlp": 0.01061494, + "balance_loss_clip": 1.36132324, + "balance_loss_mlp": 1.03705609, + "epoch": 0.12583796783405982, + "flos": 19760262286320.0, + "grad_norm": 1.998798277570671, + "language_loss": 0.81030655, + "learning_rate": 3.904422353969493e-06, + "loss": 0.83648115, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.2442627, + "step": 2093, + "time_per_iteration": 2.793360948562622 + }, + { + "auxiliary_loss_clip": 0.01539863, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.35448372, + "balance_loss_mlp": 1.02647543, + "epoch": 0.12589809108672778, + "flos": 22607329948200.0, + "grad_norm": 1.729175339594252, + "language_loss": 0.7638945, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78978759, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.22973633, + "step": 2094, + "time_per_iteration": 2.778944492340088 + }, + { + "auxiliary_loss_clip": 0.01535891, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_clip": 1.34939277, + "balance_loss_mlp": 1.03205156, + "epoch": 0.12595821433939577, + "flos": 45230967475920.0, + "grad_norm": 1.48710293383165, + "language_loss": 0.77302682, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79892677, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.22045898, + "step": 2095, + "time_per_iteration": 2.987466812133789 + }, + { + "auxiliary_loss_clip": 0.01532777, + "auxiliary_loss_mlp": 0.010595, + "balance_loss_clip": 1.34131455, + "balance_loss_mlp": 1.03619409, + "epoch": 0.12601833759206374, + "flos": 14323952396880.0, + "grad_norm": 2.1155330193808792, + "language_loss": 0.83720016, + "learning_rate": 3.904065156953232e-06, + "loss": 0.86312294, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.2331543, + "step": 2096, + "time_per_iteration": 2.7703487873077393 + }, + { + "auxiliary_loss_clip": 0.01541268, + "auxiliary_loss_mlp": 0.0106099, + "balance_loss_clip": 1.35075545, + "balance_loss_mlp": 1.03826845, + "epoch": 0.1260784608447317, + "flos": 21293245616400.0, + "grad_norm": 1.8841197638940244, + "language_loss": 0.7589367, + "learning_rate": 3.903945946870439e-06, + "loss": 0.78495932, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22729492, + "step": 2097, + "time_per_iteration": 2.805060863494873 + }, + { + "auxiliary_loss_clip": 0.01546154, + "auxiliary_loss_mlp": 0.01056738, + "balance_loss_clip": 1.35748327, + "balance_loss_mlp": 1.03450537, + "epoch": 0.12613858409739967, + "flos": 26257094951040.0, + "grad_norm": 2.067653158618665, + "language_loss": 0.87456727, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.90059614, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.22241211, + "step": 2098, + "time_per_iteration": 2.8173861503601074 + }, + { + "auxiliary_loss_clip": 0.01558675, + "auxiliary_loss_mlp": 0.01065779, + "balance_loss_clip": 1.36242461, + "balance_loss_mlp": 1.04006577, + "epoch": 0.12619870735006763, + "flos": 21585002658480.0, + "grad_norm": 2.064628444116124, + "language_loss": 0.69954085, + "learning_rate": 3.903707310115912e-06, + "loss": 0.72578543, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.25720215, + "step": 2099, + "time_per_iteration": 2.840226411819458 + }, + { + "auxiliary_loss_clip": 0.01547071, + "auxiliary_loss_mlp": 0.01057231, + "balance_loss_clip": 1.35317588, + "balance_loss_mlp": 1.03157711, + "epoch": 0.1262588306027356, + "flos": 23372197278840.0, + "grad_norm": 1.951988540702652, + "language_loss": 0.82315361, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84919667, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.2565918, + "step": 2100, + "time_per_iteration": 2.8901617527008057 + }, + { + "auxiliary_loss_clip": 0.01560185, + "auxiliary_loss_mlp": 0.01061871, + "balance_loss_clip": 1.36615849, + "balance_loss_mlp": 1.03724217, + "epoch": 0.12631895385540357, + "flos": 23954208853680.0, + "grad_norm": 1.7913849848643006, + "language_loss": 0.80628473, + "learning_rate": 3.903468384606302e-06, + "loss": 0.83250523, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.24621582, + "step": 2101, + "time_per_iteration": 2.776214361190796 + }, + { + "auxiliary_loss_clip": 0.01387448, + "auxiliary_loss_mlp": 0.01015662, + "balance_loss_clip": 1.28339243, + "balance_loss_mlp": 1.0100354, + "epoch": 0.12637907710807156, + "flos": 70297457782680.0, + "grad_norm": 0.758348550885393, + "language_loss": 0.57128429, + "learning_rate": 3.903348813579662e-06, + "loss": 0.5953154, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.05615234, + "step": 2102, + "time_per_iteration": 3.315375804901123 + }, + { + "auxiliary_loss_clip": 0.01551243, + "auxiliary_loss_mlp": 0.01055489, + "balance_loss_clip": 1.35663712, + "balance_loss_mlp": 1.03161192, + "epoch": 0.12643920036073952, + "flos": 18919572501600.0, + "grad_norm": 2.2571823829371427, + "language_loss": 0.93692082, + "learning_rate": 3.903229170377845e-06, + "loss": 0.96298814, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.23901367, + "step": 2103, + "time_per_iteration": 2.7825090885162354 + }, + { + "auxiliary_loss_clip": 0.01538934, + "auxiliary_loss_mlp": 0.01044654, + "balance_loss_clip": 1.35436285, + "balance_loss_mlp": 1.02279067, + "epoch": 0.1264993236134075, + "flos": 27788291513280.0, + "grad_norm": 1.587213924355077, + "language_loss": 0.78352964, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80936551, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.21838379, + "step": 2104, + "time_per_iteration": 2.8215737342834473 + }, + { + "auxiliary_loss_clip": 0.01556458, + "auxiliary_loss_mlp": 0.01059482, + "balance_loss_clip": 1.3644948, + "balance_loss_mlp": 1.03746367, + "epoch": 0.12655944686607545, + "flos": 24760195471800.0, + "grad_norm": 1.8160387978245465, + "language_loss": 0.81452489, + "learning_rate": 3.902989667466828e-06, + "loss": 0.84068429, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.22021484, + "step": 2105, + "time_per_iteration": 2.803889751434326 + }, + { + "auxiliary_loss_clip": 0.01563702, + "auxiliary_loss_mlp": 0.01058462, + "balance_loss_clip": 1.36523056, + "balance_loss_mlp": 1.03397679, + "epoch": 0.12661957011874342, + "flos": 24138282860280.0, + "grad_norm": 2.084208386863278, + "language_loss": 0.82941413, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85563576, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.24499512, + "step": 2106, + "time_per_iteration": 4.230465650558472 + }, + { + "auxiliary_loss_clip": 0.01549644, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.35766411, + "balance_loss_mlp": 1.0334394, + "epoch": 0.12667969337141138, + "flos": 24574740780960.0, + "grad_norm": 1.8360885728615837, + "language_loss": 0.74004436, + "learning_rate": 3.902749875909578e-06, + "loss": 0.76611221, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.23693848, + "step": 2107, + "time_per_iteration": 2.7903761863708496 + }, + { + "auxiliary_loss_clip": 0.01537801, + "auxiliary_loss_mlp": 0.01052924, + "balance_loss_clip": 1.35046387, + "balance_loss_mlp": 1.0310725, + "epoch": 0.12673981662407935, + "flos": 22966259863680.0, + "grad_norm": 1.8792011939308284, + "language_loss": 0.79147243, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81737959, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.21875, + "step": 2108, + "time_per_iteration": 2.7932653427124023 + }, + { + "auxiliary_loss_clip": 0.01554014, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_clip": 1.36021829, + "balance_loss_mlp": 1.02910733, + "epoch": 0.12679993987674734, + "flos": 17279352828000.0, + "grad_norm": 2.4463713118361943, + "language_loss": 0.76493776, + "learning_rate": 3.902509795742467e-06, + "loss": 0.79101574, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24682617, + "step": 2109, + "time_per_iteration": 2.7469711303710938 + }, + { + "auxiliary_loss_clip": 0.0153999, + "auxiliary_loss_mlp": 0.01049981, + "balance_loss_clip": 1.35309827, + "balance_loss_mlp": 1.02643764, + "epoch": 0.1268600631294153, + "flos": 17279596478160.0, + "grad_norm": 1.6597367938611265, + "language_loss": 0.83260649, + "learning_rate": 3.902389647441592e-06, + "loss": 0.8585062, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.2355957, + "step": 2110, + "time_per_iteration": 4.408202409744263 + }, + { + "auxiliary_loss_clip": 0.0155851, + "auxiliary_loss_mlp": 0.01061222, + "balance_loss_clip": 1.36572361, + "balance_loss_mlp": 1.03612804, + "epoch": 0.12692018638208327, + "flos": 24066399417120.0, + "grad_norm": 1.6404391463710508, + "language_loss": 0.79187274, + "learning_rate": 3.90226942700191e-06, + "loss": 0.81807011, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.25085449, + "step": 2111, + "time_per_iteration": 4.340290784835815 + }, + { + "auxiliary_loss_clip": 0.01564594, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_clip": 1.3652004, + "balance_loss_mlp": 1.04757583, + "epoch": 0.12698030963475124, + "flos": 31838186935800.0, + "grad_norm": 2.0728868215919904, + "language_loss": 0.77450895, + "learning_rate": 3.902149134427982e-06, + "loss": 0.80089283, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.26196289, + "step": 2112, + "time_per_iteration": 4.295496225357056 + }, + { + "auxiliary_loss_clip": 0.01548685, + "auxiliary_loss_mlp": 0.01056851, + "balance_loss_clip": 1.35784793, + "balance_loss_mlp": 1.03258002, + "epoch": 0.1270404328874192, + "flos": 25192673773200.0, + "grad_norm": 1.8506222417376426, + "language_loss": 0.86005437, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88610971, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.24291992, + "step": 2113, + "time_per_iteration": 2.8548567295074463 + }, + { + "auxiliary_loss_clip": 0.01553292, + "auxiliary_loss_mlp": 0.01065768, + "balance_loss_clip": 1.36141825, + "balance_loss_mlp": 1.041008, + "epoch": 0.12710055614008717, + "flos": 16001595997200.0, + "grad_norm": 2.0931473053211733, + "language_loss": 0.73920059, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76539123, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.24768066, + "step": 2114, + "time_per_iteration": 2.7964115142822266 + }, + { + "auxiliary_loss_clip": 0.01551877, + "auxiliary_loss_mlp": 0.01065408, + "balance_loss_clip": 1.36054707, + "balance_loss_mlp": 1.0382762, + "epoch": 0.12716067939275516, + "flos": 15089834936520.0, + "grad_norm": 1.7215428038485738, + "language_loss": 0.83535433, + "learning_rate": 3.901787823946341e-06, + "loss": 0.8615272, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.2713623, + "step": 2115, + "time_per_iteration": 2.7265543937683105 + }, + { + "auxiliary_loss_clip": 0.01552965, + "auxiliary_loss_mlp": 0.01059333, + "balance_loss_clip": 1.36099863, + "balance_loss_mlp": 1.03582478, + "epoch": 0.12722080264542313, + "flos": 28372698981360.0, + "grad_norm": 1.8562457993531463, + "language_loss": 0.86786449, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89398742, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.23522949, + "step": 2116, + "time_per_iteration": 2.837646484375 + }, + { + "auxiliary_loss_clip": 0.01538795, + "auxiliary_loss_mlp": 0.01053663, + "balance_loss_clip": 1.34956074, + "balance_loss_mlp": 1.03070307, + "epoch": 0.1272809258980911, + "flos": 32386348119600.0, + "grad_norm": 1.661429083483199, + "language_loss": 0.70375055, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72967517, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.22961426, + "step": 2117, + "time_per_iteration": 2.841576099395752 + }, + { + "auxiliary_loss_clip": 0.01545959, + "auxiliary_loss_mlp": 0.01062155, + "balance_loss_clip": 1.35261333, + "balance_loss_mlp": 1.03735924, + "epoch": 0.12734104915075906, + "flos": 16038735665400.0, + "grad_norm": 2.137071918561477, + "language_loss": 0.87034619, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89642733, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.24780273, + "step": 2118, + "time_per_iteration": 2.691892147064209 + }, + { + "auxiliary_loss_clip": 0.01542009, + "auxiliary_loss_mlp": 0.01054749, + "balance_loss_clip": 1.35133886, + "balance_loss_mlp": 1.03169417, + "epoch": 0.12740117240342702, + "flos": 18264987141480.0, + "grad_norm": 1.8583351757587818, + "language_loss": 0.87376118, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89972872, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.23059082, + "step": 2119, + "time_per_iteration": 2.762409210205078 + }, + { + "auxiliary_loss_clip": 0.01553782, + "auxiliary_loss_mlp": 0.01051197, + "balance_loss_clip": 1.36231637, + "balance_loss_mlp": 1.02667618, + "epoch": 0.127461295656095, + "flos": 12124485457200.0, + "grad_norm": 2.184137571988748, + "language_loss": 0.88004756, + "learning_rate": 3.901184197551605e-06, + "loss": 0.90609735, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.24523926, + "step": 2120, + "time_per_iteration": 2.7971267700195312 + }, + { + "auxiliary_loss_clip": 0.01548408, + "auxiliary_loss_mlp": 0.01049547, + "balance_loss_clip": 1.35652375, + "balance_loss_mlp": 1.02601492, + "epoch": 0.12752141890876295, + "flos": 23154354097920.0, + "grad_norm": 2.3531335168845784, + "language_loss": 0.76050377, + "learning_rate": 3.901063255975046e-06, + "loss": 0.78648329, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.23522949, + "step": 2121, + "time_per_iteration": 2.768671989440918 + }, + { + "auxiliary_loss_clip": 0.01546515, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_clip": 1.35427916, + "balance_loss_mlp": 1.03843498, + "epoch": 0.12758154216143094, + "flos": 21620964684240.0, + "grad_norm": 2.226005222404334, + "language_loss": 0.83826864, + "learning_rate": 3.900942242309978e-06, + "loss": 0.86436123, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.24279785, + "step": 2122, + "time_per_iteration": 2.7362120151519775 + }, + { + "auxiliary_loss_clip": 0.01551404, + "auxiliary_loss_mlp": 0.01052978, + "balance_loss_clip": 1.35776019, + "balance_loss_mlp": 1.03020918, + "epoch": 0.1276416654140989, + "flos": 15929184645360.0, + "grad_norm": 1.708064543119852, + "language_loss": 0.79424763, + "learning_rate": 3.90082115656099e-06, + "loss": 0.8202914, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.22753906, + "step": 2123, + "time_per_iteration": 2.79457950592041 + }, + { + "auxiliary_loss_clip": 0.01553295, + "auxiliary_loss_mlp": 0.01054552, + "balance_loss_clip": 1.36097789, + "balance_loss_mlp": 1.03134191, + "epoch": 0.12770178866676687, + "flos": 22387821824520.0, + "grad_norm": 1.7476494694991733, + "language_loss": 0.79358101, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81965947, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.23205566, + "step": 2124, + "time_per_iteration": 2.7673943042755127 + }, + { + "auxiliary_loss_clip": 0.01554481, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.35910106, + "balance_loss_mlp": 1.03812456, + "epoch": 0.12776191191943484, + "flos": 21657454618680.0, + "grad_norm": 1.8513082096178914, + "language_loss": 0.75654626, + "learning_rate": 3.900578768829623e-06, + "loss": 0.78272235, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.25012207, + "step": 2125, + "time_per_iteration": 2.793621063232422 + }, + { + "auxiliary_loss_clip": 0.01544317, + "auxiliary_loss_mlp": 0.01053921, + "balance_loss_clip": 1.35117948, + "balance_loss_mlp": 1.02974558, + "epoch": 0.1278220351721028, + "flos": 25740631915200.0, + "grad_norm": 2.1352413065045237, + "language_loss": 0.78460765, + "learning_rate": 3.900457466856434e-06, + "loss": 0.81059003, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.24145508, + "step": 2126, + "time_per_iteration": 2.8464457988739014 + }, + { + "auxiliary_loss_clip": 0.01550371, + "auxiliary_loss_mlp": 0.01054251, + "balance_loss_clip": 1.3594861, + "balance_loss_mlp": 1.03222108, + "epoch": 0.12788215842477077, + "flos": 41250478386600.0, + "grad_norm": 1.5245395980919425, + "language_loss": 0.69235188, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71839809, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.22033691, + "step": 2127, + "time_per_iteration": 3.054508686065674 + }, + { + "auxiliary_loss_clip": 0.0136848, + "auxiliary_loss_mlp": 0.01005755, + "balance_loss_clip": 1.26210546, + "balance_loss_mlp": 0.99991351, + "epoch": 0.12794228167743876, + "flos": 70894332017280.0, + "grad_norm": 0.8438819337106299, + "language_loss": 0.62849319, + "learning_rate": 3.900214646718047e-06, + "loss": 0.65223557, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.05834961, + "step": 2128, + "time_per_iteration": 3.3285884857177734 + }, + { + "auxiliary_loss_clip": 0.01552262, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_clip": 1.35694373, + "balance_loss_mlp": 1.02732992, + "epoch": 0.12800240493010673, + "flos": 16294083989760.0, + "grad_norm": 2.069775746176171, + "language_loss": 0.77821863, + "learning_rate": 3.900093128562056e-06, + "loss": 0.80425954, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.24487305, + "step": 2129, + "time_per_iteration": 2.7380459308624268 + }, + { + "auxiliary_loss_clip": 0.01561938, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_clip": 1.3617456, + "balance_loss_mlp": 1.03101587, + "epoch": 0.1280625281827747, + "flos": 20636467404840.0, + "grad_norm": 2.2486724938864895, + "language_loss": 0.79913342, + "learning_rate": 3.899971538354343e-06, + "loss": 0.82531893, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.25585938, + "step": 2130, + "time_per_iteration": 2.766414165496826 + }, + { + "auxiliary_loss_clip": 0.01552067, + "auxiliary_loss_mlp": 0.0105466, + "balance_loss_clip": 1.35416341, + "balance_loss_mlp": 1.03199863, + "epoch": 0.12812265143544266, + "flos": 22643413799040.0, + "grad_norm": 2.593583072321479, + "language_loss": 0.71547961, + "learning_rate": 3.899849876099518e-06, + "loss": 0.74154687, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.22668457, + "step": 2131, + "time_per_iteration": 2.809133768081665 + }, + { + "auxiliary_loss_clip": 0.01541625, + "auxiliary_loss_mlp": 0.01056556, + "balance_loss_clip": 1.34851408, + "balance_loss_mlp": 1.03310728, + "epoch": 0.12818277468811062, + "flos": 34721419665240.0, + "grad_norm": 1.9181156001025577, + "language_loss": 0.72786403, + "learning_rate": 3.899728141802197e-06, + "loss": 0.75384581, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.23413086, + "step": 2132, + "time_per_iteration": 2.9030280113220215 + }, + { + "auxiliary_loss_clip": 0.01542101, + "auxiliary_loss_mlp": 0.01052759, + "balance_loss_clip": 1.35400403, + "balance_loss_mlp": 1.02975154, + "epoch": 0.1282428979407786, + "flos": 23117295646440.0, + "grad_norm": 2.511671315073572, + "language_loss": 0.82547671, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.85142529, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.23010254, + "step": 2133, + "time_per_iteration": 2.8070836067199707 + }, + { + "auxiliary_loss_clip": 0.01555968, + "auxiliary_loss_mlp": 0.01067258, + "balance_loss_clip": 1.35569811, + "balance_loss_mlp": 1.04242682, + "epoch": 0.12830302119344655, + "flos": 20891409645600.0, + "grad_norm": 3.4497310839285333, + "language_loss": 0.80630565, + "learning_rate": 3.899484457098528e-06, + "loss": 0.83253789, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.24829102, + "step": 2134, + "time_per_iteration": 2.7576217651367188 + }, + { + "auxiliary_loss_clip": 0.01545677, + "auxiliary_loss_mlp": 0.01051931, + "balance_loss_clip": 1.35187721, + "balance_loss_mlp": 1.02886355, + "epoch": 0.12836314444611455, + "flos": 21402796636440.0, + "grad_norm": 1.66117322264972, + "language_loss": 0.8342995, + "learning_rate": 3.899362506701421e-06, + "loss": 0.86027563, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.23083496, + "step": 2135, + "time_per_iteration": 2.7657179832458496 + }, + { + "auxiliary_loss_clip": 0.01534181, + "auxiliary_loss_mlp": 0.01063158, + "balance_loss_clip": 1.34374154, + "balance_loss_mlp": 1.03916132, + "epoch": 0.1284232676987825, + "flos": 13666443234840.0, + "grad_norm": 2.157709716323528, + "language_loss": 0.77504349, + "learning_rate": 3.899240484280298e-06, + "loss": 0.80101693, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.23974609, + "step": 2136, + "time_per_iteration": 2.7252519130706787 + }, + { + "auxiliary_loss_clip": 0.01373326, + "auxiliary_loss_mlp": 0.01017558, + "balance_loss_clip": 1.26693487, + "balance_loss_mlp": 1.01183593, + "epoch": 0.12848339095145048, + "flos": 60008920605000.0, + "grad_norm": 0.9208318371922003, + "language_loss": 0.59183824, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61574709, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.05712891, + "step": 2137, + "time_per_iteration": 3.382551431655884 + }, + { + "auxiliary_loss_clip": 0.01539881, + "auxiliary_loss_mlp": 0.01054128, + "balance_loss_clip": 1.34514713, + "balance_loss_mlp": 1.03140724, + "epoch": 0.12854351420411844, + "flos": 13885382841480.0, + "grad_norm": 2.1716621892236203, + "language_loss": 0.83258069, + "learning_rate": 3.898996223384512e-06, + "loss": 0.85852075, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.22717285, + "step": 2138, + "time_per_iteration": 2.766625165939331 + }, + { + "auxiliary_loss_clip": 0.01544573, + "auxiliary_loss_mlp": 0.01054125, + "balance_loss_clip": 1.3483547, + "balance_loss_mlp": 1.02946019, + "epoch": 0.1286036374567864, + "flos": 22643088932160.0, + "grad_norm": 2.262996528747599, + "language_loss": 0.79322565, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81921268, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24645996, + "step": 2139, + "time_per_iteration": 2.784180164337158 + }, + { + "auxiliary_loss_clip": 0.01540936, + "auxiliary_loss_mlp": 0.01050531, + "balance_loss_clip": 1.34653974, + "balance_loss_mlp": 1.02754796, + "epoch": 0.12866376070945437, + "flos": 16328949589800.0, + "grad_norm": 2.0449762820541793, + "language_loss": 0.85420823, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.8801229, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.2298584, + "step": 2140, + "time_per_iteration": 2.7947590351104736 + }, + { + "auxiliary_loss_clip": 0.01528679, + "auxiliary_loss_mlp": 0.01053564, + "balance_loss_clip": 1.3371017, + "balance_loss_mlp": 1.03176022, + "epoch": 0.12872388396212234, + "flos": 11878274013840.0, + "grad_norm": 2.1358233693806348, + "language_loss": 0.862535, + "learning_rate": 3.898629291976476e-06, + "loss": 0.8883574, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.21789551, + "step": 2141, + "time_per_iteration": 2.761688470840454 + }, + { + "auxiliary_loss_clip": 0.01541752, + "auxiliary_loss_mlp": 0.01053979, + "balance_loss_clip": 1.34488106, + "balance_loss_mlp": 1.03093553, + "epoch": 0.12878400721479033, + "flos": 28372658373000.0, + "grad_norm": 2.3002508072482475, + "language_loss": 0.68433243, + "learning_rate": 3.898506837508518e-06, + "loss": 0.71028978, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.23046875, + "step": 2142, + "time_per_iteration": 2.8037683963775635 + }, + { + "auxiliary_loss_clip": 0.0154072, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_clip": 1.34366393, + "balance_loss_mlp": 1.0234344, + "epoch": 0.1288441304674583, + "flos": 25891261614360.0, + "grad_norm": 2.3753459524284573, + "language_loss": 0.83433414, + "learning_rate": 3.89838431104899e-06, + "loss": 0.86022103, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.24523926, + "step": 2143, + "time_per_iteration": 2.8644447326660156 + }, + { + "auxiliary_loss_clip": 0.01541912, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.34499252, + "balance_loss_mlp": 1.02690399, + "epoch": 0.12890425372012626, + "flos": 20818714035240.0, + "grad_norm": 1.6112060745648795, + "language_loss": 0.82232344, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84825802, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.24658203, + "step": 2144, + "time_per_iteration": 4.1798624992370605 + }, + { + "auxiliary_loss_clip": 0.01536557, + "auxiliary_loss_mlp": 0.01052464, + "balance_loss_clip": 1.34349608, + "balance_loss_mlp": 1.02828813, + "epoch": 0.12896437697279423, + "flos": 22571124272280.0, + "grad_norm": 2.257819230515767, + "language_loss": 0.78964418, + "learning_rate": 3.898139042173813e-06, + "loss": 0.81553441, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.24169922, + "step": 2145, + "time_per_iteration": 2.7936651706695557 + }, + { + "auxiliary_loss_clip": 0.0153305, + "auxiliary_loss_mlp": 0.01054741, + "balance_loss_clip": 1.33928561, + "balance_loss_mlp": 1.03061295, + "epoch": 0.1290245002254622, + "flos": 17498292434640.0, + "grad_norm": 2.166035568505126, + "language_loss": 0.83518916, + "learning_rate": 3.898016299767465e-06, + "loss": 0.86106706, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24145508, + "step": 2146, + "time_per_iteration": 2.7444796562194824 + }, + { + "auxiliary_loss_clip": 0.01541741, + "auxiliary_loss_mlp": 0.01057534, + "balance_loss_clip": 1.34871888, + "balance_loss_mlp": 1.03290582, + "epoch": 0.12908462347813016, + "flos": 36322347427560.0, + "grad_norm": 2.4146213927033244, + "language_loss": 0.71460009, + "learning_rate": 3.897893485388149e-06, + "loss": 0.74059284, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.24658203, + "step": 2147, + "time_per_iteration": 2.93450665473938 + }, + { + "auxiliary_loss_clip": 0.01536291, + "auxiliary_loss_mlp": 0.01055238, + "balance_loss_clip": 1.34323585, + "balance_loss_mlp": 1.03307724, + "epoch": 0.12914474673079815, + "flos": 22533862779000.0, + "grad_norm": 2.0452535290869553, + "language_loss": 0.72826612, + "learning_rate": 3.897770599040521e-06, + "loss": 0.75418144, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.22155762, + "step": 2148, + "time_per_iteration": 2.7440345287323 + }, + { + "auxiliary_loss_clip": 0.01522558, + "auxiliary_loss_mlp": 0.01046505, + "balance_loss_clip": 1.33268309, + "balance_loss_mlp": 1.02378368, + "epoch": 0.12920486998346611, + "flos": 21477075972840.0, + "grad_norm": 1.6039601995594925, + "language_loss": 0.79096699, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81665766, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.22705078, + "step": 2149, + "time_per_iteration": 4.2881574630737305 + }, + { + "auxiliary_loss_clip": 0.0153402, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.34030211, + "balance_loss_mlp": 1.02565217, + "epoch": 0.12926499323613408, + "flos": 27314369057520.0, + "grad_norm": 2.0690558113677255, + "language_loss": 0.76360393, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78943563, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.23510742, + "step": 2150, + "time_per_iteration": 4.235165596008301 + }, + { + "auxiliary_loss_clip": 0.01536131, + "auxiliary_loss_mlp": 0.01056528, + "balance_loss_clip": 1.34075832, + "balance_loss_mlp": 1.0322572, + "epoch": 0.12932511648880204, + "flos": 22096267824240.0, + "grad_norm": 2.8135391474051246, + "language_loss": 0.71296304, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.73888963, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.24279785, + "step": 2151, + "time_per_iteration": 4.24524188041687 + }, + { + "auxiliary_loss_clip": 0.01532845, + "auxiliary_loss_mlp": 0.01048436, + "balance_loss_clip": 1.34138155, + "balance_loss_mlp": 1.02669239, + "epoch": 0.12938523974147, + "flos": 20307611302920.0, + "grad_norm": 1.796281880206373, + "language_loss": 0.84525937, + "learning_rate": 3.897278334060137e-06, + "loss": 0.87107217, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.2175293, + "step": 2152, + "time_per_iteration": 2.7630183696746826 + }, + { + "auxiliary_loss_clip": 0.01538927, + "auxiliary_loss_mlp": 0.01053563, + "balance_loss_clip": 1.34729111, + "balance_loss_mlp": 1.03063917, + "epoch": 0.12944536299413797, + "flos": 19504670311800.0, + "grad_norm": 1.800152798521506, + "language_loss": 0.78731859, + "learning_rate": 3.897155087940906e-06, + "loss": 0.81324351, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.22924805, + "step": 2153, + "time_per_iteration": 2.886021137237549 + }, + { + "auxiliary_loss_clip": 0.01535903, + "auxiliary_loss_mlp": 0.01060746, + "balance_loss_clip": 1.34543204, + "balance_loss_mlp": 1.03698754, + "epoch": 0.12950548624680594, + "flos": 27713524876560.0, + "grad_norm": 3.1869154501827923, + "language_loss": 0.8050586, + "learning_rate": 3.897031769881364e-06, + "loss": 0.83102506, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.23779297, + "step": 2154, + "time_per_iteration": 2.7896971702575684 + }, + { + "auxiliary_loss_clip": 0.01534373, + "auxiliary_loss_mlp": 0.01053891, + "balance_loss_clip": 1.34158564, + "balance_loss_mlp": 1.02976322, + "epoch": 0.12956560949947393, + "flos": 17570135269440.0, + "grad_norm": 2.035499664664855, + "language_loss": 0.8323307, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85821331, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.2409668, + "step": 2155, + "time_per_iteration": 2.734889030456543 + }, + { + "auxiliary_loss_clip": 0.01536015, + "auxiliary_loss_mlp": 0.01055398, + "balance_loss_clip": 1.34036899, + "balance_loss_mlp": 1.03193712, + "epoch": 0.1296257327521419, + "flos": 20745612341280.0, + "grad_norm": 4.746842603663551, + "language_loss": 0.76383132, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78974545, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.23461914, + "step": 2156, + "time_per_iteration": 2.8053488731384277 + }, + { + "auxiliary_loss_clip": 0.01537527, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_clip": 1.34624815, + "balance_loss_mlp": 1.02952361, + "epoch": 0.12968585600480986, + "flos": 16399655390520.0, + "grad_norm": 1.9806804655943602, + "language_loss": 0.87034607, + "learning_rate": 3.896661384107648e-06, + "loss": 0.89624685, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.23022461, + "step": 2157, + "time_per_iteration": 2.7187552452087402 + }, + { + "auxiliary_loss_clip": 0.01542735, + "auxiliary_loss_mlp": 0.01062682, + "balance_loss_clip": 1.34535539, + "balance_loss_mlp": 1.03748071, + "epoch": 0.12974597925747783, + "flos": 28335396879720.0, + "grad_norm": 3.2416493154275097, + "language_loss": 0.81163836, + "learning_rate": 3.896537778333651e-06, + "loss": 0.8376925, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.25219727, + "step": 2158, + "time_per_iteration": 2.8086938858032227 + }, + { + "auxiliary_loss_clip": 0.01539058, + "auxiliary_loss_mlp": 0.01058485, + "balance_loss_clip": 1.34275675, + "balance_loss_mlp": 1.03475022, + "epoch": 0.1298061025101458, + "flos": 9686157187320.0, + "grad_norm": 2.099228588596435, + "language_loss": 0.7513774, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77735281, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.23718262, + "step": 2159, + "time_per_iteration": 2.689530611038208 + }, + { + "auxiliary_loss_clip": 0.01524236, + "auxiliary_loss_mlp": 0.01052837, + "balance_loss_clip": 1.33425176, + "balance_loss_mlp": 1.02881682, + "epoch": 0.12986622576281376, + "flos": 27715352252760.0, + "grad_norm": 1.8928306921801565, + "language_loss": 0.82925218, + "learning_rate": 3.89629035103964e-06, + "loss": 0.85502291, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.24023438, + "step": 2160, + "time_per_iteration": 2.8063318729400635 + }, + { + "auxiliary_loss_clip": 0.01529115, + "auxiliary_loss_mlp": 0.01054581, + "balance_loss_clip": 1.34257531, + "balance_loss_mlp": 1.02954674, + "epoch": 0.12992634901548175, + "flos": 18806813421120.0, + "grad_norm": 1.591911541571755, + "language_loss": 0.82397652, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84981352, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.25048828, + "step": 2161, + "time_per_iteration": 2.773881673812866 + }, + { + "auxiliary_loss_clip": 0.015384, + "auxiliary_loss_mlp": 0.01054041, + "balance_loss_clip": 1.3464433, + "balance_loss_mlp": 1.02936482, + "epoch": 0.12998647226814972, + "flos": 29132936958960.0, + "grad_norm": 1.981526235197842, + "language_loss": 0.83477306, + "learning_rate": 3.896042636115551e-06, + "loss": 0.86069745, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.2467041, + "step": 2162, + "time_per_iteration": 2.8202428817749023 + }, + { + "auxiliary_loss_clip": 0.01535395, + "auxiliary_loss_mlp": 0.01055599, + "balance_loss_clip": 1.33970058, + "balance_loss_mlp": 1.032305, + "epoch": 0.13004659552081768, + "flos": 19578502956240.0, + "grad_norm": 2.57231728364107, + "language_loss": 0.73652256, + "learning_rate": 3.895918670803968e-06, + "loss": 0.76243246, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.23278809, + "step": 2163, + "time_per_iteration": 2.8104493618011475 + }, + { + "auxiliary_loss_clip": 0.01544471, + "auxiliary_loss_mlp": 0.01064141, + "balance_loss_clip": 1.34862566, + "balance_loss_mlp": 1.03780806, + "epoch": 0.13010671877348565, + "flos": 22495667293440.0, + "grad_norm": 3.272611222649361, + "language_loss": 0.81383312, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83991927, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.26330566, + "step": 2164, + "time_per_iteration": 2.8448734283447266 + }, + { + "auxiliary_loss_clip": 0.01538171, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.34285259, + "balance_loss_mlp": 1.02719378, + "epoch": 0.1301668420261536, + "flos": 23883503052960.0, + "grad_norm": 1.986889016779083, + "language_loss": 0.72144914, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74732572, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.22290039, + "step": 2165, + "time_per_iteration": 2.904811382293701 + }, + { + "auxiliary_loss_clip": 0.0154761, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_clip": 1.35053539, + "balance_loss_mlp": 1.02626479, + "epoch": 0.13022696527882158, + "flos": 23155450523640.0, + "grad_norm": 1.6102220661338786, + "language_loss": 0.75309253, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77908915, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.25830078, + "step": 2166, + "time_per_iteration": 2.7756893634796143 + }, + { + "auxiliary_loss_clip": 0.01537719, + "auxiliary_loss_mlp": 0.01053982, + "balance_loss_clip": 1.34283686, + "balance_loss_mlp": 1.0314517, + "epoch": 0.13028708853148954, + "flos": 26914482288000.0, + "grad_norm": 1.5607844193302642, + "language_loss": 0.83220708, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85812414, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.22546387, + "step": 2167, + "time_per_iteration": 2.8805201053619385 + }, + { + "auxiliary_loss_clip": 0.01534488, + "auxiliary_loss_mlp": 0.01060799, + "balance_loss_clip": 1.34346223, + "balance_loss_mlp": 1.03578877, + "epoch": 0.13034721178415754, + "flos": 21256430815080.0, + "grad_norm": 1.6956489392167868, + "language_loss": 0.84000999, + "learning_rate": 3.89529776593877e-06, + "loss": 0.86596286, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.25036621, + "step": 2168, + "time_per_iteration": 2.8320200443267822 + }, + { + "auxiliary_loss_clip": 0.01534779, + "auxiliary_loss_mlp": 0.01060996, + "balance_loss_clip": 1.34100556, + "balance_loss_mlp": 1.03561616, + "epoch": 0.1304073350368255, + "flos": 18771460520760.0, + "grad_norm": 1.8685676437724548, + "language_loss": 0.80218285, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82814062, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.25378418, + "step": 2169, + "time_per_iteration": 2.759096384048462 + }, + { + "auxiliary_loss_clip": 0.01537499, + "auxiliary_loss_mlp": 0.01049046, + "balance_loss_clip": 1.34320569, + "balance_loss_mlp": 1.0241195, + "epoch": 0.13046745828949347, + "flos": 28370343696480.0, + "grad_norm": 2.915440245908459, + "language_loss": 0.67936528, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.70523071, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24926758, + "step": 2170, + "time_per_iteration": 2.835097551345825 + }, + { + "auxiliary_loss_clip": 0.01532576, + "auxiliary_loss_mlp": 0.01051198, + "balance_loss_clip": 1.34171891, + "balance_loss_mlp": 1.02785647, + "epoch": 0.13052758154216143, + "flos": 29610879642360.0, + "grad_norm": 1.4712880259036347, + "language_loss": 0.67336798, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69920576, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.23352051, + "step": 2171, + "time_per_iteration": 2.811612367630005 + }, + { + "auxiliary_loss_clip": 0.01527638, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.33563399, + "balance_loss_mlp": 1.0299654, + "epoch": 0.1305877047948294, + "flos": 19395606592080.0, + "grad_norm": 2.1383423512860005, + "language_loss": 0.72288436, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74871171, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.25146484, + "step": 2172, + "time_per_iteration": 2.8909878730773926 + }, + { + "auxiliary_loss_clip": 0.0152287, + "auxiliary_loss_mlp": 0.01054551, + "balance_loss_clip": 1.33671236, + "balance_loss_mlp": 1.03128123, + "epoch": 0.13064782804749736, + "flos": 16877963549160.0, + "grad_norm": 1.9954803929827114, + "language_loss": 0.75526065, + "learning_rate": 3.894675064326678e-06, + "loss": 0.78103483, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23278809, + "step": 2173, + "time_per_iteration": 2.9150302410125732 + }, + { + "auxiliary_loss_clip": 0.01534097, + "auxiliary_loss_mlp": 0.01058278, + "balance_loss_clip": 1.33984232, + "balance_loss_mlp": 1.03258896, + "epoch": 0.13070795130016533, + "flos": 24504522280560.0, + "grad_norm": 2.5255271049297625, + "language_loss": 0.71094584, + "learning_rate": 3.894550308446551e-06, + "loss": 0.73686957, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.25683594, + "step": 2174, + "time_per_iteration": 2.7990429401397705 + }, + { + "auxiliary_loss_clip": 0.01349297, + "auxiliary_loss_mlp": 0.01007066, + "balance_loss_clip": 1.24408913, + "balance_loss_mlp": 1.00158262, + "epoch": 0.13076807455283332, + "flos": 71070731043840.0, + "grad_norm": 0.7957250190634, + "language_loss": 0.59061742, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61418104, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.05493164, + "step": 2175, + "time_per_iteration": 3.375551700592041 + }, + { + "auxiliary_loss_clip": 0.01515395, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.32395697, + "balance_loss_mlp": 1.03402376, + "epoch": 0.13082819780550128, + "flos": 20269090950480.0, + "grad_norm": 2.093222152190804, + "language_loss": 0.80768299, + "learning_rate": 3.894300581166417e-06, + "loss": 0.83341473, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.23779297, + "step": 2176, + "time_per_iteration": 2.876739501953125 + }, + { + "auxiliary_loss_clip": 0.01520525, + "auxiliary_loss_mlp": 0.01054768, + "balance_loss_clip": 1.32913506, + "balance_loss_mlp": 1.0314033, + "epoch": 0.13088832105816925, + "flos": 34210032674400.0, + "grad_norm": 2.404707474157556, + "language_loss": 0.75379205, + "learning_rate": 3.894175609775881e-06, + "loss": 0.77954495, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.23400879, + "step": 2177, + "time_per_iteration": 2.9072823524475098 + }, + { + "auxiliary_loss_clip": 0.01523346, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.33431399, + "balance_loss_mlp": 1.02813411, + "epoch": 0.13094844431083721, + "flos": 17899519280040.0, + "grad_norm": 1.8537111678786096, + "language_loss": 0.82422018, + "learning_rate": 3.894050566558015e-06, + "loss": 0.85000563, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.27062988, + "step": 2178, + "time_per_iteration": 2.7443649768829346 + }, + { + "auxiliary_loss_clip": 0.01516829, + "auxiliary_loss_mlp": 0.01045636, + "balance_loss_clip": 1.32615638, + "balance_loss_mlp": 1.02351046, + "epoch": 0.13100856756350518, + "flos": 17315802154080.0, + "grad_norm": 2.3819909390495333, + "language_loss": 0.74865699, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77428162, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.22143555, + "step": 2179, + "time_per_iteration": 2.7421998977661133 + }, + { + "auxiliary_loss_clip": 0.0151711, + "auxiliary_loss_mlp": 0.01054962, + "balance_loss_clip": 1.3307122, + "balance_loss_mlp": 1.03322983, + "epoch": 0.13106869081617314, + "flos": 22205697019200.0, + "grad_norm": 2.230923269335419, + "language_loss": 0.85011268, + "learning_rate": 3.893800264659266e-06, + "loss": 0.87583339, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.21728516, + "step": 2180, + "time_per_iteration": 2.7561395168304443 + }, + { + "auxiliary_loss_clip": 0.01516906, + "auxiliary_loss_mlp": 0.01054975, + "balance_loss_clip": 1.32548368, + "balance_loss_mlp": 1.03202677, + "epoch": 0.13112881406884114, + "flos": 21768264497880.0, + "grad_norm": 1.7972673727784862, + "language_loss": 0.90137851, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92709732, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.22949219, + "step": 2181, + "time_per_iteration": 2.755540370941162 + }, + { + "auxiliary_loss_clip": 0.01515686, + "auxiliary_loss_mlp": 0.01052767, + "balance_loss_clip": 1.32550049, + "balance_loss_mlp": 1.03072548, + "epoch": 0.1311889373215091, + "flos": 23336316469800.0, + "grad_norm": 2.1633609716166746, + "language_loss": 0.69229615, + "learning_rate": 3.893549675508137e-06, + "loss": 0.71798074, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.22045898, + "step": 2182, + "time_per_iteration": 2.746774911880493 + }, + { + "auxiliary_loss_clip": 0.01508867, + "auxiliary_loss_mlp": 0.01052009, + "balance_loss_clip": 1.31674004, + "balance_loss_mlp": 1.02893043, + "epoch": 0.13124906057417707, + "flos": 21471959319480.0, + "grad_norm": 2.0757520687151825, + "language_loss": 0.78705275, + "learning_rate": 3.893424273224806e-06, + "loss": 0.81266153, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.23095703, + "step": 2183, + "time_per_iteration": 4.206738471984863 + }, + { + "auxiliary_loss_clip": 0.01506435, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.31833088, + "balance_loss_mlp": 1.02544761, + "epoch": 0.13130918382684503, + "flos": 23260169148840.0, + "grad_norm": 1.529628151815561, + "language_loss": 0.85482943, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88037848, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.23022461, + "step": 2184, + "time_per_iteration": 2.759995937347412 + }, + { + "auxiliary_loss_clip": 0.01520014, + "auxiliary_loss_mlp": 0.01051396, + "balance_loss_clip": 1.32905722, + "balance_loss_mlp": 1.02785194, + "epoch": 0.131369307079513, + "flos": 20855203969680.0, + "grad_norm": 1.8993525731170073, + "language_loss": 0.82848668, + "learning_rate": 3.893173253266387e-06, + "loss": 0.85420084, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.23535156, + "step": 2185, + "time_per_iteration": 2.750993013381958 + }, + { + "auxiliary_loss_clip": 0.01517353, + "auxiliary_loss_mlp": 0.01055487, + "balance_loss_clip": 1.32641041, + "balance_loss_mlp": 1.03058457, + "epoch": 0.13142943033218096, + "flos": 17862866912160.0, + "grad_norm": 1.8620974284601108, + "language_loss": 0.7271862, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75291461, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.24914551, + "step": 2186, + "time_per_iteration": 2.795496702194214 + }, + { + "auxiliary_loss_clip": 0.01505419, + "auxiliary_loss_mlp": 0.0105494, + "balance_loss_clip": 1.31685257, + "balance_loss_mlp": 1.0302515, + "epoch": 0.13148955358484893, + "flos": 21000960665640.0, + "grad_norm": 2.0958058040611034, + "language_loss": 0.80469072, + "learning_rate": 3.892921946150693e-06, + "loss": 0.83029431, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.24707031, + "step": 2187, + "time_per_iteration": 4.317295789718628 + }, + { + "auxiliary_loss_clip": 0.01355305, + "auxiliary_loss_mlp": 0.01021695, + "balance_loss_clip": 1.24704671, + "balance_loss_mlp": 1.0165925, + "epoch": 0.13154967683751692, + "flos": 70187744329200.0, + "grad_norm": 0.8459944832427595, + "language_loss": 0.59025693, + "learning_rate": 3.892796184920778e-06, + "loss": 0.6140269, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.05102539, + "step": 2188, + "time_per_iteration": 4.863296270370483 + }, + { + "auxiliary_loss_clip": 0.01511374, + "auxiliary_loss_mlp": 0.01051274, + "balance_loss_clip": 1.32387471, + "balance_loss_mlp": 1.02925587, + "epoch": 0.1316098000901849, + "flos": 20381037863760.0, + "grad_norm": 1.6527983337276788, + "language_loss": 0.74449229, + "learning_rate": 3.892670351915842e-06, + "loss": 0.77011883, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.22009277, + "step": 2189, + "time_per_iteration": 4.233031272888184 + }, + { + "auxiliary_loss_clip": 0.01506541, + "auxiliary_loss_mlp": 0.0104507, + "balance_loss_clip": 1.31819963, + "balance_loss_mlp": 1.02299237, + "epoch": 0.13166992334285285, + "flos": 23226278149440.0, + "grad_norm": 1.9769129296852408, + "language_loss": 0.72601604, + "learning_rate": 3.892544447140657e-06, + "loss": 0.7515322, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.22106934, + "step": 2190, + "time_per_iteration": 2.836238384246826 + }, + { + "auxiliary_loss_clip": 0.01510625, + "auxiliary_loss_mlp": 0.01055417, + "balance_loss_clip": 1.32350802, + "balance_loss_mlp": 1.03302944, + "epoch": 0.13173004659552082, + "flos": 23336154036360.0, + "grad_norm": 7.760911185811171, + "language_loss": 0.74847871, + "learning_rate": 3.892418470599996e-06, + "loss": 0.77413911, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.22399902, + "step": 2191, + "time_per_iteration": 2.841308116912842 + }, + { + "auxiliary_loss_clip": 0.01513091, + "auxiliary_loss_mlp": 0.01051591, + "balance_loss_clip": 1.32178986, + "balance_loss_mlp": 1.02922738, + "epoch": 0.13179016984818878, + "flos": 21256471423440.0, + "grad_norm": 2.2154879883587144, + "language_loss": 0.79510927, + "learning_rate": 3.892292422298637e-06, + "loss": 0.82075614, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.22351074, + "step": 2192, + "time_per_iteration": 2.8207926750183105 + }, + { + "auxiliary_loss_clip": 0.01520773, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.33079302, + "balance_loss_mlp": 1.02495861, + "epoch": 0.13185029310085675, + "flos": 17782333888320.0, + "grad_norm": 1.912810503985894, + "language_loss": 0.85505605, + "learning_rate": 3.892166302241361e-06, + "loss": 0.88074422, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.23071289, + "step": 2193, + "time_per_iteration": 2.766292095184326 + }, + { + "auxiliary_loss_clip": 0.01352835, + "auxiliary_loss_mlp": 0.01011482, + "balance_loss_clip": 1.24602985, + "balance_loss_mlp": 1.00649917, + "epoch": 0.1319104163535247, + "flos": 69868025108280.0, + "grad_norm": 0.7513571150853664, + "language_loss": 0.54078698, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56443012, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04980469, + "step": 2194, + "time_per_iteration": 3.2100744247436523 + }, + { + "auxiliary_loss_clip": 0.01513832, + "auxiliary_loss_mlp": 0.01052448, + "balance_loss_clip": 1.32577014, + "balance_loss_mlp": 1.0296433, + "epoch": 0.1319705396061927, + "flos": 25198724418840.0, + "grad_norm": 1.6242333764312418, + "language_loss": 0.72159284, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74725568, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.22790527, + "step": 2195, + "time_per_iteration": 2.7953481674194336 + }, + { + "auxiliary_loss_clip": 0.01519988, + "auxiliary_loss_mlp": 0.01049675, + "balance_loss_clip": 1.32607293, + "balance_loss_mlp": 1.02553558, + "epoch": 0.13203066285886067, + "flos": 20745328082760.0, + "grad_norm": 1.5468102659124259, + "language_loss": 0.78511584, + "learning_rate": 3.891787511581859e-06, + "loss": 0.81081247, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24169922, + "step": 2196, + "time_per_iteration": 2.7785074710845947 + }, + { + "auxiliary_loss_clip": 0.01523758, + "auxiliary_loss_mlp": 0.01053693, + "balance_loss_clip": 1.32991672, + "balance_loss_mlp": 1.03086412, + "epoch": 0.13209078611152864, + "flos": 22059493631280.0, + "grad_norm": 2.4074955748446305, + "language_loss": 0.75012803, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77590257, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.22839355, + "step": 2197, + "time_per_iteration": 2.857781410217285 + }, + { + "auxiliary_loss_clip": 0.01525385, + "auxiliary_loss_mlp": 0.0105438, + "balance_loss_clip": 1.3313911, + "balance_loss_mlp": 1.03064561, + "epoch": 0.1321509093641966, + "flos": 16288683077880.0, + "grad_norm": 1.8270202234305892, + "language_loss": 0.80297935, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82877707, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.23730469, + "step": 2198, + "time_per_iteration": 2.7896475791931152 + }, + { + "auxiliary_loss_clip": 0.01516316, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.32478607, + "balance_loss_mlp": 1.03671229, + "epoch": 0.13221103261686457, + "flos": 16987473960840.0, + "grad_norm": 2.284170943153709, + "language_loss": 0.82837075, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85413575, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.23474121, + "step": 2199, + "time_per_iteration": 2.735281467437744 + }, + { + "auxiliary_loss_clip": 0.01506884, + "auxiliary_loss_mlp": 0.01052748, + "balance_loss_clip": 1.31759262, + "balance_loss_mlp": 1.03040838, + "epoch": 0.13227115586953253, + "flos": 34239375537480.0, + "grad_norm": 1.6575042226720693, + "language_loss": 0.69437426, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71997058, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.22338867, + "step": 2200, + "time_per_iteration": 2.854187488555908 + }, + { + "auxiliary_loss_clip": 0.01501187, + "auxiliary_loss_mlp": 0.01059771, + "balance_loss_clip": 1.31419516, + "balance_loss_mlp": 1.03653669, + "epoch": 0.13233127912220052, + "flos": 20709609707160.0, + "grad_norm": 1.709517783997554, + "language_loss": 0.85341978, + "learning_rate": 3.891154759144557e-06, + "loss": 0.87902939, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.23266602, + "step": 2201, + "time_per_iteration": 2.7614328861236572 + }, + { + "auxiliary_loss_clip": 0.0151634, + "auxiliary_loss_mlp": 0.01060794, + "balance_loss_clip": 1.32395375, + "balance_loss_mlp": 1.03779888, + "epoch": 0.1323914023748685, + "flos": 25809672773160.0, + "grad_norm": 1.859128987346674, + "language_loss": 0.87294424, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89871562, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.22998047, + "step": 2202, + "time_per_iteration": 2.8216216564178467 + }, + { + "auxiliary_loss_clip": 0.01515057, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.32535744, + "balance_loss_mlp": 1.0264895, + "epoch": 0.13245152562753645, + "flos": 21256512031800.0, + "grad_norm": 1.9869433829735905, + "language_loss": 0.72309142, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74873018, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22302246, + "step": 2203, + "time_per_iteration": 2.8040103912353516 + }, + { + "auxiliary_loss_clip": 0.01506899, + "auxiliary_loss_mlp": 0.01055724, + "balance_loss_clip": 1.3166275, + "balance_loss_mlp": 1.03362226, + "epoch": 0.13251164888020442, + "flos": 26616106083240.0, + "grad_norm": 2.4119934653896826, + "language_loss": 0.74358982, + "learning_rate": 3.890774247090444e-06, + "loss": 0.769216, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.22106934, + "step": 2204, + "time_per_iteration": 2.7865726947784424 + }, + { + "auxiliary_loss_clip": 0.01510847, + "auxiliary_loss_mlp": 0.01055335, + "balance_loss_clip": 1.32034111, + "balance_loss_mlp": 1.03245842, + "epoch": 0.13257177213287238, + "flos": 29832743050920.0, + "grad_norm": 1.8065969936046964, + "language_loss": 0.78727174, + "learning_rate": 3.89064726633596e-06, + "loss": 0.81293356, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.22875977, + "step": 2205, + "time_per_iteration": 2.8907155990600586 + }, + { + "auxiliary_loss_clip": 0.01508729, + "auxiliary_loss_mlp": 0.01052514, + "balance_loss_clip": 1.32151902, + "balance_loss_mlp": 1.03015089, + "epoch": 0.13263189538554035, + "flos": 21293489266560.0, + "grad_norm": 1.81350959544635, + "language_loss": 0.79564577, + "learning_rate": 3.890520213887941e-06, + "loss": 0.82125819, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.22363281, + "step": 2206, + "time_per_iteration": 2.774545669555664 + }, + { + "auxiliary_loss_clip": 0.0150768, + "auxiliary_loss_mlp": 0.01057424, + "balance_loss_clip": 1.31632447, + "balance_loss_mlp": 1.03497648, + "epoch": 0.13269201863820831, + "flos": 16878613282920.0, + "grad_norm": 2.020258126357644, + "language_loss": 0.74915898, + "learning_rate": 3.890393089751208e-06, + "loss": 0.77481008, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.2244873, + "step": 2207, + "time_per_iteration": 2.7786777019500732 + }, + { + "auxiliary_loss_clip": 0.01505337, + "auxiliary_loss_mlp": 0.01059464, + "balance_loss_clip": 1.31971562, + "balance_loss_mlp": 1.03185499, + "epoch": 0.1327521418908763, + "flos": 23774195683080.0, + "grad_norm": 1.6446506533798455, + "language_loss": 0.841034, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86668205, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.27636719, + "step": 2208, + "time_per_iteration": 2.8287699222564697 + }, + { + "auxiliary_loss_clip": 0.01497237, + "auxiliary_loss_mlp": 0.01048189, + "balance_loss_clip": 1.31630468, + "balance_loss_mlp": 1.02700567, + "epoch": 0.13281226514354427, + "flos": 26511265632960.0, + "grad_norm": 1.5746972615365045, + "language_loss": 0.85438025, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87983453, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21179199, + "step": 2209, + "time_per_iteration": 2.8536646366119385 + }, + { + "auxiliary_loss_clip": 0.01507691, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.31791496, + "balance_loss_mlp": 1.02569973, + "epoch": 0.13287238839621224, + "flos": 24504156805320.0, + "grad_norm": 2.487357190271578, + "language_loss": 0.82570094, + "learning_rate": 3.890011287256929e-06, + "loss": 0.85126543, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.23059082, + "step": 2210, + "time_per_iteration": 2.8181917667388916 + }, + { + "auxiliary_loss_clip": 0.0135849, + "auxiliary_loss_mlp": 0.01008719, + "balance_loss_clip": 1.24912047, + "balance_loss_mlp": 1.00309265, + "epoch": 0.1329325116488802, + "flos": 67709677456080.0, + "grad_norm": 0.7784944981646027, + "language_loss": 0.5805369, + "learning_rate": 3.889883876413563e-06, + "loss": 0.60420901, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.05615234, + "step": 2211, + "time_per_iteration": 3.420210838317871 + }, + { + "auxiliary_loss_clip": 0.01353459, + "auxiliary_loss_mlp": 0.01011152, + "balance_loss_clip": 1.24266267, + "balance_loss_mlp": 1.00509632, + "epoch": 0.13299263490154817, + "flos": 72278716066200.0, + "grad_norm": 0.8378595165933392, + "language_loss": 0.55339515, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57704127, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.06054688, + "step": 2212, + "time_per_iteration": 3.3055169582366943 + }, + { + "auxiliary_loss_clip": 0.01518721, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_clip": 1.32517052, + "balance_loss_mlp": 1.03015649, + "epoch": 0.13305275815421613, + "flos": 17935806172680.0, + "grad_norm": 2.2773790135674847, + "language_loss": 0.7519778, + "learning_rate": 3.889628839737908e-06, + "loss": 0.7776953, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.22851562, + "step": 2213, + "time_per_iteration": 2.790268659591675 + }, + { + "auxiliary_loss_clip": 0.01498189, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_clip": 1.31363487, + "balance_loss_mlp": 1.02929425, + "epoch": 0.13311288140688413, + "flos": 22345321852800.0, + "grad_norm": 1.7239726809328417, + "language_loss": 0.79419285, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81967545, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.20776367, + "step": 2214, + "time_per_iteration": 2.731470823287964 + }, + { + "auxiliary_loss_clip": 0.01502099, + "auxiliary_loss_mlp": 0.01052777, + "balance_loss_clip": 1.31225944, + "balance_loss_mlp": 1.02982974, + "epoch": 0.1331730046595521, + "flos": 31875245387280.0, + "grad_norm": 1.6024351555108707, + "language_loss": 0.6985507, + "learning_rate": 3.889373516442597e-06, + "loss": 0.7240994, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.22924805, + "step": 2215, + "time_per_iteration": 2.8247506618499756 + }, + { + "auxiliary_loss_clip": 0.01503396, + "auxiliary_loss_mlp": 0.01053577, + "balance_loss_clip": 1.3147763, + "balance_loss_mlp": 1.03078413, + "epoch": 0.13323312791222006, + "flos": 22571936439480.0, + "grad_norm": 1.739851456582798, + "language_loss": 0.81280613, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83837587, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.22802734, + "step": 2216, + "time_per_iteration": 2.8606784343719482 + }, + { + "auxiliary_loss_clip": 0.01506094, + "auxiliary_loss_mlp": 0.01064795, + "balance_loss_clip": 1.31758964, + "balance_loss_mlp": 1.04177523, + "epoch": 0.13329325116488802, + "flos": 15089469461280.0, + "grad_norm": 2.1666296581057845, + "language_loss": 0.87106276, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89677167, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.23022461, + "step": 2217, + "time_per_iteration": 2.7581863403320312 + }, + { + "auxiliary_loss_clip": 0.0149789, + "auxiliary_loss_mlp": 0.01061194, + "balance_loss_clip": 1.3112421, + "balance_loss_mlp": 1.03676796, + "epoch": 0.133353374417556, + "flos": 27459394803000.0, + "grad_norm": 2.1600370684175614, + "language_loss": 0.73932803, + "learning_rate": 3.888989994172501e-06, + "loss": 0.76491886, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.2442627, + "step": 2218, + "time_per_iteration": 2.7724573612213135 + }, + { + "auxiliary_loss_clip": 0.01496853, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.30665326, + "balance_loss_mlp": 1.03744388, + "epoch": 0.13341349767022395, + "flos": 24099681291120.0, + "grad_norm": 1.8542472383276085, + "language_loss": 0.87963712, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.90520614, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22644043, + "step": 2219, + "time_per_iteration": 2.8276314735412598 + }, + { + "auxiliary_loss_clip": 0.01504125, + "auxiliary_loss_mlp": 0.01062021, + "balance_loss_clip": 1.31743574, + "balance_loss_mlp": 1.04145777, + "epoch": 0.13347362092289192, + "flos": 24138607727160.0, + "grad_norm": 1.520392358526463, + "language_loss": 0.77801704, + "learning_rate": 3.888733954497574e-06, + "loss": 0.80367851, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.20556641, + "step": 2220, + "time_per_iteration": 2.8833415508270264 + }, + { + "auxiliary_loss_clip": 0.01492975, + "auxiliary_loss_mlp": 0.01059742, + "balance_loss_clip": 1.30527067, + "balance_loss_mlp": 1.0396198, + "epoch": 0.1335337441755599, + "flos": 18440411567400.0, + "grad_norm": 2.2662224652131124, + "language_loss": 0.79122841, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81675553, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.20117188, + "step": 2221, + "time_per_iteration": 2.741990327835083 + }, + { + "auxiliary_loss_clip": 0.01369049, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_clip": 1.25366294, + "balance_loss_mlp": 1.09686089, + "epoch": 0.13359386742822787, + "flos": 50624185249440.0, + "grad_norm": 1.0033402315609887, + "language_loss": 0.69124806, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71595657, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.04931641, + "step": 2222, + "time_per_iteration": 4.534109115600586 + }, + { + "auxiliary_loss_clip": 0.01497762, + "auxiliary_loss_mlp": 0.01062066, + "balance_loss_clip": 1.31121325, + "balance_loss_mlp": 1.04194379, + "epoch": 0.13365399068089584, + "flos": 22783038632640.0, + "grad_norm": 2.5939895661805052, + "language_loss": 0.67762947, + "learning_rate": 3.888349357839982e-06, + "loss": 0.70322776, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.20117188, + "step": 2223, + "time_per_iteration": 2.9227354526519775 + }, + { + "auxiliary_loss_clip": 0.01493531, + "auxiliary_loss_mlp": 0.01057289, + "balance_loss_clip": 1.30570769, + "balance_loss_mlp": 1.03606987, + "epoch": 0.1337141139335638, + "flos": 12535458309000.0, + "grad_norm": 1.9170697036341187, + "language_loss": 0.83044499, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85595322, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.2121582, + "step": 2224, + "time_per_iteration": 2.776780605316162 + }, + { + "auxiliary_loss_clip": 0.01499207, + "auxiliary_loss_mlp": 0.01050925, + "balance_loss_clip": 1.30738735, + "balance_loss_mlp": 1.02896678, + "epoch": 0.13377423718623177, + "flos": 23044112735760.0, + "grad_norm": 1.9818334261049377, + "language_loss": 0.66349345, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68899482, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.21972656, + "step": 2225, + "time_per_iteration": 2.8107717037200928 + }, + { + "auxiliary_loss_clip": 0.01494152, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_clip": 1.30529666, + "balance_loss_mlp": 1.02735353, + "epoch": 0.13383436043889974, + "flos": 16220698037280.0, + "grad_norm": 2.49563824757182, + "language_loss": 0.90325093, + "learning_rate": 3.887964116724835e-06, + "loss": 0.92866731, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.20117188, + "step": 2226, + "time_per_iteration": 4.240520477294922 + }, + { + "auxiliary_loss_clip": 0.01498901, + "auxiliary_loss_mlp": 0.01051103, + "balance_loss_clip": 1.30920172, + "balance_loss_mlp": 1.03108788, + "epoch": 0.1338944836915677, + "flos": 24285014156880.0, + "grad_norm": 1.8785836978563684, + "language_loss": 0.74083674, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76633668, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.20019531, + "step": 2227, + "time_per_iteration": 4.316313028335571 + }, + { + "auxiliary_loss_clip": 0.01498867, + "auxiliary_loss_mlp": 0.0104632, + "balance_loss_clip": 1.3112781, + "balance_loss_mlp": 1.02585137, + "epoch": 0.1339546069442357, + "flos": 17602686192960.0, + "grad_norm": 1.9449166647906582, + "language_loss": 0.85608411, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.88153601, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.20471191, + "step": 2228, + "time_per_iteration": 4.3192458152771 + }, + { + "auxiliary_loss_clip": 0.01487417, + "auxiliary_loss_mlp": 0.01050961, + "balance_loss_clip": 1.30270863, + "balance_loss_mlp": 1.02956331, + "epoch": 0.13401473019690366, + "flos": 18994826438640.0, + "grad_norm": 1.8225055229807088, + "language_loss": 0.82005191, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.84543568, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.21398926, + "step": 2229, + "time_per_iteration": 2.9088165760040283 + }, + { + "auxiliary_loss_clip": 0.01498548, + "auxiliary_loss_mlp": 0.01057355, + "balance_loss_clip": 1.31142747, + "balance_loss_mlp": 1.03668392, + "epoch": 0.13407485344957162, + "flos": 26949591538200.0, + "grad_norm": 1.7582995795903338, + "language_loss": 0.74803281, + "learning_rate": 3.887449459642378e-06, + "loss": 0.77359182, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.20678711, + "step": 2230, + "time_per_iteration": 2.799372673034668 + }, + { + "auxiliary_loss_clip": 0.01492307, + "auxiliary_loss_mlp": 0.01057682, + "balance_loss_clip": 1.30406642, + "balance_loss_mlp": 1.03710628, + "epoch": 0.1341349767022396, + "flos": 20344223062440.0, + "grad_norm": 1.8985550870725088, + "language_loss": 0.80378377, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82928371, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.20568848, + "step": 2231, + "time_per_iteration": 2.789391040802002 + }, + { + "auxiliary_loss_clip": 0.01497929, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_clip": 1.3062644, + "balance_loss_mlp": 1.03267574, + "epoch": 0.13419509995490755, + "flos": 29868095951280.0, + "grad_norm": 1.6741390168561625, + "language_loss": 0.73386401, + "learning_rate": 3.887191701647992e-06, + "loss": 0.75939494, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.22509766, + "step": 2232, + "time_per_iteration": 2.832335948944092 + }, + { + "auxiliary_loss_clip": 0.01501807, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.3118372, + "balance_loss_mlp": 1.02116549, + "epoch": 0.13425522320757552, + "flos": 26948657545920.0, + "grad_norm": 2.2811182219580415, + "language_loss": 0.66721296, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.69266677, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.22412109, + "step": 2233, + "time_per_iteration": 2.861078977584839 + }, + { + "auxiliary_loss_clip": 0.01497127, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.3076942, + "balance_loss_mlp": 1.02226448, + "epoch": 0.1343153464602435, + "flos": 15781356923040.0, + "grad_norm": 2.4903321132324843, + "language_loss": 0.82685596, + "learning_rate": 3.886933657403615e-06, + "loss": 0.85226643, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.21643066, + "step": 2234, + "time_per_iteration": 2.819187641143799 + }, + { + "auxiliary_loss_clip": 0.0149926, + "auxiliary_loss_mlp": 0.01053464, + "balance_loss_clip": 1.30997372, + "balance_loss_mlp": 1.03147006, + "epoch": 0.13437546971291148, + "flos": 24319920365280.0, + "grad_norm": 1.879535777360715, + "language_loss": 0.82243395, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84796119, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.22009277, + "step": 2235, + "time_per_iteration": 2.811251163482666 + }, + { + "auxiliary_loss_clip": 0.01501401, + "auxiliary_loss_mlp": 0.01051179, + "balance_loss_clip": 1.31272268, + "balance_loss_mlp": 1.02793288, + "epoch": 0.13443559296557944, + "flos": 26656250770080.0, + "grad_norm": 1.6648093499430707, + "language_loss": 0.86993027, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.89545608, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.23254395, + "step": 2236, + "time_per_iteration": 2.8267757892608643 + }, + { + "auxiliary_loss_clip": 0.0149938, + "auxiliary_loss_mlp": 0.01053315, + "balance_loss_clip": 1.30685234, + "balance_loss_mlp": 1.02972341, + "epoch": 0.1344957162182474, + "flos": 21800896638120.0, + "grad_norm": 1.905392324606161, + "language_loss": 0.78031039, + "learning_rate": 3.886546054403946e-06, + "loss": 0.80583733, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.23583984, + "step": 2237, + "time_per_iteration": 2.7759532928466797 + }, + { + "auxiliary_loss_clip": 0.01503109, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_clip": 1.31335878, + "balance_loss_mlp": 1.02552557, + "epoch": 0.13455583947091537, + "flos": 19870422431760.0, + "grad_norm": 1.9249076749544565, + "language_loss": 0.79919267, + "learning_rate": 3.886416710321491e-06, + "loss": 0.82470381, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.22485352, + "step": 2238, + "time_per_iteration": 2.8179144859313965 + }, + { + "auxiliary_loss_clip": 0.01491404, + "auxiliary_loss_mlp": 0.01048742, + "balance_loss_clip": 1.30460775, + "balance_loss_mlp": 1.02585363, + "epoch": 0.13461596272358334, + "flos": 30852999314280.0, + "grad_norm": 2.0483364967450943, + "language_loss": 0.68547022, + "learning_rate": 3.886287294705924e-06, + "loss": 0.7108717, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2286377, + "step": 2239, + "time_per_iteration": 2.8693130016326904 + }, + { + "auxiliary_loss_clip": 0.01497255, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.30617762, + "balance_loss_mlp": 1.03170776, + "epoch": 0.1346760859762513, + "flos": 12498196815720.0, + "grad_norm": 2.091742759123597, + "language_loss": 0.82350969, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.8490386, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.23925781, + "step": 2240, + "time_per_iteration": 2.8079960346221924 + }, + { + "auxiliary_loss_clip": 0.01497103, + "auxiliary_loss_mlp": 0.01050277, + "balance_loss_clip": 1.30506921, + "balance_loss_mlp": 1.02773452, + "epoch": 0.1347362092289193, + "flos": 21841163150040.0, + "grad_norm": 1.7059403232513337, + "language_loss": 0.78083909, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80631292, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.22558594, + "step": 2241, + "time_per_iteration": 2.8909013271331787 + }, + { + "auxiliary_loss_clip": 0.01494165, + "auxiliary_loss_mlp": 0.01040284, + "balance_loss_clip": 1.31039202, + "balance_loss_mlp": 1.02051878, + "epoch": 0.13479633248158726, + "flos": 23514177397320.0, + "grad_norm": 1.7519231909544695, + "language_loss": 0.84024405, + "learning_rate": 3.88589861870965e-06, + "loss": 0.86558855, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.19750977, + "step": 2242, + "time_per_iteration": 2.7975263595581055 + }, + { + "auxiliary_loss_clip": 0.01501413, + "auxiliary_loss_mlp": 0.01057122, + "balance_loss_clip": 1.31058419, + "balance_loss_mlp": 1.03242171, + "epoch": 0.13485645573425523, + "flos": 29349440064000.0, + "grad_norm": 2.1588203990585813, + "language_loss": 0.65542698, + "learning_rate": 3.885768917010744e-06, + "loss": 0.68101233, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.24694824, + "step": 2243, + "time_per_iteration": 2.9068710803985596 + }, + { + "auxiliary_loss_clip": 0.01477061, + "auxiliary_loss_mlp": 0.01042298, + "balance_loss_clip": 1.29395378, + "balance_loss_mlp": 1.02002954, + "epoch": 0.1349165789869232, + "flos": 28042868278800.0, + "grad_norm": 1.325671706428426, + "language_loss": 0.73157001, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.75676358, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.22265625, + "step": 2244, + "time_per_iteration": 2.8038508892059326 + }, + { + "auxiliary_loss_clip": 0.01487379, + "auxiliary_loss_mlp": 0.01050647, + "balance_loss_clip": 1.3006556, + "balance_loss_mlp": 1.02794993, + "epoch": 0.13497670223959116, + "flos": 22858495611480.0, + "grad_norm": 1.4404578017107867, + "language_loss": 0.86599863, + "learning_rate": 3.88550929909221e-06, + "loss": 0.89137888, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22692871, + "step": 2245, + "time_per_iteration": 2.8243014812469482 + }, + { + "auxiliary_loss_clip": 0.01492654, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_clip": 1.3090384, + "balance_loss_mlp": 1.02589488, + "epoch": 0.13503682549225912, + "flos": 16508434851720.0, + "grad_norm": 1.5908375601214766, + "language_loss": 0.79330689, + "learning_rate": 3.88537938288243e-06, + "loss": 0.818717, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.2244873, + "step": 2246, + "time_per_iteration": 2.8220202922821045 + }, + { + "auxiliary_loss_clip": 0.01351553, + "auxiliary_loss_mlp": 0.0100331, + "balance_loss_clip": 1.24115384, + "balance_loss_mlp": 0.99825555, + "epoch": 0.1350969487449271, + "flos": 70771664496960.0, + "grad_norm": 0.7466729042851982, + "language_loss": 0.60554582, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62909442, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.05053711, + "step": 2247, + "time_per_iteration": 3.3767178058624268 + }, + { + "auxiliary_loss_clip": 0.01516622, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.32090032, + "balance_loss_mlp": 1.03412342, + "epoch": 0.13515707199759508, + "flos": 23081211795600.0, + "grad_norm": 1.7239393870701691, + "language_loss": 0.81651026, + "learning_rate": 3.885119335986473e-06, + "loss": 0.84228384, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.26635742, + "step": 2248, + "time_per_iteration": 2.8136849403381348 + }, + { + "auxiliary_loss_clip": 0.01493261, + "auxiliary_loss_mlp": 0.01047999, + "balance_loss_clip": 1.30864298, + "balance_loss_mlp": 1.02688742, + "epoch": 0.13521719525026304, + "flos": 23191737416280.0, + "grad_norm": 1.6932816618403166, + "language_loss": 0.77404648, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79945904, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.21105957, + "step": 2249, + "time_per_iteration": 2.796398162841797 + }, + { + "auxiliary_loss_clip": 0.01500453, + "auxiliary_loss_mlp": 0.0105886, + "balance_loss_clip": 1.31399333, + "balance_loss_mlp": 1.03612721, + "epoch": 0.135277318502931, + "flos": 24796401147720.0, + "grad_norm": 1.4369861411671836, + "language_loss": 0.84750736, + "learning_rate": 3.884859003154862e-06, + "loss": 0.87310052, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.22741699, + "step": 2250, + "time_per_iteration": 2.851388931274414 + }, + { + "auxiliary_loss_clip": 0.01498977, + "auxiliary_loss_mlp": 0.01052742, + "balance_loss_clip": 1.30912077, + "balance_loss_mlp": 1.02829206, + "epoch": 0.13533744175559898, + "flos": 21913615110240.0, + "grad_norm": 1.8408526194453925, + "language_loss": 0.82099944, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84651661, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.24462891, + "step": 2251, + "time_per_iteration": 2.770765781402588 + }, + { + "auxiliary_loss_clip": 0.01500645, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_clip": 1.31137538, + "balance_loss_mlp": 1.02598166, + "epoch": 0.13539756500826694, + "flos": 21216164303160.0, + "grad_norm": 1.7021274709538867, + "language_loss": 0.85890937, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88441265, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.23706055, + "step": 2252, + "time_per_iteration": 2.8793444633483887 + }, + { + "auxiliary_loss_clip": 0.01338693, + "auxiliary_loss_mlp": 0.01016387, + "balance_loss_clip": 1.23167205, + "balance_loss_mlp": 1.01056921, + "epoch": 0.1354576882609349, + "flos": 63256727811960.0, + "grad_norm": 0.7535998628799901, + "language_loss": 0.61805725, + "learning_rate": 3.884467967864485e-06, + "loss": 0.641608, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.05810547, + "step": 2253, + "time_per_iteration": 3.3517374992370605 + }, + { + "auxiliary_loss_clip": 0.01504994, + "auxiliary_loss_mlp": 0.01059555, + "balance_loss_clip": 1.31803107, + "balance_loss_mlp": 1.03740597, + "epoch": 0.1355178115136029, + "flos": 25488451042920.0, + "grad_norm": 1.8791383982335177, + "language_loss": 0.89871556, + "learning_rate": 3.884337479842671e-06, + "loss": 0.92436099, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.22155762, + "step": 2254, + "time_per_iteration": 2.829416036605835 + }, + { + "auxiliary_loss_clip": 0.01506641, + "auxiliary_loss_mlp": 0.0105817, + "balance_loss_clip": 1.31328678, + "balance_loss_mlp": 1.03237307, + "epoch": 0.13557793476627086, + "flos": 21622061109960.0, + "grad_norm": 1.9108389183603771, + "language_loss": 0.84968895, + "learning_rate": 3.884206920366591e-06, + "loss": 0.87533706, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.25830078, + "step": 2255, + "time_per_iteration": 2.831432580947876 + }, + { + "auxiliary_loss_clip": 0.01501184, + "auxiliary_loss_mlp": 0.01060088, + "balance_loss_clip": 1.31417131, + "balance_loss_mlp": 1.03641248, + "epoch": 0.13563805801893883, + "flos": 24932614879080.0, + "grad_norm": 2.7071672055825875, + "language_loss": 0.74986184, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77547461, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.23681641, + "step": 2256, + "time_per_iteration": 2.7752695083618164 + }, + { + "auxiliary_loss_clip": 0.01511719, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_clip": 1.31797028, + "balance_loss_mlp": 1.03002977, + "epoch": 0.1356981812716068, + "flos": 14753506896360.0, + "grad_norm": 2.0136991005687856, + "language_loss": 0.83197862, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85763133, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.23535156, + "step": 2257, + "time_per_iteration": 2.7353975772857666 + }, + { + "auxiliary_loss_clip": 0.01516344, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.32101154, + "balance_loss_mlp": 1.03637111, + "epoch": 0.13575830452427476, + "flos": 11112229040760.0, + "grad_norm": 2.328102045505959, + "language_loss": 0.82754642, + "learning_rate": 3.883814813262277e-06, + "loss": 0.85333043, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.25683594, + "step": 2258, + "time_per_iteration": 2.7138826847076416 + }, + { + "auxiliary_loss_clip": 0.01503121, + "auxiliary_loss_mlp": 0.01052566, + "balance_loss_clip": 1.31143272, + "balance_loss_mlp": 1.02792573, + "epoch": 0.13581842777694272, + "flos": 17963849568240.0, + "grad_norm": 2.2882244777955045, + "language_loss": 0.83641171, + "learning_rate": 3.883683968018669e-06, + "loss": 0.86196864, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.24658203, + "step": 2259, + "time_per_iteration": 2.777271270751953 + }, + { + "auxiliary_loss_clip": 0.01506931, + "auxiliary_loss_mlp": 0.01067704, + "balance_loss_clip": 1.31890273, + "balance_loss_mlp": 1.04530489, + "epoch": 0.1358785510296107, + "flos": 22862353405680.0, + "grad_norm": 1.7485095062273905, + "language_loss": 0.73808217, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.76382852, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22412109, + "step": 2260, + "time_per_iteration": 4.2257771492004395 + }, + { + "auxiliary_loss_clip": 0.0150202, + "auxiliary_loss_mlp": 0.01059375, + "balance_loss_clip": 1.31469512, + "balance_loss_mlp": 1.03754807, + "epoch": 0.13593867428227868, + "flos": 25744977009720.0, + "grad_norm": 2.255412022439784, + "language_loss": 0.75376821, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77938211, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.21838379, + "step": 2261, + "time_per_iteration": 2.7761268615722656 + }, + { + "auxiliary_loss_clip": 0.01509919, + "auxiliary_loss_mlp": 0.01054142, + "balance_loss_clip": 1.31989551, + "balance_loss_mlp": 1.03175497, + "epoch": 0.13599879753494665, + "flos": 31255566235560.0, + "grad_norm": 2.047863431747481, + "language_loss": 0.63507742, + "learning_rate": 3.883291003730794e-06, + "loss": 0.66071796, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.22363281, + "step": 2262, + "time_per_iteration": 2.8561601638793945 + }, + { + "auxiliary_loss_clip": 0.0150961, + "auxiliary_loss_mlp": 0.01058137, + "balance_loss_clip": 1.31790078, + "balance_loss_mlp": 1.03402114, + "epoch": 0.1360589207876146, + "flos": 23920358462640.0, + "grad_norm": 2.089797759139457, + "language_loss": 0.83018905, + "learning_rate": 3.883159872799043e-06, + "loss": 0.85586655, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.24145508, + "step": 2263, + "time_per_iteration": 2.864143133163452 + }, + { + "auxiliary_loss_clip": 0.01521026, + "auxiliary_loss_mlp": 0.01062226, + "balance_loss_clip": 1.32814097, + "balance_loss_mlp": 1.03746676, + "epoch": 0.13611904404028258, + "flos": 19978958242800.0, + "grad_norm": 1.7004849132026678, + "language_loss": 0.88512188, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.91095436, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.24755859, + "step": 2264, + "time_per_iteration": 2.8690919876098633 + }, + { + "auxiliary_loss_clip": 0.01520093, + "auxiliary_loss_mlp": 0.01054958, + "balance_loss_clip": 1.32653248, + "balance_loss_mlp": 1.03017449, + "epoch": 0.13617916729295054, + "flos": 15344817785640.0, + "grad_norm": 3.0858671633757417, + "language_loss": 0.71483731, + "learning_rate": 3.882897396711683e-06, + "loss": 0.74058783, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.24804688, + "step": 2265, + "time_per_iteration": 4.248081922531128 + }, + { + "auxiliary_loss_clip": 0.0150751, + "auxiliary_loss_mlp": 0.01050041, + "balance_loss_clip": 1.31967592, + "balance_loss_mlp": 1.02584195, + "epoch": 0.1362392905456185, + "flos": 27456877084680.0, + "grad_norm": 4.754294083467565, + "language_loss": 0.67563486, + "learning_rate": 3.882766051566027e-06, + "loss": 0.70121038, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.24194336, + "step": 2266, + "time_per_iteration": 2.821934938430786 + }, + { + "auxiliary_loss_clip": 0.01512352, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_clip": 1.32408786, + "balance_loss_mlp": 1.03928721, + "epoch": 0.1362994137982865, + "flos": 25014406762080.0, + "grad_norm": 1.5632090385854047, + "language_loss": 0.76756382, + "learning_rate": 3.882634635025694e-06, + "loss": 0.7933203, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.2401123, + "step": 2267, + "time_per_iteration": 4.233333110809326 + }, + { + "auxiliary_loss_clip": 0.01518349, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.32818389, + "balance_loss_mlp": 1.02273798, + "epoch": 0.13635953705095447, + "flos": 20307895561440.0, + "grad_norm": 1.9904444553446659, + "language_loss": 0.82262099, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84827, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.23840332, + "step": 2268, + "time_per_iteration": 2.7726571559906006 + }, + { + "auxiliary_loss_clip": 0.01520968, + "auxiliary_loss_mlp": 0.01052033, + "balance_loss_clip": 1.33295894, + "balance_loss_mlp": 1.02722502, + "epoch": 0.13641966030362243, + "flos": 31364751780360.0, + "grad_norm": 1.7834102352915078, + "language_loss": 0.76214927, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78787923, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2479248, + "step": 2269, + "time_per_iteration": 2.8620009422302246 + }, + { + "auxiliary_loss_clip": 0.01525418, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.33460283, + "balance_loss_mlp": 1.02795529, + "epoch": 0.1364797835562904, + "flos": 20482020519840.0, + "grad_norm": 1.9026318611767976, + "language_loss": 0.81136608, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83713514, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.2355957, + "step": 2270, + "time_per_iteration": 2.7976014614105225 + }, + { + "auxiliary_loss_clip": 0.01522834, + "auxiliary_loss_mlp": 0.01063341, + "balance_loss_clip": 1.3291142, + "balance_loss_mlp": 1.03778243, + "epoch": 0.13653990680895836, + "flos": 13082198200200.0, + "grad_norm": 2.2725985023316766, + "language_loss": 0.76070654, + "learning_rate": 3.882108255017295e-06, + "loss": 0.78656828, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.2557373, + "step": 2271, + "time_per_iteration": 2.750617742538452 + }, + { + "auxiliary_loss_clip": 0.01522045, + "auxiliary_loss_mlp": 0.01060046, + "balance_loss_clip": 1.33170795, + "balance_loss_mlp": 1.0346663, + "epoch": 0.13660003006162633, + "flos": 16951187068200.0, + "grad_norm": 1.9343544493376803, + "language_loss": 0.8053329, + "learning_rate": 3.881976481578379e-06, + "loss": 0.83115387, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.25378418, + "step": 2272, + "time_per_iteration": 2.7744033336639404 + }, + { + "auxiliary_loss_clip": 0.0134555, + "auxiliary_loss_mlp": 0.01006803, + "balance_loss_clip": 1.23869658, + "balance_loss_mlp": 1.00270176, + "epoch": 0.1366601533142943, + "flos": 68698641655080.0, + "grad_norm": 0.6881515138604717, + "language_loss": 0.60692489, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.63044846, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.04101562, + "step": 2273, + "time_per_iteration": 3.3636882305145264 + }, + { + "auxiliary_loss_clip": 0.01519106, + "auxiliary_loss_mlp": 0.01051419, + "balance_loss_clip": 1.33092713, + "balance_loss_mlp": 1.02787542, + "epoch": 0.13672027656696228, + "flos": 19248875295480.0, + "grad_norm": 1.6653183815901087, + "language_loss": 0.77479935, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80050457, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.2355957, + "step": 2274, + "time_per_iteration": 2.9077725410461426 + }, + { + "auxiliary_loss_clip": 0.01524662, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.33401692, + "balance_loss_mlp": 1.0270009, + "epoch": 0.13678039981963025, + "flos": 24540281264520.0, + "grad_norm": 1.8615577715530367, + "language_loss": 0.789217, + "learning_rate": 3.881580733093211e-06, + "loss": 0.81498808, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.2545166, + "step": 2275, + "time_per_iteration": 2.815088987350464 + }, + { + "auxiliary_loss_clip": 0.01523435, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_clip": 1.3322767, + "balance_loss_mlp": 1.02654982, + "epoch": 0.13684052307229821, + "flos": 15673592670840.0, + "grad_norm": 2.1833569054704083, + "language_loss": 0.80819583, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83392382, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.22814941, + "step": 2276, + "time_per_iteration": 2.747086524963379 + }, + { + "auxiliary_loss_clip": 0.01539177, + "auxiliary_loss_mlp": 0.01060086, + "balance_loss_clip": 1.34346569, + "balance_loss_mlp": 1.03359759, + "epoch": 0.13690064632496618, + "flos": 28370384304840.0, + "grad_norm": 2.217406502025154, + "language_loss": 0.70127481, + "learning_rate": 3.881316544012779e-06, + "loss": 0.72726744, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.26501465, + "step": 2277, + "time_per_iteration": 2.8581860065460205 + }, + { + "auxiliary_loss_clip": 0.01533364, + "auxiliary_loss_mlp": 0.01065241, + "balance_loss_clip": 1.33950233, + "balance_loss_mlp": 1.03990853, + "epoch": 0.13696076957763414, + "flos": 23410108505880.0, + "grad_norm": 2.2417161358033644, + "language_loss": 0.81117725, + "learning_rate": 3.88118434246049e-06, + "loss": 0.83716321, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.25378418, + "step": 2278, + "time_per_iteration": 2.8072543144226074 + }, + { + "auxiliary_loss_clip": 0.01522381, + "auxiliary_loss_mlp": 0.01048986, + "balance_loss_clip": 1.33224607, + "balance_loss_mlp": 1.02523971, + "epoch": 0.1370208928303021, + "flos": 37203750416160.0, + "grad_norm": 2.4865552801686444, + "language_loss": 0.75510532, + "learning_rate": 3.881052069573502e-06, + "loss": 0.78081894, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.23742676, + "step": 2279, + "time_per_iteration": 2.882222890853882 + }, + { + "auxiliary_loss_clip": 0.01529112, + "auxiliary_loss_mlp": 0.01057268, + "balance_loss_clip": 1.33563161, + "balance_loss_mlp": 1.03367662, + "epoch": 0.13708101608297008, + "flos": 26981573944680.0, + "grad_norm": 1.6230162032803626, + "language_loss": 0.77052033, + "learning_rate": 3.880919725356831e-06, + "loss": 0.79638416, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.23608398, + "step": 2280, + "time_per_iteration": 2.851712226867676 + }, + { + "auxiliary_loss_clip": 0.01519865, + "auxiliary_loss_mlp": 0.01058193, + "balance_loss_clip": 1.3323791, + "balance_loss_mlp": 1.03469658, + "epoch": 0.13714113933563807, + "flos": 32562625321080.0, + "grad_norm": 1.6459504183910412, + "language_loss": 0.80227488, + "learning_rate": 3.880787309815496e-06, + "loss": 0.8280555, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.23510742, + "step": 2281, + "time_per_iteration": 2.901052236557007 + }, + { + "auxiliary_loss_clip": 0.01540609, + "auxiliary_loss_mlp": 0.01063393, + "balance_loss_clip": 1.34530234, + "balance_loss_mlp": 1.03999186, + "epoch": 0.13720126258830603, + "flos": 16104812113080.0, + "grad_norm": 1.7765132011072384, + "language_loss": 0.84137678, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86741674, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.23388672, + "step": 2282, + "time_per_iteration": 2.8737096786499023 + }, + { + "auxiliary_loss_clip": 0.01528378, + "auxiliary_loss_mlp": 0.01054307, + "balance_loss_clip": 1.34107792, + "balance_loss_mlp": 1.03193092, + "epoch": 0.137261385840974, + "flos": 18958295895840.0, + "grad_norm": 1.4803963762986971, + "language_loss": 0.73429716, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76012409, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.22375488, + "step": 2283, + "time_per_iteration": 2.8053431510925293 + }, + { + "auxiliary_loss_clip": 0.01531782, + "auxiliary_loss_mlp": 0.01057275, + "balance_loss_clip": 1.34444618, + "balance_loss_mlp": 1.03505468, + "epoch": 0.13732150909364196, + "flos": 23300679310920.0, + "grad_norm": 1.8829865648402062, + "language_loss": 0.85012138, + "learning_rate": 3.880389635293729e-06, + "loss": 0.87601197, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.22229004, + "step": 2284, + "time_per_iteration": 2.8064568042755127 + }, + { + "auxiliary_loss_clip": 0.01542045, + "auxiliary_loss_mlp": 0.01065538, + "balance_loss_clip": 1.34555137, + "balance_loss_mlp": 1.04039669, + "epoch": 0.13738163234630993, + "flos": 29357277477480.0, + "grad_norm": 1.780479011577055, + "language_loss": 0.75806773, + "learning_rate": 3.880256934503974e-06, + "loss": 0.78414357, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.25170898, + "step": 2285, + "time_per_iteration": 2.9645590782165527 + }, + { + "auxiliary_loss_clip": 0.01526405, + "auxiliary_loss_mlp": 0.01056876, + "balance_loss_clip": 1.33781815, + "balance_loss_mlp": 1.03368938, + "epoch": 0.1374417555989779, + "flos": 26656981720560.0, + "grad_norm": 1.7066860018400531, + "language_loss": 0.75427276, + "learning_rate": 3.880124162414689e-06, + "loss": 0.78010553, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.23181152, + "step": 2286, + "time_per_iteration": 2.8192086219787598 + }, + { + "auxiliary_loss_clip": 0.01539513, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_clip": 1.34759545, + "balance_loss_mlp": 1.02720308, + "epoch": 0.1375018788516459, + "flos": 28409391957600.0, + "grad_norm": 2.1657346930983317, + "language_loss": 0.86550713, + "learning_rate": 3.879991319030908e-06, + "loss": 0.8914308, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.25671387, + "step": 2287, + "time_per_iteration": 2.844397783279419 + }, + { + "auxiliary_loss_clip": 0.01526448, + "auxiliary_loss_mlp": 0.01055538, + "balance_loss_clip": 1.33587599, + "balance_loss_mlp": 1.03350806, + "epoch": 0.13756200210431385, + "flos": 37420740821520.0, + "grad_norm": 2.0168650027986597, + "language_loss": 0.69089609, + "learning_rate": 3.879858404357666e-06, + "loss": 0.71671593, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.22045898, + "step": 2288, + "time_per_iteration": 2.932070016860962 + }, + { + "auxiliary_loss_clip": 0.01523878, + "auxiliary_loss_mlp": 0.01058201, + "balance_loss_clip": 1.33292925, + "balance_loss_mlp": 1.03365588, + "epoch": 0.13762212535698182, + "flos": 22716190626120.0, + "grad_norm": 2.3836423731543928, + "language_loss": 0.87474072, + "learning_rate": 3.879725418400005e-06, + "loss": 0.90056157, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.24560547, + "step": 2289, + "time_per_iteration": 2.8247735500335693 + }, + { + "auxiliary_loss_clip": 0.01509406, + "auxiliary_loss_mlp": 0.01057731, + "balance_loss_clip": 1.32435203, + "balance_loss_mlp": 1.03566515, + "epoch": 0.13768224860964978, + "flos": 23957538739200.0, + "grad_norm": 2.0293214590467485, + "language_loss": 0.75109005, + "learning_rate": 3.879592361162969e-06, + "loss": 0.77676141, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22058105, + "step": 2290, + "time_per_iteration": 2.8435490131378174 + }, + { + "auxiliary_loss_clip": 0.01333945, + "auxiliary_loss_mlp": 0.01063721, + "balance_loss_clip": 1.22733951, + "balance_loss_mlp": 1.05926228, + "epoch": 0.13774237186231775, + "flos": 63607715470080.0, + "grad_norm": 0.7133829549198335, + "language_loss": 0.51622826, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.54020488, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.04467773, + "step": 2291, + "time_per_iteration": 3.4015321731567383 + }, + { + "auxiliary_loss_clip": 0.01532004, + "auxiliary_loss_mlp": 0.01059677, + "balance_loss_clip": 1.34135056, + "balance_loss_mlp": 1.03500021, + "epoch": 0.1378024951149857, + "flos": 24284770506720.0, + "grad_norm": 1.8356698451185227, + "language_loss": 0.71532983, + "learning_rate": 3.879326032870952e-06, + "loss": 0.74124664, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.24682617, + "step": 2292, + "time_per_iteration": 2.798027992248535 + }, + { + "auxiliary_loss_clip": 0.01526732, + "auxiliary_loss_mlp": 0.01053883, + "balance_loss_clip": 1.33758521, + "balance_loss_mlp": 1.03149557, + "epoch": 0.13786261836765368, + "flos": 14024804633280.0, + "grad_norm": 2.7034796343150758, + "language_loss": 0.8128643, + "learning_rate": 3.879192761826071e-06, + "loss": 0.83867037, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.22399902, + "step": 2293, + "time_per_iteration": 2.769782066345215 + }, + { + "auxiliary_loss_clip": 0.01529951, + "auxiliary_loss_mlp": 0.01050755, + "balance_loss_clip": 1.33668876, + "balance_loss_mlp": 1.02674675, + "epoch": 0.13792274162032167, + "flos": 28884857531040.0, + "grad_norm": 2.630247894161145, + "language_loss": 0.78696048, + "learning_rate": 3.879059419522011e-06, + "loss": 0.81276757, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.24035645, + "step": 2294, + "time_per_iteration": 2.844391107559204 + }, + { + "auxiliary_loss_clip": 0.01516894, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.32964683, + "balance_loss_mlp": 1.03343081, + "epoch": 0.13798286487298964, + "flos": 21145986411120.0, + "grad_norm": 2.489242535502722, + "language_loss": 0.8083024, + "learning_rate": 3.878926005963831e-06, + "loss": 0.83402008, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.21447754, + "step": 2295, + "time_per_iteration": 2.8599257469177246 + }, + { + "auxiliary_loss_clip": 0.01521277, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.3317275, + "balance_loss_mlp": 1.03062296, + "epoch": 0.1380429881256576, + "flos": 22492012541040.0, + "grad_norm": 2.610157633600602, + "language_loss": 0.78430921, + "learning_rate": 3.878792521156588e-06, + "loss": 0.81007499, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.24682617, + "step": 2296, + "time_per_iteration": 2.848520517349243 + }, + { + "auxiliary_loss_clip": 0.0152277, + "auxiliary_loss_mlp": 0.01069622, + "balance_loss_clip": 1.33345962, + "balance_loss_mlp": 1.04697192, + "epoch": 0.13810311137832557, + "flos": 21398167283400.0, + "grad_norm": 1.8213881153115408, + "language_loss": 0.78605247, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.81197643, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.22668457, + "step": 2297, + "time_per_iteration": 2.783273220062256 + }, + { + "auxiliary_loss_clip": 0.01515764, + "auxiliary_loss_mlp": 0.01059267, + "balance_loss_clip": 1.33002996, + "balance_loss_mlp": 1.03453124, + "epoch": 0.13816323463099353, + "flos": 25994802597120.0, + "grad_norm": 2.528857630559525, + "language_loss": 0.69776559, + "learning_rate": 3.878525337815164e-06, + "loss": 0.72351593, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.24755859, + "step": 2298, + "time_per_iteration": 2.799180030822754 + }, + { + "auxiliary_loss_clip": 0.0152327, + "auxiliary_loss_mlp": 0.01050457, + "balance_loss_clip": 1.33188987, + "balance_loss_mlp": 1.02799785, + "epoch": 0.1382233578836615, + "flos": 19248956512200.0, + "grad_norm": 1.68332307978913, + "language_loss": 0.86998522, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89572251, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.22460938, + "step": 2299, + "time_per_iteration": 4.191086530685425 + }, + { + "auxiliary_loss_clip": 0.01524511, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_clip": 1.33350778, + "balance_loss_mlp": 1.03038454, + "epoch": 0.1382834811363295, + "flos": 25671834707400.0, + "grad_norm": 1.9884410813020388, + "language_loss": 0.7588588, + "learning_rate": 3.878257869538267e-06, + "loss": 0.78464997, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24230957, + "step": 2300, + "time_per_iteration": 2.783738136291504 + }, + { + "auxiliary_loss_clip": 0.01517038, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.32959771, + "balance_loss_mlp": 1.03049302, + "epoch": 0.13834360438899745, + "flos": 19787940206640.0, + "grad_norm": 2.897250044532031, + "language_loss": 0.83529615, + "learning_rate": 3.878124028561692e-06, + "loss": 0.86099243, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.2208252, + "step": 2301, + "time_per_iteration": 2.7445473670959473 + }, + { + "auxiliary_loss_clip": 0.01502829, + "auxiliary_loss_mlp": 0.01052383, + "balance_loss_clip": 1.31623411, + "balance_loss_mlp": 1.03018618, + "epoch": 0.13840372764166542, + "flos": 26657590845960.0, + "grad_norm": 1.8424259769672413, + "language_loss": 0.86242181, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8879739, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.2220459, + "step": 2302, + "time_per_iteration": 2.8249337673187256 + }, + { + "auxiliary_loss_clip": 0.01326536, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.21877205, + "balance_loss_mlp": 1.02530158, + "epoch": 0.13846385089433338, + "flos": 70527645905040.0, + "grad_norm": 0.7610800039814757, + "language_loss": 0.65676147, + "learning_rate": 3.877856132957667e-06, + "loss": 0.68032992, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.05004883, + "step": 2303, + "time_per_iteration": 3.4074742794036865 + }, + { + "auxiliary_loss_clip": 0.01514561, + "auxiliary_loss_mlp": 0.01049761, + "balance_loss_clip": 1.32963252, + "balance_loss_mlp": 1.02730179, + "epoch": 0.13852397414700135, + "flos": 17353347905880.0, + "grad_norm": 2.0808430781299325, + "language_loss": 0.7890501, + "learning_rate": 3.877722078340374e-06, + "loss": 0.81469333, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.2244873, + "step": 2304, + "time_per_iteration": 5.985597610473633 + }, + { + "auxiliary_loss_clip": 0.01520507, + "auxiliary_loss_mlp": 0.01047836, + "balance_loss_clip": 1.33069754, + "balance_loss_mlp": 1.02522242, + "epoch": 0.13858409739966931, + "flos": 21548837590920.0, + "grad_norm": 1.6241236101521956, + "language_loss": 0.7777077, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8033911, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.22619629, + "step": 2305, + "time_per_iteration": 4.261961221694946 + }, + { + "auxiliary_loss_clip": 0.01501622, + "auxiliary_loss_mlp": 0.01052576, + "balance_loss_clip": 1.31781483, + "balance_loss_mlp": 1.02996218, + "epoch": 0.13864422065233728, + "flos": 21584921441760.0, + "grad_norm": 1.7525594799794066, + "language_loss": 0.88075525, + "learning_rate": 3.877453755500647e-06, + "loss": 0.90629721, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22619629, + "step": 2306, + "time_per_iteration": 2.965153932571411 + }, + { + "auxiliary_loss_clip": 0.01314675, + "auxiliary_loss_mlp": 0.01017305, + "balance_loss_clip": 1.20722318, + "balance_loss_mlp": 1.01284623, + "epoch": 0.13870434390500527, + "flos": 53381907873360.0, + "grad_norm": 0.8675603536604525, + "language_loss": 0.59059459, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61391437, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.04467773, + "step": 2307, + "time_per_iteration": 3.2762060165405273 + }, + { + "auxiliary_loss_clip": 0.01521513, + "auxiliary_loss_mlp": 0.01056639, + "balance_loss_clip": 1.33041072, + "balance_loss_mlp": 1.03208148, + "epoch": 0.13876446715767324, + "flos": 22571164880640.0, + "grad_norm": 1.6430043743587448, + "language_loss": 0.79816473, + "learning_rate": 3.877185147887984e-06, + "loss": 0.82394624, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.24536133, + "step": 2308, + "time_per_iteration": 2.8112547397613525 + }, + { + "auxiliary_loss_clip": 0.01512685, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.32769728, + "balance_loss_mlp": 1.02445173, + "epoch": 0.1388245904103412, + "flos": 20710259440920.0, + "grad_norm": 2.177690329781893, + "language_loss": 0.77850223, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80410123, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22766113, + "step": 2309, + "time_per_iteration": 2.7767748832702637 + }, + { + "auxiliary_loss_clip": 0.01525298, + "auxiliary_loss_mlp": 0.01062515, + "balance_loss_clip": 1.33348477, + "balance_loss_mlp": 1.03935242, + "epoch": 0.13888471366300917, + "flos": 20559142441440.0, + "grad_norm": 1.9321890424314905, + "language_loss": 0.68787384, + "learning_rate": 3.876916255543129e-06, + "loss": 0.71375197, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23156738, + "step": 2310, + "time_per_iteration": 2.7206761837005615 + }, + { + "auxiliary_loss_clip": 0.01513914, + "auxiliary_loss_mlp": 0.01059996, + "balance_loss_clip": 1.32630193, + "balance_loss_mlp": 1.03621399, + "epoch": 0.13894483691567713, + "flos": 13841136710280.0, + "grad_norm": 1.7636453996973962, + "language_loss": 0.83831429, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86405343, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.23779297, + "step": 2311, + "time_per_iteration": 2.7321693897247314 + }, + { + "auxiliary_loss_clip": 0.01522473, + "auxiliary_loss_mlp": 0.01061625, + "balance_loss_clip": 1.32980454, + "balance_loss_mlp": 1.03864193, + "epoch": 0.1390049601683451, + "flos": 28036167899400.0, + "grad_norm": 1.9809807027260973, + "language_loss": 0.81677198, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84261292, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.22973633, + "step": 2312, + "time_per_iteration": 2.781595230102539 + }, + { + "auxiliary_loss_clip": 0.01519119, + "auxiliary_loss_mlp": 0.01059758, + "balance_loss_clip": 1.32810569, + "balance_loss_mlp": 1.03731084, + "epoch": 0.13906508342101306, + "flos": 26762025212640.0, + "grad_norm": 2.0401177849571326, + "language_loss": 0.86804205, + "learning_rate": 3.876512383242215e-06, + "loss": 0.89383084, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22424316, + "step": 2313, + "time_per_iteration": 2.800654411315918 + }, + { + "auxiliary_loss_clip": 0.01514244, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.32754171, + "balance_loss_mlp": 1.03390765, + "epoch": 0.13912520667368106, + "flos": 24540484306320.0, + "grad_norm": 2.519173507895881, + "language_loss": 0.8053152, + "learning_rate": 3.876377616820024e-06, + "loss": 0.831025, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22839355, + "step": 2314, + "time_per_iteration": 2.756016492843628 + }, + { + "auxiliary_loss_clip": 0.01512661, + "auxiliary_loss_mlp": 0.01057217, + "balance_loss_clip": 1.32453382, + "balance_loss_mlp": 1.0358547, + "epoch": 0.13918532992634902, + "flos": 19386957011400.0, + "grad_norm": 2.3905143737708343, + "language_loss": 0.85626197, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88196075, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.21362305, + "step": 2315, + "time_per_iteration": 2.7123334407806396 + }, + { + "auxiliary_loss_clip": 0.01510457, + "auxiliary_loss_mlp": 0.01059573, + "balance_loss_clip": 1.32514119, + "balance_loss_mlp": 1.03730488, + "epoch": 0.139245453179017, + "flos": 21328639125120.0, + "grad_norm": 2.1490051001272166, + "language_loss": 0.77631187, + "learning_rate": 3.876107870523477e-06, + "loss": 0.80201221, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22277832, + "step": 2316, + "time_per_iteration": 2.755894422531128 + }, + { + "auxiliary_loss_clip": 0.01511455, + "auxiliary_loss_mlp": 0.01071055, + "balance_loss_clip": 1.32553625, + "balance_loss_mlp": 1.04805923, + "epoch": 0.13930557643168495, + "flos": 19505401262280.0, + "grad_norm": 1.851622746956335, + "language_loss": 0.77656662, + "learning_rate": 3.875972890659349e-06, + "loss": 0.80239171, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.22998047, + "step": 2317, + "time_per_iteration": 2.8437631130218506 + }, + { + "auxiliary_loss_clip": 0.01518334, + "auxiliary_loss_mlp": 0.01052218, + "balance_loss_clip": 1.32874501, + "balance_loss_mlp": 1.03049862, + "epoch": 0.13936569968435292, + "flos": 25416323949600.0, + "grad_norm": 1.9336893448472139, + "language_loss": 0.80635154, + "learning_rate": 3.875837839658139e-06, + "loss": 0.83205706, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.21716309, + "step": 2318, + "time_per_iteration": 2.824751853942871 + }, + { + "auxiliary_loss_clip": 0.01324343, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_clip": 1.21945429, + "balance_loss_mlp": 1.04629695, + "epoch": 0.13942582293702088, + "flos": 70788354532920.0, + "grad_norm": 0.8721280668282135, + "language_loss": 0.59045041, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61419785, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04101562, + "step": 2319, + "time_per_iteration": 3.2789371013641357 + }, + { + "auxiliary_loss_clip": 0.01514311, + "auxiliary_loss_mlp": 0.01053943, + "balance_loss_clip": 1.32403481, + "balance_loss_mlp": 1.03075683, + "epoch": 0.13948594618968888, + "flos": 35597787217200.0, + "grad_norm": 3.378249725498066, + "language_loss": 0.66506696, + "learning_rate": 3.875567524264967e-06, + "loss": 0.69074953, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.23193359, + "step": 2320, + "time_per_iteration": 2.878075122833252 + }, + { + "auxiliary_loss_clip": 0.01500322, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.31805813, + "balance_loss_mlp": 1.02718735, + "epoch": 0.13954606944235684, + "flos": 21110105602080.0, + "grad_norm": 1.4787143308645976, + "language_loss": 0.70717275, + "learning_rate": 3.875432259883256e-06, + "loss": 0.73266381, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.21594238, + "step": 2321, + "time_per_iteration": 2.743492364883423 + }, + { + "auxiliary_loss_clip": 0.01505742, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.31920612, + "balance_loss_mlp": 1.02521479, + "epoch": 0.1396061926950248, + "flos": 25049678445720.0, + "grad_norm": 1.75080919688218, + "language_loss": 0.86197984, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88752055, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.23132324, + "step": 2322, + "time_per_iteration": 2.796706199645996 + }, + { + "auxiliary_loss_clip": 0.01480581, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.30202997, + "balance_loss_mlp": 1.02931619, + "epoch": 0.13966631594769277, + "flos": 37641182937480.0, + "grad_norm": 1.5678363888439941, + "language_loss": 0.67385244, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69916236, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.2109375, + "step": 2323, + "time_per_iteration": 2.9054627418518066 + }, + { + "auxiliary_loss_clip": 0.01515104, + "auxiliary_loss_mlp": 0.01058756, + "balance_loss_clip": 1.32290769, + "balance_loss_mlp": 1.03576088, + "epoch": 0.13972643920036074, + "flos": 16695392051880.0, + "grad_norm": 2.006225006477941, + "language_loss": 0.89113033, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91686893, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.2298584, + "step": 2324, + "time_per_iteration": 2.754171133041382 + }, + { + "auxiliary_loss_clip": 0.01500681, + "auxiliary_loss_mlp": 0.01058341, + "balance_loss_clip": 1.31427538, + "balance_loss_mlp": 1.03471375, + "epoch": 0.1397865624530287, + "flos": 23336275861440.0, + "grad_norm": 2.3554689645727445, + "language_loss": 0.72645587, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.75204611, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.2364502, + "step": 2325, + "time_per_iteration": 2.9039535522460938 + }, + { + "auxiliary_loss_clip": 0.01510001, + "auxiliary_loss_mlp": 0.01054489, + "balance_loss_clip": 1.32244599, + "balance_loss_mlp": 1.03310263, + "epoch": 0.13984668570569667, + "flos": 22782754374120.0, + "grad_norm": 3.2823332946595816, + "language_loss": 0.82203114, + "learning_rate": 3.874754871328688e-06, + "loss": 0.84767604, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.21374512, + "step": 2326, + "time_per_iteration": 2.7637548446655273 + }, + { + "auxiliary_loss_clip": 0.01494856, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.31338978, + "balance_loss_mlp": 1.03000104, + "epoch": 0.13990680895836466, + "flos": 19469398628160.0, + "grad_norm": 2.0445300419798884, + "language_loss": 0.89235461, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91781813, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21496582, + "step": 2327, + "time_per_iteration": 2.758876085281372 + }, + { + "auxiliary_loss_clip": 0.01500895, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.3183651, + "balance_loss_mlp": 1.03323793, + "epoch": 0.13996693221103262, + "flos": 20308179819960.0, + "grad_norm": 2.2324357164030117, + "language_loss": 0.85637498, + "learning_rate": 3.874483418234632e-06, + "loss": 0.88193518, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.21899414, + "step": 2328, + "time_per_iteration": 2.7315263748168945 + }, + { + "auxiliary_loss_clip": 0.015043, + "auxiliary_loss_mlp": 0.01054801, + "balance_loss_clip": 1.31902099, + "balance_loss_mlp": 1.03099442, + "epoch": 0.1400270554637006, + "flos": 26623740454920.0, + "grad_norm": 1.6339761603266012, + "language_loss": 0.74363446, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76922548, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.23815918, + "step": 2329, + "time_per_iteration": 2.832738161087036 + }, + { + "auxiliary_loss_clip": 0.01509604, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.32266903, + "balance_loss_mlp": 1.02608848, + "epoch": 0.14008717871636855, + "flos": 19396296934200.0, + "grad_norm": 1.8932797861389448, + "language_loss": 0.78934705, + "learning_rate": 3.874211680818183e-06, + "loss": 0.81493658, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.23254395, + "step": 2330, + "time_per_iteration": 2.8129069805145264 + }, + { + "auxiliary_loss_clip": 0.01505131, + "auxiliary_loss_mlp": 0.01051592, + "balance_loss_clip": 1.31986701, + "balance_loss_mlp": 1.03082585, + "epoch": 0.14014730196903652, + "flos": 15308449676280.0, + "grad_norm": 2.1360011299174655, + "language_loss": 0.72323853, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74880576, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.2076416, + "step": 2331, + "time_per_iteration": 2.755314826965332 + }, + { + "auxiliary_loss_clip": 0.01489924, + "auxiliary_loss_mlp": 0.01061015, + "balance_loss_clip": 1.31298065, + "balance_loss_mlp": 1.0401175, + "epoch": 0.14020742522170448, + "flos": 14943875198760.0, + "grad_norm": 1.5531796032164782, + "language_loss": 0.73092949, + "learning_rate": 3.873939659120557e-06, + "loss": 0.75643885, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.20874023, + "step": 2332, + "time_per_iteration": 2.7382919788360596 + }, + { + "auxiliary_loss_clip": 0.01307581, + "auxiliary_loss_mlp": 0.01020158, + "balance_loss_clip": 1.20256066, + "balance_loss_mlp": 1.01479363, + "epoch": 0.14026754847437245, + "flos": 48836584545480.0, + "grad_norm": 0.8200270938766162, + "language_loss": 0.56133127, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58460867, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.05371094, + "step": 2333, + "time_per_iteration": 3.093254566192627 + }, + { + "auxiliary_loss_clip": 0.01504365, + "auxiliary_loss_mlp": 0.01054052, + "balance_loss_clip": 1.32197094, + "balance_loss_mlp": 1.03255868, + "epoch": 0.14032767172704044, + "flos": 25778502533880.0, + "grad_norm": 1.6676757588094047, + "language_loss": 0.82958323, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85516745, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.21496582, + "step": 2334, + "time_per_iteration": 2.809330463409424 + }, + { + "auxiliary_loss_clip": 0.01496381, + "auxiliary_loss_mlp": 0.01051832, + "balance_loss_clip": 1.31221914, + "balance_loss_mlp": 1.0308392, + "epoch": 0.1403877949797084, + "flos": 21221443389960.0, + "grad_norm": 1.7429583834036857, + "language_loss": 0.81413746, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83961952, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.20983887, + "step": 2335, + "time_per_iteration": 2.7724828720092773 + }, + { + "auxiliary_loss_clip": 0.01510416, + "auxiliary_loss_mlp": 0.01055338, + "balance_loss_clip": 1.32005572, + "balance_loss_mlp": 1.02973211, + "epoch": 0.14044791823237637, + "flos": 22753208469240.0, + "grad_norm": 1.561540234510696, + "language_loss": 0.82240903, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84806657, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.25622559, + "step": 2336, + "time_per_iteration": 2.83337140083313 + }, + { + "auxiliary_loss_clip": 0.01498421, + "auxiliary_loss_mlp": 0.01056618, + "balance_loss_clip": 1.31382775, + "balance_loss_mlp": 1.0348624, + "epoch": 0.14050804148504434, + "flos": 22969386707400.0, + "grad_norm": 1.561494170771974, + "language_loss": 0.80540937, + "learning_rate": 3.873258361417225e-06, + "loss": 0.8309598, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.21777344, + "step": 2337, + "time_per_iteration": 2.747378349304199 + }, + { + "auxiliary_loss_clip": 0.01500898, + "auxiliary_loss_mlp": 0.01059004, + "balance_loss_clip": 1.31632566, + "balance_loss_mlp": 1.03772557, + "epoch": 0.1405681647377123, + "flos": 22205453369040.0, + "grad_norm": 1.7089157184511663, + "language_loss": 0.7962271, + "learning_rate": 3.873121888753442e-06, + "loss": 0.8218261, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.21276855, + "step": 2338, + "time_per_iteration": 4.192859411239624 + }, + { + "auxiliary_loss_clip": 0.01510436, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_clip": 1.32283163, + "balance_loss_mlp": 1.0296917, + "epoch": 0.14062828799038027, + "flos": 23738111832240.0, + "grad_norm": 2.2358898636652107, + "language_loss": 0.80654186, + "learning_rate": 3.87298534506069e-06, + "loss": 0.83218598, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.24304199, + "step": 2339, + "time_per_iteration": 2.783653736114502 + }, + { + "auxiliary_loss_clip": 0.01506613, + "auxiliary_loss_mlp": 0.01061228, + "balance_loss_clip": 1.32239604, + "balance_loss_mlp": 1.03972316, + "epoch": 0.14068841124304826, + "flos": 39209803426440.0, + "grad_norm": 1.7568146095414074, + "language_loss": 0.66585934, + "learning_rate": 3.872848730344146e-06, + "loss": 0.69153774, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.21496582, + "step": 2340, + "time_per_iteration": 2.9040582180023193 + }, + { + "auxiliary_loss_clip": 0.01495283, + "auxiliary_loss_mlp": 0.01060069, + "balance_loss_clip": 1.31448293, + "balance_loss_mlp": 1.03840816, + "epoch": 0.14074853449571623, + "flos": 20196963857160.0, + "grad_norm": 2.8719079352026906, + "language_loss": 0.79171532, + "learning_rate": 3.87271204460899e-06, + "loss": 0.81726885, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21679688, + "step": 2341, + "time_per_iteration": 2.83176589012146 + }, + { + "auxiliary_loss_clip": 0.01495149, + "auxiliary_loss_mlp": 0.01059859, + "balance_loss_clip": 1.31183743, + "balance_loss_mlp": 1.03972435, + "epoch": 0.1408086577483842, + "flos": 18410500187280.0, + "grad_norm": 2.1484364076758657, + "language_loss": 0.81009197, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.8356421, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.20141602, + "step": 2342, + "time_per_iteration": 2.779686689376831 + }, + { + "auxiliary_loss_clip": 0.01501278, + "auxiliary_loss_mlp": 0.01049644, + "balance_loss_clip": 1.32171106, + "balance_loss_mlp": 1.0292592, + "epoch": 0.14086878100105216, + "flos": 25270039344960.0, + "grad_norm": 1.8599655219641897, + "language_loss": 0.77850699, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80401623, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.20385742, + "step": 2343, + "time_per_iteration": 4.231879472732544 + }, + { + "auxiliary_loss_clip": 0.01308022, + "auxiliary_loss_mlp": 0.01009226, + "balance_loss_clip": 1.20278132, + "balance_loss_mlp": 1.00321794, + "epoch": 0.14092890425372012, + "flos": 65993351069880.0, + "grad_norm": 0.8366440084584849, + "language_loss": 0.61545801, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63863051, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.06005859, + "step": 2344, + "time_per_iteration": 4.701382637023926 + }, + { + "auxiliary_loss_clip": 0.01490061, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.30879378, + "balance_loss_mlp": 1.0244664, + "epoch": 0.1409890275063881, + "flos": 23700119388480.0, + "grad_norm": 1.4434553008517388, + "language_loss": 0.6521703, + "learning_rate": 3.872164591585956e-06, + "loss": 0.67751992, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.20422363, + "step": 2345, + "time_per_iteration": 4.269144535064697 + }, + { + "auxiliary_loss_clip": 0.01506507, + "auxiliary_loss_mlp": 0.01044737, + "balance_loss_clip": 1.31520343, + "balance_loss_mlp": 1.02208734, + "epoch": 0.14104915075905605, + "flos": 23628438987120.0, + "grad_norm": 1.9600421993809174, + "language_loss": 0.74155211, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.76706457, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.22644043, + "step": 2346, + "time_per_iteration": 2.8777620792388916 + }, + { + "auxiliary_loss_clip": 0.01500238, + "auxiliary_loss_mlp": 0.01044974, + "balance_loss_clip": 1.31419754, + "balance_loss_mlp": 1.02246702, + "epoch": 0.14110927401172405, + "flos": 20600099295480.0, + "grad_norm": 1.7005800464980412, + "language_loss": 0.77634513, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80179721, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.22509766, + "step": 2347, + "time_per_iteration": 2.804182767868042 + }, + { + "auxiliary_loss_clip": 0.01500205, + "auxiliary_loss_mlp": 0.01054138, + "balance_loss_clip": 1.31519032, + "balance_loss_mlp": 1.03314567, + "epoch": 0.141169397264392, + "flos": 28553889794400.0, + "grad_norm": 1.6511182851985278, + "language_loss": 0.77014583, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79568928, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.20983887, + "step": 2348, + "time_per_iteration": 2.819000482559204 + }, + { + "auxiliary_loss_clip": 0.01492754, + "auxiliary_loss_mlp": 0.01046175, + "balance_loss_clip": 1.30825031, + "balance_loss_mlp": 1.02528906, + "epoch": 0.14122952051705998, + "flos": 17096700114000.0, + "grad_norm": 1.6389085347295178, + "language_loss": 0.86859453, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89398384, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.2088623, + "step": 2349, + "time_per_iteration": 2.7027196884155273 + }, + { + "auxiliary_loss_clip": 0.01494582, + "auxiliary_loss_mlp": 0.01053353, + "balance_loss_clip": 1.31346679, + "balance_loss_mlp": 1.02988064, + "epoch": 0.14128964376972794, + "flos": 28952355271320.0, + "grad_norm": 1.6550046243648384, + "language_loss": 0.89101541, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91649473, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.23449707, + "step": 2350, + "time_per_iteration": 2.8108654022216797 + }, + { + "auxiliary_loss_clip": 0.01507361, + "auxiliary_loss_mlp": 0.01047486, + "balance_loss_clip": 1.32095385, + "balance_loss_mlp": 1.02342951, + "epoch": 0.1413497670223959, + "flos": 18994501571760.0, + "grad_norm": 1.7675555561903744, + "language_loss": 0.81011271, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83566117, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.24047852, + "step": 2351, + "time_per_iteration": 2.828108072280884 + }, + { + "auxiliary_loss_clip": 0.01507373, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.32046926, + "balance_loss_mlp": 1.02544141, + "epoch": 0.14140989027506387, + "flos": 29868339601440.0, + "grad_norm": 2.4874019121813897, + "language_loss": 0.83935052, + "learning_rate": 3.871203815778219e-06, + "loss": 0.86491156, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23266602, + "step": 2352, + "time_per_iteration": 2.7836129665374756 + }, + { + "auxiliary_loss_clip": 0.0130743, + "auxiliary_loss_mlp": 0.01009321, + "balance_loss_clip": 1.20256591, + "balance_loss_mlp": 1.00438583, + "epoch": 0.14147001352773186, + "flos": 62094653863560.0, + "grad_norm": 0.9141148828366202, + "language_loss": 0.61915004, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64231753, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.04931641, + "step": 2353, + "time_per_iteration": 3.2240688800811768 + }, + { + "auxiliary_loss_clip": 0.01498695, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_clip": 1.31423545, + "balance_loss_mlp": 1.02937651, + "epoch": 0.14153013678039983, + "flos": 22022557004880.0, + "grad_norm": 1.6735409645638608, + "language_loss": 0.87732184, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.90281695, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.21435547, + "step": 2354, + "time_per_iteration": 2.767249345779419 + }, + { + "auxiliary_loss_clip": 0.01500813, + "auxiliary_loss_mlp": 0.01056817, + "balance_loss_clip": 1.31572759, + "balance_loss_mlp": 1.03314209, + "epoch": 0.1415902600330678, + "flos": 19724990602680.0, + "grad_norm": 2.0750157345481384, + "language_loss": 0.74649107, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77206743, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.23681641, + "step": 2355, + "time_per_iteration": 2.7741565704345703 + }, + { + "auxiliary_loss_clip": 0.0130946, + "auxiliary_loss_mlp": 0.0101736, + "balance_loss_clip": 1.20769572, + "balance_loss_mlp": 1.01283002, + "epoch": 0.14165038328573576, + "flos": 65915295156000.0, + "grad_norm": 0.6804586181198701, + "language_loss": 0.5184803, + "learning_rate": 3.870653239879212e-06, + "loss": 0.54174852, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.04541016, + "step": 2356, + "time_per_iteration": 3.1790945529937744 + }, + { + "auxiliary_loss_clip": 0.01501137, + "auxiliary_loss_mlp": 0.01064362, + "balance_loss_clip": 1.31829882, + "balance_loss_mlp": 1.04081869, + "epoch": 0.14171050653840372, + "flos": 12133947205080.0, + "grad_norm": 2.355242921808562, + "language_loss": 0.70757276, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.73322773, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23535156, + "step": 2357, + "time_per_iteration": 2.738034248352051 + }, + { + "auxiliary_loss_clip": 0.01509235, + "auxiliary_loss_mlp": 0.01057101, + "balance_loss_clip": 1.32150316, + "balance_loss_mlp": 1.03353333, + "epoch": 0.1417706297910717, + "flos": 20417365364760.0, + "grad_norm": 1.84887617460227, + "language_loss": 0.83190501, + "learning_rate": 3.870377526296674e-06, + "loss": 0.85756832, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.23571777, + "step": 2358, + "time_per_iteration": 2.7648520469665527 + }, + { + "auxiliary_loss_clip": 0.01518442, + "auxiliary_loss_mlp": 0.01051933, + "balance_loss_clip": 1.32687783, + "balance_loss_mlp": 1.02812672, + "epoch": 0.14183075304373965, + "flos": 22385466539640.0, + "grad_norm": 1.8956196436566903, + "language_loss": 0.72500819, + "learning_rate": 3.870239563115436e-06, + "loss": 0.75071198, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.23803711, + "step": 2359, + "time_per_iteration": 2.770871162414551 + }, + { + "auxiliary_loss_clip": 0.01513339, + "auxiliary_loss_mlp": 0.0105399, + "balance_loss_clip": 1.3263917, + "balance_loss_mlp": 1.03182888, + "epoch": 0.14189087629640765, + "flos": 21586058475840.0, + "grad_norm": 2.0271768080363666, + "language_loss": 0.7603488, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78602207, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.22180176, + "step": 2360, + "time_per_iteration": 2.784945011138916 + }, + { + "auxiliary_loss_clip": 0.01506919, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.32556725, + "balance_loss_mlp": 1.02728057, + "epoch": 0.1419509995490756, + "flos": 20013052284000.0, + "grad_norm": 2.0752889455347012, + "language_loss": 0.82136631, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84693313, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22485352, + "step": 2361, + "time_per_iteration": 2.725691318511963 + }, + { + "auxiliary_loss_clip": 0.01497368, + "auxiliary_loss_mlp": 0.01050808, + "balance_loss_clip": 1.31529045, + "balance_loss_mlp": 1.02917194, + "epoch": 0.14201112280174358, + "flos": 31947778564200.0, + "grad_norm": 2.0872039816639547, + "language_loss": 0.75106388, + "learning_rate": 3.86982524807463e-06, + "loss": 0.77654564, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.21630859, + "step": 2362, + "time_per_iteration": 2.9419119358062744 + }, + { + "auxiliary_loss_clip": 0.01500802, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_clip": 1.31847906, + "balance_loss_mlp": 1.03843784, + "epoch": 0.14207124605441154, + "flos": 41471692061400.0, + "grad_norm": 1.6592304155857887, + "language_loss": 0.73967856, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76529902, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22814941, + "step": 2363, + "time_per_iteration": 2.8930459022521973 + }, + { + "auxiliary_loss_clip": 0.01508173, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_clip": 1.32398963, + "balance_loss_mlp": 1.03617001, + "epoch": 0.1421313693070795, + "flos": 31911410454840.0, + "grad_norm": 1.696675296532266, + "language_loss": 0.73506117, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.76073408, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22912598, + "step": 2364, + "time_per_iteration": 2.850062847137451 + }, + { + "auxiliary_loss_clip": 0.01495543, + "auxiliary_loss_mlp": 0.01056334, + "balance_loss_clip": 1.31772268, + "balance_loss_mlp": 1.0366168, + "epoch": 0.14219149255974747, + "flos": 26876977144560.0, + "grad_norm": 1.945155665122343, + "language_loss": 0.90769744, + "learning_rate": 3.869410294898195e-06, + "loss": 0.93321621, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.19689941, + "step": 2365, + "time_per_iteration": 2.800794839859009 + }, + { + "auxiliary_loss_clip": 0.0150562, + "auxiliary_loss_mlp": 0.0105852, + "balance_loss_clip": 1.32070422, + "balance_loss_mlp": 1.03520322, + "epoch": 0.14225161581241544, + "flos": 27460247578560.0, + "grad_norm": 1.6259764220852246, + "language_loss": 0.65784055, + "learning_rate": 3.869271835389268e-06, + "loss": 0.68348193, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23327637, + "step": 2366, + "time_per_iteration": 2.820605993270874 + }, + { + "auxiliary_loss_clip": 0.01504012, + "auxiliary_loss_mlp": 0.01061894, + "balance_loss_clip": 1.32375002, + "balance_loss_mlp": 1.04049611, + "epoch": 0.14231173906508343, + "flos": 10565042457600.0, + "grad_norm": 1.9913130321141632, + "language_loss": 0.8075633, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.83322233, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21374512, + "step": 2367, + "time_per_iteration": 2.8202030658721924 + }, + { + "auxiliary_loss_clip": 0.01517657, + "auxiliary_loss_mlp": 0.01062321, + "balance_loss_clip": 1.33253527, + "balance_loss_mlp": 1.03905153, + "epoch": 0.1423718623177514, + "flos": 28366079818680.0, + "grad_norm": 2.0090790030242616, + "language_loss": 0.83394527, + "learning_rate": 3.868994703727742e-06, + "loss": 0.85974503, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.23266602, + "step": 2368, + "time_per_iteration": 2.7841665744781494 + }, + { + "auxiliary_loss_clip": 0.01509396, + "auxiliary_loss_mlp": 0.01056997, + "balance_loss_clip": 1.325369, + "balance_loss_mlp": 1.03192747, + "epoch": 0.14243198557041936, + "flos": 19358954224200.0, + "grad_norm": 2.5497828673485974, + "language_loss": 0.87220144, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89786536, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.25073242, + "step": 2369, + "time_per_iteration": 2.750312328338623 + }, + { + "auxiliary_loss_clip": 0.01512638, + "auxiliary_loss_mlp": 0.01059062, + "balance_loss_clip": 1.32426822, + "balance_loss_mlp": 1.03752112, + "epoch": 0.14249210882308733, + "flos": 28813177129680.0, + "grad_norm": 1.457506090699493, + "language_loss": 0.75902116, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78473818, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.21533203, + "step": 2370, + "time_per_iteration": 2.890702486038208 + }, + { + "auxiliary_loss_clip": 0.01502411, + "auxiliary_loss_mlp": 0.01059357, + "balance_loss_clip": 1.32023382, + "balance_loss_mlp": 1.03693402, + "epoch": 0.1425522320757553, + "flos": 21840026115960.0, + "grad_norm": 1.5969822761849537, + "language_loss": 0.83137918, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85699695, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.22399902, + "step": 2371, + "time_per_iteration": 2.750823497772217 + }, + { + "auxiliary_loss_clip": 0.01513818, + "auxiliary_loss_mlp": 0.01056792, + "balance_loss_clip": 1.32898211, + "balance_loss_mlp": 1.03335512, + "epoch": 0.14261235532842326, + "flos": 17315964587520.0, + "grad_norm": 1.9302948905956894, + "language_loss": 0.83064473, + "learning_rate": 3.868439589977181e-06, + "loss": 0.8563509, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.234375, + "step": 2372, + "time_per_iteration": 2.7118170261383057 + }, + { + "auxiliary_loss_clip": 0.01506838, + "auxiliary_loss_mlp": 0.01053666, + "balance_loss_clip": 1.32333302, + "balance_loss_mlp": 1.03073013, + "epoch": 0.14267247858109125, + "flos": 18811361557440.0, + "grad_norm": 2.224826514111941, + "language_loss": 0.85285693, + "learning_rate": 3.868300634397836e-06, + "loss": 0.87846196, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22949219, + "step": 2373, + "time_per_iteration": 2.735522747039795 + }, + { + "auxiliary_loss_clip": 0.014993, + "auxiliary_loss_mlp": 0.01058196, + "balance_loss_clip": 1.31894445, + "balance_loss_mlp": 1.03835952, + "epoch": 0.14273260183375922, + "flos": 11361932803080.0, + "grad_norm": 1.9776489994570987, + "language_loss": 0.85988224, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88545716, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.19848633, + "step": 2374, + "time_per_iteration": 2.8539435863494873 + }, + { + "auxiliary_loss_clip": 0.01511093, + "auxiliary_loss_mlp": 0.010602, + "balance_loss_clip": 1.32372236, + "balance_loss_mlp": 1.03792, + "epoch": 0.14279272508642718, + "flos": 27574021868040.0, + "grad_norm": 8.239288500431798, + "language_loss": 0.79644442, + "learning_rate": 3.868022510705977e-06, + "loss": 0.82215732, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.22265625, + "step": 2375, + "time_per_iteration": 2.794045925140381 + }, + { + "auxiliary_loss_clip": 0.0150711, + "auxiliary_loss_mlp": 0.01056534, + "balance_loss_clip": 1.3247385, + "balance_loss_mlp": 1.03446865, + "epoch": 0.14285284833909515, + "flos": 16256822496480.0, + "grad_norm": 2.483297384109677, + "language_loss": 0.76847821, + "learning_rate": 3.867883342604009e-06, + "loss": 0.79411465, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.2208252, + "step": 2376, + "time_per_iteration": 4.107966899871826 + }, + { + "auxiliary_loss_clip": 0.01504788, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.32301331, + "balance_loss_mlp": 1.03462148, + "epoch": 0.1429129715917631, + "flos": 19760384111400.0, + "grad_norm": 1.7129557226337917, + "language_loss": 0.93593872, + "learning_rate": 3.867744103671717e-06, + "loss": 0.96155369, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22094727, + "step": 2377, + "time_per_iteration": 2.8516182899475098 + }, + { + "auxiliary_loss_clip": 0.01504358, + "auxiliary_loss_mlp": 0.01063073, + "balance_loss_clip": 1.32149839, + "balance_loss_mlp": 1.03925478, + "epoch": 0.14297309484443108, + "flos": 21141397666440.0, + "grad_norm": 1.8499423213881059, + "language_loss": 0.9161129, + "learning_rate": 3.867604793914382e-06, + "loss": 0.94178724, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.23803711, + "step": 2378, + "time_per_iteration": 2.7540462017059326 + }, + { + "auxiliary_loss_clip": 0.01509538, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.32385552, + "balance_loss_mlp": 1.02977777, + "epoch": 0.14303321809709904, + "flos": 23591949052680.0, + "grad_norm": 3.8380996622262082, + "language_loss": 0.74505806, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.7706728, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22167969, + "step": 2379, + "time_per_iteration": 2.8130991458892822 + }, + { + "auxiliary_loss_clip": 0.01503572, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_clip": 1.32012928, + "balance_loss_mlp": 1.02954078, + "epoch": 0.14309334134976703, + "flos": 15892572885840.0, + "grad_norm": 1.8031291127861746, + "language_loss": 0.79271054, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81825805, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.21643066, + "step": 2380, + "time_per_iteration": 2.708380699157715 + }, + { + "auxiliary_loss_clip": 0.01513198, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_clip": 1.32906246, + "balance_loss_mlp": 1.0341506, + "epoch": 0.143153464602435, + "flos": 16329883582080.0, + "grad_norm": 2.0818350483452033, + "language_loss": 0.88352013, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90921384, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.22033691, + "step": 2381, + "time_per_iteration": 4.178308010101318 + }, + { + "auxiliary_loss_clip": 0.01506026, + "auxiliary_loss_mlp": 0.0105408, + "balance_loss_clip": 1.32360244, + "balance_loss_mlp": 1.03246784, + "epoch": 0.14321358785510296, + "flos": 17096497072200.0, + "grad_norm": 5.812829167904496, + "language_loss": 0.76385236, + "learning_rate": 3.867046846740299e-06, + "loss": 0.78945339, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.21630859, + "step": 2382, + "time_per_iteration": 4.2495505809783936 + }, + { + "auxiliary_loss_clip": 0.01511655, + "auxiliary_loss_mlp": 0.0105398, + "balance_loss_clip": 1.3280673, + "balance_loss_mlp": 1.03150892, + "epoch": 0.14327371110777093, + "flos": 26328531702240.0, + "grad_norm": 2.036115334510126, + "language_loss": 0.7714287, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79708505, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22473145, + "step": 2383, + "time_per_iteration": 4.344728231430054 + }, + { + "auxiliary_loss_clip": 0.01513741, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.32783341, + "balance_loss_mlp": 1.02516317, + "epoch": 0.1433338343604389, + "flos": 18080913134880.0, + "grad_norm": 2.30178100306118, + "language_loss": 0.88542861, + "learning_rate": 3.866767448340471e-06, + "loss": 0.91105497, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23742676, + "step": 2384, + "time_per_iteration": 2.726973533630371 + }, + { + "auxiliary_loss_clip": 0.01515349, + "auxiliary_loss_mlp": 0.01058339, + "balance_loss_clip": 1.32665598, + "balance_loss_mlp": 1.0334717, + "epoch": 0.14339395761310686, + "flos": 15526942590960.0, + "grad_norm": 2.22803183559685, + "language_loss": 0.79774773, + "learning_rate": 3.866627642955895e-06, + "loss": 0.8234846, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.24865723, + "step": 2385, + "time_per_iteration": 2.742622137069702 + }, + { + "auxiliary_loss_clip": 0.01507113, + "auxiliary_loss_mlp": 0.01055203, + "balance_loss_clip": 1.31994796, + "balance_loss_mlp": 1.03323293, + "epoch": 0.14345408086577485, + "flos": 28554701961600.0, + "grad_norm": 1.6392332185601217, + "language_loss": 0.75817627, + "learning_rate": 3.866487766788612e-06, + "loss": 0.78379941, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.21960449, + "step": 2386, + "time_per_iteration": 2.844770669937134 + }, + { + "auxiliary_loss_clip": 0.01510753, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.32831037, + "balance_loss_mlp": 1.02484989, + "epoch": 0.14351420411844282, + "flos": 20234915692560.0, + "grad_norm": 2.042837161676436, + "language_loss": 0.79036105, + "learning_rate": 3.866347819843925e-06, + "loss": 0.81593728, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.22021484, + "step": 2387, + "time_per_iteration": 2.811986207962036 + }, + { + "auxiliary_loss_clip": 0.01511595, + "auxiliary_loss_mlp": 0.01049551, + "balance_loss_clip": 1.3278892, + "balance_loss_mlp": 1.02718699, + "epoch": 0.14357432737111078, + "flos": 19869813306360.0, + "grad_norm": 2.0677067857839426, + "language_loss": 0.82628506, + "learning_rate": 3.866207802127143e-06, + "loss": 0.85189652, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22363281, + "step": 2388, + "time_per_iteration": 2.8222157955169678 + }, + { + "auxiliary_loss_clip": 0.01517995, + "auxiliary_loss_mlp": 0.01049162, + "balance_loss_clip": 1.33425069, + "balance_loss_mlp": 1.02840734, + "epoch": 0.14363445062377875, + "flos": 28262701269360.0, + "grad_norm": 2.0449210320180007, + "language_loss": 0.82548189, + "learning_rate": 3.866067713643573e-06, + "loss": 0.85115337, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.2076416, + "step": 2389, + "time_per_iteration": 2.7820186614990234 + }, + { + "auxiliary_loss_clip": 0.01521959, + "auxiliary_loss_mlp": 0.01053803, + "balance_loss_clip": 1.33346343, + "balance_loss_mlp": 1.03124809, + "epoch": 0.1436945738764467, + "flos": 18191235713760.0, + "grad_norm": 1.948456842998102, + "language_loss": 0.83314687, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85890448, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.22546387, + "step": 2390, + "time_per_iteration": 2.7332849502563477 + }, + { + "auxiliary_loss_clip": 0.01517978, + "auxiliary_loss_mlp": 0.01050671, + "balance_loss_clip": 1.33413064, + "balance_loss_mlp": 1.02873611, + "epoch": 0.14375469712911468, + "flos": 27313638107040.0, + "grad_norm": 1.5459990635711103, + "language_loss": 0.75135058, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77703708, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.21948242, + "step": 2391, + "time_per_iteration": 2.8448991775512695 + }, + { + "auxiliary_loss_clip": 0.0133858, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.24148977, + "balance_loss_mlp": 1.0341599, + "epoch": 0.14381482038178264, + "flos": 56904961501080.0, + "grad_norm": 0.8646394567379307, + "language_loss": 0.61793154, + "learning_rate": 3.865647023645277e-06, + "loss": 0.64169544, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03637695, + "step": 2392, + "time_per_iteration": 3.1358585357666016 + }, + { + "auxiliary_loss_clip": 0.01528128, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_clip": 1.33932579, + "balance_loss_mlp": 1.03402686, + "epoch": 0.14387494363445064, + "flos": 14286162994920.0, + "grad_norm": 1.9811658439037625, + "language_loss": 0.76983458, + "learning_rate": 3.865506652147709e-06, + "loss": 0.7956984, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.24243164, + "step": 2393, + "time_per_iteration": 2.7143216133117676 + }, + { + "auxiliary_loss_clip": 0.01521591, + "auxiliary_loss_mlp": 0.01061808, + "balance_loss_clip": 1.33572388, + "balance_loss_mlp": 1.03986156, + "epoch": 0.1439350668871186, + "flos": 26767263691080.0, + "grad_norm": 1.8647419468154758, + "language_loss": 0.76723909, + "learning_rate": 3.865366209909941e-06, + "loss": 0.79307306, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21936035, + "step": 2394, + "time_per_iteration": 2.795443296432495 + }, + { + "auxiliary_loss_clip": 0.01519146, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.33610034, + "balance_loss_mlp": 1.03594208, + "epoch": 0.14399519013978657, + "flos": 40707758723040.0, + "grad_norm": 1.514949049437347, + "language_loss": 0.8587386, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88450146, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21191406, + "step": 2395, + "time_per_iteration": 2.9239888191223145 + }, + { + "auxiliary_loss_clip": 0.01514619, + "auxiliary_loss_mlp": 0.01047811, + "balance_loss_clip": 1.33461452, + "balance_loss_mlp": 1.02654409, + "epoch": 0.14405531339245453, + "flos": 20562309893520.0, + "grad_norm": 1.502780991317999, + "language_loss": 0.8247925, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85041678, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.21264648, + "step": 2396, + "time_per_iteration": 2.811350107192993 + }, + { + "auxiliary_loss_clip": 0.0151181, + "auxiliary_loss_mlp": 0.01042415, + "balance_loss_clip": 1.33174562, + "balance_loss_mlp": 1.02117205, + "epoch": 0.1441154366451225, + "flos": 19577650180680.0, + "grad_norm": 2.3523928158476326, + "language_loss": 0.83938217, + "learning_rate": 3.864944458808712e-06, + "loss": 0.86492443, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21264648, + "step": 2397, + "time_per_iteration": 2.746884346008301 + }, + { + "auxiliary_loss_clip": 0.01521088, + "auxiliary_loss_mlp": 0.01054328, + "balance_loss_clip": 1.33698988, + "balance_loss_mlp": 1.03238177, + "epoch": 0.14417555989779046, + "flos": 18520416682560.0, + "grad_norm": 1.675107559783046, + "language_loss": 0.80581486, + "learning_rate": 3.86480373366343e-06, + "loss": 0.83156908, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.21948242, + "step": 2398, + "time_per_iteration": 2.7380833625793457 + }, + { + "auxiliary_loss_clip": 0.01514025, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.33398938, + "balance_loss_mlp": 1.03792524, + "epoch": 0.14423568315045843, + "flos": 26037343177200.0, + "grad_norm": 2.3320145487357644, + "language_loss": 0.65031874, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67605305, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21472168, + "step": 2399, + "time_per_iteration": 2.8317410945892334 + }, + { + "auxiliary_loss_clip": 0.01520794, + "auxiliary_loss_mlp": 0.01051748, + "balance_loss_clip": 1.34036434, + "balance_loss_mlp": 1.02965903, + "epoch": 0.14429580640312642, + "flos": 21293814133440.0, + "grad_norm": 1.549024313275646, + "language_loss": 0.82477754, + "learning_rate": 3.864522071237571e-06, + "loss": 0.85050297, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22094727, + "step": 2400, + "time_per_iteration": 2.771512985229492 + }, + { + "auxiliary_loss_clip": 0.01530399, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_clip": 1.34421706, + "balance_loss_mlp": 1.02324474, + "epoch": 0.14435592965579438, + "flos": 25633151921520.0, + "grad_norm": 1.5206085888338026, + "language_loss": 0.74911487, + "learning_rate": 3.864381133967676e-06, + "loss": 0.77487665, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.22558594, + "step": 2401, + "time_per_iteration": 2.8452935218811035 + }, + { + "auxiliary_loss_clip": 0.01512734, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_clip": 1.33210063, + "balance_loss_mlp": 1.02874422, + "epoch": 0.14441605290846235, + "flos": 22970077049520.0, + "grad_norm": 1.534985337570794, + "language_loss": 0.81300551, + "learning_rate": 3.86424012600026e-06, + "loss": 0.83862936, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.20910645, + "step": 2402, + "time_per_iteration": 2.823854684829712 + }, + { + "auxiliary_loss_clip": 0.0151754, + "auxiliary_loss_mlp": 0.0104938, + "balance_loss_clip": 1.33698916, + "balance_loss_mlp": 1.02771986, + "epoch": 0.14447617616113032, + "flos": 17352129655080.0, + "grad_norm": 2.559543212224845, + "language_loss": 0.84623253, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87190169, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.2166748, + "step": 2403, + "time_per_iteration": 2.768944263458252 + }, + { + "auxiliary_loss_clip": 0.01520896, + "auxiliary_loss_mlp": 0.01053984, + "balance_loss_clip": 1.33874822, + "balance_loss_mlp": 1.03060746, + "epoch": 0.14453629941379828, + "flos": 24065424816480.0, + "grad_norm": 1.8352051619274181, + "language_loss": 0.70217413, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72792292, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23376465, + "step": 2404, + "time_per_iteration": 2.7844080924987793 + }, + { + "auxiliary_loss_clip": 0.01509266, + "auxiliary_loss_mlp": 0.01053217, + "balance_loss_clip": 1.32870436, + "balance_loss_mlp": 1.0324508, + "epoch": 0.14459642266646625, + "flos": 14433747067080.0, + "grad_norm": 2.1012574441089322, + "language_loss": 0.73583543, + "learning_rate": 3.863816677966381e-06, + "loss": 0.76146024, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.20776367, + "step": 2405, + "time_per_iteration": 2.752448081970215 + }, + { + "auxiliary_loss_clip": 0.01518075, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.33773196, + "balance_loss_mlp": 1.02920842, + "epoch": 0.14465654591913424, + "flos": 9870393627360.0, + "grad_norm": 2.231571388660693, + "language_loss": 0.73577219, + "learning_rate": 3.863675387262386e-06, + "loss": 0.76145887, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21386719, + "step": 2406, + "time_per_iteration": 2.7782788276672363 + }, + { + "auxiliary_loss_clip": 0.01516981, + "auxiliary_loss_mlp": 0.01053389, + "balance_loss_clip": 1.33490896, + "balance_loss_mlp": 1.03053617, + "epoch": 0.1447166691718022, + "flos": 24978363519600.0, + "grad_norm": 4.554585210621027, + "language_loss": 0.7640025, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.78970617, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22839355, + "step": 2407, + "time_per_iteration": 2.8220434188842773 + }, + { + "auxiliary_loss_clip": 0.01514024, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_clip": 1.33349824, + "balance_loss_mlp": 1.02916932, + "epoch": 0.14477679242447017, + "flos": 21913046593200.0, + "grad_norm": 1.4983862839893425, + "language_loss": 0.7951777, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.82081509, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.20544434, + "step": 2408, + "time_per_iteration": 2.8236184120178223 + }, + { + "auxiliary_loss_clip": 0.01512002, + "auxiliary_loss_mlp": 0.01062847, + "balance_loss_clip": 1.32987928, + "balance_loss_mlp": 1.04050779, + "epoch": 0.14483691567713813, + "flos": 20745652949640.0, + "grad_norm": 1.846103889982219, + "language_loss": 0.82972789, + "learning_rate": 3.863251091147299e-06, + "loss": 0.85547638, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.22351074, + "step": 2409, + "time_per_iteration": 2.765786647796631 + }, + { + "auxiliary_loss_clip": 0.01526453, + "auxiliary_loss_mlp": 0.01056089, + "balance_loss_clip": 1.34264767, + "balance_loss_mlp": 1.03435659, + "epoch": 0.1448970389298061, + "flos": 35414484769440.0, + "grad_norm": 1.6218244927837353, + "language_loss": 0.74466145, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77048689, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.2175293, + "step": 2410, + "time_per_iteration": 2.87007999420166 + }, + { + "auxiliary_loss_clip": 0.0151605, + "auxiliary_loss_mlp": 0.01056298, + "balance_loss_clip": 1.33658051, + "balance_loss_mlp": 1.03518605, + "epoch": 0.14495716218247406, + "flos": 15418650430080.0, + "grad_norm": 1.690530123687741, + "language_loss": 0.81297374, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83869731, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.2109375, + "step": 2411, + "time_per_iteration": 2.7483723163604736 + }, + { + "auxiliary_loss_clip": 0.01506596, + "auxiliary_loss_mlp": 0.01055599, + "balance_loss_clip": 1.32761621, + "balance_loss_mlp": 1.03479743, + "epoch": 0.14501728543514203, + "flos": 33699823326000.0, + "grad_norm": 1.7809258962406294, + "language_loss": 0.70250094, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72812289, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.20788574, + "step": 2412, + "time_per_iteration": 2.828648328781128 + }, + { + "auxiliary_loss_clip": 0.01511492, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_clip": 1.33180487, + "balance_loss_mlp": 1.03153968, + "epoch": 0.14507740868781002, + "flos": 15600328543440.0, + "grad_norm": 1.8288051510019874, + "language_loss": 0.7747187, + "learning_rate": 3.862684373853579e-06, + "loss": 0.8003642, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.21533203, + "step": 2413, + "time_per_iteration": 2.762324810028076 + }, + { + "auxiliary_loss_clip": 0.01343568, + "auxiliary_loss_mlp": 0.0102137, + "balance_loss_clip": 1.24595857, + "balance_loss_mlp": 1.01776969, + "epoch": 0.145137531940478, + "flos": 66690883093680.0, + "grad_norm": 0.9128661776103371, + "language_loss": 0.58875465, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61240399, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.03588867, + "step": 2414, + "time_per_iteration": 3.18697190284729 + }, + { + "auxiliary_loss_clip": 0.01342068, + "auxiliary_loss_mlp": 0.01012004, + "balance_loss_clip": 1.24541318, + "balance_loss_mlp": 1.0086422, + "epoch": 0.14519765519314595, + "flos": 67538151432720.0, + "grad_norm": 0.8517490761714207, + "language_loss": 0.62274116, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64628196, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.03369141, + "step": 2415, + "time_per_iteration": 4.5772669315338135 + }, + { + "auxiliary_loss_clip": 0.01511267, + "auxiliary_loss_mlp": 0.01053685, + "balance_loss_clip": 1.33136034, + "balance_loss_mlp": 1.03272748, + "epoch": 0.14525777844581392, + "flos": 17203002465240.0, + "grad_norm": 1.9887232046641015, + "language_loss": 0.72627139, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.75192094, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.20959473, + "step": 2416, + "time_per_iteration": 2.7580654621124268 + }, + { + "auxiliary_loss_clip": 0.01340842, + "auxiliary_loss_mlp": 0.01020458, + "balance_loss_clip": 1.24480343, + "balance_loss_mlp": 1.01738226, + "epoch": 0.14531790169848188, + "flos": 65420801242920.0, + "grad_norm": 0.7154271977721408, + "language_loss": 0.60381401, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62742698, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.03063965, + "step": 2417, + "time_per_iteration": 3.233281373977661 + }, + { + "auxiliary_loss_clip": 0.01514109, + "auxiliary_loss_mlp": 0.01057951, + "balance_loss_clip": 1.33059371, + "balance_loss_mlp": 1.0370059, + "epoch": 0.14537802495114985, + "flos": 32568960225240.0, + "grad_norm": 2.171977331568813, + "language_loss": 0.79860198, + "learning_rate": 3.861974388030356e-06, + "loss": 0.82432258, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.20935059, + "step": 2418, + "time_per_iteration": 2.984121561050415 + }, + { + "auxiliary_loss_clip": 0.01499251, + "auxiliary_loss_mlp": 0.01059282, + "balance_loss_clip": 1.32192755, + "balance_loss_mlp": 1.03861094, + "epoch": 0.1454381482038178, + "flos": 20231098506720.0, + "grad_norm": 1.872226070098107, + "language_loss": 0.7191934, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74477863, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.20666504, + "step": 2419, + "time_per_iteration": 2.8191404342651367 + }, + { + "auxiliary_loss_clip": 0.01509633, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.32801151, + "balance_loss_mlp": 1.03129649, + "epoch": 0.1454982714564858, + "flos": 22898274823080.0, + "grad_norm": 2.564927854147512, + "language_loss": 0.90906274, + "learning_rate": 3.861689899419569e-06, + "loss": 0.9346894, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.21704102, + "step": 2420, + "time_per_iteration": 4.168237924575806 + }, + { + "auxiliary_loss_clip": 0.0150291, + "auxiliary_loss_mlp": 0.01067003, + "balance_loss_clip": 1.32215238, + "balance_loss_mlp": 1.04752398, + "epoch": 0.14555839470915377, + "flos": 20234672042400.0, + "grad_norm": 1.8620572195152707, + "language_loss": 0.82888663, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85458577, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.19470215, + "step": 2421, + "time_per_iteration": 4.230494499206543 + }, + { + "auxiliary_loss_clip": 0.01508051, + "auxiliary_loss_mlp": 0.01063941, + "balance_loss_clip": 1.32377458, + "balance_loss_mlp": 1.0423398, + "epoch": 0.14561851796182174, + "flos": 22241334178080.0, + "grad_norm": 1.389400601388548, + "language_loss": 0.82083857, + "learning_rate": 3.861405128426914e-06, + "loss": 0.84655845, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.21582031, + "step": 2422, + "time_per_iteration": 4.184580326080322 + }, + { + "auxiliary_loss_clip": 0.0134122, + "auxiliary_loss_mlp": 0.01014359, + "balance_loss_clip": 1.24135864, + "balance_loss_mlp": 1.01030564, + "epoch": 0.1456786412144897, + "flos": 52651598257080.0, + "grad_norm": 0.9054624594591318, + "language_loss": 0.63314962, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65670538, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.04052734, + "step": 2423, + "time_per_iteration": 3.2707409858703613 + }, + { + "auxiliary_loss_clip": 0.01507247, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.32585144, + "balance_loss_mlp": 1.03144455, + "epoch": 0.14573876446715767, + "flos": 23226887274840.0, + "grad_norm": 1.4927587980622192, + "language_loss": 0.82426202, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84984422, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.19519043, + "step": 2424, + "time_per_iteration": 2.8015637397766113 + }, + { + "auxiliary_loss_clip": 0.0150957, + "auxiliary_loss_mlp": 0.01049987, + "balance_loss_clip": 1.32974124, + "balance_loss_mlp": 1.02845788, + "epoch": 0.14579888771982563, + "flos": 18119027403720.0, + "grad_norm": 2.0408912759451194, + "language_loss": 0.79149014, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81708574, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.21520996, + "step": 2425, + "time_per_iteration": 2.8210787773132324 + }, + { + "auxiliary_loss_clip": 0.01517569, + "auxiliary_loss_mlp": 0.01052843, + "balance_loss_clip": 1.33531308, + "balance_loss_mlp": 1.03134978, + "epoch": 0.14585901097249362, + "flos": 23006079683640.0, + "grad_norm": 2.7467435053132494, + "language_loss": 0.83556885, + "learning_rate": 3.860834739468821e-06, + "loss": 0.86127293, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.21496582, + "step": 2426, + "time_per_iteration": 2.7462997436523438 + }, + { + "auxiliary_loss_clip": 0.01511823, + "auxiliary_loss_mlp": 0.01054256, + "balance_loss_clip": 1.32985711, + "balance_loss_mlp": 1.03267884, + "epoch": 0.1459191342251616, + "flos": 21913777543680.0, + "grad_norm": 2.222528074831741, + "language_loss": 0.8748883, + "learning_rate": 3.860691965808173e-06, + "loss": 0.90054911, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.21582031, + "step": 2427, + "time_per_iteration": 2.760282039642334 + }, + { + "auxiliary_loss_clip": 0.01512108, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.32683277, + "balance_loss_mlp": 1.02430701, + "epoch": 0.14597925747782955, + "flos": 14979593574360.0, + "grad_norm": 1.711555548599384, + "language_loss": 0.67279941, + "learning_rate": 3.8605491215899e-06, + "loss": 0.6983878, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.22436523, + "step": 2428, + "time_per_iteration": 2.8569061756134033 + }, + { + "auxiliary_loss_clip": 0.01501132, + "auxiliary_loss_mlp": 0.01047519, + "balance_loss_clip": 1.32009828, + "balance_loss_mlp": 1.02556086, + "epoch": 0.14603938073049752, + "flos": 21073818709440.0, + "grad_norm": 1.6701494309676979, + "language_loss": 0.83689624, + "learning_rate": 3.860406206819417e-06, + "loss": 0.86238277, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.21948242, + "step": 2429, + "time_per_iteration": 2.7571945190429688 + }, + { + "auxiliary_loss_clip": 0.01494608, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.31494725, + "balance_loss_mlp": 1.03172421, + "epoch": 0.14609950398316549, + "flos": 19869529047840.0, + "grad_norm": 1.5896249984462085, + "language_loss": 0.79478782, + "learning_rate": 3.860263221502145e-06, + "loss": 0.82030439, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.25341797, + "step": 2430, + "time_per_iteration": 2.8311307430267334 + }, + { + "auxiliary_loss_clip": 0.01515925, + "auxiliary_loss_mlp": 0.01047565, + "balance_loss_clip": 1.33273458, + "balance_loss_mlp": 1.02617919, + "epoch": 0.14615962723583345, + "flos": 22423946283720.0, + "grad_norm": 1.8855732581831601, + "language_loss": 0.83867061, + "learning_rate": 3.860120165643504e-06, + "loss": 0.8643055, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21374512, + "step": 2431, + "time_per_iteration": 2.7396962642669678 + }, + { + "auxiliary_loss_clip": 0.01510964, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.32505274, + "balance_loss_mlp": 1.02941155, + "epoch": 0.14621975048850142, + "flos": 22351372498440.0, + "grad_norm": 1.7145115262724193, + "language_loss": 0.79342818, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81905723, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.2253418, + "step": 2432, + "time_per_iteration": 2.7615108489990234 + }, + { + "auxiliary_loss_clip": 0.01506965, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.32568192, + "balance_loss_mlp": 1.03053713, + "epoch": 0.1462798737411694, + "flos": 24394524568560.0, + "grad_norm": 1.836352608826712, + "language_loss": 0.80446208, + "learning_rate": 3.859833842323822e-06, + "loss": 0.83005452, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21740723, + "step": 2433, + "time_per_iteration": 2.7335994243621826 + }, + { + "auxiliary_loss_clip": 0.01498193, + "auxiliary_loss_mlp": 0.01047503, + "balance_loss_clip": 1.3209796, + "balance_loss_mlp": 1.02610493, + "epoch": 0.14633999699383737, + "flos": 19249362595800.0, + "grad_norm": 1.819340509948791, + "language_loss": 0.78462732, + "learning_rate": 3.859690574873638e-06, + "loss": 0.81008428, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21411133, + "step": 2434, + "time_per_iteration": 2.76416277885437 + }, + { + "auxiliary_loss_clip": 0.01338042, + "auxiliary_loss_mlp": 0.01009953, + "balance_loss_clip": 1.239663, + "balance_loss_mlp": 1.00620961, + "epoch": 0.14640012024650534, + "flos": 62675487795960.0, + "grad_norm": 0.8501108072605797, + "language_loss": 0.58406007, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60754001, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.03735352, + "step": 2435, + "time_per_iteration": 3.227346420288086 + }, + { + "auxiliary_loss_clip": 0.01485205, + "auxiliary_loss_mlp": 0.01050797, + "balance_loss_clip": 1.31045353, + "balance_loss_mlp": 1.02992392, + "epoch": 0.1464602434991733, + "flos": 12280069376280.0, + "grad_norm": 1.9846014596489172, + "language_loss": 0.88729966, + "learning_rate": 3.859403828419744e-06, + "loss": 0.91265965, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20874023, + "step": 2436, + "time_per_iteration": 2.7959189414978027 + }, + { + "auxiliary_loss_clip": 0.01502691, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.32048392, + "balance_loss_mlp": 1.02739847, + "epoch": 0.14652036675184127, + "flos": 20927128021200.0, + "grad_norm": 1.8023528827560733, + "language_loss": 0.74861956, + "learning_rate": 3.85926034942691e-06, + "loss": 0.77412915, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.20874023, + "step": 2437, + "time_per_iteration": 2.801752805709839 + }, + { + "auxiliary_loss_clip": 0.01503217, + "auxiliary_loss_mlp": 0.01048349, + "balance_loss_clip": 1.32151842, + "balance_loss_mlp": 1.0262475, + "epoch": 0.14658049000450923, + "flos": 27708733090080.0, + "grad_norm": 1.8778132183097747, + "language_loss": 0.74088442, + "learning_rate": 3.859116799930736e-06, + "loss": 0.7664001, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.22106934, + "step": 2438, + "time_per_iteration": 2.8050119876861572 + }, + { + "auxiliary_loss_clip": 0.01507665, + "auxiliary_loss_mlp": 0.01047499, + "balance_loss_clip": 1.32861829, + "balance_loss_mlp": 1.02648282, + "epoch": 0.14664061325717723, + "flos": 24941954801880.0, + "grad_norm": 1.725751437502758, + "language_loss": 0.75082517, + "learning_rate": 3.858973179936668e-06, + "loss": 0.77637684, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21008301, + "step": 2439, + "time_per_iteration": 2.8213839530944824 + }, + { + "auxiliary_loss_clip": 0.0150501, + "auxiliary_loss_mlp": 0.01046891, + "balance_loss_clip": 1.3258611, + "balance_loss_mlp": 1.02524233, + "epoch": 0.1467007365098452, + "flos": 40305191801760.0, + "grad_norm": 2.104793007356185, + "language_loss": 0.74576199, + "learning_rate": 3.85882948945015e-06, + "loss": 0.771281, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21643066, + "step": 2440, + "time_per_iteration": 2.981006622314453 + }, + { + "auxiliary_loss_clip": 0.01494201, + "auxiliary_loss_mlp": 0.01056855, + "balance_loss_clip": 1.31869173, + "balance_loss_mlp": 1.03680432, + "epoch": 0.14676085976251316, + "flos": 26546618533320.0, + "grad_norm": 1.4770721265611975, + "language_loss": 0.82904583, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85455632, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20056152, + "step": 2441, + "time_per_iteration": 2.8133227825164795 + }, + { + "auxiliary_loss_clip": 0.01518844, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_clip": 1.33202171, + "balance_loss_mlp": 1.02875555, + "epoch": 0.14682098301518112, + "flos": 23555377901520.0, + "grad_norm": 2.2142510919125797, + "language_loss": 0.72609389, + "learning_rate": 3.858541897021563e-06, + "loss": 0.75178277, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.2130127, + "step": 2442, + "time_per_iteration": 2.7496814727783203 + }, + { + "auxiliary_loss_clip": 0.01520709, + "auxiliary_loss_mlp": 0.01051698, + "balance_loss_clip": 1.33261704, + "balance_loss_mlp": 1.0307169, + "epoch": 0.1468811062678491, + "flos": 11654177145480.0, + "grad_norm": 4.425944628633186, + "language_loss": 0.81623518, + "learning_rate": 3.8583979950904e-06, + "loss": 0.84195924, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.20996094, + "step": 2443, + "time_per_iteration": 2.7493057250976562 + }, + { + "auxiliary_loss_clip": 0.01511793, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_clip": 1.33034945, + "balance_loss_mlp": 1.03158092, + "epoch": 0.14694122952051705, + "flos": 23007785234760.0, + "grad_norm": 1.6999368206196996, + "language_loss": 0.8318187, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85747176, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21911621, + "step": 2444, + "time_per_iteration": 2.7985613346099854 + }, + { + "auxiliary_loss_clip": 0.01511055, + "auxiliary_loss_mlp": 0.01051476, + "balance_loss_clip": 1.32902145, + "balance_loss_mlp": 1.03090096, + "epoch": 0.14700135277318502, + "flos": 26508301222680.0, + "grad_norm": 1.5910065640047186, + "language_loss": 0.71245795, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73808324, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.20568848, + "step": 2445, + "time_per_iteration": 2.813838481903076 + }, + { + "auxiliary_loss_clip": 0.01360267, + "auxiliary_loss_mlp": 0.01013713, + "balance_loss_clip": 1.26146805, + "balance_loss_mlp": 1.01044679, + "epoch": 0.147061476025853, + "flos": 59463805048200.0, + "grad_norm": 0.8213759410456148, + "language_loss": 0.63136643, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65510625, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.03271484, + "step": 2446, + "time_per_iteration": 3.206228017807007 + }, + { + "auxiliary_loss_clip": 0.01518249, + "auxiliary_loss_mlp": 0.01066487, + "balance_loss_clip": 1.3363868, + "balance_loss_mlp": 1.0448153, + "epoch": 0.14712159927852098, + "flos": 28336939997400.0, + "grad_norm": 1.804712763377238, + "language_loss": 0.7535972, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77944452, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.2166748, + "step": 2447, + "time_per_iteration": 2.826043128967285 + }, + { + "auxiliary_loss_clip": 0.01510299, + "auxiliary_loss_mlp": 0.01056282, + "balance_loss_clip": 1.3304646, + "balance_loss_mlp": 1.03511, + "epoch": 0.14718172253118894, + "flos": 27095632492680.0, + "grad_norm": 1.8551270664089357, + "language_loss": 0.85516882, + "learning_rate": 3.857677428484242e-06, + "loss": 0.8808347, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.21179199, + "step": 2448, + "time_per_iteration": 2.852152109146118 + }, + { + "auxiliary_loss_clip": 0.01360882, + "auxiliary_loss_mlp": 0.0100656, + "balance_loss_clip": 1.26226604, + "balance_loss_mlp": 1.00303161, + "epoch": 0.1472418457838569, + "flos": 66721647249360.0, + "grad_norm": 0.7627020673564613, + "language_loss": 0.5688014, + "learning_rate": 3.857533103811195e-06, + "loss": 0.59247583, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.03540039, + "step": 2449, + "time_per_iteration": 3.2272844314575195 + }, + { + "auxiliary_loss_clip": 0.01501336, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_clip": 1.32482219, + "balance_loss_mlp": 1.02877271, + "epoch": 0.14730196903652487, + "flos": 19578381131160.0, + "grad_norm": 1.769357748739765, + "language_loss": 0.85822433, + "learning_rate": 3.857388708700307e-06, + "loss": 0.88373351, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.20812988, + "step": 2450, + "time_per_iteration": 2.764159917831421 + }, + { + "auxiliary_loss_clip": 0.01515316, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.33363283, + "balance_loss_mlp": 1.02910626, + "epoch": 0.14736209228919284, + "flos": 16075388033280.0, + "grad_norm": 2.049914592782505, + "language_loss": 0.75483793, + "learning_rate": 3.857244243157052e-06, + "loss": 0.78049451, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.21228027, + "step": 2451, + "time_per_iteration": 2.8531265258789062 + }, + { + "auxiliary_loss_clip": 0.01501258, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_clip": 1.3263911, + "balance_loss_mlp": 1.02668893, + "epoch": 0.1474222155418608, + "flos": 23044681252800.0, + "grad_norm": 1.4979101020617516, + "language_loss": 0.82202125, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84750426, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.20373535, + "step": 2452, + "time_per_iteration": 2.7884714603424072 + }, + { + "auxiliary_loss_clip": 0.01507961, + "auxiliary_loss_mlp": 0.01048827, + "balance_loss_clip": 1.33020973, + "balance_loss_mlp": 1.02795315, + "epoch": 0.1474823387945288, + "flos": 17022989294640.0, + "grad_norm": 1.6315135900706814, + "language_loss": 0.74764895, + "learning_rate": 3.856955100795361e-06, + "loss": 0.7732169, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.2088623, + "step": 2453, + "time_per_iteration": 2.763349771499634 + }, + { + "auxiliary_loss_clip": 0.01509557, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_clip": 1.3288908, + "balance_loss_mlp": 1.03210521, + "epoch": 0.14754246204719676, + "flos": 17899600496760.0, + "grad_norm": 2.044401032280104, + "language_loss": 0.76498538, + "learning_rate": 3.856810423987889e-06, + "loss": 0.79061812, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21618652, + "step": 2454, + "time_per_iteration": 4.234007835388184 + }, + { + "auxiliary_loss_clip": 0.01500242, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.32181168, + "balance_loss_mlp": 1.02415538, + "epoch": 0.14760258529986472, + "flos": 13083050975760.0, + "grad_norm": 2.0182131110823422, + "language_loss": 0.83218586, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85763735, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.20739746, + "step": 2455, + "time_per_iteration": 2.7239294052124023 + }, + { + "auxiliary_loss_clip": 0.01510515, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.32690883, + "balance_loss_mlp": 1.02847648, + "epoch": 0.1476627085525327, + "flos": 30812042460240.0, + "grad_norm": 1.8903074321807363, + "language_loss": 0.84460396, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8702023, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.20837402, + "step": 2456, + "time_per_iteration": 2.837293863296509 + }, + { + "auxiliary_loss_clip": 0.01498344, + "auxiliary_loss_mlp": 0.01049176, + "balance_loss_clip": 1.32445526, + "balance_loss_mlp": 1.02833867, + "epoch": 0.14772283180520066, + "flos": 21694472461800.0, + "grad_norm": 1.7276074015747547, + "language_loss": 0.84194404, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86741924, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20825195, + "step": 2457, + "time_per_iteration": 2.7769176959991455 + }, + { + "auxiliary_loss_clip": 0.01498184, + "auxiliary_loss_mlp": 0.01047034, + "balance_loss_clip": 1.32444143, + "balance_loss_mlp": 1.02619672, + "epoch": 0.14778295505786862, + "flos": 18775034056440.0, + "grad_norm": 1.7295597748072697, + "language_loss": 0.75584936, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78130162, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20837402, + "step": 2458, + "time_per_iteration": 2.818950891494751 + }, + { + "auxiliary_loss_clip": 0.0151613, + "auxiliary_loss_mlp": 0.01052645, + "balance_loss_clip": 1.33131039, + "balance_loss_mlp": 1.02943456, + "epoch": 0.1478430783105366, + "flos": 22898762123400.0, + "grad_norm": 2.108295971163175, + "language_loss": 0.83658743, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86227524, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23217773, + "step": 2459, + "time_per_iteration": 4.252923965454102 + }, + { + "auxiliary_loss_clip": 0.01498596, + "auxiliary_loss_mlp": 0.01045057, + "balance_loss_clip": 1.32291651, + "balance_loss_mlp": 1.02462423, + "epoch": 0.14790320156320458, + "flos": 15089469461280.0, + "grad_norm": 1.9921257784851023, + "language_loss": 0.75960875, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78504527, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.2043457, + "step": 2460, + "time_per_iteration": 5.752053499221802 + }, + { + "auxiliary_loss_clip": 0.01509805, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_clip": 1.32611215, + "balance_loss_mlp": 1.02535021, + "epoch": 0.14796332481587254, + "flos": 26510331640680.0, + "grad_norm": 1.7458805362812255, + "language_loss": 0.812325, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83788818, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.21166992, + "step": 2461, + "time_per_iteration": 2.8766770362854004 + }, + { + "auxiliary_loss_clip": 0.01505887, + "auxiliary_loss_mlp": 0.01057035, + "balance_loss_clip": 1.32569659, + "balance_loss_mlp": 1.03401542, + "epoch": 0.1480234480685405, + "flos": 17567252075880.0, + "grad_norm": 2.4188139029894904, + "language_loss": 0.65961593, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68524516, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.23010254, + "step": 2462, + "time_per_iteration": 2.7305355072021484 + }, + { + "auxiliary_loss_clip": 0.01509009, + "auxiliary_loss_mlp": 0.01054754, + "balance_loss_clip": 1.3295604, + "balance_loss_mlp": 1.03339195, + "epoch": 0.14808357132120847, + "flos": 53591954249160.0, + "grad_norm": 1.5102553114690191, + "language_loss": 0.6750108, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.70064837, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21337891, + "step": 2463, + "time_per_iteration": 3.0984718799591064 + }, + { + "auxiliary_loss_clip": 0.01504788, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.32440042, + "balance_loss_mlp": 1.03041387, + "epoch": 0.14814369457387644, + "flos": 19834054322400.0, + "grad_norm": 1.7136224000475349, + "language_loss": 0.77136385, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79692698, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.21105957, + "step": 2464, + "time_per_iteration": 2.759610652923584 + }, + { + "auxiliary_loss_clip": 0.01493011, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.31640637, + "balance_loss_mlp": 1.03048313, + "epoch": 0.1482038178265444, + "flos": 23920277245920.0, + "grad_norm": 1.78735569519167, + "language_loss": 0.80083144, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82625997, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.19360352, + "step": 2465, + "time_per_iteration": 2.787832736968994 + }, + { + "auxiliary_loss_clip": 0.01512771, + "auxiliary_loss_mlp": 0.01054045, + "balance_loss_clip": 1.32983208, + "balance_loss_mlp": 1.03121686, + "epoch": 0.1482639410792124, + "flos": 24175869220440.0, + "grad_norm": 1.9334506657329293, + "language_loss": 0.76306248, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78873068, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22814941, + "step": 2466, + "time_per_iteration": 2.7835450172424316 + }, + { + "auxiliary_loss_clip": 0.01353004, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.25417876, + "balance_loss_mlp": 1.02885377, + "epoch": 0.14832406433188036, + "flos": 66205224821880.0, + "grad_norm": 0.8308886720289473, + "language_loss": 0.600927, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62477839, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.03295898, + "step": 2467, + "time_per_iteration": 3.2566678524017334 + }, + { + "auxiliary_loss_clip": 0.01506195, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.32849431, + "balance_loss_mlp": 1.02754545, + "epoch": 0.14838418758454833, + "flos": 25416811249920.0, + "grad_norm": 1.8719169637566098, + "language_loss": 0.87495184, + "learning_rate": 3.85477755808841e-06, + "loss": 0.90049583, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20654297, + "step": 2468, + "time_per_iteration": 2.7982654571533203 + }, + { + "auxiliary_loss_clip": 0.015146, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.3314569, + "balance_loss_mlp": 1.03034878, + "epoch": 0.1484443108372163, + "flos": 23294506840200.0, + "grad_norm": 1.9953369805118095, + "language_loss": 0.76442051, + "learning_rate": 3.854631825701919e-06, + "loss": 0.79010099, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.23095703, + "step": 2469, + "time_per_iteration": 2.8569912910461426 + }, + { + "auxiliary_loss_clip": 0.01497335, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_clip": 1.31904531, + "balance_loss_mlp": 1.03168988, + "epoch": 0.14850443408988426, + "flos": 14651793289800.0, + "grad_norm": 2.234284828583638, + "language_loss": 0.76421458, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78970176, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.19702148, + "step": 2470, + "time_per_iteration": 2.7477190494537354 + }, + { + "auxiliary_loss_clip": 0.01499839, + "auxiliary_loss_mlp": 0.01052903, + "balance_loss_clip": 1.32392561, + "balance_loss_mlp": 1.03131437, + "epoch": 0.14856455734255222, + "flos": 23553347483520.0, + "grad_norm": 1.9834979643351642, + "language_loss": 0.72413874, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74966621, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21594238, + "step": 2471, + "time_per_iteration": 2.788220167160034 + }, + { + "auxiliary_loss_clip": 0.01510775, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.32607365, + "balance_loss_mlp": 1.02421844, + "epoch": 0.1486246805952202, + "flos": 18081847127160.0, + "grad_norm": 1.9503587580573487, + "language_loss": 0.89865601, + "learning_rate": 3.854194206597615e-06, + "loss": 0.9242413, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.2355957, + "step": 2472, + "time_per_iteration": 2.931894302368164 + }, + { + "auxiliary_loss_clip": 0.01505606, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.32721353, + "balance_loss_mlp": 1.03312874, + "epoch": 0.14868480384788818, + "flos": 19358263882080.0, + "grad_norm": 2.2731037282288225, + "language_loss": 0.80485988, + "learning_rate": 3.854048192933008e-06, + "loss": 0.83047593, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.22851562, + "step": 2473, + "time_per_iteration": 2.7967529296875 + }, + { + "auxiliary_loss_clip": 0.01505915, + "auxiliary_loss_mlp": 0.0106188, + "balance_loss_clip": 1.32333875, + "balance_loss_mlp": 1.04109025, + "epoch": 0.14874492710055615, + "flos": 22205290935600.0, + "grad_norm": 2.6024642991023432, + "language_loss": 0.77417934, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79985726, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.20788574, + "step": 2474, + "time_per_iteration": 2.851318836212158 + }, + { + "auxiliary_loss_clip": 0.0151041, + "auxiliary_loss_mlp": 0.01055441, + "balance_loss_clip": 1.32608747, + "balance_loss_mlp": 1.03215981, + "epoch": 0.1488050503532241, + "flos": 21108115792440.0, + "grad_norm": 1.812326091168963, + "language_loss": 0.82992882, + "learning_rate": 3.853755954692255e-06, + "loss": 0.8555873, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.23278809, + "step": 2475, + "time_per_iteration": 2.7496564388275146 + }, + { + "auxiliary_loss_clip": 0.01504415, + "auxiliary_loss_mlp": 0.01051834, + "balance_loss_clip": 1.32796073, + "balance_loss_mlp": 1.03162849, + "epoch": 0.14886517360589208, + "flos": 12790400549760.0, + "grad_norm": 1.7130804653263458, + "language_loss": 0.8108393, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83640182, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.20214844, + "step": 2476, + "time_per_iteration": 2.785762071609497 + }, + { + "auxiliary_loss_clip": 0.01497156, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.32363534, + "balance_loss_mlp": 1.03037, + "epoch": 0.14892529685856004, + "flos": 29028827459160.0, + "grad_norm": 1.6746551130894007, + "language_loss": 0.77911699, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80459201, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.19995117, + "step": 2477, + "time_per_iteration": 2.8461601734161377 + }, + { + "auxiliary_loss_clip": 0.01358657, + "auxiliary_loss_mlp": 0.01004335, + "balance_loss_clip": 1.2593565, + "balance_loss_mlp": 1.00099683, + "epoch": 0.148985420111228, + "flos": 61940247586920.0, + "grad_norm": 0.8193890593522476, + "language_loss": 0.60216808, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62579799, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.03344727, + "step": 2478, + "time_per_iteration": 3.308387279510498 + }, + { + "auxiliary_loss_clip": 0.01503369, + "auxiliary_loss_mlp": 0.010486, + "balance_loss_clip": 1.32465565, + "balance_loss_mlp": 1.02944326, + "epoch": 0.149045543363896, + "flos": 23920439679360.0, + "grad_norm": 2.116069312438098, + "language_loss": 0.71332562, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73884523, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.19152832, + "step": 2479, + "time_per_iteration": 2.879943609237671 + }, + { + "auxiliary_loss_clip": 0.0150312, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.32396626, + "balance_loss_mlp": 1.0281589, + "epoch": 0.14910566661656396, + "flos": 23659000101000.0, + "grad_norm": 1.546527076842388, + "language_loss": 0.81405449, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83957613, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.20898438, + "step": 2480, + "time_per_iteration": 2.8925974369049072 + }, + { + "auxiliary_loss_clip": 0.01504563, + "auxiliary_loss_mlp": 0.01050204, + "balance_loss_clip": 1.32230449, + "balance_loss_mlp": 1.03012931, + "epoch": 0.14916578986923193, + "flos": 20519322621480.0, + "grad_norm": 2.0137885253500016, + "language_loss": 0.84244883, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86799651, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.20080566, + "step": 2481, + "time_per_iteration": 2.7603259086608887 + }, + { + "auxiliary_loss_clip": 0.014969, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_clip": 1.31769609, + "balance_loss_mlp": 1.03583348, + "epoch": 0.1492259131218999, + "flos": 22496926152600.0, + "grad_norm": 1.8114961669594616, + "language_loss": 0.77776867, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.80331838, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22265625, + "step": 2482, + "time_per_iteration": 2.7681734561920166 + }, + { + "auxiliary_loss_clip": 0.01517233, + "auxiliary_loss_mlp": 0.0105199, + "balance_loss_clip": 1.33243072, + "balance_loss_mlp": 1.02854145, + "epoch": 0.14928603637456786, + "flos": 23191168899240.0, + "grad_norm": 3.5924663297630874, + "language_loss": 0.79340434, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81909657, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23461914, + "step": 2483, + "time_per_iteration": 2.7943217754364014 + }, + { + "auxiliary_loss_clip": 0.01497245, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.32346678, + "balance_loss_mlp": 1.02676618, + "epoch": 0.14934615962723582, + "flos": 21658307394240.0, + "grad_norm": 1.5232319771873843, + "language_loss": 0.70380819, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72924298, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.19470215, + "step": 2484, + "time_per_iteration": 2.9356632232666016 + }, + { + "auxiliary_loss_clip": 0.01500739, + "auxiliary_loss_mlp": 0.01048827, + "balance_loss_clip": 1.32092869, + "balance_loss_mlp": 1.02714241, + "epoch": 0.1494062828799038, + "flos": 27014977643760.0, + "grad_norm": 1.847178239274528, + "language_loss": 0.84688139, + "learning_rate": 3.852290546699863e-06, + "loss": 0.87237704, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21691895, + "step": 2485, + "time_per_iteration": 2.816892385482788 + }, + { + "auxiliary_loss_clip": 0.01509353, + "auxiliary_loss_mlp": 0.01046253, + "balance_loss_clip": 1.328583, + "balance_loss_mlp": 1.02514148, + "epoch": 0.14946640613257178, + "flos": 21219900272280.0, + "grad_norm": 1.876918923853972, + "language_loss": 0.85081977, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87637585, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21105957, + "step": 2486, + "time_per_iteration": 2.785471200942993 + }, + { + "auxiliary_loss_clip": 0.01490456, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.31665111, + "balance_loss_mlp": 1.02651668, + "epoch": 0.14952652938523975, + "flos": 13374848626200.0, + "grad_norm": 1.89801511706092, + "language_loss": 0.75058508, + "learning_rate": 3.851996622054842e-06, + "loss": 0.77594781, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.19287109, + "step": 2487, + "time_per_iteration": 2.722522258758545 + }, + { + "auxiliary_loss_clip": 0.01503747, + "auxiliary_loss_mlp": 0.01049307, + "balance_loss_clip": 1.32467103, + "balance_loss_mlp": 1.02894568, + "epoch": 0.1495866526379077, + "flos": 35524563698160.0, + "grad_norm": 1.8709838264297052, + "language_loss": 0.72106022, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74659073, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.20349121, + "step": 2488, + "time_per_iteration": 2.891500949859619 + }, + { + "auxiliary_loss_clip": 0.01512185, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_clip": 1.32752109, + "balance_loss_mlp": 1.0309968, + "epoch": 0.14964677589057568, + "flos": 17636374150560.0, + "grad_norm": 2.4738246103439665, + "language_loss": 0.71100402, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73666102, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.22509766, + "step": 2489, + "time_per_iteration": 2.6968274116516113 + }, + { + "auxiliary_loss_clip": 0.01511391, + "auxiliary_loss_mlp": 0.01049866, + "balance_loss_clip": 1.33091784, + "balance_loss_mlp": 1.02919555, + "epoch": 0.14970689914324364, + "flos": 20189045226960.0, + "grad_norm": 2.3795208722630243, + "language_loss": 0.82360125, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.84921378, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.20666504, + "step": 2490, + "time_per_iteration": 2.7558388710021973 + }, + { + "auxiliary_loss_clip": 0.01513308, + "auxiliary_loss_mlp": 0.01054337, + "balance_loss_clip": 1.3319521, + "balance_loss_mlp": 1.03341556, + "epoch": 0.1497670223959116, + "flos": 37235286130680.0, + "grad_norm": 1.654041079087356, + "language_loss": 0.80253774, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82821417, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.20922852, + "step": 2491, + "time_per_iteration": 2.9156394004821777 + }, + { + "auxiliary_loss_clip": 0.01504556, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.32153034, + "balance_loss_mlp": 1.02546859, + "epoch": 0.1498271456485796, + "flos": 24460397974440.0, + "grad_norm": 3.0140137011315176, + "language_loss": 0.91100407, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93653488, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.23071289, + "step": 2492, + "time_per_iteration": 2.764816999435425 + }, + { + "auxiliary_loss_clip": 0.01511652, + "auxiliary_loss_mlp": 0.01058028, + "balance_loss_clip": 1.33218861, + "balance_loss_mlp": 1.03748786, + "epoch": 0.14988726890124757, + "flos": 16258000138920.0, + "grad_norm": 2.497182734867508, + "language_loss": 0.80601311, + "learning_rate": 3.851113162828802e-06, + "loss": 0.83170986, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.20544434, + "step": 2493, + "time_per_iteration": 4.216289281845093 + }, + { + "auxiliary_loss_clip": 0.01506851, + "auxiliary_loss_mlp": 0.01047486, + "balance_loss_clip": 1.32491565, + "balance_loss_mlp": 1.02279758, + "epoch": 0.14994739215391553, + "flos": 20671292396520.0, + "grad_norm": 1.6190095272417822, + "language_loss": 0.80862141, + "learning_rate": 3.85096567391148e-06, + "loss": 0.8341648, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.24719238, + "step": 2494, + "time_per_iteration": 2.816904067993164 + }, + { + "auxiliary_loss_clip": 0.01503052, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_clip": 1.32596028, + "balance_loss_mlp": 1.03604436, + "epoch": 0.1500075154065835, + "flos": 70663680221760.0, + "grad_norm": 2.107689037108088, + "language_loss": 0.66421366, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68983698, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.23217773, + "step": 2495, + "time_per_iteration": 3.1674399375915527 + }, + { + "auxiliary_loss_clip": 0.01346708, + "auxiliary_loss_mlp": 0.01013409, + "balance_loss_clip": 1.24849224, + "balance_loss_mlp": 1.00921249, + "epoch": 0.15006763865925146, + "flos": 68025904358040.0, + "grad_norm": 0.905704700322053, + "language_loss": 0.59544361, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61904478, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.04199219, + "step": 2496, + "time_per_iteration": 3.325460195541382 + }, + { + "auxiliary_loss_clip": 0.01513007, + "auxiliary_loss_mlp": 0.01056678, + "balance_loss_clip": 1.33100319, + "balance_loss_mlp": 1.03419518, + "epoch": 0.15012776191191943, + "flos": 18920912577480.0, + "grad_norm": 1.6584046223399345, + "language_loss": 0.65971732, + "learning_rate": 3.850522786049075e-06, + "loss": 0.6854142, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22473145, + "step": 2497, + "time_per_iteration": 2.8008625507354736 + }, + { + "auxiliary_loss_clip": 0.01511314, + "auxiliary_loss_mlp": 0.01050814, + "balance_loss_clip": 1.32996213, + "balance_loss_mlp": 1.03040576, + "epoch": 0.1501878851645874, + "flos": 23707307068200.0, + "grad_norm": 1.4553911893876017, + "language_loss": 0.75467122, + "learning_rate": 3.850375016410121e-06, + "loss": 0.78029245, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.20410156, + "step": 2498, + "time_per_iteration": 4.336648941040039 + }, + { + "auxiliary_loss_clip": 0.01511926, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.32895684, + "balance_loss_mlp": 1.0256778, + "epoch": 0.15024800841725539, + "flos": 20417405973120.0, + "grad_norm": 2.064332741299894, + "language_loss": 0.72858965, + "learning_rate": 3.850227176604761e-06, + "loss": 0.75418305, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21728516, + "step": 2499, + "time_per_iteration": 5.822682619094849 + }, + { + "auxiliary_loss_clip": 0.01512373, + "auxiliary_loss_mlp": 0.01057744, + "balance_loss_clip": 1.33054399, + "balance_loss_mlp": 1.03633392, + "epoch": 0.15030813166992335, + "flos": 31837171726800.0, + "grad_norm": 1.8493020126172037, + "language_loss": 0.72647148, + "learning_rate": 3.850079266638601e-06, + "loss": 0.75217259, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.2142334, + "step": 2500, + "time_per_iteration": 2.9057297706604004 + }, + { + "auxiliary_loss_clip": 0.01499445, + "auxiliary_loss_mlp": 0.01058555, + "balance_loss_clip": 1.31939757, + "balance_loss_mlp": 1.03726375, + "epoch": 0.15036825492259132, + "flos": 35663944881600.0, + "grad_norm": 1.7774347561885042, + "language_loss": 0.65525913, + "learning_rate": 3.849931286517249e-06, + "loss": 0.68083912, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.2130127, + "step": 2501, + "time_per_iteration": 2.9303581714630127 + }, + { + "auxiliary_loss_clip": 0.01498537, + "auxiliary_loss_mlp": 0.01053454, + "balance_loss_clip": 1.318097, + "balance_loss_mlp": 1.03145981, + "epoch": 0.15042837817525928, + "flos": 18842125713120.0, + "grad_norm": 2.0046041660325793, + "language_loss": 0.84021914, + "learning_rate": 3.849783236246318e-06, + "loss": 0.86573899, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22009277, + "step": 2502, + "time_per_iteration": 2.764843225479126 + }, + { + "auxiliary_loss_clip": 0.01499906, + "auxiliary_loss_mlp": 0.01060673, + "balance_loss_clip": 1.31969035, + "balance_loss_mlp": 1.04016924, + "epoch": 0.15048850142792725, + "flos": 19540145037240.0, + "grad_norm": 1.895689291982064, + "language_loss": 0.77344429, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79905009, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.20507812, + "step": 2503, + "time_per_iteration": 2.8076465129852295 + }, + { + "auxiliary_loss_clip": 0.01494253, + "auxiliary_loss_mlp": 0.01049951, + "balance_loss_clip": 1.3158468, + "balance_loss_mlp": 1.02972114, + "epoch": 0.1505486246805952, + "flos": 22022597613240.0, + "grad_norm": 1.719641695093926, + "language_loss": 0.85206175, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87750381, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.20227051, + "step": 2504, + "time_per_iteration": 2.8213329315185547 + }, + { + "auxiliary_loss_clip": 0.01487531, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.31009197, + "balance_loss_mlp": 1.02464366, + "epoch": 0.15060874793326318, + "flos": 20748373709760.0, + "grad_norm": 1.5967245560804229, + "language_loss": 0.83166808, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85699093, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20092773, + "step": 2505, + "time_per_iteration": 2.86639666557312 + }, + { + "auxiliary_loss_clip": 0.0149435, + "auxiliary_loss_mlp": 0.0104722, + "balance_loss_clip": 1.31629527, + "balance_loss_mlp": 1.02685881, + "epoch": 0.15066887118593117, + "flos": 16476249403440.0, + "grad_norm": 1.8119048238718571, + "language_loss": 0.76436245, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.20361328, + "step": 2506, + "time_per_iteration": 2.7417192459106445 + }, + { + "auxiliary_loss_clip": 0.01504918, + "auxiliary_loss_mlp": 0.01045598, + "balance_loss_clip": 1.32037592, + "balance_loss_mlp": 1.02474785, + "epoch": 0.15072899443859913, + "flos": 19863356577120.0, + "grad_norm": 3.33460229880725, + "language_loss": 0.77104688, + "learning_rate": 3.849041932844552e-06, + "loss": 0.796552, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.20849609, + "step": 2507, + "time_per_iteration": 2.7584357261657715 + }, + { + "auxiliary_loss_clip": 0.01487572, + "auxiliary_loss_mlp": 0.01052364, + "balance_loss_clip": 1.3101902, + "balance_loss_mlp": 1.03182387, + "epoch": 0.1507891176912671, + "flos": 20781005850000.0, + "grad_norm": 2.655998408662565, + "language_loss": 0.69567406, + "learning_rate": 3.848893461794131e-06, + "loss": 0.72107351, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20544434, + "step": 2508, + "time_per_iteration": 2.7732760906219482 + }, + { + "auxiliary_loss_clip": 0.01500569, + "auxiliary_loss_mlp": 0.01048302, + "balance_loss_clip": 1.31912911, + "balance_loss_mlp": 1.02785778, + "epoch": 0.15084924094393506, + "flos": 23591908444320.0, + "grad_norm": 1.6272818907572357, + "language_loss": 0.77654248, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80203116, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.2043457, + "step": 2509, + "time_per_iteration": 2.848538875579834 + }, + { + "auxiliary_loss_clip": 0.01505064, + "auxiliary_loss_mlp": 0.01053093, + "balance_loss_clip": 1.31918705, + "balance_loss_mlp": 1.03099203, + "epoch": 0.15090936419660303, + "flos": 18915308623800.0, + "grad_norm": 2.6114798829553956, + "language_loss": 0.81264347, + "learning_rate": 3.848596309368246e-06, + "loss": 0.83822507, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22106934, + "step": 2510, + "time_per_iteration": 2.8524010181427 + }, + { + "auxiliary_loss_clip": 0.01501137, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.31867421, + "balance_loss_mlp": 1.02977777, + "epoch": 0.150969487449271, + "flos": 17932354462080.0, + "grad_norm": 1.8121918974512674, + "language_loss": 0.74047136, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76600558, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22497559, + "step": 2511, + "time_per_iteration": 2.7196614742279053 + }, + { + "auxiliary_loss_clip": 0.01492146, + "auxiliary_loss_mlp": 0.01043113, + "balance_loss_clip": 1.31338477, + "balance_loss_mlp": 1.02281141, + "epoch": 0.151029610701939, + "flos": 24248402397360.0, + "grad_norm": 2.2741101796538037, + "language_loss": 0.69230163, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71765423, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.20300293, + "step": 2512, + "time_per_iteration": 2.8503098487854004 + }, + { + "auxiliary_loss_clip": 0.01491164, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_clip": 1.31115067, + "balance_loss_mlp": 1.02896631, + "epoch": 0.15108973395460695, + "flos": 30268510629480.0, + "grad_norm": 2.470189954395566, + "language_loss": 0.73879111, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76420003, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.2076416, + "step": 2513, + "time_per_iteration": 2.829737424850464 + }, + { + "auxiliary_loss_clip": 0.01303501, + "auxiliary_loss_mlp": 0.01009098, + "balance_loss_clip": 1.20570731, + "balance_loss_mlp": 1.00523567, + "epoch": 0.15114985720727492, + "flos": 60452525597040.0, + "grad_norm": 0.8793300555735041, + "language_loss": 0.64787823, + "learning_rate": 3.84800116337411e-06, + "loss": 0.67100424, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.03857422, + "step": 2514, + "time_per_iteration": 3.200305461883545 + }, + { + "auxiliary_loss_clip": 0.01488685, + "auxiliary_loss_mlp": 0.0105282, + "balance_loss_clip": 1.31030512, + "balance_loss_mlp": 1.03125536, + "epoch": 0.15120998045994288, + "flos": 20526550909560.0, + "grad_norm": 2.064533908295827, + "language_loss": 0.7324847, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75789976, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21569824, + "step": 2515, + "time_per_iteration": 2.776963233947754 + }, + { + "auxiliary_loss_clip": 0.01481801, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_clip": 1.30454707, + "balance_loss_mlp": 1.02706385, + "epoch": 0.15127010371261085, + "flos": 21183897638160.0, + "grad_norm": 1.926315596478434, + "language_loss": 0.7805382, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.8058356, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.2088623, + "step": 2516, + "time_per_iteration": 2.8785743713378906 + }, + { + "auxiliary_loss_clip": 0.01306219, + "auxiliary_loss_mlp": 0.01004473, + "balance_loss_clip": 1.20749736, + "balance_loss_mlp": 1.00077772, + "epoch": 0.1513302269652788, + "flos": 65335395215880.0, + "grad_norm": 0.7234825688897035, + "language_loss": 0.5463568, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56946373, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.03686523, + "step": 2517, + "time_per_iteration": 3.258749008178711 + }, + { + "auxiliary_loss_clip": 0.01486287, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.30558777, + "balance_loss_mlp": 1.02760899, + "epoch": 0.15139035021794678, + "flos": 19140583134600.0, + "grad_norm": 1.864578059384739, + "language_loss": 0.78819436, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8135457, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21240234, + "step": 2518, + "time_per_iteration": 2.7468173503875732 + }, + { + "auxiliary_loss_clip": 0.01500762, + "auxiliary_loss_mlp": 0.01057674, + "balance_loss_clip": 1.317837, + "balance_loss_mlp": 1.03608561, + "epoch": 0.15145047347061477, + "flos": 26584245501840.0, + "grad_norm": 1.999347305657315, + "language_loss": 0.70705211, + "learning_rate": 3.847255654205137e-06, + "loss": 0.73263645, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.21594238, + "step": 2519, + "time_per_iteration": 2.8421149253845215 + }, + { + "auxiliary_loss_clip": 0.01483263, + "auxiliary_loss_mlp": 0.01049838, + "balance_loss_clip": 1.30325484, + "balance_loss_mlp": 1.02922678, + "epoch": 0.15151059672328274, + "flos": 20307814344720.0, + "grad_norm": 2.095034565349201, + "language_loss": 0.7946434, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81997442, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.20617676, + "step": 2520, + "time_per_iteration": 2.780769109725952 + }, + { + "auxiliary_loss_clip": 0.01488959, + "auxiliary_loss_mlp": 0.01050372, + "balance_loss_clip": 1.30642641, + "balance_loss_mlp": 1.02837753, + "epoch": 0.1515707199759507, + "flos": 27233226908280.0, + "grad_norm": 1.7174550907832464, + "language_loss": 0.75593138, + "learning_rate": 3.846956960161114e-06, + "loss": 0.78132463, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.2199707, + "step": 2521, + "time_per_iteration": 2.8437554836273193 + }, + { + "auxiliary_loss_clip": 0.0148917, + "auxiliary_loss_mlp": 0.01057674, + "balance_loss_clip": 1.305655, + "balance_loss_mlp": 1.03486955, + "epoch": 0.15163084322861867, + "flos": 23592720611520.0, + "grad_norm": 2.0114267134317068, + "language_loss": 0.82395363, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84942204, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22790527, + "step": 2522, + "time_per_iteration": 2.805508852005005 + }, + { + "auxiliary_loss_clip": 0.01305888, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_clip": 1.20852947, + "balance_loss_mlp": 1.04153776, + "epoch": 0.15169096648128663, + "flos": 66904584221880.0, + "grad_norm": 0.8245302155680277, + "language_loss": 0.57889742, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60241532, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.04370117, + "step": 2523, + "time_per_iteration": 3.209439754486084 + }, + { + "auxiliary_loss_clip": 0.01484641, + "auxiliary_loss_mlp": 0.01053459, + "balance_loss_clip": 1.30618286, + "balance_loss_mlp": 1.03155994, + "epoch": 0.1517510897339546, + "flos": 29101360636080.0, + "grad_norm": 2.3038735823397865, + "language_loss": 0.75114173, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77652276, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21923828, + "step": 2524, + "time_per_iteration": 2.8946187496185303 + }, + { + "auxiliary_loss_clip": 0.01487115, + "auxiliary_loss_mlp": 0.0105775, + "balance_loss_clip": 1.30540025, + "balance_loss_mlp": 1.03619695, + "epoch": 0.1518112129866226, + "flos": 18411271746120.0, + "grad_norm": 1.688517999700599, + "language_loss": 0.74993789, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.77538645, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.21569824, + "step": 2525, + "time_per_iteration": 2.743528127670288 + }, + { + "auxiliary_loss_clip": 0.01487951, + "auxiliary_loss_mlp": 0.01050433, + "balance_loss_clip": 1.30409777, + "balance_loss_mlp": 1.02715182, + "epoch": 0.15187133623929056, + "flos": 19429903675080.0, + "grad_norm": 1.7566458818258788, + "language_loss": 0.80583668, + "learning_rate": 3.846208999506402e-06, + "loss": 0.83122051, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.23278809, + "step": 2526, + "time_per_iteration": 2.8061141967773438 + }, + { + "auxiliary_loss_clip": 0.01481173, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.30412984, + "balance_loss_mlp": 1.03048921, + "epoch": 0.15193145949195852, + "flos": 17570663178120.0, + "grad_norm": 1.837217987442799, + "language_loss": 0.85591781, + "learning_rate": 3.846059197327466e-06, + "loss": 0.88124222, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.20788574, + "step": 2527, + "time_per_iteration": 2.837291955947876 + }, + { + "auxiliary_loss_clip": 0.01485424, + "auxiliary_loss_mlp": 0.01049515, + "balance_loss_clip": 1.3061024, + "balance_loss_mlp": 1.02880859, + "epoch": 0.15199158274462649, + "flos": 36183453544440.0, + "grad_norm": 1.6171806421545412, + "language_loss": 0.69249308, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71784246, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.20727539, + "step": 2528, + "time_per_iteration": 2.9219391345977783 + }, + { + "auxiliary_loss_clip": 0.01476052, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.29932642, + "balance_loss_mlp": 1.03513169, + "epoch": 0.15205170599729445, + "flos": 23079018944160.0, + "grad_norm": 1.6566217750913452, + "language_loss": 0.87053502, + "learning_rate": 3.845759382967026e-06, + "loss": 0.89586037, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.21362305, + "step": 2529, + "time_per_iteration": 2.7912518978118896 + }, + { + "auxiliary_loss_clip": 0.01481419, + "auxiliary_loss_mlp": 0.01043435, + "balance_loss_clip": 1.30459094, + "balance_loss_mlp": 1.02219224, + "epoch": 0.15211182924996242, + "flos": 21913452676800.0, + "grad_norm": 1.8335943225379245, + "language_loss": 0.83808231, + "learning_rate": 3.845609370796893e-06, + "loss": 0.8633309, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21252441, + "step": 2530, + "time_per_iteration": 4.151379585266113 + }, + { + "auxiliary_loss_clip": 0.01477666, + "auxiliary_loss_mlp": 0.01049199, + "balance_loss_clip": 1.29859638, + "balance_loss_mlp": 1.02845669, + "epoch": 0.15217195250263038, + "flos": 13885545274920.0, + "grad_norm": 1.9656837162427048, + "language_loss": 0.80587637, + "learning_rate": 3.845459288641066e-06, + "loss": 0.83114499, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.20751953, + "step": 2531, + "time_per_iteration": 2.768916606903076 + }, + { + "auxiliary_loss_clip": 0.01476446, + "auxiliary_loss_mlp": 0.01053054, + "balance_loss_clip": 1.29860032, + "balance_loss_mlp": 1.03258586, + "epoch": 0.15223207575529837, + "flos": 24540727956480.0, + "grad_norm": 1.8308070195595385, + "language_loss": 0.79490972, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.82020473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.20483398, + "step": 2532, + "time_per_iteration": 2.7965314388275146 + }, + { + "auxiliary_loss_clip": 0.01477537, + "auxiliary_loss_mlp": 0.01051322, + "balance_loss_clip": 1.30227327, + "balance_loss_mlp": 1.02922046, + "epoch": 0.15229219900796634, + "flos": 25562324295720.0, + "grad_norm": 2.003241613789295, + "language_loss": 0.8787111, + "learning_rate": 3.845158914395105e-06, + "loss": 0.90399969, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22094727, + "step": 2533, + "time_per_iteration": 2.7744483947753906 + }, + { + "auxiliary_loss_clip": 0.01473266, + "auxiliary_loss_mlp": 0.01062915, + "balance_loss_clip": 1.29595673, + "balance_loss_mlp": 1.0412662, + "epoch": 0.1523523222606343, + "flos": 18221675002560.0, + "grad_norm": 2.2076971116545816, + "language_loss": 0.79744136, + "learning_rate": 3.84500862231636e-06, + "loss": 0.82280326, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.21630859, + "step": 2534, + "time_per_iteration": 2.8002634048461914 + }, + { + "auxiliary_loss_clip": 0.01487528, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.30408955, + "balance_loss_mlp": 1.02182746, + "epoch": 0.15241244551330227, + "flos": 13263876313560.0, + "grad_norm": 4.605632058314776, + "language_loss": 0.77175677, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79707456, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22424316, + "step": 2535, + "time_per_iteration": 2.7196505069732666 + }, + { + "auxiliary_loss_clip": 0.01483533, + "auxiliary_loss_mlp": 0.01055443, + "balance_loss_clip": 1.29933512, + "balance_loss_mlp": 1.0339973, + "epoch": 0.15247256876597023, + "flos": 19719873949320.0, + "grad_norm": 1.8338657531184026, + "language_loss": 0.79013062, + "learning_rate": 3.844707828275835e-06, + "loss": 0.81552041, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.2142334, + "step": 2536, + "time_per_iteration": 4.235260963439941 + }, + { + "auxiliary_loss_clip": 0.01475131, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_clip": 1.30033529, + "balance_loss_mlp": 1.03235126, + "epoch": 0.1525326920186382, + "flos": 20380712996880.0, + "grad_norm": 1.9865182262481693, + "language_loss": 0.75486672, + "learning_rate": 3.844557326325461e-06, + "loss": 0.78014028, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.19873047, + "step": 2537, + "time_per_iteration": 2.7753586769104004 + }, + { + "auxiliary_loss_clip": 0.014806, + "auxiliary_loss_mlp": 0.01046034, + "balance_loss_clip": 1.30061483, + "balance_loss_mlp": 1.02461231, + "epoch": 0.15259281527130616, + "flos": 13593991274640.0, + "grad_norm": 1.9995684785655246, + "language_loss": 0.777156, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.80242234, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.21411133, + "step": 2538, + "time_per_iteration": 5.87609338760376 + }, + { + "auxiliary_loss_clip": 0.01474419, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.29840231, + "balance_loss_mlp": 1.02453494, + "epoch": 0.15265293852397416, + "flos": 22866008158080.0, + "grad_norm": 1.4462701775369897, + "language_loss": 0.90001345, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92520422, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20117188, + "step": 2539, + "time_per_iteration": 2.794264554977417 + }, + { + "auxiliary_loss_clip": 0.0148534, + "auxiliary_loss_mlp": 0.01049127, + "balance_loss_clip": 1.30746913, + "balance_loss_mlp": 1.02665627, + "epoch": 0.15271306177664212, + "flos": 29243625013080.0, + "grad_norm": 2.890221776887019, + "language_loss": 0.93560141, + "learning_rate": 3.844105400822391e-06, + "loss": 0.96094608, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.22460938, + "step": 2540, + "time_per_iteration": 2.87432861328125 + }, + { + "auxiliary_loss_clip": 0.01468873, + "auxiliary_loss_mlp": 0.01059647, + "balance_loss_clip": 1.2950232, + "balance_loss_mlp": 1.03973866, + "epoch": 0.1527731850293101, + "flos": 31252236350040.0, + "grad_norm": 1.7095363429247796, + "language_loss": 0.75516725, + "learning_rate": 3.843954619123092e-06, + "loss": 0.78045243, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.19909668, + "step": 2541, + "time_per_iteration": 2.8505191802978516 + }, + { + "auxiliary_loss_clip": 0.01466059, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.2909832, + "balance_loss_mlp": 1.03673887, + "epoch": 0.15283330828197805, + "flos": 22387131482400.0, + "grad_norm": 1.6619281110326252, + "language_loss": 0.81335163, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83858633, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20678711, + "step": 2542, + "time_per_iteration": 2.7946183681488037 + }, + { + "auxiliary_loss_clip": 0.01477862, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_clip": 1.2992928, + "balance_loss_mlp": 1.0283215, + "epoch": 0.15289343153464602, + "flos": 25525631319480.0, + "grad_norm": 2.126047813125142, + "language_loss": 0.78257954, + "learning_rate": 3.843652845961383e-06, + "loss": 0.80785024, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.2088623, + "step": 2543, + "time_per_iteration": 2.8389768600463867 + }, + { + "auxiliary_loss_clip": 0.01465354, + "auxiliary_loss_mlp": 0.01054335, + "balance_loss_clip": 1.28980827, + "balance_loss_mlp": 1.02912211, + "epoch": 0.15295355478731398, + "flos": 22715053592040.0, + "grad_norm": 2.0514830003987226, + "language_loss": 0.8731274, + "learning_rate": 3.843501854510416e-06, + "loss": 0.89832425, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.2520752, + "step": 2544, + "time_per_iteration": 2.77374267578125 + }, + { + "auxiliary_loss_clip": 0.01483679, + "auxiliary_loss_mlp": 0.01057725, + "balance_loss_clip": 1.30194473, + "balance_loss_mlp": 1.03558779, + "epoch": 0.15301367803998198, + "flos": 23256311354640.0, + "grad_norm": 2.1104981243475924, + "language_loss": 0.83169842, + "learning_rate": 3.843350793153673e-06, + "loss": 0.85711241, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22131348, + "step": 2545, + "time_per_iteration": 2.7717387676239014 + }, + { + "auxiliary_loss_clip": 0.01476286, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_clip": 1.30072188, + "balance_loss_mlp": 1.03018713, + "epoch": 0.15307380129264994, + "flos": 25891992564840.0, + "grad_norm": 2.3345622890043094, + "language_loss": 0.71669465, + "learning_rate": 3.843199661896884e-06, + "loss": 0.74196267, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20324707, + "step": 2546, + "time_per_iteration": 2.7878451347351074 + }, + { + "auxiliary_loss_clip": 0.01473345, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.29483414, + "balance_loss_mlp": 1.02810955, + "epoch": 0.1531339245453179, + "flos": 46981469120040.0, + "grad_norm": 1.5004057283967438, + "language_loss": 0.77942133, + "learning_rate": 3.843048460745779e-06, + "loss": 0.80464673, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.2109375, + "step": 2547, + "time_per_iteration": 2.9912941455841064 + }, + { + "auxiliary_loss_clip": 0.01480676, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_clip": 1.30211234, + "balance_loss_mlp": 1.03493357, + "epoch": 0.15319404779798587, + "flos": 35888894525520.0, + "grad_norm": 2.268964981293361, + "language_loss": 0.74795842, + "learning_rate": 3.842897189706092e-06, + "loss": 0.7733199, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20544434, + "step": 2548, + "time_per_iteration": 2.906541347503662 + }, + { + "auxiliary_loss_clip": 0.01472542, + "auxiliary_loss_mlp": 0.01049951, + "balance_loss_clip": 1.29606247, + "balance_loss_mlp": 1.0302099, + "epoch": 0.15325417105065384, + "flos": 25669966722840.0, + "grad_norm": 1.4333485337247178, + "language_loss": 0.80859518, + "learning_rate": 3.842745848783558e-06, + "loss": 0.8338201, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.19750977, + "step": 2549, + "time_per_iteration": 2.851881742477417 + }, + { + "auxiliary_loss_clip": 0.01476863, + "auxiliary_loss_mlp": 0.01052551, + "balance_loss_clip": 1.29868531, + "balance_loss_mlp": 1.03294122, + "epoch": 0.1533142943033218, + "flos": 18775521356760.0, + "grad_norm": 1.634540569329432, + "language_loss": 0.75055641, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77585059, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.19628906, + "step": 2550, + "time_per_iteration": 2.77358341217041 + }, + { + "auxiliary_loss_clip": 0.01479007, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.29986382, + "balance_loss_mlp": 1.01823962, + "epoch": 0.15337441755598977, + "flos": 23111935342920.0, + "grad_norm": 2.1378006831852687, + "language_loss": 0.77882791, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.80401462, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.2142334, + "step": 2551, + "time_per_iteration": 2.856560707092285 + }, + { + "auxiliary_loss_clip": 0.01328479, + "auxiliary_loss_mlp": 0.01011669, + "balance_loss_clip": 1.22452235, + "balance_loss_mlp": 1.00678146, + "epoch": 0.15343454080865776, + "flos": 59876848926360.0, + "grad_norm": 1.0604993705968477, + "language_loss": 0.5670352, + "learning_rate": 3.842291406776283e-06, + "loss": 0.5904367, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.04882812, + "step": 2552, + "time_per_iteration": 3.1946141719818115 + }, + { + "auxiliary_loss_clip": 0.01479487, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.29950559, + "balance_loss_mlp": 1.03199303, + "epoch": 0.15349466406132573, + "flos": 11914601514840.0, + "grad_norm": 1.9603208994202712, + "language_loss": 0.88943839, + "learning_rate": 3.84213978637978e-06, + "loss": 0.91475582, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.20263672, + "step": 2553, + "time_per_iteration": 2.7480528354644775 + }, + { + "auxiliary_loss_clip": 0.01484578, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.30312181, + "balance_loss_mlp": 1.03558755, + "epoch": 0.1535547873139937, + "flos": 24102199009440.0, + "grad_norm": 1.7267580817050823, + "language_loss": 0.78235936, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80777085, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.2097168, + "step": 2554, + "time_per_iteration": 2.867110013961792 + }, + { + "auxiliary_loss_clip": 0.01481135, + "auxiliary_loss_mlp": 0.01058737, + "balance_loss_clip": 1.29994297, + "balance_loss_mlp": 1.03736234, + "epoch": 0.15361491056666166, + "flos": 17570947436640.0, + "grad_norm": 2.2972811359822805, + "language_loss": 0.78581768, + "learning_rate": 3.841836336030151e-06, + "loss": 0.81121641, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.21398926, + "step": 2555, + "time_per_iteration": 2.8031187057495117 + }, + { + "auxiliary_loss_clip": 0.01467037, + "auxiliary_loss_mlp": 0.01050728, + "balance_loss_clip": 1.29193068, + "balance_loss_mlp": 1.03113043, + "epoch": 0.15367503381932962, + "flos": 25051505821920.0, + "grad_norm": 1.4991710989541758, + "language_loss": 0.77256519, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79774284, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.19592285, + "step": 2556, + "time_per_iteration": 2.8405277729034424 + }, + { + "auxiliary_loss_clip": 0.01471717, + "auxiliary_loss_mlp": 0.01054138, + "balance_loss_clip": 1.2995398, + "balance_loss_mlp": 1.03337193, + "epoch": 0.15373515707199759, + "flos": 21512388264840.0, + "grad_norm": 1.6321814357398456, + "language_loss": 0.9065758, + "learning_rate": 3.84153260631005e-06, + "loss": 0.93183434, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.2076416, + "step": 2557, + "time_per_iteration": 2.8233375549316406 + }, + { + "auxiliary_loss_clip": 0.01477931, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.2998507, + "balance_loss_mlp": 1.03057981, + "epoch": 0.15379528032466555, + "flos": 26000325334080.0, + "grad_norm": 1.7976921941134443, + "language_loss": 0.70556957, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73086405, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.20935059, + "step": 2558, + "time_per_iteration": 2.808527946472168 + }, + { + "auxiliary_loss_clip": 0.01479918, + "auxiliary_loss_mlp": 0.01057848, + "balance_loss_clip": 1.3028779, + "balance_loss_mlp": 1.03736806, + "epoch": 0.15385540357733354, + "flos": 19281507435720.0, + "grad_norm": 2.177125364603849, + "language_loss": 0.92486346, + "learning_rate": 3.841228597265548e-06, + "loss": 0.95024109, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.20483398, + "step": 2559, + "time_per_iteration": 2.7987170219421387 + }, + { + "auxiliary_loss_clip": 0.01486658, + "auxiliary_loss_mlp": 0.01067924, + "balance_loss_clip": 1.3077054, + "balance_loss_mlp": 1.04519045, + "epoch": 0.1539155268300015, + "flos": 28555108045200.0, + "grad_norm": 3.7319215195507414, + "language_loss": 0.64468223, + "learning_rate": 3.841076488011055e-06, + "loss": 0.670228, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.22753906, + "step": 2560, + "time_per_iteration": 2.886981725692749 + }, + { + "auxiliary_loss_clip": 0.01492184, + "auxiliary_loss_mlp": 0.01057603, + "balance_loss_clip": 1.31016648, + "balance_loss_mlp": 1.03627658, + "epoch": 0.15397565008266947, + "flos": 23552860183200.0, + "grad_norm": 1.6629188327784838, + "language_loss": 0.887178, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.91267586, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.21325684, + "step": 2561, + "time_per_iteration": 2.848728656768799 + }, + { + "auxiliary_loss_clip": 0.01472919, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.30224943, + "balance_loss_mlp": 1.0318079, + "epoch": 0.15403577333533744, + "flos": 17134408299240.0, + "grad_norm": 1.6886639401617949, + "language_loss": 0.83460188, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85985321, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20410156, + "step": 2562, + "time_per_iteration": 2.8522281646728516 + }, + { + "auxiliary_loss_clip": 0.01499232, + "auxiliary_loss_mlp": 0.01054003, + "balance_loss_clip": 1.31365371, + "balance_loss_mlp": 1.02880263, + "epoch": 0.1540958965880054, + "flos": 17898991371360.0, + "grad_norm": 1.766802635519684, + "language_loss": 0.7508806, + "learning_rate": 3.840619741387832e-06, + "loss": 0.77641296, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.2520752, + "step": 2563, + "time_per_iteration": 2.768414258956909 + }, + { + "auxiliary_loss_clip": 0.01489619, + "auxiliary_loss_mlp": 0.0105466, + "balance_loss_clip": 1.30752945, + "balance_loss_mlp": 1.03229666, + "epoch": 0.15415601984067337, + "flos": 32167489729680.0, + "grad_norm": 1.9785126440215783, + "language_loss": 0.76563871, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.79108149, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22363281, + "step": 2564, + "time_per_iteration": 2.8585493564605713 + }, + { + "auxiliary_loss_clip": 0.0148224, + "auxiliary_loss_mlp": 0.01058593, + "balance_loss_clip": 1.30594087, + "balance_loss_mlp": 1.03773165, + "epoch": 0.15421614309334136, + "flos": 24029747049240.0, + "grad_norm": 2.1655361528972072, + "language_loss": 0.71374881, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73915714, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.20849609, + "step": 2565, + "time_per_iteration": 2.764289140701294 + }, + { + "auxiliary_loss_clip": 0.01492316, + "auxiliary_loss_mlp": 0.01055916, + "balance_loss_clip": 1.31490123, + "balance_loss_mlp": 1.03350496, + "epoch": 0.15427626634600933, + "flos": 24391275899760.0, + "grad_norm": 3.185837227139872, + "language_loss": 0.72349435, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74897671, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.22424316, + "step": 2566, + "time_per_iteration": 2.766263246536255 + }, + { + "auxiliary_loss_clip": 0.01479588, + "auxiliary_loss_mlp": 0.01044906, + "balance_loss_clip": 1.30481887, + "balance_loss_mlp": 1.02403235, + "epoch": 0.1543363895986773, + "flos": 23336803770120.0, + "grad_norm": 1.6027726920382872, + "language_loss": 0.85410833, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87935328, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.20874023, + "step": 2567, + "time_per_iteration": 2.759859561920166 + }, + { + "auxiliary_loss_clip": 0.01481435, + "auxiliary_loss_mlp": 0.01052553, + "balance_loss_clip": 1.30633283, + "balance_loss_mlp": 1.03240633, + "epoch": 0.15439651285134526, + "flos": 24278922902880.0, + "grad_norm": 1.9051246826888206, + "language_loss": 0.78788352, + "learning_rate": 3.839857101163202e-06, + "loss": 0.81322336, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20141602, + "step": 2568, + "time_per_iteration": 2.988314628601074 + }, + { + "auxiliary_loss_clip": 0.01493167, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.31757867, + "balance_loss_mlp": 1.02220225, + "epoch": 0.15445663610401322, + "flos": 22461370210440.0, + "grad_norm": 1.7024213111258477, + "language_loss": 0.70404744, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72943705, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.23583984, + "step": 2569, + "time_per_iteration": 4.271164894104004 + }, + { + "auxiliary_loss_clip": 0.01479336, + "auxiliary_loss_mlp": 0.0104283, + "balance_loss_clip": 1.30756235, + "balance_loss_mlp": 1.02319598, + "epoch": 0.1545167593566812, + "flos": 22053970894320.0, + "grad_norm": 1.6767418178938776, + "language_loss": 0.77079248, + "learning_rate": 3.839551556659884e-06, + "loss": 0.79601407, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.19641113, + "step": 2570, + "time_per_iteration": 2.713602304458618 + }, + { + "auxiliary_loss_clip": 0.01487064, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.31353688, + "balance_loss_mlp": 1.02345788, + "epoch": 0.15457688260934915, + "flos": 19323032806800.0, + "grad_norm": 2.280269555600424, + "language_loss": 0.78219599, + "learning_rate": 3.839398679771359e-06, + "loss": 0.80750847, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.20727539, + "step": 2571, + "time_per_iteration": 2.8433938026428223 + }, + { + "auxiliary_loss_clip": 0.01486883, + "auxiliary_loss_mlp": 0.01050091, + "balance_loss_clip": 1.3120743, + "balance_loss_mlp": 1.02933693, + "epoch": 0.15463700586201715, + "flos": 24139338677640.0, + "grad_norm": 1.6999832432966953, + "language_loss": 0.82602561, + "learning_rate": 3.839245733132652e-06, + "loss": 0.85139537, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.20751953, + "step": 2572, + "time_per_iteration": 2.786062717437744 + }, + { + "auxiliary_loss_clip": 0.0148876, + "auxiliary_loss_mlp": 0.01049629, + "balance_loss_clip": 1.31186557, + "balance_loss_mlp": 1.0294826, + "epoch": 0.1546971291146851, + "flos": 22426301568600.0, + "grad_norm": 1.6672502756184404, + "language_loss": 0.90957177, + "learning_rate": 3.839092716749563e-06, + "loss": 0.93495566, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.20141602, + "step": 2573, + "time_per_iteration": 2.756765127182007 + }, + { + "auxiliary_loss_clip": 0.01483221, + "auxiliary_loss_mlp": 0.01048045, + "balance_loss_clip": 1.30564988, + "balance_loss_mlp": 1.02771997, + "epoch": 0.15475725236735308, + "flos": 17534741760720.0, + "grad_norm": 1.5372713071994646, + "language_loss": 0.70607257, + "learning_rate": 3.838939630627893e-06, + "loss": 0.73138523, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20300293, + "step": 2574, + "time_per_iteration": 2.699755907058716 + }, + { + "auxiliary_loss_clip": 0.01484463, + "auxiliary_loss_mlp": 0.01045966, + "balance_loss_clip": 1.30694509, + "balance_loss_mlp": 1.02466369, + "epoch": 0.15481737562002104, + "flos": 22566454310880.0, + "grad_norm": 1.5663264901372573, + "language_loss": 0.82925653, + "learning_rate": 3.838786474773448e-06, + "loss": 0.85456079, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21289062, + "step": 2575, + "time_per_iteration": 4.184113025665283 + }, + { + "auxiliary_loss_clip": 0.01480776, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_clip": 1.30541348, + "balance_loss_mlp": 1.02965784, + "epoch": 0.154877498872689, + "flos": 24906073992840.0, + "grad_norm": 1.6518172370048256, + "language_loss": 0.85008198, + "learning_rate": 3.838633249192036e-06, + "loss": 0.875386, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.19970703, + "step": 2576, + "time_per_iteration": 4.328003644943237 + }, + { + "auxiliary_loss_clip": 0.01479695, + "auxiliary_loss_mlp": 0.01047653, + "balance_loss_clip": 1.30454063, + "balance_loss_mlp": 1.02698219, + "epoch": 0.15493762212535697, + "flos": 28153393899480.0, + "grad_norm": 1.6458618987897695, + "language_loss": 0.82219779, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84747124, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20678711, + "step": 2577, + "time_per_iteration": 4.252169132232666 + }, + { + "auxiliary_loss_clip": 0.01485026, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_clip": 1.31003833, + "balance_loss_mlp": 1.0294714, + "epoch": 0.15499774537802496, + "flos": 25416608208120.0, + "grad_norm": 2.8911205028078997, + "language_loss": 0.76408875, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78942633, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.19274902, + "step": 2578, + "time_per_iteration": 2.87711238861084 + }, + { + "auxiliary_loss_clip": 0.01482087, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.30605805, + "balance_loss_mlp": 1.02745843, + "epoch": 0.15505786863069293, + "flos": 22096755124560.0, + "grad_norm": 2.62996602014219, + "language_loss": 0.82982993, + "learning_rate": 3.83817315414411e-06, + "loss": 0.85512578, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.20031738, + "step": 2579, + "time_per_iteration": 2.7754757404327393 + }, + { + "auxiliary_loss_clip": 0.01482396, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.30714154, + "balance_loss_mlp": 1.0270139, + "epoch": 0.1551179918833609, + "flos": 18921887178120.0, + "grad_norm": 1.5551862831864112, + "language_loss": 0.80998093, + "learning_rate": 3.838019649712958e-06, + "loss": 0.83527219, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.19726562, + "step": 2580, + "time_per_iteration": 2.7390248775482178 + }, + { + "auxiliary_loss_clip": 0.01367141, + "auxiliary_loss_mlp": 0.0101861, + "balance_loss_clip": 1.26770663, + "balance_loss_mlp": 1.01489115, + "epoch": 0.15517811513602886, + "flos": 66254871864960.0, + "grad_norm": 0.8389226230677744, + "language_loss": 0.58878386, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.61264133, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.03710938, + "step": 2581, + "time_per_iteration": 3.3768186569213867 + }, + { + "auxiliary_loss_clip": 0.01480455, + "auxiliary_loss_mlp": 0.0105162, + "balance_loss_clip": 1.30283415, + "balance_loss_mlp": 1.03012705, + "epoch": 0.15523823838869683, + "flos": 24025970471760.0, + "grad_norm": 1.817867797745129, + "language_loss": 0.85130608, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87662685, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21508789, + "step": 2582, + "time_per_iteration": 2.8742494583129883 + }, + { + "auxiliary_loss_clip": 0.01485662, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_clip": 1.30929947, + "balance_loss_mlp": 1.04562569, + "epoch": 0.1552983616413648, + "flos": 20489898541680.0, + "grad_norm": 2.4199170675293233, + "language_loss": 0.79389286, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81941354, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.20776367, + "step": 2583, + "time_per_iteration": 2.767221212387085 + }, + { + "auxiliary_loss_clip": 0.01483676, + "auxiliary_loss_mlp": 0.01052306, + "balance_loss_clip": 1.3074348, + "balance_loss_mlp": 1.03143287, + "epoch": 0.15535848489403276, + "flos": 32130025194600.0, + "grad_norm": 1.7241470727849166, + "language_loss": 0.76810044, + "learning_rate": 3.837404935067705e-06, + "loss": 0.79346025, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.2088623, + "step": 2584, + "time_per_iteration": 2.8884379863739014 + }, + { + "auxiliary_loss_clip": 0.01488144, + "auxiliary_loss_mlp": 0.01050807, + "balance_loss_clip": 1.31137037, + "balance_loss_mlp": 1.03172147, + "epoch": 0.15541860814670075, + "flos": 19103077991160.0, + "grad_norm": 1.7978197846355868, + "language_loss": 0.7611382, + "learning_rate": 3.837251082205368e-06, + "loss": 0.78652775, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.1907959, + "step": 2585, + "time_per_iteration": 2.751105546951294 + }, + { + "auxiliary_loss_clip": 0.01477883, + "auxiliary_loss_mlp": 0.01052079, + "balance_loss_clip": 1.30591488, + "balance_loss_mlp": 1.03221917, + "epoch": 0.1554787313993687, + "flos": 19176829418880.0, + "grad_norm": 2.4359533042977404, + "language_loss": 0.61937016, + "learning_rate": 3.837097159674286e-06, + "loss": 0.64466977, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.1986084, + "step": 2586, + "time_per_iteration": 2.711618661880493 + }, + { + "auxiliary_loss_clip": 0.01479578, + "auxiliary_loss_mlp": 0.01054936, + "balance_loss_clip": 1.30257344, + "balance_loss_mlp": 1.03512287, + "epoch": 0.15553885465203668, + "flos": 16148164860360.0, + "grad_norm": 1.7400450128504443, + "language_loss": 0.82201666, + "learning_rate": 3.836943167480296e-06, + "loss": 0.84736174, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.19812012, + "step": 2587, + "time_per_iteration": 2.740556478500366 + }, + { + "auxiliary_loss_clip": 0.01489034, + "auxiliary_loss_mlp": 0.01061046, + "balance_loss_clip": 1.30958271, + "balance_loss_mlp": 1.0380981, + "epoch": 0.15559897790470464, + "flos": 25343262864000.0, + "grad_norm": 2.1254447131550895, + "language_loss": 0.88919282, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91469359, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22937012, + "step": 2588, + "time_per_iteration": 2.7692809104919434 + }, + { + "auxiliary_loss_clip": 0.01481736, + "auxiliary_loss_mlp": 0.01055337, + "balance_loss_clip": 1.30855119, + "balance_loss_mlp": 1.0351193, + "epoch": 0.1556591011573726, + "flos": 23153988622680.0, + "grad_norm": 2.3638719255999963, + "language_loss": 0.64976341, + "learning_rate": 3.83663497412695e-06, + "loss": 0.67513406, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20202637, + "step": 2589, + "time_per_iteration": 2.7565457820892334 + }, + { + "auxiliary_loss_clip": 0.0147998, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.30601788, + "balance_loss_mlp": 1.02583027, + "epoch": 0.15571922441004057, + "flos": 25375935612600.0, + "grad_norm": 1.8009557568528747, + "language_loss": 0.82775521, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85303164, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.21813965, + "step": 2590, + "time_per_iteration": 2.7762229442596436 + }, + { + "auxiliary_loss_clip": 0.01489918, + "auxiliary_loss_mlp": 0.01052976, + "balance_loss_clip": 1.3124342, + "balance_loss_mlp": 1.03331852, + "epoch": 0.15577934766270854, + "flos": 14505427468440.0, + "grad_norm": 2.5984459330304728, + "language_loss": 0.799896, + "learning_rate": 3.836326502192077e-06, + "loss": 0.82532489, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.19641113, + "step": 2591, + "time_per_iteration": 2.8099300861358643 + }, + { + "auxiliary_loss_clip": 0.01478215, + "auxiliary_loss_mlp": 0.01050333, + "balance_loss_clip": 1.30554593, + "balance_loss_mlp": 1.03215384, + "epoch": 0.15583947091537653, + "flos": 37421593597080.0, + "grad_norm": 1.944919435786965, + "language_loss": 0.65424746, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67953295, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.18164062, + "step": 2592, + "time_per_iteration": 2.9403483867645264 + }, + { + "auxiliary_loss_clip": 0.01493328, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.31609058, + "balance_loss_mlp": 1.03017974, + "epoch": 0.1558995941680445, + "flos": 21839701249080.0, + "grad_norm": 2.2253328594049515, + "language_loss": 0.82993823, + "learning_rate": 3.836017751722467e-06, + "loss": 0.85537481, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.20153809, + "step": 2593, + "time_per_iteration": 2.838003635406494 + }, + { + "auxiliary_loss_clip": 0.01482921, + "auxiliary_loss_mlp": 0.01051427, + "balance_loss_clip": 1.31214666, + "balance_loss_mlp": 1.03188872, + "epoch": 0.15595971742071246, + "flos": 19797483171240.0, + "grad_norm": 1.8711419753479785, + "language_loss": 0.73340249, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75874597, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.19519043, + "step": 2594, + "time_per_iteration": 2.946049213409424 + }, + { + "auxiliary_loss_clip": 0.01478636, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_clip": 1.30965376, + "balance_loss_mlp": 1.02468812, + "epoch": 0.15601984067338043, + "flos": 26727525087840.0, + "grad_norm": 2.8474902275078655, + "language_loss": 0.82042062, + "learning_rate": 3.835708722764952e-06, + "loss": 0.84563988, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.18603516, + "step": 2595, + "time_per_iteration": 2.7992351055145264 + }, + { + "auxiliary_loss_clip": 0.01485045, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.30945146, + "balance_loss_mlp": 1.02699041, + "epoch": 0.1560799639260484, + "flos": 18373888427760.0, + "grad_norm": 1.874713476128768, + "language_loss": 0.87083614, + "learning_rate": 3.835554103867876e-06, + "loss": 0.89615613, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.1998291, + "step": 2596, + "time_per_iteration": 2.79967999458313 + }, + { + "auxiliary_loss_clip": 0.01483632, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_clip": 1.3134681, + "balance_loss_mlp": 1.02410436, + "epoch": 0.15614008717871636, + "flos": 22603797020880.0, + "grad_norm": 1.5746672680056528, + "language_loss": 0.68966204, + "learning_rate": 3.835399415366404e-06, + "loss": 0.71494073, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20129395, + "step": 2597, + "time_per_iteration": 2.7538466453552246 + }, + { + "auxiliary_loss_clip": 0.01480218, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.31176734, + "balance_loss_mlp": 1.02534223, + "epoch": 0.15620021043138435, + "flos": 22751949610080.0, + "grad_norm": 1.5797757704062614, + "language_loss": 0.79950017, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82473516, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.17932129, + "step": 2598, + "time_per_iteration": 2.8432211875915527 + }, + { + "auxiliary_loss_clip": 0.01481707, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.31203043, + "balance_loss_mlp": 1.02151942, + "epoch": 0.15626033368405232, + "flos": 13118809959720.0, + "grad_norm": 1.717548646237255, + "language_loss": 0.82776386, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85298699, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19091797, + "step": 2599, + "time_per_iteration": 2.725836753845215 + }, + { + "auxiliary_loss_clip": 0.01499165, + "auxiliary_loss_mlp": 0.0104598, + "balance_loss_clip": 1.3192749, + "balance_loss_mlp": 1.02415276, + "epoch": 0.15632045693672028, + "flos": 16476696095400.0, + "grad_norm": 2.0525146641393253, + "language_loss": 0.82352757, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84897906, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21826172, + "step": 2600, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.01489332, + "auxiliary_loss_mlp": 0.01049507, + "balance_loss_clip": 1.31447577, + "balance_loss_mlp": 1.02841866, + "epoch": 0.15638058018938825, + "flos": 20855285186400.0, + "grad_norm": 1.8227901860250724, + "language_loss": 0.89128602, + "learning_rate": 3.834779965433917e-06, + "loss": 0.91667449, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21081543, + "step": 2601, + "time_per_iteration": 2.804795265197754 + }, + { + "auxiliary_loss_clip": 0.01492487, + "auxiliary_loss_mlp": 0.01061948, + "balance_loss_clip": 1.31655645, + "balance_loss_mlp": 1.03916717, + "epoch": 0.1564407034420562, + "flos": 21877328217600.0, + "grad_norm": 1.870804400308706, + "language_loss": 0.78829896, + "learning_rate": 3.834624928998508e-06, + "loss": 0.81384331, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.2277832, + "step": 2602, + "time_per_iteration": 2.7510464191436768 + }, + { + "auxiliary_loss_clip": 0.01489867, + "auxiliary_loss_mlp": 0.01043931, + "balance_loss_clip": 1.31510437, + "balance_loss_mlp": 1.02279496, + "epoch": 0.15650082669472418, + "flos": 21839498207280.0, + "grad_norm": 2.0545277075275328, + "language_loss": 0.74378908, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76912707, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21154785, + "step": 2603, + "time_per_iteration": 2.829867362976074 + }, + { + "auxiliary_loss_clip": 0.01485277, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.31118214, + "balance_loss_mlp": 1.02401555, + "epoch": 0.15656094994739214, + "flos": 13803509741760.0, + "grad_norm": 3.0060345736319127, + "language_loss": 0.88627827, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.9115811, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.20983887, + "step": 2604, + "time_per_iteration": 2.7357254028320312 + }, + { + "auxiliary_loss_clip": 0.01499489, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.32245207, + "balance_loss_mlp": 1.01808155, + "epoch": 0.15662107320006013, + "flos": 27313962973920.0, + "grad_norm": 1.9591658950064903, + "language_loss": 0.85050213, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87588477, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.20703125, + "step": 2605, + "time_per_iteration": 2.8267955780029297 + }, + { + "auxiliary_loss_clip": 0.01501447, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.31996989, + "balance_loss_mlp": 1.02176547, + "epoch": 0.1566811964527281, + "flos": 26690385419640.0, + "grad_norm": 2.028686596156568, + "language_loss": 0.73277795, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75823027, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22009277, + "step": 2606, + "time_per_iteration": 2.9074504375457764 + }, + { + "auxiliary_loss_clip": 0.01498743, + "auxiliary_loss_mlp": 0.01045109, + "balance_loss_clip": 1.32322562, + "balance_loss_mlp": 1.02554703, + "epoch": 0.15674131970539606, + "flos": 16107248614680.0, + "grad_norm": 2.043775235180364, + "language_loss": 0.76975846, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.79519701, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.19580078, + "step": 2607, + "time_per_iteration": 2.7129876613616943 + }, + { + "auxiliary_loss_clip": 0.0149364, + "auxiliary_loss_mlp": 0.01047276, + "balance_loss_clip": 1.32009029, + "balance_loss_mlp": 1.02690291, + "epoch": 0.15680144295806403, + "flos": 19174108658760.0, + "grad_norm": 1.6648073300647803, + "language_loss": 0.82633114, + "learning_rate": 3.833693249639615e-06, + "loss": 0.85174036, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20373535, + "step": 2608, + "time_per_iteration": 2.767001152038574 + }, + { + "auxiliary_loss_clip": 0.01502523, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_clip": 1.32241654, + "balance_loss_mlp": 1.02725732, + "epoch": 0.156861566210732, + "flos": 20818307951640.0, + "grad_norm": 1.6707923763691, + "language_loss": 0.73318624, + "learning_rate": 3.833537726343684e-06, + "loss": 0.75871849, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23425293, + "step": 2609, + "time_per_iteration": 4.148451566696167 + }, + { + "auxiliary_loss_clip": 0.01499545, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.32030058, + "balance_loss_mlp": 1.02742267, + "epoch": 0.15692168946339996, + "flos": 20052790887240.0, + "grad_norm": 1.632448543635007, + "language_loss": 0.7241385, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74962461, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.21655273, + "step": 2610, + "time_per_iteration": 2.7578203678131104 + }, + { + "auxiliary_loss_clip": 0.01500099, + "auxiliary_loss_mlp": 0.01048764, + "balance_loss_clip": 1.31983781, + "balance_loss_mlp": 1.02562594, + "epoch": 0.15698181271606793, + "flos": 21403080894960.0, + "grad_norm": 1.7814975083405677, + "language_loss": 0.72711825, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75260693, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.23156738, + "step": 2611, + "time_per_iteration": 2.7528343200683594 + }, + { + "auxiliary_loss_clip": 0.0150012, + "auxiliary_loss_mlp": 0.01051553, + "balance_loss_clip": 1.32366395, + "balance_loss_mlp": 1.0301311, + "epoch": 0.15704193596873592, + "flos": 20850127924680.0, + "grad_norm": 2.006587103089819, + "language_loss": 0.70893633, + "learning_rate": 3.833070739311887e-06, + "loss": 0.73445308, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.2142334, + "step": 2612, + "time_per_iteration": 2.7615320682525635 + }, + { + "auxiliary_loss_clip": 0.015018, + "auxiliary_loss_mlp": 0.01047949, + "balance_loss_clip": 1.32424998, + "balance_loss_mlp": 1.02613366, + "epoch": 0.15710205922140388, + "flos": 21768020847720.0, + "grad_norm": 2.089612251406446, + "language_loss": 0.76485443, + "learning_rate": 3.83291493793963e-06, + "loss": 0.79035187, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.21801758, + "step": 2613, + "time_per_iteration": 2.920292377471924 + }, + { + "auxiliary_loss_clip": 0.01504637, + "auxiliary_loss_mlp": 0.01053014, + "balance_loss_clip": 1.32533467, + "balance_loss_mlp": 1.03074622, + "epoch": 0.15716218247407185, + "flos": 25012985469480.0, + "grad_norm": 1.781861301491829, + "language_loss": 0.66250104, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68807757, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.22277832, + "step": 2614, + "time_per_iteration": 4.235349416732788 + }, + { + "auxiliary_loss_clip": 0.0150697, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.32548165, + "balance_loss_mlp": 1.02853417, + "epoch": 0.1572223057267398, + "flos": 20196395340120.0, + "grad_norm": 2.3598077347899564, + "language_loss": 0.75766969, + "learning_rate": 3.832603126688072e-06, + "loss": 0.78326213, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.23779297, + "step": 2615, + "time_per_iteration": 4.2912304401397705 + }, + { + "auxiliary_loss_clip": 0.01496653, + "auxiliary_loss_mlp": 0.01050732, + "balance_loss_clip": 1.32405984, + "balance_loss_mlp": 1.0307045, + "epoch": 0.15728242897940778, + "flos": 20964430122840.0, + "grad_norm": 1.492620298523876, + "language_loss": 0.73405147, + "learning_rate": 3.832447116820594e-06, + "loss": 0.7595253, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20031738, + "step": 2616, + "time_per_iteration": 4.3132007122039795 + }, + { + "auxiliary_loss_clip": 0.01501457, + "auxiliary_loss_mlp": 0.0105228, + "balance_loss_clip": 1.32390022, + "balance_loss_mlp": 1.03020263, + "epoch": 0.15734255223207574, + "flos": 23043341176920.0, + "grad_norm": 2.0679852540830357, + "language_loss": 0.72693861, + "learning_rate": 3.832291037466539e-06, + "loss": 0.75247598, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.2208252, + "step": 2617, + "time_per_iteration": 2.8521370887756348 + }, + { + "auxiliary_loss_clip": 0.01502467, + "auxiliary_loss_mlp": 0.01048592, + "balance_loss_clip": 1.32703626, + "balance_loss_mlp": 1.02804041, + "epoch": 0.15740267548474374, + "flos": 20555244038880.0, + "grad_norm": 2.0345493941745803, + "language_loss": 0.75254703, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.77805763, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.20556641, + "step": 2618, + "time_per_iteration": 2.731827735900879 + }, + { + "auxiliary_loss_clip": 0.01515105, + "auxiliary_loss_mlp": 0.01056404, + "balance_loss_clip": 1.33155251, + "balance_loss_mlp": 1.03193092, + "epoch": 0.1574627987374117, + "flos": 22671091719360.0, + "grad_norm": 2.6816994672447625, + "language_loss": 0.78940988, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81512499, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.24475098, + "step": 2619, + "time_per_iteration": 2.805521011352539 + }, + { + "auxiliary_loss_clip": 0.014985, + "auxiliary_loss_mlp": 0.01058169, + "balance_loss_clip": 1.32267964, + "balance_loss_mlp": 1.03666377, + "epoch": 0.15752292199007967, + "flos": 16805105505360.0, + "grad_norm": 1.5933134345219904, + "language_loss": 0.77184296, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79740965, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21508789, + "step": 2620, + "time_per_iteration": 2.7013964653015137 + }, + { + "auxiliary_loss_clip": 0.01504176, + "auxiliary_loss_mlp": 0.01057351, + "balance_loss_clip": 1.32394946, + "balance_loss_mlp": 1.03335452, + "epoch": 0.15758304524274763, + "flos": 29832133925520.0, + "grad_norm": 1.5622652508338104, + "language_loss": 0.71328413, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73889941, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.2401123, + "step": 2621, + "time_per_iteration": 2.836916923522949 + }, + { + "auxiliary_loss_clip": 0.01511756, + "auxiliary_loss_mlp": 0.01053345, + "balance_loss_clip": 1.32965505, + "balance_loss_mlp": 1.0290029, + "epoch": 0.1576431684954156, + "flos": 53586553337280.0, + "grad_norm": 1.8455351351945453, + "language_loss": 0.72341448, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74906552, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.2434082, + "step": 2622, + "time_per_iteration": 3.0535337924957275 + }, + { + "auxiliary_loss_clip": 0.01501299, + "auxiliary_loss_mlp": 0.01052267, + "balance_loss_clip": 1.32437587, + "balance_loss_mlp": 1.03220379, + "epoch": 0.15770329174808356, + "flos": 20818510993440.0, + "grad_norm": 1.6076811911749473, + "language_loss": 0.87885827, + "learning_rate": 3.831353102455684e-06, + "loss": 0.90439391, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.20068359, + "step": 2623, + "time_per_iteration": 2.796220064163208 + }, + { + "auxiliary_loss_clip": 0.01498878, + "auxiliary_loss_mlp": 0.01056425, + "balance_loss_clip": 1.32126832, + "balance_loss_mlp": 1.03391838, + "epoch": 0.15776341500075153, + "flos": 24979581770400.0, + "grad_norm": 1.762727255386819, + "language_loss": 0.8192153, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84476829, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.22509766, + "step": 2624, + "time_per_iteration": 2.812695026397705 + }, + { + "auxiliary_loss_clip": 0.0150668, + "auxiliary_loss_mlp": 0.01054704, + "balance_loss_clip": 1.32311785, + "balance_loss_mlp": 1.03131533, + "epoch": 0.15782353825341952, + "flos": 21912924768120.0, + "grad_norm": 2.1364289550410764, + "language_loss": 0.80475879, + "learning_rate": 3.831039901828054e-06, + "loss": 0.83037257, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23388672, + "step": 2625, + "time_per_iteration": 2.73201584815979 + }, + { + "auxiliary_loss_clip": 0.01499406, + "auxiliary_loss_mlp": 0.01048825, + "balance_loss_clip": 1.31965017, + "balance_loss_mlp": 1.02803445, + "epoch": 0.15788366150608749, + "flos": 26182815614640.0, + "grad_norm": 2.1211967842290305, + "language_loss": 0.810157, + "learning_rate": 3.830883197361445e-06, + "loss": 0.83563936, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.2076416, + "step": 2626, + "time_per_iteration": 2.8090600967407227 + }, + { + "auxiliary_loss_clip": 0.01502346, + "auxiliary_loss_mlp": 0.01054019, + "balance_loss_clip": 1.32430506, + "balance_loss_mlp": 1.03030872, + "epoch": 0.15794378475875545, + "flos": 27715677119640.0, + "grad_norm": 1.7649003287270817, + "language_loss": 0.74765503, + "learning_rate": 3.830726423467561e-06, + "loss": 0.77321869, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.23730469, + "step": 2627, + "time_per_iteration": 2.8876090049743652 + }, + { + "auxiliary_loss_clip": 0.01495042, + "auxiliary_loss_mlp": 0.01060753, + "balance_loss_clip": 1.31580067, + "balance_loss_mlp": 1.03770983, + "epoch": 0.15800390801142342, + "flos": 12133947205080.0, + "grad_norm": 1.924703746858333, + "language_loss": 0.85772938, + "learning_rate": 3.830569580152348e-06, + "loss": 0.88328731, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.23046875, + "step": 2628, + "time_per_iteration": 2.729182243347168 + }, + { + "auxiliary_loss_clip": 0.01496301, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.31787109, + "balance_loss_mlp": 1.02596986, + "epoch": 0.15806403126409138, + "flos": 20709690923880.0, + "grad_norm": 2.8331862240126946, + "language_loss": 0.77027583, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79570478, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20617676, + "step": 2629, + "time_per_iteration": 2.7340774536132812 + }, + { + "auxiliary_loss_clip": 0.01499851, + "auxiliary_loss_mlp": 0.01062561, + "balance_loss_clip": 1.32025719, + "balance_loss_mlp": 1.03800416, + "epoch": 0.15812415451675935, + "flos": 17826214544280.0, + "grad_norm": 2.5413843010835913, + "language_loss": 0.74313831, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.76876241, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.24536133, + "step": 2630, + "time_per_iteration": 2.767512559890747 + }, + { + "auxiliary_loss_clip": 0.0150557, + "auxiliary_loss_mlp": 0.010516, + "balance_loss_clip": 1.32120121, + "balance_loss_mlp": 1.02934313, + "epoch": 0.15818427776942734, + "flos": 20088874738080.0, + "grad_norm": 2.1672531793571994, + "language_loss": 0.84039319, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.86596489, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.22277832, + "step": 2631, + "time_per_iteration": 2.739168167114258 + }, + { + "auxiliary_loss_clip": 0.01493658, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_clip": 1.31489158, + "balance_loss_mlp": 1.02745962, + "epoch": 0.1582444010220953, + "flos": 21219859663920.0, + "grad_norm": 1.6301282467966147, + "language_loss": 0.79277551, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.81819916, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21264648, + "step": 2632, + "time_per_iteration": 2.7512800693511963 + }, + { + "auxiliary_loss_clip": 0.01513651, + "auxiliary_loss_mlp": 0.01055978, + "balance_loss_clip": 1.3316009, + "balance_loss_mlp": 1.03317356, + "epoch": 0.15830452427476327, + "flos": 17862866912160.0, + "grad_norm": 1.9007466433165983, + "language_loss": 0.83038831, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85608459, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22827148, + "step": 2633, + "time_per_iteration": 2.750182867050171 + }, + { + "auxiliary_loss_clip": 0.01510148, + "auxiliary_loss_mlp": 0.01052194, + "balance_loss_clip": 1.32729197, + "balance_loss_mlp": 1.02974677, + "epoch": 0.15836464752743123, + "flos": 24540321872880.0, + "grad_norm": 1.8598562830666256, + "language_loss": 0.7763586, + "learning_rate": 3.829627062746394e-06, + "loss": 0.80198205, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.2244873, + "step": 2634, + "time_per_iteration": 2.7967944145202637 + }, + { + "auxiliary_loss_clip": 0.01503174, + "auxiliary_loss_mlp": 0.01051194, + "balance_loss_clip": 1.32071042, + "balance_loss_mlp": 1.02860403, + "epoch": 0.1584247707800992, + "flos": 20125689539400.0, + "grad_norm": 1.9286758750449806, + "language_loss": 0.89008784, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91563153, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22583008, + "step": 2635, + "time_per_iteration": 2.7936365604400635 + }, + { + "auxiliary_loss_clip": 0.01513449, + "auxiliary_loss_mlp": 0.0106049, + "balance_loss_clip": 1.33039594, + "balance_loss_mlp": 1.03761435, + "epoch": 0.15848489403276717, + "flos": 20380834821960.0, + "grad_norm": 2.24375309756477, + "language_loss": 0.76877284, + "learning_rate": 3.829312335177034e-06, + "loss": 0.79451227, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22875977, + "step": 2636, + "time_per_iteration": 2.732210636138916 + }, + { + "auxiliary_loss_clip": 0.01510837, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_clip": 1.32816648, + "balance_loss_mlp": 1.02571344, + "epoch": 0.15854501728543513, + "flos": 39354017004720.0, + "grad_norm": 2.439665626040308, + "language_loss": 0.7233777, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74900091, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.25744629, + "step": 2637, + "time_per_iteration": 2.920401096343994 + }, + { + "auxiliary_loss_clip": 0.01506821, + "auxiliary_loss_mlp": 0.01052126, + "balance_loss_clip": 1.32839763, + "balance_loss_mlp": 1.03001237, + "epoch": 0.15860514053810312, + "flos": 24869746491840.0, + "grad_norm": 2.4208711572721584, + "language_loss": 0.78241897, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80800843, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.22106934, + "step": 2638, + "time_per_iteration": 2.75940203666687 + }, + { + "auxiliary_loss_clip": 0.01514077, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.33059692, + "balance_loss_mlp": 1.03079271, + "epoch": 0.1586652637907711, + "flos": 26183384131680.0, + "grad_norm": 2.04484420513764, + "language_loss": 0.76093698, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78662908, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.2434082, + "step": 2639, + "time_per_iteration": 2.981576442718506 + }, + { + "auxiliary_loss_clip": 0.01516952, + "auxiliary_loss_mlp": 0.01055221, + "balance_loss_clip": 1.33406341, + "balance_loss_mlp": 1.0321424, + "epoch": 0.15872538704343905, + "flos": 19796833437480.0, + "grad_norm": 1.8815864702940936, + "language_loss": 0.81638205, + "learning_rate": 3.82868204767362e-06, + "loss": 0.84210378, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23083496, + "step": 2640, + "time_per_iteration": 2.7920892238616943 + }, + { + "auxiliary_loss_clip": 0.01504862, + "auxiliary_loss_mlp": 0.01058817, + "balance_loss_clip": 1.32736015, + "balance_loss_mlp": 1.03574955, + "epoch": 0.15878551029610702, + "flos": 28481234792400.0, + "grad_norm": 1.3361487480375163, + "language_loss": 0.67326301, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69889987, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.23071289, + "step": 2641, + "time_per_iteration": 2.8061749935150146 + }, + { + "auxiliary_loss_clip": 0.01527612, + "auxiliary_loss_mlp": 0.01061075, + "balance_loss_clip": 1.33952856, + "balance_loss_mlp": 1.0342648, + "epoch": 0.15884563354877498, + "flos": 24212034288000.0, + "grad_norm": 1.9920590204248931, + "language_loss": 0.76065701, + "learning_rate": 3.828366487835167e-06, + "loss": 0.78654385, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.26806641, + "step": 2642, + "time_per_iteration": 2.8375940322875977 + }, + { + "auxiliary_loss_clip": 0.0151529, + "auxiliary_loss_mlp": 0.01050274, + "balance_loss_clip": 1.33669233, + "balance_loss_mlp": 1.02659857, + "epoch": 0.15890575680144295, + "flos": 23954899195800.0, + "grad_norm": 2.5059337546649223, + "language_loss": 0.70790267, + "learning_rate": 3.828208603915186e-06, + "loss": 0.7335583, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.23681641, + "step": 2643, + "time_per_iteration": 2.7429614067077637 + }, + { + "auxiliary_loss_clip": 0.01512955, + "auxiliary_loss_mlp": 0.01047164, + "balance_loss_clip": 1.33548903, + "balance_loss_mlp": 1.025635, + "epoch": 0.15896588005411091, + "flos": 21219981489000.0, + "grad_norm": 1.9493756355773606, + "language_loss": 0.78215277, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80775404, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21533203, + "step": 2644, + "time_per_iteration": 2.759302854537964 + }, + { + "auxiliary_loss_clip": 0.01518292, + "auxiliary_loss_mlp": 0.01052653, + "balance_loss_clip": 1.33915782, + "balance_loss_mlp": 1.02962136, + "epoch": 0.1590260033067789, + "flos": 24357709767240.0, + "grad_norm": 1.6972391710458274, + "language_loss": 0.82281291, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84852237, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.23010254, + "step": 2645, + "time_per_iteration": 2.7753922939300537 + }, + { + "auxiliary_loss_clip": 0.01512969, + "auxiliary_loss_mlp": 0.01066952, + "balance_loss_clip": 1.33200538, + "balance_loss_mlp": 1.04224026, + "epoch": 0.15908612655944687, + "flos": 32055014907720.0, + "grad_norm": 1.9525774811325445, + "language_loss": 0.70235145, + "learning_rate": 3.827734536224087e-06, + "loss": 0.72815073, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.24719238, + "step": 2646, + "time_per_iteration": 2.841758966445923 + }, + { + "auxiliary_loss_clip": 0.01505723, + "auxiliary_loss_mlp": 0.01056531, + "balance_loss_clip": 1.32908106, + "balance_loss_mlp": 1.03373766, + "epoch": 0.15914624981211484, + "flos": 17789887043280.0, + "grad_norm": 2.0180742301631844, + "language_loss": 0.63026428, + "learning_rate": 3.827576375036642e-06, + "loss": 0.65588677, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.22802734, + "step": 2647, + "time_per_iteration": 4.108550786972046 + }, + { + "auxiliary_loss_clip": 0.0150926, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_clip": 1.33117676, + "balance_loss_mlp": 1.02650976, + "epoch": 0.1592063730647828, + "flos": 17717028999480.0, + "grad_norm": 1.9171441060762169, + "language_loss": 0.8938179, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91940749, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.23193359, + "step": 2648, + "time_per_iteration": 2.7084007263183594 + }, + { + "auxiliary_loss_clip": 0.01511438, + "auxiliary_loss_mlp": 0.01049745, + "balance_loss_clip": 1.3362633, + "balance_loss_mlp": 1.0278821, + "epoch": 0.15926649631745077, + "flos": 18807747413400.0, + "grad_norm": 1.8048454689465843, + "language_loss": 0.91948426, + "learning_rate": 3.827259844762114e-06, + "loss": 0.94509602, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.21862793, + "step": 2649, + "time_per_iteration": 2.7698092460632324 + }, + { + "auxiliary_loss_clip": 0.01528872, + "auxiliary_loss_mlp": 0.0105293, + "balance_loss_clip": 1.33883643, + "balance_loss_mlp": 1.02881396, + "epoch": 0.15932661957011873, + "flos": 17570703786480.0, + "grad_norm": 2.4024106260913523, + "language_loss": 0.71794689, + "learning_rate": 3.827101475687033e-06, + "loss": 0.74376488, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.24121094, + "step": 2650, + "time_per_iteration": 2.8800852298736572 + }, + { + "auxiliary_loss_clip": 0.01504867, + "auxiliary_loss_mlp": 0.01049669, + "balance_loss_clip": 1.33075368, + "balance_loss_mlp": 1.02874804, + "epoch": 0.15938674282278673, + "flos": 13338318083400.0, + "grad_norm": 2.233798172206248, + "language_loss": 0.71354121, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73908663, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20922852, + "step": 2651, + "time_per_iteration": 2.7924559116363525 + }, + { + "auxiliary_loss_clip": 0.01518743, + "auxiliary_loss_mlp": 0.01054655, + "balance_loss_clip": 1.33779621, + "balance_loss_mlp": 1.03038406, + "epoch": 0.1594468660754547, + "flos": 22493555658720.0, + "grad_norm": 1.9664991720181124, + "language_loss": 0.80030471, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82603866, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.24267578, + "step": 2652, + "time_per_iteration": 2.8554580211639404 + }, + { + "auxiliary_loss_clip": 0.0150975, + "auxiliary_loss_mlp": 0.01047851, + "balance_loss_clip": 1.3356185, + "balance_loss_mlp": 1.02729988, + "epoch": 0.15950698932812266, + "flos": 15010804422000.0, + "grad_norm": 2.7220118878137294, + "language_loss": 0.70436728, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72994328, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20544434, + "step": 2653, + "time_per_iteration": 4.163519859313965 + }, + { + "auxiliary_loss_clip": 0.01506471, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_clip": 1.32942271, + "balance_loss_mlp": 1.02465677, + "epoch": 0.15956711258079062, + "flos": 30161314894320.0, + "grad_norm": 1.837429250278963, + "language_loss": 0.77561182, + "learning_rate": 3.826467306608095e-06, + "loss": 0.80114698, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.22375488, + "step": 2654, + "time_per_iteration": 4.302571058273315 + }, + { + "auxiliary_loss_clip": 0.01507793, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_clip": 1.33085465, + "balance_loss_mlp": 1.02781713, + "epoch": 0.1596272358334586, + "flos": 21037653641880.0, + "grad_norm": 1.8943914775502517, + "language_loss": 0.82597464, + "learning_rate": 3.826308591173765e-06, + "loss": 0.85154867, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21801758, + "step": 2655, + "time_per_iteration": 4.235063552856445 + }, + { + "auxiliary_loss_clip": 0.01506776, + "auxiliary_loss_mlp": 0.01055901, + "balance_loss_clip": 1.32826257, + "balance_loss_mlp": 1.03470528, + "epoch": 0.15968735908612655, + "flos": 15272244000360.0, + "grad_norm": 2.2649459885388423, + "language_loss": 0.73887128, + "learning_rate": 3.826149806485631e-06, + "loss": 0.76449805, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21179199, + "step": 2656, + "time_per_iteration": 2.8409581184387207 + }, + { + "auxiliary_loss_clip": 0.01500485, + "auxiliary_loss_mlp": 0.01053855, + "balance_loss_clip": 1.3272084, + "balance_loss_mlp": 1.0327071, + "epoch": 0.15974748233879452, + "flos": 52674792276600.0, + "grad_norm": 1.6944148418348244, + "language_loss": 0.77507389, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80061722, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21154785, + "step": 2657, + "time_per_iteration": 3.005376100540161 + }, + { + "auxiliary_loss_clip": 0.01503093, + "auxiliary_loss_mlp": 0.01045669, + "balance_loss_clip": 1.32832623, + "balance_loss_mlp": 1.02357984, + "epoch": 0.1598076055914625, + "flos": 18737772563160.0, + "grad_norm": 1.7122133679185236, + "language_loss": 0.74856073, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77404839, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22070312, + "step": 2658, + "time_per_iteration": 2.7463178634643555 + }, + { + "auxiliary_loss_clip": 0.01512189, + "auxiliary_loss_mlp": 0.0105885, + "balance_loss_clip": 1.33320618, + "balance_loss_mlp": 1.03467417, + "epoch": 0.15986772884413047, + "flos": 34355829978720.0, + "grad_norm": 1.5940983473948245, + "language_loss": 0.75724316, + "learning_rate": 3.825673036958624e-06, + "loss": 0.78295362, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.24157715, + "step": 2659, + "time_per_iteration": 2.8541948795318604 + }, + { + "auxiliary_loss_clip": 0.01513511, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_clip": 1.33379936, + "balance_loss_mlp": 1.04452384, + "epoch": 0.15992785209679844, + "flos": 22060143365040.0, + "grad_norm": 3.8616514370035975, + "language_loss": 0.91207552, + "learning_rate": 3.825513975315508e-06, + "loss": 0.93787789, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22216797, + "step": 2660, + "time_per_iteration": 2.8643205165863037 + }, + { + "auxiliary_loss_clip": 0.01516653, + "auxiliary_loss_mlp": 0.01059603, + "balance_loss_clip": 1.33734667, + "balance_loss_mlp": 1.03690577, + "epoch": 0.1599879753494664, + "flos": 33072347369160.0, + "grad_norm": 1.8628027904645572, + "language_loss": 0.77955091, + "learning_rate": 3.82535484444872e-06, + "loss": 0.80531347, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22705078, + "step": 2661, + "time_per_iteration": 2.850278854370117 + }, + { + "auxiliary_loss_clip": 0.01506496, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_clip": 1.32637358, + "balance_loss_mlp": 1.03123617, + "epoch": 0.16004809860213437, + "flos": 28043883487800.0, + "grad_norm": 1.6618306150546134, + "language_loss": 0.74458033, + "learning_rate": 3.825195644364292e-06, + "loss": 0.77018392, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22644043, + "step": 2662, + "time_per_iteration": 2.7853026390075684 + }, + { + "auxiliary_loss_clip": 0.0150814, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.32737398, + "balance_loss_mlp": 1.03065932, + "epoch": 0.16010822185480234, + "flos": 22784784792120.0, + "grad_norm": 3.8593201979060705, + "language_loss": 0.82237983, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84799492, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22729492, + "step": 2663, + "time_per_iteration": 2.789340019226074 + }, + { + "auxiliary_loss_clip": 0.01511829, + "auxiliary_loss_mlp": 0.010566, + "balance_loss_clip": 1.33215487, + "balance_loss_mlp": 1.03186393, + "epoch": 0.16016834510747033, + "flos": 20088834129720.0, + "grad_norm": 2.388829362625242, + "language_loss": 0.80120575, + "learning_rate": 3.824877036566672e-06, + "loss": 0.82688999, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.24719238, + "step": 2664, + "time_per_iteration": 2.756760358810425 + }, + { + "auxiliary_loss_clip": 0.01504032, + "auxiliary_loss_mlp": 0.01059485, + "balance_loss_clip": 1.32491899, + "balance_loss_mlp": 1.03668022, + "epoch": 0.1602284683601383, + "flos": 21178171859400.0, + "grad_norm": 1.5897904474359736, + "language_loss": 0.93935466, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96498978, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22790527, + "step": 2665, + "time_per_iteration": 2.8384804725646973 + }, + { + "auxiliary_loss_clip": 0.01507847, + "auxiliary_loss_mlp": 0.01053746, + "balance_loss_clip": 1.32777679, + "balance_loss_mlp": 1.03120351, + "epoch": 0.16028859161280626, + "flos": 14651671464720.0, + "grad_norm": 1.920976654034208, + "language_loss": 0.85408437, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87970024, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.22546387, + "step": 2666, + "time_per_iteration": 2.6983280181884766 + }, + { + "auxiliary_loss_clip": 0.01511187, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_clip": 1.33262849, + "balance_loss_mlp": 1.02868342, + "epoch": 0.16034871486547422, + "flos": 20994503936400.0, + "grad_norm": 2.347949823288801, + "language_loss": 0.8172015, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.84282684, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22668457, + "step": 2667, + "time_per_iteration": 2.8008639812469482 + }, + { + "auxiliary_loss_clip": 0.01508729, + "auxiliary_loss_mlp": 0.01055055, + "balance_loss_clip": 1.33383501, + "balance_loss_mlp": 1.03072464, + "epoch": 0.1604088381181422, + "flos": 21402756028080.0, + "grad_norm": 1.671660669827542, + "language_loss": 0.73997533, + "learning_rate": 3.824238990625567e-06, + "loss": 0.7656132, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.24316406, + "step": 2668, + "time_per_iteration": 2.7579658031463623 + }, + { + "auxiliary_loss_clip": 0.01516571, + "auxiliary_loss_mlp": 0.01063493, + "balance_loss_clip": 1.33676004, + "balance_loss_mlp": 1.04140353, + "epoch": 0.16046896137081015, + "flos": 23882162977080.0, + "grad_norm": 1.5208760986976655, + "language_loss": 0.77680898, + "learning_rate": 3.824079306186848e-06, + "loss": 0.80260968, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22106934, + "step": 2669, + "time_per_iteration": 2.793367385864258 + }, + { + "auxiliary_loss_clip": 0.01345818, + "auxiliary_loss_mlp": 0.01006962, + "balance_loss_clip": 1.25386333, + "balance_loss_mlp": 1.00193107, + "epoch": 0.16052908462347812, + "flos": 59820136028640.0, + "grad_norm": 0.7922624347980244, + "language_loss": 0.55539703, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57892483, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.05029297, + "step": 2670, + "time_per_iteration": 3.174823522567749 + }, + { + "auxiliary_loss_clip": 0.015134, + "auxiliary_loss_mlp": 0.01054434, + "balance_loss_clip": 1.3328203, + "balance_loss_mlp": 1.03309608, + "epoch": 0.1605892078761461, + "flos": 18301192817400.0, + "grad_norm": 1.9214211948319424, + "language_loss": 0.78308499, + "learning_rate": 3.82375972980766e-06, + "loss": 0.80876327, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21337891, + "step": 2671, + "time_per_iteration": 2.7567434310913086 + }, + { + "auxiliary_loss_clip": 0.01518601, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.33870482, + "balance_loss_mlp": 1.02490735, + "epoch": 0.16064933112881408, + "flos": 32167124254440.0, + "grad_norm": 1.8754172354795615, + "language_loss": 0.65796149, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.68361294, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21643066, + "step": 2672, + "time_per_iteration": 2.916452169418335 + }, + { + "auxiliary_loss_clip": 0.01518124, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.33675706, + "balance_loss_mlp": 1.02341747, + "epoch": 0.16070945438148204, + "flos": 19833607630440.0, + "grad_norm": 1.9420342038553988, + "language_loss": 0.86163127, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.88728392, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.23730469, + "step": 2673, + "time_per_iteration": 2.761098861694336 + }, + { + "auxiliary_loss_clip": 0.01512243, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_clip": 1.33536661, + "balance_loss_mlp": 1.02970386, + "epoch": 0.16076957763415, + "flos": 18917379650160.0, + "grad_norm": 2.2429410289611402, + "language_loss": 0.73498869, + "learning_rate": 3.823279846575403e-06, + "loss": 0.76062369, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21533203, + "step": 2674, + "time_per_iteration": 2.772732734680176 + }, + { + "auxiliary_loss_clip": 0.01518638, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.3392489, + "balance_loss_mlp": 1.02506518, + "epoch": 0.16082970088681797, + "flos": 16768818612720.0, + "grad_norm": 1.7179324466196026, + "language_loss": 0.84600949, + "learning_rate": 3.823119747211986e-06, + "loss": 0.87167817, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.23181152, + "step": 2675, + "time_per_iteration": 2.752103805541992 + }, + { + "auxiliary_loss_clip": 0.01518941, + "auxiliary_loss_mlp": 0.01052406, + "balance_loss_clip": 1.34076476, + "balance_loss_mlp": 1.02869534, + "epoch": 0.16088982413948594, + "flos": 35156740551840.0, + "grad_norm": 1.7943590823885884, + "language_loss": 0.82269633, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84840977, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.23706055, + "step": 2676, + "time_per_iteration": 2.9653637409210205 + }, + { + "auxiliary_loss_clip": 0.01507253, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_clip": 1.33419752, + "balance_loss_mlp": 1.02670801, + "epoch": 0.1609499473921539, + "flos": 18629805269160.0, + "grad_norm": 2.3066697771988736, + "language_loss": 0.73912466, + "learning_rate": 3.822799341092573e-06, + "loss": 0.76466835, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20410156, + "step": 2677, + "time_per_iteration": 2.7416489124298096 + }, + { + "auxiliary_loss_clip": 0.01508151, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.33383942, + "balance_loss_mlp": 1.02408242, + "epoch": 0.1610100706448219, + "flos": 33152230659240.0, + "grad_norm": 1.5976337209091627, + "language_loss": 0.76641977, + "learning_rate": 3.822639034348728e-06, + "loss": 0.79194319, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.2010498, + "step": 2678, + "time_per_iteration": 2.85331130027771 + }, + { + "auxiliary_loss_clip": 0.01506995, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.32912135, + "balance_loss_mlp": 1.02392244, + "epoch": 0.16107019389748986, + "flos": 34683224179680.0, + "grad_norm": 2.13134577803691, + "language_loss": 0.70474887, + "learning_rate": 3.822478658490228e-06, + "loss": 0.73027343, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21533203, + "step": 2679, + "time_per_iteration": 2.9128637313842773 + }, + { + "auxiliary_loss_clip": 0.0134057, + "auxiliary_loss_mlp": 0.01004493, + "balance_loss_clip": 1.24913156, + "balance_loss_mlp": 0.99953413, + "epoch": 0.16113031715015783, + "flos": 65727079096680.0, + "grad_norm": 0.7745927716866465, + "language_loss": 0.51816893, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5416196, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.04956055, + "step": 2680, + "time_per_iteration": 3.2981581687927246 + }, + { + "auxiliary_loss_clip": 0.015096, + "auxiliary_loss_mlp": 0.01055028, + "balance_loss_clip": 1.32906032, + "balance_loss_mlp": 1.03136468, + "epoch": 0.1611904404028258, + "flos": 20814896849400.0, + "grad_norm": 1.6028459631269734, + "language_loss": 0.81033468, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.83598101, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23681641, + "step": 2681, + "time_per_iteration": 2.7983827590942383 + }, + { + "auxiliary_loss_clip": 0.01505518, + "auxiliary_loss_mlp": 0.01053327, + "balance_loss_clip": 1.33253646, + "balance_loss_mlp": 1.03236961, + "epoch": 0.16125056365549376, + "flos": 27018673004520.0, + "grad_norm": 1.8929560993754213, + "language_loss": 0.69395125, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71953976, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20959473, + "step": 2682, + "time_per_iteration": 2.7980802059173584 + }, + { + "auxiliary_loss_clip": 0.01507905, + "auxiliary_loss_mlp": 0.01054057, + "balance_loss_clip": 1.3305707, + "balance_loss_mlp": 1.03194356, + "epoch": 0.16131068690816172, + "flos": 19280614051800.0, + "grad_norm": 1.6937375060669035, + "language_loss": 0.8734597, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89907932, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22119141, + "step": 2683, + "time_per_iteration": 2.82255482673645 + }, + { + "auxiliary_loss_clip": 0.01509682, + "auxiliary_loss_mlp": 0.01051864, + "balance_loss_clip": 1.33342719, + "balance_loss_mlp": 1.03077614, + "epoch": 0.16137081016082971, + "flos": 35345119044600.0, + "grad_norm": 1.833018729137729, + "language_loss": 0.74782896, + "learning_rate": 3.821675742690849e-06, + "loss": 0.77344441, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.2109375, + "step": 2684, + "time_per_iteration": 2.89383602142334 + }, + { + "auxiliary_loss_clip": 0.01508854, + "auxiliary_loss_mlp": 0.01049598, + "balance_loss_clip": 1.3307786, + "balance_loss_mlp": 1.02700794, + "epoch": 0.16143093341349768, + "flos": 34241243522040.0, + "grad_norm": 4.701245942304466, + "language_loss": 0.70496476, + "learning_rate": 3.821514952272223e-06, + "loss": 0.73054922, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.22595215, + "step": 2685, + "time_per_iteration": 2.8537063598632812 + }, + { + "auxiliary_loss_clip": 0.01494453, + "auxiliary_loss_mlp": 0.01054683, + "balance_loss_clip": 1.3221693, + "balance_loss_mlp": 1.03177094, + "epoch": 0.16149105666616564, + "flos": 28004835226680.0, + "grad_norm": 1.8867078813652276, + "language_loss": 0.72099531, + "learning_rate": 3.821354092781567e-06, + "loss": 0.74648666, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.22912598, + "step": 2686, + "time_per_iteration": 4.237035512924194 + }, + { + "auxiliary_loss_clip": 0.01504354, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.32626152, + "balance_loss_mlp": 1.02829182, + "epoch": 0.1615511799188336, + "flos": 19426289531040.0, + "grad_norm": 1.9115348337065816, + "language_loss": 0.81515706, + "learning_rate": 3.821193164224981e-06, + "loss": 0.8407222, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.23864746, + "step": 2687, + "time_per_iteration": 2.7958498001098633 + }, + { + "auxiliary_loss_clip": 0.01511296, + "auxiliary_loss_mlp": 0.01052241, + "balance_loss_clip": 1.3281579, + "balance_loss_mlp": 1.02985358, + "epoch": 0.16161130317150157, + "flos": 22859835687360.0, + "grad_norm": 1.7737725996412999, + "language_loss": 0.72417188, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74980724, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.22399902, + "step": 2688, + "time_per_iteration": 2.770397901535034 + }, + { + "auxiliary_loss_clip": 0.01503309, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.32586312, + "balance_loss_mlp": 1.02706099, + "epoch": 0.16167142642416954, + "flos": 26117023425480.0, + "grad_norm": 1.6111644152880884, + "language_loss": 0.76790321, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.79340732, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20031738, + "step": 2689, + "time_per_iteration": 2.858466148376465 + }, + { + "auxiliary_loss_clip": 0.01498152, + "auxiliary_loss_mlp": 0.01051217, + "balance_loss_clip": 1.32453394, + "balance_loss_mlp": 1.02975953, + "epoch": 0.1617315496768375, + "flos": 22784256883440.0, + "grad_norm": 2.646668002934412, + "language_loss": 0.8772651, + "learning_rate": 3.820709964220683e-06, + "loss": 0.90275878, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21435547, + "step": 2690, + "time_per_iteration": 2.8127646446228027 + }, + { + "auxiliary_loss_clip": 0.01495976, + "auxiliary_loss_mlp": 0.01049215, + "balance_loss_clip": 1.32090664, + "balance_loss_mlp": 1.02889013, + "epoch": 0.1617916729295055, + "flos": 22022313354720.0, + "grad_norm": 1.4658144036353935, + "language_loss": 0.88687778, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.91232967, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.203125, + "step": 2691, + "time_per_iteration": 4.2573394775390625 + }, + { + "auxiliary_loss_clip": 0.01513836, + "auxiliary_loss_mlp": 0.01056994, + "balance_loss_clip": 1.33209562, + "balance_loss_mlp": 1.03161442, + "epoch": 0.16185179618217346, + "flos": 23443187338080.0, + "grad_norm": 2.3075565569300878, + "language_loss": 0.82525468, + "learning_rate": 3.820387485666784e-06, + "loss": 0.850963, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.25378418, + "step": 2692, + "time_per_iteration": 2.8159677982330322 + }, + { + "auxiliary_loss_clip": 0.01517598, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_clip": 1.3359015, + "balance_loss_mlp": 1.02962637, + "epoch": 0.16191191943484143, + "flos": 25671631665600.0, + "grad_norm": 2.084352723057889, + "language_loss": 0.81840384, + "learning_rate": 3.820226142842862e-06, + "loss": 0.84410781, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.23168945, + "step": 2693, + "time_per_iteration": 4.4984400272369385 + }, + { + "auxiliary_loss_clip": 0.01489824, + "auxiliary_loss_mlp": 0.0105298, + "balance_loss_clip": 1.31842601, + "balance_loss_mlp": 1.03354931, + "epoch": 0.1619720426875094, + "flos": 23482641682800.0, + "grad_norm": 1.8255105411573984, + "language_loss": 0.84089172, + "learning_rate": 3.820064730995783e-06, + "loss": 0.86631978, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19445801, + "step": 2694, + "time_per_iteration": 4.3463733196258545 + }, + { + "auxiliary_loss_clip": 0.01504168, + "auxiliary_loss_mlp": 0.01060103, + "balance_loss_clip": 1.32268953, + "balance_loss_mlp": 1.03486681, + "epoch": 0.16203216594017736, + "flos": 24138973202400.0, + "grad_norm": 1.7032678137536976, + "language_loss": 0.69104171, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71668446, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.25268555, + "step": 2695, + "time_per_iteration": 2.78420352935791 + }, + { + "auxiliary_loss_clip": 0.01518579, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_clip": 1.33923721, + "balance_loss_mlp": 1.02653825, + "epoch": 0.16209228919284532, + "flos": 22345321852800.0, + "grad_norm": 2.4402987757822996, + "language_loss": 0.8316741, + "learning_rate": 3.819741700256637e-06, + "loss": 0.85735047, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22509766, + "step": 2696, + "time_per_iteration": 2.7827346324920654 + }, + { + "auxiliary_loss_clip": 0.01519792, + "auxiliary_loss_mlp": 0.01068166, + "balance_loss_clip": 1.33458853, + "balance_loss_mlp": 1.04271483, + "epoch": 0.1621524124455133, + "flos": 15819389975160.0, + "grad_norm": 2.0591882673045547, + "language_loss": 0.89023423, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.91611379, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.25439453, + "step": 2697, + "time_per_iteration": 2.7100136280059814 + }, + { + "auxiliary_loss_clip": 0.01496492, + "auxiliary_loss_mlp": 0.01048495, + "balance_loss_clip": 1.32495546, + "balance_loss_mlp": 1.02700198, + "epoch": 0.16221253569818128, + "flos": 30192322700160.0, + "grad_norm": 1.444941510685254, + "language_loss": 0.80672121, + "learning_rate": 3.819418393498343e-06, + "loss": 0.83217108, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.21496582, + "step": 2698, + "time_per_iteration": 2.8679916858673096 + }, + { + "auxiliary_loss_clip": 0.01500923, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.32815337, + "balance_loss_mlp": 1.02800286, + "epoch": 0.16227265895084925, + "flos": 24611108890320.0, + "grad_norm": 1.5889669273918596, + "language_loss": 0.77705157, + "learning_rate": 3.819256636627339e-06, + "loss": 0.80254936, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20861816, + "step": 2699, + "time_per_iteration": 2.792085647583008 + }, + { + "auxiliary_loss_clip": 0.01504675, + "auxiliary_loss_mlp": 0.01049771, + "balance_loss_clip": 1.32837391, + "balance_loss_mlp": 1.02720463, + "epoch": 0.1623327822035172, + "flos": 19578096872640.0, + "grad_norm": 1.869742503852256, + "language_loss": 0.86960638, + "learning_rate": 3.81909481076994e-06, + "loss": 0.8951509, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.22570801, + "step": 2700, + "time_per_iteration": 2.74672532081604 + }, + { + "auxiliary_loss_clip": 0.01505354, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.33024311, + "balance_loss_mlp": 1.02764928, + "epoch": 0.16239290545618518, + "flos": 26474044748040.0, + "grad_norm": 1.5169043142606164, + "language_loss": 0.80519682, + "learning_rate": 3.818932915932284e-06, + "loss": 0.83076024, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.23352051, + "step": 2701, + "time_per_iteration": 2.849977731704712 + }, + { + "auxiliary_loss_clip": 0.01508045, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.33093238, + "balance_loss_mlp": 1.02767575, + "epoch": 0.16245302870885314, + "flos": 15856245384840.0, + "grad_norm": 1.8232866666932255, + "language_loss": 0.73430437, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75988829, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.22668457, + "step": 2702, + "time_per_iteration": 2.756319522857666 + }, + { + "auxiliary_loss_clip": 0.01509198, + "auxiliary_loss_mlp": 0.01056117, + "balance_loss_clip": 1.33053708, + "balance_loss_mlp": 1.03299022, + "epoch": 0.1625131519615211, + "flos": 14760572751000.0, + "grad_norm": 1.9852983555583426, + "language_loss": 0.73394024, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75959337, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.23144531, + "step": 2703, + "time_per_iteration": 2.7488982677459717 + }, + { + "auxiliary_loss_clip": 0.0151158, + "auxiliary_loss_mlp": 0.01058773, + "balance_loss_clip": 1.33264673, + "balance_loss_mlp": 1.03454971, + "epoch": 0.1625732752141891, + "flos": 28226414376720.0, + "grad_norm": 1.882604383399786, + "language_loss": 0.70547986, + "learning_rate": 3.818446817599176e-06, + "loss": 0.73118341, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.2421875, + "step": 2704, + "time_per_iteration": 2.8800177574157715 + }, + { + "auxiliary_loss_clip": 0.01347728, + "auxiliary_loss_mlp": 0.01002978, + "balance_loss_clip": 1.25612879, + "balance_loss_mlp": 0.99797076, + "epoch": 0.16263339846685707, + "flos": 67343031952200.0, + "grad_norm": 0.7786909624809456, + "language_loss": 0.53358942, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55709648, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.05004883, + "step": 2705, + "time_per_iteration": 3.262281894683838 + }, + { + "auxiliary_loss_clip": 0.01507795, + "auxiliary_loss_mlp": 0.0105114, + "balance_loss_clip": 1.32938886, + "balance_loss_mlp": 1.02894354, + "epoch": 0.16269352171952503, + "flos": 14323221446400.0, + "grad_norm": 2.911307734831081, + "language_loss": 0.75502384, + "learning_rate": 3.818122407255102e-06, + "loss": 0.78061318, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22192383, + "step": 2706, + "time_per_iteration": 2.79488468170166 + }, + { + "auxiliary_loss_clip": 0.01503468, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_clip": 1.32506216, + "balance_loss_mlp": 1.03179049, + "epoch": 0.162753644972193, + "flos": 28366445293920.0, + "grad_norm": 1.9712118108160421, + "language_loss": 0.73340726, + "learning_rate": 3.817960098664914e-06, + "loss": 0.75896752, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.20776367, + "step": 2707, + "time_per_iteration": 2.8467042446136475 + }, + { + "auxiliary_loss_clip": 0.01509565, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.3334949, + "balance_loss_mlp": 1.03968179, + "epoch": 0.16281376822486096, + "flos": 19942630741800.0, + "grad_norm": 2.3770829359651002, + "language_loss": 0.83584309, + "learning_rate": 3.817797721137495e-06, + "loss": 0.86155713, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.22167969, + "step": 2708, + "time_per_iteration": 2.7540111541748047 + }, + { + "auxiliary_loss_clip": 0.01515559, + "auxiliary_loss_mlp": 0.01049714, + "balance_loss_clip": 1.33382547, + "balance_loss_mlp": 1.02510905, + "epoch": 0.16287389147752893, + "flos": 21256715073600.0, + "grad_norm": 5.727388332591561, + "language_loss": 0.86550713, + "learning_rate": 3.817635274679006e-06, + "loss": 0.89115989, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.24584961, + "step": 2709, + "time_per_iteration": 2.7958052158355713 + }, + { + "auxiliary_loss_clip": 0.01504359, + "auxiliary_loss_mlp": 0.01055132, + "balance_loss_clip": 1.32636213, + "balance_loss_mlp": 1.0339371, + "epoch": 0.1629340147301969, + "flos": 19249362595800.0, + "grad_norm": 1.6820200209909386, + "language_loss": 0.91695797, + "learning_rate": 3.817472759295605e-06, + "loss": 0.94255292, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21203613, + "step": 2710, + "time_per_iteration": 2.831947088241577 + }, + { + "auxiliary_loss_clip": 0.01508449, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.33312035, + "balance_loss_mlp": 1.03865123, + "epoch": 0.16299413798286488, + "flos": 21254562830520.0, + "grad_norm": 2.373720489947261, + "language_loss": 0.82318372, + "learning_rate": 3.817310174993453e-06, + "loss": 0.84887844, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22375488, + "step": 2711, + "time_per_iteration": 2.7642643451690674 + }, + { + "auxiliary_loss_clip": 0.0152178, + "auxiliary_loss_mlp": 0.01048965, + "balance_loss_clip": 1.33819866, + "balance_loss_mlp": 1.02612472, + "epoch": 0.16305426123553285, + "flos": 18775237098240.0, + "grad_norm": 2.0658248503459657, + "language_loss": 0.81258988, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83829737, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22839355, + "step": 2712, + "time_per_iteration": 2.795954942703247 + }, + { + "auxiliary_loss_clip": 0.01519229, + "auxiliary_loss_mlp": 0.01068109, + "balance_loss_clip": 1.33707201, + "balance_loss_mlp": 1.04510188, + "epoch": 0.16311438448820081, + "flos": 22092328813320.0, + "grad_norm": 1.7761206706484156, + "language_loss": 0.77353209, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79940546, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.23010254, + "step": 2713, + "time_per_iteration": 2.7740111351013184 + }, + { + "auxiliary_loss_clip": 0.01507375, + "auxiliary_loss_mlp": 0.01064716, + "balance_loss_clip": 1.33666277, + "balance_loss_mlp": 1.0426743, + "epoch": 0.16317450774086878, + "flos": 16471498225320.0, + "grad_norm": 2.0878825637731233, + "language_loss": 0.79295045, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81867135, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22045898, + "step": 2714, + "time_per_iteration": 2.748668909072876 + }, + { + "auxiliary_loss_clip": 0.01514129, + "auxiliary_loss_mlp": 0.0106932, + "balance_loss_clip": 1.33814657, + "balance_loss_mlp": 1.04675364, + "epoch": 0.16323463099353674, + "flos": 24358318892640.0, + "grad_norm": 1.5820693293913892, + "language_loss": 0.78376287, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80959737, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.22570801, + "step": 2715, + "time_per_iteration": 2.761542320251465 + }, + { + "auxiliary_loss_clip": 0.01513339, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.33591795, + "balance_loss_mlp": 1.0302918, + "epoch": 0.1632947542462047, + "flos": 24906317643000.0, + "grad_norm": 2.1608651296860946, + "language_loss": 0.81581903, + "learning_rate": 3.816496219917336e-06, + "loss": 0.84147036, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.21508789, + "step": 2716, + "time_per_iteration": 2.8533928394317627 + }, + { + "auxiliary_loss_clip": 0.01520118, + "auxiliary_loss_mlp": 0.01055889, + "balance_loss_clip": 1.34214091, + "balance_loss_mlp": 1.03500319, + "epoch": 0.1633548774988727, + "flos": 24905749125960.0, + "grad_norm": 1.8686779198670969, + "language_loss": 0.86615896, + "learning_rate": 3.816333222232251e-06, + "loss": 0.89191902, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.20898438, + "step": 2717, + "time_per_iteration": 2.863943338394165 + }, + { + "auxiliary_loss_clip": 0.01510763, + "auxiliary_loss_mlp": 0.01052434, + "balance_loss_clip": 1.33624935, + "balance_loss_mlp": 1.03076196, + "epoch": 0.16341500075154067, + "flos": 30447427374360.0, + "grad_norm": 1.9044211898081782, + "language_loss": 0.76764369, + "learning_rate": 3.816170155671629e-06, + "loss": 0.79327565, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21691895, + "step": 2718, + "time_per_iteration": 2.9339277744293213 + }, + { + "auxiliary_loss_clip": 0.01519236, + "auxiliary_loss_mlp": 0.01058518, + "balance_loss_clip": 1.33939385, + "balance_loss_mlp": 1.0334363, + "epoch": 0.16347512400420863, + "flos": 22789779620400.0, + "grad_norm": 1.9445750153506454, + "language_loss": 0.73919868, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76497626, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.25085449, + "step": 2719, + "time_per_iteration": 2.848257541656494 + }, + { + "auxiliary_loss_clip": 0.01512027, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.33450019, + "balance_loss_mlp": 1.0325433, + "epoch": 0.1635352472568766, + "flos": 22637809845360.0, + "grad_norm": 1.5659867692776408, + "language_loss": 0.72542536, + "learning_rate": 3.815843815948507e-06, + "loss": 0.75109065, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21948242, + "step": 2720, + "time_per_iteration": 2.814424753189087 + }, + { + "auxiliary_loss_clip": 0.01505733, + "auxiliary_loss_mlp": 0.0106401, + "balance_loss_clip": 1.33151555, + "balance_loss_mlp": 1.03901172, + "epoch": 0.16359537050954456, + "flos": 15527186241120.0, + "grad_norm": 3.0275395444578743, + "language_loss": 0.75363207, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77932954, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.25024414, + "step": 2721, + "time_per_iteration": 2.7541213035583496 + }, + { + "auxiliary_loss_clip": 0.01519302, + "auxiliary_loss_mlp": 0.01056206, + "balance_loss_clip": 1.338287, + "balance_loss_mlp": 1.03318715, + "epoch": 0.16365549376221253, + "flos": 22095171398520.0, + "grad_norm": 1.7587202660939125, + "language_loss": 0.79727864, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.82303369, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.23010254, + "step": 2722, + "time_per_iteration": 2.7574453353881836 + }, + { + "auxiliary_loss_clip": 0.01523555, + "auxiliary_loss_mlp": 0.01055671, + "balance_loss_clip": 1.34239447, + "balance_loss_mlp": 1.03111339, + "epoch": 0.1637156170148805, + "flos": 24065668466640.0, + "grad_norm": 2.4427702036127323, + "language_loss": 0.84740716, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.87319934, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.24560547, + "step": 2723, + "time_per_iteration": 2.7869644165039062 + }, + { + "auxiliary_loss_clip": 0.01504182, + "auxiliary_loss_mlp": 0.01047602, + "balance_loss_clip": 1.33328795, + "balance_loss_mlp": 1.02572703, + "epoch": 0.1637757402675485, + "flos": 26690710286520.0, + "grad_norm": 2.036535565565918, + "language_loss": 0.71289188, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73840976, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21887207, + "step": 2724, + "time_per_iteration": 2.7999093532562256 + }, + { + "auxiliary_loss_clip": 0.01510578, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_clip": 1.33901274, + "balance_loss_mlp": 1.0299449, + "epoch": 0.16383586352021645, + "flos": 16111228233960.0, + "grad_norm": 2.032630076538082, + "language_loss": 0.71430767, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73992497, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21203613, + "step": 2725, + "time_per_iteration": 4.151240110397339 + }, + { + "auxiliary_loss_clip": 0.01504777, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_clip": 1.33375371, + "balance_loss_mlp": 1.0227623, + "epoch": 0.16389598677288442, + "flos": 19169925997680.0, + "grad_norm": 3.6283013100666506, + "language_loss": 0.88600993, + "learning_rate": 3.814863144409855e-06, + "loss": 0.91149622, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.2109375, + "step": 2726, + "time_per_iteration": 2.739107370376587 + }, + { + "auxiliary_loss_clip": 0.01515285, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_clip": 1.33901572, + "balance_loss_mlp": 1.02874374, + "epoch": 0.16395611002555238, + "flos": 21512022789600.0, + "grad_norm": 1.811987706135321, + "language_loss": 0.73889714, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76455712, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.21960449, + "step": 2727, + "time_per_iteration": 2.8903276920318604 + }, + { + "auxiliary_loss_clip": 0.01510342, + "auxiliary_loss_mlp": 0.01052917, + "balance_loss_clip": 1.33926284, + "balance_loss_mlp": 1.03278267, + "epoch": 0.16401623327822035, + "flos": 21475979547120.0, + "grad_norm": 1.5533432783130476, + "language_loss": 0.82767248, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.8533051, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.20141602, + "step": 2728, + "time_per_iteration": 2.769118070602417 + }, + { + "auxiliary_loss_clip": 0.01521823, + "auxiliary_loss_mlp": 0.01050393, + "balance_loss_clip": 1.3440783, + "balance_loss_mlp": 1.02600312, + "epoch": 0.1640763565308883, + "flos": 13630237558920.0, + "grad_norm": 2.602205445170488, + "language_loss": 0.85520244, + "learning_rate": 3.814371879489633e-06, + "loss": 0.88092458, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.24401855, + "step": 2729, + "time_per_iteration": 4.179281949996948 + }, + { + "auxiliary_loss_clip": 0.01518226, + "auxiliary_loss_mlp": 0.01049086, + "balance_loss_clip": 1.34111881, + "balance_loss_mlp": 1.02848709, + "epoch": 0.16413647978355628, + "flos": 15455830706640.0, + "grad_norm": 1.8244361976958874, + "language_loss": 0.73229957, + "learning_rate": 3.814207986905616e-06, + "loss": 0.75797266, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.20581055, + "step": 2730, + "time_per_iteration": 2.7390036582946777 + }, + { + "auxiliary_loss_clip": 0.01521103, + "auxiliary_loss_mlp": 0.01052003, + "balance_loss_clip": 1.3415885, + "balance_loss_mlp": 1.0288049, + "epoch": 0.16419660303622427, + "flos": 45886771086840.0, + "grad_norm": 1.584897294508201, + "language_loss": 0.74435711, + "learning_rate": 3.814044025526651e-06, + "loss": 0.7700882, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.23205566, + "step": 2731, + "time_per_iteration": 2.9687294960021973 + }, + { + "auxiliary_loss_clip": 0.01524413, + "auxiliary_loss_mlp": 0.01049864, + "balance_loss_clip": 1.347242, + "balance_loss_mlp": 1.027632, + "epoch": 0.16425672628889224, + "flos": 18957483728640.0, + "grad_norm": 1.9844840179863288, + "language_loss": 0.79309523, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.818838, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.22216797, + "step": 2732, + "time_per_iteration": 4.248352289199829 + }, + { + "auxiliary_loss_clip": 0.01517844, + "auxiliary_loss_mlp": 0.0104839, + "balance_loss_clip": 1.34079504, + "balance_loss_mlp": 1.02622938, + "epoch": 0.1643168495415602, + "flos": 24317930555640.0, + "grad_norm": 3.402120585774494, + "language_loss": 0.69415671, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71981901, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.22180176, + "step": 2733, + "time_per_iteration": 4.373496770858765 + }, + { + "auxiliary_loss_clip": 0.01515267, + "auxiliary_loss_mlp": 0.01049208, + "balance_loss_clip": 1.33929276, + "balance_loss_mlp": 1.02629638, + "epoch": 0.16437697279422817, + "flos": 26433534585960.0, + "grad_norm": 1.6533414172736107, + "language_loss": 0.80943811, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.83508289, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.22900391, + "step": 2734, + "time_per_iteration": 2.8070578575134277 + }, + { + "auxiliary_loss_clip": 0.01517676, + "auxiliary_loss_mlp": 0.01058541, + "balance_loss_clip": 1.34292579, + "balance_loss_mlp": 1.03626037, + "epoch": 0.16443709604689613, + "flos": 34538929384680.0, + "grad_norm": 2.0025912739892147, + "language_loss": 0.82094288, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84670508, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.22265625, + "step": 2735, + "time_per_iteration": 2.905642032623291 + }, + { + "auxiliary_loss_clip": 0.01515181, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.34330571, + "balance_loss_mlp": 1.02300763, + "epoch": 0.1644972192995641, + "flos": 23263255384200.0, + "grad_norm": 3.5801069345065284, + "language_loss": 0.79062366, + "learning_rate": 3.813223186925296e-06, + "loss": 0.81622207, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21655273, + "step": 2736, + "time_per_iteration": 2.7535736560821533 + }, + { + "auxiliary_loss_clip": 0.01527516, + "auxiliary_loss_mlp": 0.0105703, + "balance_loss_clip": 1.34910285, + "balance_loss_mlp": 1.03541708, + "epoch": 0.1645573425522321, + "flos": 26985228697080.0, + "grad_norm": 2.2736975789110763, + "language_loss": 0.81799662, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.84384203, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21606445, + "step": 2737, + "time_per_iteration": 2.9549126625061035 + }, + { + "auxiliary_loss_clip": 0.01526934, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.3480612, + "balance_loss_mlp": 1.02894616, + "epoch": 0.16461746580490005, + "flos": 28737801367560.0, + "grad_norm": 1.8226185153273797, + "language_loss": 0.876158, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.90192223, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.20544434, + "step": 2738, + "time_per_iteration": 2.8409719467163086 + }, + { + "auxiliary_loss_clip": 0.01524507, + "auxiliary_loss_mlp": 0.01055739, + "balance_loss_clip": 1.34690547, + "balance_loss_mlp": 1.03298211, + "epoch": 0.16467758905756802, + "flos": 24934767122160.0, + "grad_norm": 2.1810455386914613, + "language_loss": 0.72099614, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74679857, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.22753906, + "step": 2739, + "time_per_iteration": 2.78686785697937 + }, + { + "auxiliary_loss_clip": 0.01514375, + "auxiliary_loss_mlp": 0.01056669, + "balance_loss_clip": 1.33813715, + "balance_loss_mlp": 1.0341506, + "epoch": 0.16473771231023598, + "flos": 24831875873160.0, + "grad_norm": 2.0658521677089, + "language_loss": 0.81869829, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.84440869, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.22521973, + "step": 2740, + "time_per_iteration": 2.8160648345947266 + }, + { + "auxiliary_loss_clip": 0.01529195, + "auxiliary_loss_mlp": 0.01055256, + "balance_loss_clip": 1.3490808, + "balance_loss_mlp": 1.03041291, + "epoch": 0.16479783556290395, + "flos": 39903518264400.0, + "grad_norm": 2.1124386933700734, + "language_loss": 0.70099849, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.72684306, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.24841309, + "step": 2741, + "time_per_iteration": 2.950315237045288 + }, + { + "auxiliary_loss_clip": 0.01522961, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_clip": 1.34648538, + "balance_loss_mlp": 1.03166342, + "epoch": 0.16485795881557191, + "flos": 19901267804160.0, + "grad_norm": 1.8888765746911669, + "language_loss": 0.79711908, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82290876, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.24365234, + "step": 2742, + "time_per_iteration": 2.766572952270508 + }, + { + "auxiliary_loss_clip": 0.01519429, + "auxiliary_loss_mlp": 0.01054058, + "balance_loss_clip": 1.34550023, + "balance_loss_mlp": 1.03114581, + "epoch": 0.16491808206823988, + "flos": 20560726167480.0, + "grad_norm": 3.260892683440003, + "language_loss": 0.84877443, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87450922, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22924805, + "step": 2743, + "time_per_iteration": 2.739957332611084 + }, + { + "auxiliary_loss_clip": 0.01515563, + "auxiliary_loss_mlp": 0.01049776, + "balance_loss_clip": 1.34100282, + "balance_loss_mlp": 1.02729344, + "epoch": 0.16497820532090787, + "flos": 23805487747440.0, + "grad_norm": 1.5275351339892806, + "language_loss": 0.8589468, + "learning_rate": 3.811906270092265e-06, + "loss": 0.88460016, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.22460938, + "step": 2744, + "time_per_iteration": 2.8270375728607178 + }, + { + "auxiliary_loss_clip": 0.01503533, + "auxiliary_loss_mlp": 0.01046929, + "balance_loss_clip": 1.33301401, + "balance_loss_mlp": 1.02716446, + "epoch": 0.16503832857357584, + "flos": 25488004350960.0, + "grad_norm": 1.7264697770505082, + "language_loss": 0.83119678, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85670143, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19775391, + "step": 2745, + "time_per_iteration": 2.7758138179779053 + }, + { + "auxiliary_loss_clip": 0.01517478, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_clip": 1.33925796, + "balance_loss_mlp": 1.03811097, + "epoch": 0.1650984518262438, + "flos": 17680579673400.0, + "grad_norm": 1.859982820925699, + "language_loss": 0.7709859, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.79679036, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.24853516, + "step": 2746, + "time_per_iteration": 2.912104845046997 + }, + { + "auxiliary_loss_clip": 0.01517686, + "auxiliary_loss_mlp": 0.0106283, + "balance_loss_clip": 1.34070253, + "balance_loss_mlp": 1.04095531, + "epoch": 0.16515857507891177, + "flos": 18703069396560.0, + "grad_norm": 1.5948293814586363, + "language_loss": 0.80369586, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82950103, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21887207, + "step": 2747, + "time_per_iteration": 2.784313678741455 + }, + { + "auxiliary_loss_clip": 0.01516856, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.34082353, + "balance_loss_mlp": 1.02563679, + "epoch": 0.16521869833157973, + "flos": 15014377957680.0, + "grad_norm": 2.0777938990437845, + "language_loss": 0.70947635, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.73511767, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21643066, + "step": 2748, + "time_per_iteration": 2.909842014312744 + }, + { + "auxiliary_loss_clip": 0.01515834, + "auxiliary_loss_mlp": 0.01053143, + "balance_loss_clip": 1.3391993, + "balance_loss_mlp": 1.03206706, + "epoch": 0.1652788215842477, + "flos": 22125448253880.0, + "grad_norm": 2.4217120319690193, + "language_loss": 0.87787879, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90356863, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.2109375, + "step": 2749, + "time_per_iteration": 3.048692226409912 + }, + { + "auxiliary_loss_clip": 0.01507611, + "auxiliary_loss_mlp": 0.01054214, + "balance_loss_clip": 1.32866764, + "balance_loss_mlp": 1.03230298, + "epoch": 0.16533894483691566, + "flos": 18337723360200.0, + "grad_norm": 1.8349537029366456, + "language_loss": 0.79677051, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.82238877, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21911621, + "step": 2750, + "time_per_iteration": 2.8618948459625244 + }, + { + "auxiliary_loss_clip": 0.01513109, + "auxiliary_loss_mlp": 0.01057607, + "balance_loss_clip": 1.3376708, + "balance_loss_mlp": 1.03544569, + "epoch": 0.16539906808958366, + "flos": 22387172090760.0, + "grad_norm": 1.7002979371165299, + "language_loss": 0.95356125, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97926843, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22143555, + "step": 2751, + "time_per_iteration": 2.8213441371917725 + }, + { + "auxiliary_loss_clip": 0.01503059, + "auxiliary_loss_mlp": 0.0105854, + "balance_loss_clip": 1.33056748, + "balance_loss_mlp": 1.03624737, + "epoch": 0.16545919134225162, + "flos": 22716271842840.0, + "grad_norm": 2.539960674874798, + "language_loss": 0.7143296, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73994559, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.22302246, + "step": 2752, + "time_per_iteration": 2.744516134262085 + }, + { + "auxiliary_loss_clip": 0.01354889, + "auxiliary_loss_mlp": 0.01068917, + "balance_loss_clip": 1.26785529, + "balance_loss_mlp": 1.06371963, + "epoch": 0.1655193145949196, + "flos": 67818213267120.0, + "grad_norm": 0.790554080134331, + "language_loss": 0.54024363, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56448174, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.05200195, + "step": 2753, + "time_per_iteration": 3.3947527408599854 + }, + { + "auxiliary_loss_clip": 0.01508012, + "auxiliary_loss_mlp": 0.01048357, + "balance_loss_clip": 1.33246708, + "balance_loss_mlp": 1.02599335, + "epoch": 0.16557943784758755, + "flos": 24285826324080.0, + "grad_norm": 1.827067899669539, + "language_loss": 0.75766158, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.78322524, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.22351074, + "step": 2754, + "time_per_iteration": 2.829139471054077 + }, + { + "auxiliary_loss_clip": 0.0152329, + "auxiliary_loss_mlp": 0.0106253, + "balance_loss_clip": 1.33930659, + "balance_loss_mlp": 1.03887868, + "epoch": 0.16563956110025552, + "flos": 20088387437760.0, + "grad_norm": 2.294012640416305, + "language_loss": 0.8708117, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89666992, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.2364502, + "step": 2755, + "time_per_iteration": 2.7295029163360596 + }, + { + "auxiliary_loss_clip": 0.01508835, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.33356977, + "balance_loss_mlp": 1.03510737, + "epoch": 0.16569968435292348, + "flos": 28039863260160.0, + "grad_norm": 1.6931516162939035, + "language_loss": 0.74063581, + "learning_rate": 3.80992265092595e-06, + "loss": 0.76629555, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22033691, + "step": 2756, + "time_per_iteration": 2.8180339336395264 + }, + { + "auxiliary_loss_clip": 0.01495737, + "auxiliary_loss_mlp": 0.01052938, + "balance_loss_clip": 1.32731664, + "balance_loss_mlp": 1.03084838, + "epoch": 0.16575980760559147, + "flos": 26255876700240.0, + "grad_norm": 1.8777038072184529, + "language_loss": 0.75409007, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77957684, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.2208252, + "step": 2757, + "time_per_iteration": 2.7867424488067627 + }, + { + "auxiliary_loss_clip": 0.01506583, + "auxiliary_loss_mlp": 0.01050842, + "balance_loss_clip": 1.33195353, + "balance_loss_mlp": 1.03062391, + "epoch": 0.16581993085825944, + "flos": 26949753971640.0, + "grad_norm": 1.5266089914850858, + "language_loss": 0.85105866, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.87663293, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.20202637, + "step": 2758, + "time_per_iteration": 2.877575397491455 + }, + { + "auxiliary_loss_clip": 0.01505477, + "auxiliary_loss_mlp": 0.01047678, + "balance_loss_clip": 1.33179975, + "balance_loss_mlp": 1.02760315, + "epoch": 0.1658800541109274, + "flos": 21658754086200.0, + "grad_norm": 1.8210684705394862, + "language_loss": 0.79337859, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81891018, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20056152, + "step": 2759, + "time_per_iteration": 2.766340970993042 + }, + { + "auxiliary_loss_clip": 0.01503422, + "auxiliary_loss_mlp": 0.0104894, + "balance_loss_clip": 1.32946181, + "balance_loss_mlp": 1.02773285, + "epoch": 0.16594017736359537, + "flos": 16439840685720.0, + "grad_norm": 2.0349818319707644, + "language_loss": 0.74880731, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77433097, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.2121582, + "step": 2760, + "time_per_iteration": 2.847055435180664 + }, + { + "auxiliary_loss_clip": 0.01513361, + "auxiliary_loss_mlp": 0.01048171, + "balance_loss_clip": 1.3348701, + "balance_loss_mlp": 1.02640307, + "epoch": 0.16600030061626334, + "flos": 22642357981680.0, + "grad_norm": 1.563796883628879, + "language_loss": 0.73393083, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75954616, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21777344, + "step": 2761, + "time_per_iteration": 2.7621588706970215 + }, + { + "auxiliary_loss_clip": 0.01506473, + "auxiliary_loss_mlp": 0.01054319, + "balance_loss_clip": 1.33124804, + "balance_loss_mlp": 1.0314548, + "epoch": 0.1660604238689313, + "flos": 26402283129960.0, + "grad_norm": 1.6617267272770262, + "language_loss": 0.88521492, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91082287, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.2286377, + "step": 2762, + "time_per_iteration": 2.7804927825927734 + }, + { + "auxiliary_loss_clip": 0.01506684, + "auxiliary_loss_mlp": 0.01050528, + "balance_loss_clip": 1.32898962, + "balance_loss_mlp": 1.02929676, + "epoch": 0.16612054712159927, + "flos": 23045290378200.0, + "grad_norm": 1.6597610658296396, + "language_loss": 0.88074577, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90631789, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.2121582, + "step": 2763, + "time_per_iteration": 2.7993030548095703 + }, + { + "auxiliary_loss_clip": 0.0135985, + "auxiliary_loss_mlp": 0.01007907, + "balance_loss_clip": 1.26935577, + "balance_loss_mlp": 1.00323439, + "epoch": 0.16618067037426726, + "flos": 59256966732840.0, + "grad_norm": 0.7743494812106618, + "language_loss": 0.59796804, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.62164563, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.04663086, + "step": 2764, + "time_per_iteration": 4.656231880187988 + }, + { + "auxiliary_loss_clip": 0.01508484, + "auxiliary_loss_mlp": 0.01052773, + "balance_loss_clip": 1.33276749, + "balance_loss_mlp": 1.02974212, + "epoch": 0.16624079362693522, + "flos": 27204574387320.0, + "grad_norm": 1.6972867826229712, + "language_loss": 0.82463384, + "learning_rate": 3.808428450193401e-06, + "loss": 0.85024643, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.23059082, + "step": 2765, + "time_per_iteration": 2.8211193084716797 + }, + { + "auxiliary_loss_clip": 0.01526695, + "auxiliary_loss_mlp": 0.01056011, + "balance_loss_clip": 1.34182084, + "balance_loss_mlp": 1.03196609, + "epoch": 0.1663009168796032, + "flos": 10928561117760.0, + "grad_norm": 3.954907904207836, + "language_loss": 0.70619023, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.73201722, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.24035645, + "step": 2766, + "time_per_iteration": 2.742413282394409 + }, + { + "auxiliary_loss_clip": 0.01510039, + "auxiliary_loss_mlp": 0.01049888, + "balance_loss_clip": 1.33674097, + "balance_loss_mlp": 1.02874064, + "epoch": 0.16636104013227115, + "flos": 17898910154640.0, + "grad_norm": 2.344390063416246, + "language_loss": 0.89220732, + "learning_rate": 3.808095651090769e-06, + "loss": 0.91780663, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21166992, + "step": 2767, + "time_per_iteration": 2.727134943008423 + }, + { + "auxiliary_loss_clip": 0.01346162, + "auxiliary_loss_mlp": 0.01010341, + "balance_loss_clip": 1.25622511, + "balance_loss_mlp": 1.00557315, + "epoch": 0.16642116338493912, + "flos": 66743453938680.0, + "grad_norm": 0.641367658541888, + "language_loss": 0.52936846, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.55293345, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.04760742, + "step": 2768, + "time_per_iteration": 3.35502290725708 + }, + { + "auxiliary_loss_clip": 0.01527751, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.34702933, + "balance_loss_mlp": 1.02940869, + "epoch": 0.16648128663760708, + "flos": 19030747856040.0, + "grad_norm": 2.7153812846669205, + "language_loss": 0.85498756, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.88078082, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22155762, + "step": 2769, + "time_per_iteration": 4.172307014465332 + }, + { + "auxiliary_loss_clip": 0.01340741, + "auxiliary_loss_mlp": 0.0101651, + "balance_loss_clip": 1.24941802, + "balance_loss_mlp": 1.01169431, + "epoch": 0.16654140989027508, + "flos": 70150117360680.0, + "grad_norm": 0.8147155299267385, + "language_loss": 0.5748322, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59840471, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.0480957, + "step": 2770, + "time_per_iteration": 4.737793207168579 + }, + { + "auxiliary_loss_clip": 0.01336629, + "auxiliary_loss_mlp": 0.01008973, + "balance_loss_clip": 1.24586093, + "balance_loss_mlp": 1.00461006, + "epoch": 0.16660153314294304, + "flos": 70286452917120.0, + "grad_norm": 1.1689534232835574, + "language_loss": 0.56211025, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58556628, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.04370117, + "step": 2771, + "time_per_iteration": 3.1012134552001953 + }, + { + "auxiliary_loss_clip": 0.01515565, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_clip": 1.34003067, + "balance_loss_mlp": 1.04684091, + "epoch": 0.166661656395611, + "flos": 23080155978240.0, + "grad_norm": 2.34011193205874, + "language_loss": 0.70733261, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.73318756, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.23083496, + "step": 2772, + "time_per_iteration": 4.177093982696533 + }, + { + "auxiliary_loss_clip": 0.01505454, + "auxiliary_loss_mlp": 0.01060059, + "balance_loss_clip": 1.33182132, + "balance_loss_mlp": 1.03753996, + "epoch": 0.16672177964827897, + "flos": 28372252289400.0, + "grad_norm": 1.7737597131576888, + "language_loss": 0.86372292, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88937807, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.22521973, + "step": 2773, + "time_per_iteration": 2.8510701656341553 + }, + { + "auxiliary_loss_clip": 0.01513122, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.33815503, + "balance_loss_mlp": 1.02803731, + "epoch": 0.16678190290094694, + "flos": 19094672060640.0, + "grad_norm": 2.0110252735841225, + "language_loss": 0.82011783, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84573257, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20324707, + "step": 2774, + "time_per_iteration": 2.7193262577056885 + }, + { + "auxiliary_loss_clip": 0.0152115, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_clip": 1.343117, + "balance_loss_mlp": 1.03048444, + "epoch": 0.1668420261536149, + "flos": 21804185915280.0, + "grad_norm": 2.1558598977761165, + "language_loss": 0.8315624, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85730356, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.22485352, + "step": 2775, + "time_per_iteration": 2.8152339458465576 + }, + { + "auxiliary_loss_clip": 0.01512427, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_clip": 1.33801091, + "balance_loss_mlp": 1.04449272, + "epoch": 0.16690214940628287, + "flos": 19067237790480.0, + "grad_norm": 2.1162344461587237, + "language_loss": 0.81024492, + "learning_rate": 3.806594661981897e-06, + "loss": 0.83601391, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.19970703, + "step": 2776, + "time_per_iteration": 2.7402634620666504 + }, + { + "auxiliary_loss_clip": 0.01503829, + "auxiliary_loss_mlp": 0.01053786, + "balance_loss_clip": 1.33545518, + "balance_loss_mlp": 1.03379416, + "epoch": 0.16696227265895086, + "flos": 18593234118000.0, + "grad_norm": 1.903479437677736, + "language_loss": 0.80816972, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.83374584, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1998291, + "step": 2777, + "time_per_iteration": 2.7470388412475586 + }, + { + "auxiliary_loss_clip": 0.01511944, + "auxiliary_loss_mlp": 0.01062241, + "balance_loss_clip": 1.33671379, + "balance_loss_mlp": 1.03975797, + "epoch": 0.16702239591161883, + "flos": 23299298626680.0, + "grad_norm": 1.6557229769375974, + "language_loss": 0.85211521, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87785709, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22485352, + "step": 2778, + "time_per_iteration": 2.826390504837036 + }, + { + "auxiliary_loss_clip": 0.01516552, + "auxiliary_loss_mlp": 0.01046506, + "balance_loss_clip": 1.33971989, + "balance_loss_mlp": 1.02544188, + "epoch": 0.1670825191642868, + "flos": 24431014503000.0, + "grad_norm": 2.4541686535836633, + "language_loss": 0.74049175, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76612234, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21044922, + "step": 2779, + "time_per_iteration": 2.79988694190979 + }, + { + "auxiliary_loss_clip": 0.01506489, + "auxiliary_loss_mlp": 0.01054501, + "balance_loss_clip": 1.33164275, + "balance_loss_mlp": 1.03377032, + "epoch": 0.16714264241695476, + "flos": 26803225716840.0, + "grad_norm": 1.8889418125359598, + "language_loss": 0.65506876, + "learning_rate": 3.805925774274554e-06, + "loss": 0.68067861, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.20739746, + "step": 2780, + "time_per_iteration": 2.8625118732452393 + }, + { + "auxiliary_loss_clip": 0.01507869, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_clip": 1.33379579, + "balance_loss_mlp": 1.02403045, + "epoch": 0.16720276566962272, + "flos": 21840350982840.0, + "grad_norm": 1.991591350871756, + "language_loss": 0.7883566, + "learning_rate": 3.805758381129643e-06, + "loss": 0.81388068, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20507812, + "step": 2781, + "time_per_iteration": 2.881422996520996 + }, + { + "auxiliary_loss_clip": 0.01502562, + "auxiliary_loss_mlp": 0.01048065, + "balance_loss_clip": 1.3258965, + "balance_loss_mlp": 1.02733493, + "epoch": 0.1672628889222907, + "flos": 21475411030080.0, + "grad_norm": 1.4000395865539548, + "language_loss": 0.75343478, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77894104, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.20739746, + "step": 2782, + "time_per_iteration": 2.816225528717041 + }, + { + "auxiliary_loss_clip": 0.0151683, + "auxiliary_loss_mlp": 0.0105138, + "balance_loss_clip": 1.33404398, + "balance_loss_mlp": 1.02908742, + "epoch": 0.16732301217495865, + "flos": 30780181878840.0, + "grad_norm": 2.199046012819395, + "language_loss": 0.68059635, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70627844, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.2232666, + "step": 2783, + "time_per_iteration": 2.8403513431549072 + }, + { + "auxiliary_loss_clip": 0.01508623, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.33351588, + "balance_loss_mlp": 1.03101063, + "epoch": 0.16738313542762664, + "flos": 23479555447440.0, + "grad_norm": 1.8574897512327244, + "language_loss": 0.70086539, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72646761, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20593262, + "step": 2784, + "time_per_iteration": 2.818894386291504 + }, + { + "auxiliary_loss_clip": 0.01506136, + "auxiliary_loss_mlp": 0.01054082, + "balance_loss_clip": 1.32920361, + "balance_loss_mlp": 1.03095579, + "epoch": 0.1674432586802946, + "flos": 29795156690760.0, + "grad_norm": 1.902747636234683, + "language_loss": 0.61300159, + "learning_rate": 3.805088123868126e-06, + "loss": 0.63860375, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.23120117, + "step": 2785, + "time_per_iteration": 2.9180214405059814 + }, + { + "auxiliary_loss_clip": 0.01330847, + "auxiliary_loss_mlp": 0.01008523, + "balance_loss_clip": 1.24231815, + "balance_loss_mlp": 1.00473261, + "epoch": 0.16750338193296258, + "flos": 66152589741360.0, + "grad_norm": 0.7867557612670447, + "language_loss": 0.58829433, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.61168802, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.0378418, + "step": 2786, + "time_per_iteration": 3.307096004486084 + }, + { + "auxiliary_loss_clip": 0.01507479, + "auxiliary_loss_mlp": 0.01045472, + "balance_loss_clip": 1.33013964, + "balance_loss_mlp": 1.0243125, + "epoch": 0.16756350518563054, + "flos": 25701543045720.0, + "grad_norm": 1.770343802138273, + "language_loss": 0.76541901, + "learning_rate": 3.80475258451721e-06, + "loss": 0.79094851, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.21166992, + "step": 2787, + "time_per_iteration": 2.866013526916504 + }, + { + "auxiliary_loss_clip": 0.01507569, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.33129334, + "balance_loss_mlp": 1.02073109, + "epoch": 0.1676236284382985, + "flos": 23840637606000.0, + "grad_norm": 1.7549380438351627, + "language_loss": 0.77577484, + "learning_rate": 3.804584712183972e-06, + "loss": 0.80127108, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21337891, + "step": 2788, + "time_per_iteration": 2.8155219554901123 + }, + { + "auxiliary_loss_clip": 0.01335276, + "auxiliary_loss_mlp": 0.01009073, + "balance_loss_clip": 1.2451365, + "balance_loss_mlp": 1.00502038, + "epoch": 0.16768375169096647, + "flos": 59887714985640.0, + "grad_norm": 0.8669997120226641, + "language_loss": 0.59471416, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61815763, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.04052734, + "step": 2789, + "time_per_iteration": 3.1624443531036377 + }, + { + "auxiliary_loss_clip": 0.01503066, + "auxiliary_loss_mlp": 0.01061785, + "balance_loss_clip": 1.32649744, + "balance_loss_mlp": 1.03960013, + "epoch": 0.16774387494363446, + "flos": 38442783852720.0, + "grad_norm": 1.416351591985315, + "language_loss": 0.70467758, + "learning_rate": 3.804248762233765e-06, + "loss": 0.73032612, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.2220459, + "step": 2790, + "time_per_iteration": 2.9927785396575928 + }, + { + "auxiliary_loss_clip": 0.01508445, + "auxiliary_loss_mlp": 0.01053664, + "balance_loss_clip": 1.33294153, + "balance_loss_mlp": 1.03315997, + "epoch": 0.16780399819630243, + "flos": 22642723456920.0, + "grad_norm": 1.5967168153853726, + "language_loss": 0.79717863, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.82279974, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.20507812, + "step": 2791, + "time_per_iteration": 2.874742269515991 + }, + { + "auxiliary_loss_clip": 0.01507276, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.3321743, + "balance_loss_mlp": 1.0295341, + "epoch": 0.1678641214489704, + "flos": 32898141194040.0, + "grad_norm": 1.6679020034965857, + "language_loss": 0.71665037, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.74223071, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21228027, + "step": 2792, + "time_per_iteration": 3.0047004222869873 + }, + { + "auxiliary_loss_clip": 0.01495654, + "auxiliary_loss_mlp": 0.01054838, + "balance_loss_clip": 1.32084274, + "balance_loss_mlp": 1.03442931, + "epoch": 0.16792424470163836, + "flos": 19979770410000.0, + "grad_norm": 1.9251239594705636, + "language_loss": 0.72308409, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74858898, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20422363, + "step": 2793, + "time_per_iteration": 2.774047613143921 + }, + { + "auxiliary_loss_clip": 0.01507429, + "auxiliary_loss_mlp": 0.01054429, + "balance_loss_clip": 1.33154929, + "balance_loss_mlp": 1.03397298, + "epoch": 0.16798436795430632, + "flos": 19724746952520.0, + "grad_norm": 1.7618695719110127, + "language_loss": 0.77338183, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79900038, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20458984, + "step": 2794, + "time_per_iteration": 2.95147705078125 + }, + { + "auxiliary_loss_clip": 0.01500519, + "auxiliary_loss_mlp": 0.01050928, + "balance_loss_clip": 1.32383144, + "balance_loss_mlp": 1.03034031, + "epoch": 0.1680444912069743, + "flos": 28109919327120.0, + "grad_norm": 2.262834550968506, + "language_loss": 0.72814339, + "learning_rate": 3.803407690167187e-06, + "loss": 0.75365788, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.20593262, + "step": 2795, + "time_per_iteration": 2.8230926990509033 + }, + { + "auxiliary_loss_clip": 0.014952, + "auxiliary_loss_mlp": 0.01049947, + "balance_loss_clip": 1.31996179, + "balance_loss_mlp": 1.03076589, + "epoch": 0.16810461445964225, + "flos": 18079491842280.0, + "grad_norm": 1.7984479379451452, + "language_loss": 0.84333473, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86878622, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.19177246, + "step": 2796, + "time_per_iteration": 2.7744436264038086 + }, + { + "auxiliary_loss_clip": 0.01512304, + "auxiliary_loss_mlp": 0.01055826, + "balance_loss_clip": 1.33280253, + "balance_loss_mlp": 1.03368878, + "epoch": 0.16816473771231025, + "flos": 23883990353280.0, + "grad_norm": 1.8020535412728265, + "language_loss": 0.82440454, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.85008579, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22155762, + "step": 2797, + "time_per_iteration": 2.8108057975769043 + }, + { + "auxiliary_loss_clip": 0.01489647, + "auxiliary_loss_mlp": 0.01042318, + "balance_loss_clip": 1.32097483, + "balance_loss_mlp": 1.0237813, + "epoch": 0.1682248609649782, + "flos": 22788723803040.0, + "grad_norm": 1.3960595157787077, + "language_loss": 0.75279856, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77811819, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.18530273, + "step": 2798, + "time_per_iteration": 2.780613899230957 + }, + { + "auxiliary_loss_clip": 0.01508009, + "auxiliary_loss_mlp": 0.0105448, + "balance_loss_clip": 1.33349478, + "balance_loss_mlp": 1.03490639, + "epoch": 0.16828498421764618, + "flos": 20710137615840.0, + "grad_norm": 1.4782885151527163, + "language_loss": 0.79922885, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.82485372, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.19567871, + "step": 2799, + "time_per_iteration": 2.835059642791748 + }, + { + "auxiliary_loss_clip": 0.01507057, + "auxiliary_loss_mlp": 0.01044548, + "balance_loss_clip": 1.32739365, + "balance_loss_mlp": 1.02342415, + "epoch": 0.16834510747031414, + "flos": 29426155902000.0, + "grad_norm": 2.0722673433093646, + "language_loss": 0.71061081, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73612684, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21118164, + "step": 2800, + "time_per_iteration": 2.8275444507598877 + }, + { + "auxiliary_loss_clip": 0.01500246, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.32562768, + "balance_loss_mlp": 1.023803, + "epoch": 0.1684052307229821, + "flos": 18149304259080.0, + "grad_norm": 1.8055011395266685, + "language_loss": 0.84316611, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.86861807, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.21154785, + "step": 2801, + "time_per_iteration": 2.7706844806671143 + }, + { + "auxiliary_loss_clip": 0.01502129, + "auxiliary_loss_mlp": 0.01048805, + "balance_loss_clip": 1.32628202, + "balance_loss_mlp": 1.02927876, + "epoch": 0.16846535397565007, + "flos": 16578450310320.0, + "grad_norm": 3.9258627349897317, + "language_loss": 0.82502306, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85053241, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.19519043, + "step": 2802, + "time_per_iteration": 4.199822664260864 + }, + { + "auxiliary_loss_clip": 0.01505806, + "auxiliary_loss_mlp": 0.01053262, + "balance_loss_clip": 1.32919812, + "balance_loss_mlp": 1.03066015, + "epoch": 0.16852547722831807, + "flos": 30414957667560.0, + "grad_norm": 1.5297037606987693, + "language_loss": 0.81347144, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83906215, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.22595215, + "step": 2803, + "time_per_iteration": 2.90938663482666 + }, + { + "auxiliary_loss_clip": 0.01501463, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.32967782, + "balance_loss_mlp": 1.02642655, + "epoch": 0.16858560048098603, + "flos": 33513556467960.0, + "grad_norm": 2.4113139942880832, + "language_loss": 0.7714026, + "learning_rate": 3.801889452704297e-06, + "loss": 0.79689395, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21240234, + "step": 2804, + "time_per_iteration": 2.8871076107025146 + }, + { + "auxiliary_loss_clip": 0.01344113, + "auxiliary_loss_mlp": 0.01024506, + "balance_loss_clip": 1.25138092, + "balance_loss_mlp": 1.01880789, + "epoch": 0.168645723733654, + "flos": 67386384699480.0, + "grad_norm": 0.828195038751725, + "language_loss": 0.55449247, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57817864, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.05688477, + "step": 2805, + "time_per_iteration": 3.2783398628234863 + }, + { + "auxiliary_loss_clip": 0.0149895, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.32837355, + "balance_loss_mlp": 1.02792692, + "epoch": 0.16870584698632196, + "flos": 21329613725760.0, + "grad_norm": 2.2326617198369503, + "language_loss": 0.73756361, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.7630291, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19677734, + "step": 2806, + "time_per_iteration": 4.2264978885650635 + }, + { + "auxiliary_loss_clip": 0.01494384, + "auxiliary_loss_mlp": 0.01046177, + "balance_loss_clip": 1.32197928, + "balance_loss_mlp": 1.02593517, + "epoch": 0.16876597023898993, + "flos": 20745652949640.0, + "grad_norm": 2.1094103796933217, + "language_loss": 0.70172185, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72712743, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20239258, + "step": 2807, + "time_per_iteration": 2.861586570739746 + }, + { + "auxiliary_loss_clip": 0.01491249, + "auxiliary_loss_mlp": 0.01046902, + "balance_loss_clip": 1.31826484, + "balance_loss_mlp": 1.0248239, + "epoch": 0.1688260934916579, + "flos": 20308423470120.0, + "grad_norm": 4.9386797697930165, + "language_loss": 0.70222318, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72760463, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22094727, + "step": 2808, + "time_per_iteration": 2.8018453121185303 + }, + { + "auxiliary_loss_clip": 0.01506468, + "auxiliary_loss_mlp": 0.01044217, + "balance_loss_clip": 1.32706928, + "balance_loss_mlp": 1.0220685, + "epoch": 0.16888621674432586, + "flos": 20345725571760.0, + "grad_norm": 2.3106495898295023, + "language_loss": 0.80797607, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.83348292, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22131348, + "step": 2809, + "time_per_iteration": 4.344112873077393 + }, + { + "auxiliary_loss_clip": 0.01505136, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_clip": 1.32566953, + "balance_loss_mlp": 1.0262512, + "epoch": 0.16894633999699385, + "flos": 16248051090720.0, + "grad_norm": 2.0114425300185212, + "language_loss": 0.8853929, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.91091526, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.20849609, + "step": 2810, + "time_per_iteration": 2.7437915802001953 + }, + { + "auxiliary_loss_clip": 0.0150422, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.32531643, + "balance_loss_mlp": 1.03317821, + "epoch": 0.16900646324966181, + "flos": 19614992890680.0, + "grad_norm": 1.8793903204302196, + "language_loss": 0.92676955, + "learning_rate": 3.800704774747416e-06, + "loss": 0.95234859, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.20495605, + "step": 2811, + "time_per_iteration": 4.172362565994263 + }, + { + "auxiliary_loss_clip": 0.01507916, + "auxiliary_loss_mlp": 0.01048372, + "balance_loss_clip": 1.33217621, + "balance_loss_mlp": 1.02829719, + "epoch": 0.16906658650232978, + "flos": 22023206738640.0, + "grad_norm": 2.3640153614988906, + "language_loss": 0.79628158, + "learning_rate": 3.800535261856291e-06, + "loss": 0.82184446, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20068359, + "step": 2812, + "time_per_iteration": 2.765941858291626 + }, + { + "auxiliary_loss_clip": 0.01498779, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.32384205, + "balance_loss_mlp": 1.02227092, + "epoch": 0.16912670975499774, + "flos": 11767220484480.0, + "grad_norm": 2.551944205320744, + "language_loss": 0.74867082, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77407628, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.19519043, + "step": 2813, + "time_per_iteration": 2.8922674655914307 + }, + { + "auxiliary_loss_clip": 0.01504462, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.32680237, + "balance_loss_mlp": 1.02492821, + "epoch": 0.1691868330076657, + "flos": 17165334888360.0, + "grad_norm": 2.5736820702808547, + "language_loss": 0.6947909, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.72028315, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.19848633, + "step": 2814, + "time_per_iteration": 2.6780803203582764 + }, + { + "auxiliary_loss_clip": 0.01503925, + "auxiliary_loss_mlp": 0.01049691, + "balance_loss_clip": 1.32643092, + "balance_loss_mlp": 1.02896047, + "epoch": 0.16924695626033368, + "flos": 22421347348680.0, + "grad_norm": 1.616090447565278, + "language_loss": 0.62162805, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64716423, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.20727539, + "step": 2815, + "time_per_iteration": 2.761624813079834 + }, + { + "auxiliary_loss_clip": 0.01487695, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.31266022, + "balance_loss_mlp": 1.01932955, + "epoch": 0.16930707951300164, + "flos": 25745261268240.0, + "grad_norm": 2.1070471790548804, + "language_loss": 0.82637405, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.85165882, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21459961, + "step": 2816, + "time_per_iteration": 2.826601266860962 + }, + { + "auxiliary_loss_clip": 0.01503207, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.32775939, + "balance_loss_mlp": 1.03038311, + "epoch": 0.16936720276566963, + "flos": 22752152651880.0, + "grad_norm": 3.664192771743788, + "language_loss": 0.87664127, + "learning_rate": 3.799686673382153e-06, + "loss": 0.90218467, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20751953, + "step": 2817, + "time_per_iteration": 2.788821220397949 + }, + { + "auxiliary_loss_clip": 0.01491607, + "auxiliary_loss_mlp": 0.01050342, + "balance_loss_clip": 1.31821442, + "balance_loss_mlp": 1.02746594, + "epoch": 0.1694273260183376, + "flos": 19578827823120.0, + "grad_norm": 1.6492435432718753, + "language_loss": 0.82009077, + "learning_rate": 3.799516750928672e-06, + "loss": 0.84551024, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.22875977, + "step": 2818, + "time_per_iteration": 2.82362699508667 + }, + { + "auxiliary_loss_clip": 0.01502217, + "auxiliary_loss_mlp": 0.01047377, + "balance_loss_clip": 1.32729459, + "balance_loss_mlp": 1.02594364, + "epoch": 0.16948744927100556, + "flos": 12461585056200.0, + "grad_norm": 2.4738441487064247, + "language_loss": 0.8111254, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83662128, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21435547, + "step": 2819, + "time_per_iteration": 2.7419376373291016 + }, + { + "auxiliary_loss_clip": 0.01328512, + "auxiliary_loss_mlp": 0.01007489, + "balance_loss_clip": 1.23594975, + "balance_loss_mlp": 1.00231576, + "epoch": 0.16954757252367353, + "flos": 71305572146400.0, + "grad_norm": 0.9287548448920874, + "language_loss": 0.61035073, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63371074, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.05175781, + "step": 2820, + "time_per_iteration": 3.181306838989258 + }, + { + "auxiliary_loss_clip": 0.01495802, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.32153583, + "balance_loss_mlp": 1.03545809, + "epoch": 0.1696076957763415, + "flos": 29612544585120.0, + "grad_norm": 1.8476504427170855, + "language_loss": 0.78954995, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.81508321, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.22045898, + "step": 2821, + "time_per_iteration": 2.8042421340942383 + }, + { + "auxiliary_loss_clip": 0.01498862, + "auxiliary_loss_mlp": 0.01053454, + "balance_loss_clip": 1.32244897, + "balance_loss_mlp": 1.03122115, + "epoch": 0.16966781902900946, + "flos": 24393874834800.0, + "grad_norm": 3.2938350816406183, + "language_loss": 0.78509349, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.81061661, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22216797, + "step": 2822, + "time_per_iteration": 2.780344009399414 + }, + { + "auxiliary_loss_clip": 0.01487113, + "auxiliary_loss_mlp": 0.01046563, + "balance_loss_clip": 1.31634903, + "balance_loss_mlp": 1.02624941, + "epoch": 0.16972794228167745, + "flos": 23043950302320.0, + "grad_norm": 1.947525876605381, + "language_loss": 0.75241727, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77775401, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20324707, + "step": 2823, + "time_per_iteration": 2.776287794113159 + }, + { + "auxiliary_loss_clip": 0.01504839, + "auxiliary_loss_mlp": 0.01055711, + "balance_loss_clip": 1.32898831, + "balance_loss_mlp": 1.03432512, + "epoch": 0.16978806553434542, + "flos": 35235892891440.0, + "grad_norm": 1.80289461726734, + "language_loss": 0.60471874, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.63032424, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21386719, + "step": 2824, + "time_per_iteration": 2.9424755573272705 + }, + { + "auxiliary_loss_clip": 0.01502083, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.32818866, + "balance_loss_mlp": 1.02454984, + "epoch": 0.16984818878701338, + "flos": 32020595999640.0, + "grad_norm": 1.6361385629797938, + "language_loss": 0.73413795, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75961828, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21398926, + "step": 2825, + "time_per_iteration": 2.9097986221313477 + }, + { + "auxiliary_loss_clip": 0.01513087, + "auxiliary_loss_mlp": 0.0105595, + "balance_loss_clip": 1.33331871, + "balance_loss_mlp": 1.03327692, + "epoch": 0.16990831203968135, + "flos": 22823548794720.0, + "grad_norm": 1.8761739603272638, + "language_loss": 0.85489357, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.88058388, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.22680664, + "step": 2826, + "time_per_iteration": 2.7471606731414795 + }, + { + "auxiliary_loss_clip": 0.01509586, + "auxiliary_loss_mlp": 0.01052567, + "balance_loss_clip": 1.3304193, + "balance_loss_mlp": 1.03071642, + "epoch": 0.1699684352923493, + "flos": 23044843686240.0, + "grad_norm": 1.6303412419291599, + "language_loss": 0.8269071, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.85252857, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21838379, + "step": 2827, + "time_per_iteration": 2.7569732666015625 + }, + { + "auxiliary_loss_clip": 0.01510937, + "auxiliary_loss_mlp": 0.01048226, + "balance_loss_clip": 1.3319068, + "balance_loss_mlp": 1.02438414, + "epoch": 0.17002855854501728, + "flos": 21439124137440.0, + "grad_norm": 1.6384629962960533, + "language_loss": 0.73880601, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76439762, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.23852539, + "step": 2828, + "time_per_iteration": 2.7547457218170166 + }, + { + "auxiliary_loss_clip": 0.01332273, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.24281788, + "balance_loss_mlp": 1.03876817, + "epoch": 0.17008868179768524, + "flos": 71469480779280.0, + "grad_norm": 0.8875081176202344, + "language_loss": 0.56449902, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58824921, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.03979492, + "step": 2829, + "time_per_iteration": 3.325474500656128 + }, + { + "auxiliary_loss_clip": 0.01495113, + "auxiliary_loss_mlp": 0.0105465, + "balance_loss_clip": 1.32148814, + "balance_loss_mlp": 1.03364491, + "epoch": 0.17014880505035324, + "flos": 24905789734320.0, + "grad_norm": 2.1024755624322493, + "language_loss": 0.83690047, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.86239809, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21008301, + "step": 2830, + "time_per_iteration": 2.796689510345459 + }, + { + "auxiliary_loss_clip": 0.01505593, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.32911527, + "balance_loss_mlp": 1.02625048, + "epoch": 0.1702089283030212, + "flos": 29868542643240.0, + "grad_norm": 2.0199455468032084, + "language_loss": 0.78527021, + "learning_rate": 3.797301551737529e-06, + "loss": 0.8108083, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.21984863, + "step": 2831, + "time_per_iteration": 2.8287248611450195 + }, + { + "auxiliary_loss_clip": 0.01507808, + "auxiliary_loss_mlp": 0.01054681, + "balance_loss_clip": 1.32978737, + "balance_loss_mlp": 1.03390324, + "epoch": 0.17026905155568917, + "flos": 17748077413680.0, + "grad_norm": 1.7976975852007298, + "language_loss": 0.79258037, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81820518, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.20788574, + "step": 2832, + "time_per_iteration": 2.774054527282715 + }, + { + "auxiliary_loss_clip": 0.01506585, + "auxiliary_loss_mlp": 0.01053206, + "balance_loss_clip": 1.33320594, + "balance_loss_mlp": 1.03185546, + "epoch": 0.17032917480835713, + "flos": 23153866797600.0, + "grad_norm": 1.7368255242056123, + "language_loss": 0.89208806, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91768593, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.21337891, + "step": 2833, + "time_per_iteration": 2.878037214279175 + }, + { + "auxiliary_loss_clip": 0.01502552, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_clip": 1.3291738, + "balance_loss_mlp": 1.02908063, + "epoch": 0.1703892980610251, + "flos": 39209884643160.0, + "grad_norm": 2.0900272185447797, + "language_loss": 0.72755176, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.7530663, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.19824219, + "step": 2834, + "time_per_iteration": 2.918238878250122 + }, + { + "auxiliary_loss_clip": 0.015112, + "auxiliary_loss_mlp": 0.01053343, + "balance_loss_clip": 1.33369648, + "balance_loss_mlp": 1.03326833, + "epoch": 0.17044942131369306, + "flos": 23044275169200.0, + "grad_norm": 1.8729556311546693, + "language_loss": 0.87420827, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.89985371, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20080566, + "step": 2835, + "time_per_iteration": 2.7377591133117676 + }, + { + "auxiliary_loss_clip": 0.01513783, + "auxiliary_loss_mlp": 0.01054043, + "balance_loss_clip": 1.33385825, + "balance_loss_mlp": 1.03178692, + "epoch": 0.17050954456636103, + "flos": 17059316795640.0, + "grad_norm": 2.061388686227094, + "language_loss": 0.75121301, + "learning_rate": 3.796446484348989e-06, + "loss": 0.77689123, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22265625, + "step": 2836, + "time_per_iteration": 2.827080488204956 + }, + { + "auxiliary_loss_clip": 0.01513292, + "auxiliary_loss_mlp": 0.01046404, + "balance_loss_clip": 1.33264458, + "balance_loss_mlp": 1.02408814, + "epoch": 0.17056966781902902, + "flos": 16841392398000.0, + "grad_norm": 2.5339565002748388, + "language_loss": 0.80489457, + "learning_rate": 3.796275266481036e-06, + "loss": 0.83049154, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.2232666, + "step": 2837, + "time_per_iteration": 2.699842691421509 + }, + { + "auxiliary_loss_clip": 0.01502289, + "auxiliary_loss_mlp": 0.01048983, + "balance_loss_clip": 1.33324265, + "balance_loss_mlp": 1.02829993, + "epoch": 0.17062979107169698, + "flos": 17717069607840.0, + "grad_norm": 1.860134629936925, + "language_loss": 0.83765674, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.86316943, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20690918, + "step": 2838, + "time_per_iteration": 2.7551016807556152 + }, + { + "auxiliary_loss_clip": 0.01507154, + "auxiliary_loss_mlp": 0.0104857, + "balance_loss_clip": 1.33417416, + "balance_loss_mlp": 1.02892458, + "epoch": 0.17068991432436495, + "flos": 22529720726280.0, + "grad_norm": 1.6450211273520736, + "language_loss": 0.93618381, + "learning_rate": 3.795932626406812e-06, + "loss": 0.96174103, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.1965332, + "step": 2839, + "time_per_iteration": 2.769007444381714 + }, + { + "auxiliary_loss_clip": 0.01515935, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.33877528, + "balance_loss_mlp": 1.02889264, + "epoch": 0.17075003757703291, + "flos": 25888215987360.0, + "grad_norm": 1.8903206082095976, + "language_loss": 0.84004629, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86571872, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22424316, + "step": 2840, + "time_per_iteration": 2.809943914413452 + }, + { + "auxiliary_loss_clip": 0.01513124, + "auxiliary_loss_mlp": 0.01053228, + "balance_loss_clip": 1.33779192, + "balance_loss_mlp": 1.03115034, + "epoch": 0.17081016082970088, + "flos": 20125567714320.0, + "grad_norm": 1.7503869289728047, + "language_loss": 0.76915109, + "learning_rate": 3.79558971392481e-06, + "loss": 0.79481459, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.22106934, + "step": 2841, + "time_per_iteration": 4.1588640213012695 + }, + { + "auxiliary_loss_clip": 0.01511662, + "auxiliary_loss_mlp": 0.01049734, + "balance_loss_clip": 1.33624232, + "balance_loss_mlp": 1.03036284, + "epoch": 0.17087028408236885, + "flos": 24941914193520.0, + "grad_norm": 1.8141304940146117, + "language_loss": 0.76839435, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79400826, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.19372559, + "step": 2842, + "time_per_iteration": 2.797138214111328 + }, + { + "auxiliary_loss_clip": 0.01498785, + "auxiliary_loss_mlp": 0.0105124, + "balance_loss_clip": 1.32922769, + "balance_loss_mlp": 1.02999723, + "epoch": 0.17093040733503684, + "flos": 19061958703680.0, + "grad_norm": 1.924103001888483, + "language_loss": 0.86343437, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88893461, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21240234, + "step": 2843, + "time_per_iteration": 2.7552287578582764 + }, + { + "auxiliary_loss_clip": 0.01508251, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.33623302, + "balance_loss_mlp": 1.02871847, + "epoch": 0.1709905305877048, + "flos": 13082888542320.0, + "grad_norm": 1.7210532822947369, + "language_loss": 0.68336153, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7089234, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.1920166, + "step": 2844, + "time_per_iteration": 2.71097731590271 + }, + { + "auxiliary_loss_clip": 0.01509327, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.3350718, + "balance_loss_mlp": 1.03022289, + "epoch": 0.17105065384037277, + "flos": 19214172128880.0, + "grad_norm": 1.7815823734461589, + "language_loss": 0.78539425, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.81099504, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.2052002, + "step": 2845, + "time_per_iteration": 4.14713454246521 + }, + { + "auxiliary_loss_clip": 0.01510446, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.33609748, + "balance_loss_mlp": 1.02849674, + "epoch": 0.17111077709304073, + "flos": 18519807557160.0, + "grad_norm": 2.040760465930267, + "language_loss": 0.77913153, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80471349, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.19250488, + "step": 2846, + "time_per_iteration": 2.732578754425049 + }, + { + "auxiliary_loss_clip": 0.01507522, + "auxiliary_loss_mlp": 0.01048433, + "balance_loss_clip": 1.33596587, + "balance_loss_mlp": 1.02972865, + "epoch": 0.1711709003457087, + "flos": 25088158189800.0, + "grad_norm": 1.8807767658838783, + "language_loss": 0.80357677, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82913625, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.18701172, + "step": 2847, + "time_per_iteration": 2.8579399585723877 + }, + { + "auxiliary_loss_clip": 0.01512209, + "auxiliary_loss_mlp": 0.01056053, + "balance_loss_clip": 1.33490956, + "balance_loss_mlp": 1.0369432, + "epoch": 0.17123102359837666, + "flos": 17570866219920.0, + "grad_norm": 2.235420494509583, + "language_loss": 0.86956596, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89524865, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.19116211, + "step": 2848, + "time_per_iteration": 4.263880252838135 + }, + { + "auxiliary_loss_clip": 0.01502881, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.32881749, + "balance_loss_mlp": 1.02587116, + "epoch": 0.17129114685104463, + "flos": 26178795387000.0, + "grad_norm": 1.6955472282764585, + "language_loss": 0.7505337, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77601993, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.1986084, + "step": 2849, + "time_per_iteration": 2.8671677112579346 + }, + { + "auxiliary_loss_clip": 0.0139255, + "auxiliary_loss_mlp": 0.01025221, + "balance_loss_clip": 1.29967523, + "balance_loss_mlp": 1.02102518, + "epoch": 0.17135127010371262, + "flos": 69286094750160.0, + "grad_norm": 0.8168114607486654, + "language_loss": 0.57532352, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59950119, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.04199219, + "step": 2850, + "time_per_iteration": 4.77133321762085 + }, + { + "auxiliary_loss_clip": 0.015005, + "auxiliary_loss_mlp": 0.01045656, + "balance_loss_clip": 1.33169031, + "balance_loss_mlp": 1.0263437, + "epoch": 0.1714113933563806, + "flos": 23555377901520.0, + "grad_norm": 2.1760317981874273, + "language_loss": 0.80874765, + "learning_rate": 3.793871067220031e-06, + "loss": 0.8342092, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19311523, + "step": 2851, + "time_per_iteration": 2.781707525253296 + }, + { + "auxiliary_loss_clip": 0.01497651, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.32761693, + "balance_loss_mlp": 1.02212489, + "epoch": 0.17147151660904855, + "flos": 21147448312080.0, + "grad_norm": 2.1596379093540796, + "language_loss": 0.9367584, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.96214426, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.18798828, + "step": 2852, + "time_per_iteration": 2.844989776611328 + }, + { + "auxiliary_loss_clip": 0.01512877, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.33438575, + "balance_loss_mlp": 1.03840232, + "epoch": 0.17153163986171652, + "flos": 18629561619000.0, + "grad_norm": 1.6754072822141473, + "language_loss": 0.69404811, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7197668, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.20605469, + "step": 2853, + "time_per_iteration": 2.8791451454162598 + }, + { + "auxiliary_loss_clip": 0.01506605, + "auxiliary_loss_mlp": 0.01050113, + "balance_loss_clip": 1.33127356, + "balance_loss_mlp": 1.03068209, + "epoch": 0.17159176311438448, + "flos": 18227847473280.0, + "grad_norm": 1.8850454467434243, + "language_loss": 0.66654813, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.69211531, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.19433594, + "step": 2854, + "time_per_iteration": 2.7173256874084473 + }, + { + "auxiliary_loss_clip": 0.01498655, + "auxiliary_loss_mlp": 0.01046145, + "balance_loss_clip": 1.32577586, + "balance_loss_mlp": 1.02658272, + "epoch": 0.17165188636705245, + "flos": 20743663140000.0, + "grad_norm": 1.646792624661111, + "language_loss": 0.89415103, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91959906, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.19567871, + "step": 2855, + "time_per_iteration": 2.776582956314087 + }, + { + "auxiliary_loss_clip": 0.01506457, + "auxiliary_loss_mlp": 0.01054535, + "balance_loss_clip": 1.33288562, + "balance_loss_mlp": 1.03484178, + "epoch": 0.17171200961972044, + "flos": 24905302434000.0, + "grad_norm": 2.164807628169864, + "language_loss": 0.83482975, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86043966, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.19689941, + "step": 2856, + "time_per_iteration": 2.748708963394165 + }, + { + "auxiliary_loss_clip": 0.01510202, + "auxiliary_loss_mlp": 0.01055759, + "balance_loss_clip": 1.33509731, + "balance_loss_mlp": 1.03519523, + "epoch": 0.1717721328723884, + "flos": 20162098257120.0, + "grad_norm": 1.949220021001483, + "language_loss": 0.86675447, + "learning_rate": 3.792836613639026e-06, + "loss": 0.89241403, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20556641, + "step": 2857, + "time_per_iteration": 2.90812611579895 + }, + { + "auxiliary_loss_clip": 0.01516455, + "auxiliary_loss_mlp": 0.0104454, + "balance_loss_clip": 1.34138811, + "balance_loss_mlp": 1.02428627, + "epoch": 0.17183225612505637, + "flos": 23366106024840.0, + "grad_norm": 2.103836396547289, + "language_loss": 0.78474706, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.81035703, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20263672, + "step": 2858, + "time_per_iteration": 2.8168580532073975 + }, + { + "auxiliary_loss_clip": 0.01522675, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.33839476, + "balance_loss_mlp": 1.02508712, + "epoch": 0.17189237937772434, + "flos": 18118864970280.0, + "grad_norm": 2.8225146352736448, + "language_loss": 0.77878129, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.80448335, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22424316, + "step": 2859, + "time_per_iteration": 2.7550318241119385 + }, + { + "auxiliary_loss_clip": 0.01510142, + "auxiliary_loss_mlp": 0.01042736, + "balance_loss_clip": 1.33606887, + "balance_loss_mlp": 1.02181482, + "epoch": 0.1719525026303923, + "flos": 23263661467800.0, + "grad_norm": 1.8642396752662647, + "language_loss": 0.76840842, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79393721, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.20922852, + "step": 2860, + "time_per_iteration": 2.84182071685791 + }, + { + "auxiliary_loss_clip": 0.01513653, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.33739924, + "balance_loss_mlp": 1.02506709, + "epoch": 0.17201262588306027, + "flos": 20814896849400.0, + "grad_norm": 2.3261265796585633, + "language_loss": 0.81641304, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84199953, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.19934082, + "step": 2861, + "time_per_iteration": 2.754657506942749 + }, + { + "auxiliary_loss_clip": 0.01512908, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.33688164, + "balance_loss_mlp": 1.03153467, + "epoch": 0.17207274913572823, + "flos": 20380712996880.0, + "grad_norm": 2.0696416802187985, + "language_loss": 0.85672784, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.88237292, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.20056152, + "step": 2862, + "time_per_iteration": 2.8036768436431885 + }, + { + "auxiliary_loss_clip": 0.01505329, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.33392191, + "balance_loss_mlp": 1.02248549, + "epoch": 0.17213287238839622, + "flos": 26803469367000.0, + "grad_norm": 1.7713964761034893, + "language_loss": 0.78350395, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80897129, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.18908691, + "step": 2863, + "time_per_iteration": 2.814082145690918 + }, + { + "auxiliary_loss_clip": 0.01501738, + "auxiliary_loss_mlp": 0.01052651, + "balance_loss_clip": 1.32800531, + "balance_loss_mlp": 1.03088295, + "epoch": 0.1721929956410642, + "flos": 26036368576560.0, + "grad_norm": 1.7505506706199676, + "language_loss": 0.72648221, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.75202608, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21765137, + "step": 2864, + "time_per_iteration": 2.8473944664001465 + }, + { + "auxiliary_loss_clip": 0.01519261, + "auxiliary_loss_mlp": 0.01049767, + "balance_loss_clip": 1.34091115, + "balance_loss_mlp": 1.02835655, + "epoch": 0.17225311889373215, + "flos": 22278189587760.0, + "grad_norm": 1.9951862632991426, + "language_loss": 0.73348325, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75917351, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21411133, + "step": 2865, + "time_per_iteration": 2.7651000022888184 + }, + { + "auxiliary_loss_clip": 0.01514031, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.33906007, + "balance_loss_mlp": 1.03074777, + "epoch": 0.17231324214640012, + "flos": 21292717707720.0, + "grad_norm": 2.2459590961150027, + "language_loss": 0.78933918, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81499672, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20983887, + "step": 2866, + "time_per_iteration": 2.8096985816955566 + }, + { + "auxiliary_loss_clip": 0.01509468, + "auxiliary_loss_mlp": 0.01052209, + "balance_loss_clip": 1.3322506, + "balance_loss_mlp": 1.03027475, + "epoch": 0.17237336539906808, + "flos": 19685251999440.0, + "grad_norm": 1.6182896899125674, + "language_loss": 0.80184376, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82746059, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.21936035, + "step": 2867, + "time_per_iteration": 2.8797192573547363 + }, + { + "auxiliary_loss_clip": 0.01503823, + "auxiliary_loss_mlp": 0.01047304, + "balance_loss_clip": 1.32789207, + "balance_loss_mlp": 1.02619183, + "epoch": 0.17243348865173605, + "flos": 17534092026960.0, + "grad_norm": 1.649633296228963, + "language_loss": 0.79999, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.82550132, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.2109375, + "step": 2868, + "time_per_iteration": 2.8231751918792725 + }, + { + "auxiliary_loss_clip": 0.01513458, + "auxiliary_loss_mlp": 0.01048703, + "balance_loss_clip": 1.33424735, + "balance_loss_mlp": 1.02847314, + "epoch": 0.17249361190440402, + "flos": 18264459232800.0, + "grad_norm": 1.8416161000016558, + "language_loss": 0.83526689, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86088848, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.20227051, + "step": 2869, + "time_per_iteration": 3.0083024501800537 + }, + { + "auxiliary_loss_clip": 0.01497047, + "auxiliary_loss_mlp": 0.01050921, + "balance_loss_clip": 1.32124162, + "balance_loss_mlp": 1.02943993, + "epoch": 0.172553735157072, + "flos": 21178780984800.0, + "grad_norm": 1.8554819413262988, + "language_loss": 0.77437901, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79985869, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21472168, + "step": 2870, + "time_per_iteration": 2.7398152351379395 + }, + { + "auxiliary_loss_clip": 0.01494863, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_clip": 1.32555866, + "balance_loss_mlp": 1.0263288, + "epoch": 0.17261385840973997, + "flos": 22278392629560.0, + "grad_norm": 1.726598326670829, + "language_loss": 0.7757507, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.80114168, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.17907715, + "step": 2871, + "time_per_iteration": 2.849782705307007 + }, + { + "auxiliary_loss_clip": 0.01504921, + "auxiliary_loss_mlp": 0.01051533, + "balance_loss_clip": 1.32862461, + "balance_loss_mlp": 1.03110063, + "epoch": 0.17267398166240794, + "flos": 27927266613120.0, + "grad_norm": 2.4330167214034177, + "language_loss": 0.74734777, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.77291238, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.2043457, + "step": 2872, + "time_per_iteration": 2.7843894958496094 + }, + { + "auxiliary_loss_clip": 0.01495034, + "auxiliary_loss_mlp": 0.01043266, + "balance_loss_clip": 1.32283545, + "balance_loss_mlp": 1.02207029, + "epoch": 0.1727341049150759, + "flos": 21950226869760.0, + "grad_norm": 1.6208012687874442, + "language_loss": 0.82723689, + "learning_rate": 3.790066109323988e-06, + "loss": 0.85261989, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21179199, + "step": 2873, + "time_per_iteration": 2.7821898460388184 + }, + { + "auxiliary_loss_clip": 0.0149696, + "auxiliary_loss_mlp": 0.01050093, + "balance_loss_clip": 1.32545185, + "balance_loss_mlp": 1.02888548, + "epoch": 0.17279422816774387, + "flos": 18111677290560.0, + "grad_norm": 2.1385868227963867, + "language_loss": 0.7536546, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77912509, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21228027, + "step": 2874, + "time_per_iteration": 2.708406448364258 + }, + { + "auxiliary_loss_clip": 0.01502651, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.3270483, + "balance_loss_mlp": 1.02800941, + "epoch": 0.17285435142041183, + "flos": 21840513416280.0, + "grad_norm": 1.7852884674798364, + "language_loss": 0.80998981, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83551043, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21411133, + "step": 2875, + "time_per_iteration": 2.8459818363189697 + }, + { + "auxiliary_loss_clip": 0.01504001, + "auxiliary_loss_mlp": 0.01051057, + "balance_loss_clip": 1.32634008, + "balance_loss_mlp": 1.03064823, + "epoch": 0.17291447467307983, + "flos": 18372954435480.0, + "grad_norm": 2.2491128264413893, + "language_loss": 0.87782413, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.90337473, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.20397949, + "step": 2876, + "time_per_iteration": 2.717259645462036 + }, + { + "auxiliary_loss_clip": 0.01504842, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_clip": 1.33070183, + "balance_loss_mlp": 1.02857065, + "epoch": 0.1729745979257478, + "flos": 18629277360480.0, + "grad_norm": 1.6957150728979151, + "language_loss": 0.84518266, + "learning_rate": 3.789370767013681e-06, + "loss": 0.87071955, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.20275879, + "step": 2877, + "time_per_iteration": 2.733896017074585 + }, + { + "auxiliary_loss_clip": 0.01502051, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.32810736, + "balance_loss_mlp": 1.02775621, + "epoch": 0.17303472117841576, + "flos": 23002831014840.0, + "grad_norm": 2.005750311577467, + "language_loss": 0.80025816, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.8257609, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20471191, + "step": 2878, + "time_per_iteration": 2.8806920051574707 + }, + { + "auxiliary_loss_clip": 0.01496228, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_clip": 1.32373905, + "balance_loss_mlp": 1.03157103, + "epoch": 0.17309484443108372, + "flos": 25669682464320.0, + "grad_norm": 1.6936062216300247, + "language_loss": 0.70300603, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72848362, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.19958496, + "step": 2879, + "time_per_iteration": 2.800525665283203 + }, + { + "auxiliary_loss_clip": 0.01494298, + "auxiliary_loss_mlp": 0.01055316, + "balance_loss_clip": 1.31930757, + "balance_loss_mlp": 1.03589678, + "epoch": 0.1731549676837517, + "flos": 13556851606440.0, + "grad_norm": 1.850570602024796, + "language_loss": 0.83625853, + "learning_rate": 3.78884854780014e-06, + "loss": 0.86175466, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.1940918, + "step": 2880, + "time_per_iteration": 4.189151287078857 + }, + { + "auxiliary_loss_clip": 0.01502261, + "auxiliary_loss_mlp": 0.01049985, + "balance_loss_clip": 1.32508755, + "balance_loss_mlp": 1.02945709, + "epoch": 0.17321509093641965, + "flos": 22862272188960.0, + "grad_norm": 1.7923573046274566, + "language_loss": 0.81192458, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83744705, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.20532227, + "step": 2881, + "time_per_iteration": 2.825652837753296 + }, + { + "auxiliary_loss_clip": 0.01494707, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_clip": 1.31932068, + "balance_loss_mlp": 1.02838969, + "epoch": 0.17327521418908762, + "flos": 24358156459200.0, + "grad_norm": 2.017707986890441, + "language_loss": 0.77641141, + "learning_rate": 3.788500062480197e-06, + "loss": 0.8018381, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.19555664, + "step": 2882, + "time_per_iteration": 2.7329249382019043 + }, + { + "auxiliary_loss_clip": 0.0148937, + "auxiliary_loss_mlp": 0.01055365, + "balance_loss_clip": 1.31718659, + "balance_loss_mlp": 1.03593373, + "epoch": 0.1733353374417556, + "flos": 33111679888800.0, + "grad_norm": 1.8452602595303313, + "language_loss": 0.77063549, + "learning_rate": 3.788325718086769e-06, + "loss": 0.79608285, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.19433594, + "step": 2883, + "time_per_iteration": 2.8459200859069824 + }, + { + "auxiliary_loss_clip": 0.01486565, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.31402206, + "balance_loss_mlp": 1.02474856, + "epoch": 0.17339546069442358, + "flos": 24394280918400.0, + "grad_norm": 1.9430650749951328, + "language_loss": 0.85795301, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.88326341, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.19750977, + "step": 2884, + "time_per_iteration": 4.268373727798462 + }, + { + "auxiliary_loss_clip": 0.01491661, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.31759799, + "balance_loss_mlp": 1.0212996, + "epoch": 0.17345558394709154, + "flos": 27459963320040.0, + "grad_norm": 1.6466013843984126, + "language_loss": 0.74696803, + "learning_rate": 3.787976825866055e-06, + "loss": 0.77228963, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.1920166, + "step": 2885, + "time_per_iteration": 2.814075231552124 + }, + { + "auxiliary_loss_clip": 0.01486858, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.31730664, + "balance_loss_mlp": 1.0274229, + "epoch": 0.1735157071997595, + "flos": 24687702903240.0, + "grad_norm": 1.6267430333718966, + "language_loss": 0.70806074, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.73339784, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19421387, + "step": 2886, + "time_per_iteration": 2.851740837097168 + }, + { + "auxiliary_loss_clip": 0.01486678, + "auxiliary_loss_mlp": 0.01045567, + "balance_loss_clip": 1.31129241, + "balance_loss_mlp": 1.02570629, + "epoch": 0.17357583045242747, + "flos": 21693903944760.0, + "grad_norm": 2.3124026800153836, + "language_loss": 0.69889832, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.72422075, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.1986084, + "step": 2887, + "time_per_iteration": 4.334078550338745 + }, + { + "auxiliary_loss_clip": 0.01493215, + "auxiliary_loss_mlp": 0.01045787, + "balance_loss_clip": 1.31996107, + "balance_loss_mlp": 1.02622473, + "epoch": 0.17363595370509544, + "flos": 15379723994040.0, + "grad_norm": 1.8557596388621407, + "language_loss": 0.85439491, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87978488, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.19555664, + "step": 2888, + "time_per_iteration": 2.7770910263061523 + }, + { + "auxiliary_loss_clip": 0.01498814, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.32379651, + "balance_loss_mlp": 1.02598405, + "epoch": 0.1736960769577634, + "flos": 23446232965080.0, + "grad_norm": 2.019420099065322, + "language_loss": 0.78945911, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.81491631, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20910645, + "step": 2889, + "time_per_iteration": 4.301637649536133 + }, + { + "auxiliary_loss_clip": 0.0147936, + "auxiliary_loss_mlp": 0.01042795, + "balance_loss_clip": 1.31063139, + "balance_loss_mlp": 1.02288699, + "epoch": 0.1737562002104314, + "flos": 18592421950800.0, + "grad_norm": 1.9576783810233116, + "language_loss": 0.84146643, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86668801, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19909668, + "step": 2890, + "time_per_iteration": 2.9855399131774902 + }, + { + "auxiliary_loss_clip": 0.01496009, + "auxiliary_loss_mlp": 0.01053263, + "balance_loss_clip": 1.32175589, + "balance_loss_mlp": 1.03206754, + "epoch": 0.17381632346309936, + "flos": 16002408164400.0, + "grad_norm": 1.9417971329864612, + "language_loss": 0.83095634, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.85644901, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.2121582, + "step": 2891, + "time_per_iteration": 2.766779899597168 + }, + { + "auxiliary_loss_clip": 0.01500397, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.32216227, + "balance_loss_mlp": 1.02215707, + "epoch": 0.17387644671576732, + "flos": 13373670983760.0, + "grad_norm": 2.031386307193036, + "language_loss": 0.81564289, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.84107578, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.20727539, + "step": 2892, + "time_per_iteration": 2.755373477935791 + }, + { + "auxiliary_loss_clip": 0.01505611, + "auxiliary_loss_mlp": 0.01056383, + "balance_loss_clip": 1.32977962, + "balance_loss_mlp": 1.03442502, + "epoch": 0.1739365699684353, + "flos": 26621263344960.0, + "grad_norm": 2.1442816285499795, + "language_loss": 0.75125605, + "learning_rate": 3.786578545502627e-06, + "loss": 0.77687597, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21960449, + "step": 2893, + "time_per_iteration": 2.783676862716675 + }, + { + "auxiliary_loss_clip": 0.01496597, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_clip": 1.32137859, + "balance_loss_mlp": 1.0235579, + "epoch": 0.17399669322110325, + "flos": 23373253096200.0, + "grad_norm": 1.8721010959169933, + "language_loss": 0.82770896, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85310984, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.19934082, + "step": 2894, + "time_per_iteration": 2.7508206367492676 + }, + { + "auxiliary_loss_clip": 0.01492587, + "auxiliary_loss_mlp": 0.01049589, + "balance_loss_clip": 1.31762362, + "balance_loss_mlp": 1.02631927, + "epoch": 0.17405681647377122, + "flos": 22059168764400.0, + "grad_norm": 3.2419685383744965, + "language_loss": 0.74157858, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76700026, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.23291016, + "step": 2895, + "time_per_iteration": 2.7410168647766113 + }, + { + "auxiliary_loss_clip": 0.01367202, + "auxiliary_loss_mlp": 0.01015601, + "balance_loss_clip": 1.27026212, + "balance_loss_mlp": 1.01123762, + "epoch": 0.1741169397264392, + "flos": 61472375776800.0, + "grad_norm": 0.8678124831537863, + "language_loss": 0.62830198, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.65213001, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.04370117, + "step": 2896, + "time_per_iteration": 3.3350749015808105 + }, + { + "auxiliary_loss_clip": 0.01490453, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.31524873, + "balance_loss_mlp": 1.02170205, + "epoch": 0.17417706297910718, + "flos": 27023586616080.0, + "grad_norm": 2.1630408477327507, + "language_loss": 0.75802541, + "learning_rate": 3.785877779175034e-06, + "loss": 0.78334606, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.19909668, + "step": 2897, + "time_per_iteration": 2.8190829753875732 + }, + { + "auxiliary_loss_clip": 0.01487751, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_clip": 1.31762838, + "balance_loss_mlp": 1.028409, + "epoch": 0.17423718623177514, + "flos": 33515180802360.0, + "grad_norm": 1.8241499328492188, + "language_loss": 0.69525051, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.72061008, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19799805, + "step": 2898, + "time_per_iteration": 2.84183931350708 + }, + { + "auxiliary_loss_clip": 0.01503799, + "auxiliary_loss_mlp": 0.01055843, + "balance_loss_clip": 1.32447779, + "balance_loss_mlp": 1.03524327, + "epoch": 0.1742973094844431, + "flos": 27204371345520.0, + "grad_norm": 2.1685473677516858, + "language_loss": 0.76679027, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.79238665, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.20617676, + "step": 2899, + "time_per_iteration": 2.8437674045562744 + }, + { + "auxiliary_loss_clip": 0.01480838, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.31135511, + "balance_loss_mlp": 1.01875186, + "epoch": 0.17435743273711107, + "flos": 22715865759240.0, + "grad_norm": 1.7986963170762214, + "language_loss": 0.73103762, + "learning_rate": 3.785351493339121e-06, + "loss": 0.756235, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20166016, + "step": 2900, + "time_per_iteration": 2.7680411338806152 + }, + { + "auxiliary_loss_clip": 0.014951, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_clip": 1.32194769, + "balance_loss_mlp": 1.03587294, + "epoch": 0.17441755598977904, + "flos": 41654019908520.0, + "grad_norm": 1.760179392056556, + "language_loss": 0.69895101, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72445691, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.19616699, + "step": 2901, + "time_per_iteration": 3.151236057281494 + }, + { + "auxiliary_loss_clip": 0.01498398, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_clip": 1.32168996, + "balance_loss_mlp": 1.03283238, + "epoch": 0.174477679242447, + "flos": 26292772718280.0, + "grad_norm": 1.6875278539203313, + "language_loss": 0.76369393, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78920603, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.1998291, + "step": 2902, + "time_per_iteration": 2.931210517883301 + }, + { + "auxiliary_loss_clip": 0.01497758, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_clip": 1.32277751, + "balance_loss_mlp": 1.03586709, + "epoch": 0.174537802495115, + "flos": 17862948128880.0, + "grad_norm": 2.0152350385669173, + "language_loss": 0.81565589, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.84119475, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20275879, + "step": 2903, + "time_per_iteration": 2.7788894176483154 + }, + { + "auxiliary_loss_clip": 0.01489745, + "auxiliary_loss_mlp": 0.01043083, + "balance_loss_clip": 1.31873918, + "balance_loss_mlp": 1.02374697, + "epoch": 0.17459792574778296, + "flos": 16944405472080.0, + "grad_norm": 1.6305361578399302, + "language_loss": 0.73902625, + "learning_rate": 3.784648831112429e-06, + "loss": 0.76435453, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.19311523, + "step": 2904, + "time_per_iteration": 2.7833199501037598 + }, + { + "auxiliary_loss_clip": 0.01483481, + "auxiliary_loss_mlp": 0.01050699, + "balance_loss_clip": 1.31080914, + "balance_loss_mlp": 1.03194749, + "epoch": 0.17465804900045093, + "flos": 25525509494400.0, + "grad_norm": 1.8854689079749036, + "language_loss": 0.65141004, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.67675185, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.18762207, + "step": 2905, + "time_per_iteration": 2.774430751800537 + }, + { + "auxiliary_loss_clip": 0.01508388, + "auxiliary_loss_mlp": 0.01055102, + "balance_loss_clip": 1.32757163, + "balance_loss_mlp": 1.03403747, + "epoch": 0.1747181722531189, + "flos": 24134506282800.0, + "grad_norm": 1.5889041191470967, + "language_loss": 0.79515249, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.82078743, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21069336, + "step": 2906, + "time_per_iteration": 2.8248772621154785 + }, + { + "auxiliary_loss_clip": 0.01494261, + "auxiliary_loss_mlp": 0.01055287, + "balance_loss_clip": 1.31895852, + "balance_loss_mlp": 1.03562927, + "epoch": 0.17477829550578686, + "flos": 17753315892120.0, + "grad_norm": 1.8213722618537826, + "language_loss": 0.81596851, + "learning_rate": 3.784121123841449e-06, + "loss": 0.84146392, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.19665527, + "step": 2907, + "time_per_iteration": 2.786625385284424 + }, + { + "auxiliary_loss_clip": 0.0148905, + "auxiliary_loss_mlp": 0.0105516, + "balance_loss_clip": 1.31243515, + "balance_loss_mlp": 1.03572893, + "epoch": 0.17483841875845482, + "flos": 15381551370240.0, + "grad_norm": 2.596928049550136, + "language_loss": 0.81864852, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.84409058, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.19445801, + "step": 2908, + "time_per_iteration": 2.749436378479004 + }, + { + "auxiliary_loss_clip": 0.01487497, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.31221974, + "balance_loss_mlp": 1.03283501, + "epoch": 0.17489854201112282, + "flos": 17167446523080.0, + "grad_norm": 2.597851238268183, + "language_loss": 0.80109328, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82650667, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.21020508, + "step": 2909, + "time_per_iteration": 2.7550532817840576 + }, + { + "auxiliary_loss_clip": 0.01489239, + "auxiliary_loss_mlp": 0.01047492, + "balance_loss_clip": 1.31336117, + "balance_loss_mlp": 1.0268445, + "epoch": 0.17495866526379078, + "flos": 19760018636160.0, + "grad_norm": 1.910005167436962, + "language_loss": 0.76737756, + "learning_rate": 3.783592807684017e-06, + "loss": 0.79274487, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20654297, + "step": 2910, + "time_per_iteration": 2.8204727172851562 + }, + { + "auxiliary_loss_clip": 0.01482989, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_clip": 1.30879998, + "balance_loss_mlp": 1.02832186, + "epoch": 0.17501878851645875, + "flos": 28516912559640.0, + "grad_norm": 1.7464046117625402, + "language_loss": 0.86942834, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89474547, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20422363, + "step": 2911, + "time_per_iteration": 2.9141016006469727 + }, + { + "auxiliary_loss_clip": 0.01484427, + "auxiliary_loss_mlp": 0.01047262, + "balance_loss_clip": 1.30960059, + "balance_loss_mlp": 1.02653122, + "epoch": 0.1750789117691267, + "flos": 17935724955960.0, + "grad_norm": 2.1139084139939586, + "language_loss": 0.90035999, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.92567682, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20715332, + "step": 2912, + "time_per_iteration": 2.778747797012329 + }, + { + "auxiliary_loss_clip": 0.01493032, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_clip": 1.31638229, + "balance_loss_mlp": 1.0225668, + "epoch": 0.17513903502179468, + "flos": 18263971932480.0, + "grad_norm": 2.625960533753207, + "language_loss": 0.73250997, + "learning_rate": 3.783063882820439e-06, + "loss": 0.75786608, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.20007324, + "step": 2913, + "time_per_iteration": 2.9343440532684326 + }, + { + "auxiliary_loss_clip": 0.01481274, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_clip": 1.3094101, + "balance_loss_mlp": 1.02523947, + "epoch": 0.17519915827446264, + "flos": 20709690923880.0, + "grad_norm": 2.0890756738500085, + "language_loss": 0.69367468, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71893692, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.19702148, + "step": 2914, + "time_per_iteration": 2.7710180282592773 + }, + { + "auxiliary_loss_clip": 0.01478321, + "auxiliary_loss_mlp": 0.01046122, + "balance_loss_clip": 1.30656075, + "balance_loss_mlp": 1.02653635, + "epoch": 0.1752592815271306, + "flos": 20528500110840.0, + "grad_norm": 2.0624461508240213, + "language_loss": 0.93606806, + "learning_rate": 3.782710928163772e-06, + "loss": 0.96131247, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.19592285, + "step": 2915, + "time_per_iteration": 2.8355929851531982 + }, + { + "auxiliary_loss_clip": 0.01468443, + "auxiliary_loss_mlp": 0.0104965, + "balance_loss_clip": 1.29830527, + "balance_loss_mlp": 1.02871644, + "epoch": 0.1753194047797986, + "flos": 21804267132000.0, + "grad_norm": 2.169139470877165, + "language_loss": 0.81177974, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83696067, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20947266, + "step": 2916, + "time_per_iteration": 2.766028642654419 + }, + { + "auxiliary_loss_clip": 0.01479396, + "auxiliary_loss_mlp": 0.01052683, + "balance_loss_clip": 1.30478907, + "balance_loss_mlp": 1.03254855, + "epoch": 0.17537952803246656, + "flos": 20673322814520.0, + "grad_norm": 1.5710266528522312, + "language_loss": 0.73771924, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76304007, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.20129395, + "step": 2917, + "time_per_iteration": 2.7769861221313477 + }, + { + "auxiliary_loss_clip": 0.01472172, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.30315423, + "balance_loss_mlp": 1.02566242, + "epoch": 0.17543965128513453, + "flos": 23300354444040.0, + "grad_norm": 1.8696364434976318, + "language_loss": 0.77435637, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79952842, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19372559, + "step": 2918, + "time_per_iteration": 4.204146146774292 + }, + { + "auxiliary_loss_clip": 0.01485245, + "auxiliary_loss_mlp": 0.01040408, + "balance_loss_clip": 1.30771065, + "balance_loss_mlp": 1.02072644, + "epoch": 0.1754997745378025, + "flos": 29101563677880.0, + "grad_norm": 2.307675628388019, + "language_loss": 0.74401593, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76927245, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.19689941, + "step": 2919, + "time_per_iteration": 2.7914443016052246 + }, + { + "auxiliary_loss_clip": 0.0148216, + "auxiliary_loss_mlp": 0.01047563, + "balance_loss_clip": 1.30431688, + "balance_loss_mlp": 1.02779853, + "epoch": 0.17555989779047046, + "flos": 30377858607720.0, + "grad_norm": 2.1101483000928285, + "language_loss": 0.74559855, + "learning_rate": 3.781827358629228e-06, + "loss": 0.77089584, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.19763184, + "step": 2920, + "time_per_iteration": 2.844735622406006 + }, + { + "auxiliary_loss_clip": 0.01462677, + "auxiliary_loss_mlp": 0.01041884, + "balance_loss_clip": 1.29327917, + "balance_loss_mlp": 1.02363312, + "epoch": 0.17562002104313842, + "flos": 23292070338600.0, + "grad_norm": 3.593171126356078, + "language_loss": 0.8052448, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.83029044, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.18237305, + "step": 2921, + "time_per_iteration": 2.8236289024353027 + }, + { + "auxiliary_loss_clip": 0.01483938, + "auxiliary_loss_mlp": 0.01053985, + "balance_loss_clip": 1.3062005, + "balance_loss_mlp": 1.03326607, + "epoch": 0.1756801442958064, + "flos": 24795913847400.0, + "grad_norm": 1.6068645585695913, + "language_loss": 0.87909859, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.90447778, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.20703125, + "step": 2922, + "time_per_iteration": 3.00840425491333 + }, + { + "auxiliary_loss_clip": 0.01472093, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_clip": 1.29803109, + "balance_loss_mlp": 1.0290097, + "epoch": 0.17574026754847438, + "flos": 25776675157680.0, + "grad_norm": 3.9492774237524433, + "language_loss": 0.6241833, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64938927, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.19482422, + "step": 2923, + "time_per_iteration": 4.296422243118286 + }, + { + "auxiliary_loss_clip": 0.01477643, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_clip": 1.30360532, + "balance_loss_mlp": 1.02502668, + "epoch": 0.17580039080114235, + "flos": 17461030941360.0, + "grad_norm": 2.3224746622708707, + "language_loss": 0.81183875, + "learning_rate": 3.78111928675413e-06, + "loss": 0.83706045, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.19506836, + "step": 2924, + "time_per_iteration": 2.8921449184417725 + }, + { + "auxiliary_loss_clip": 0.01478861, + "auxiliary_loss_mlp": 0.0105131, + "balance_loss_clip": 1.30115485, + "balance_loss_mlp": 1.02923226, + "epoch": 0.1758605140538103, + "flos": 14868540045000.0, + "grad_norm": 2.323283206911062, + "language_loss": 0.71300226, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73830396, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.22094727, + "step": 2925, + "time_per_iteration": 4.355324029922485 + }, + { + "auxiliary_loss_clip": 0.01465381, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_clip": 1.29427576, + "balance_loss_mlp": 1.02578914, + "epoch": 0.17592063730647828, + "flos": 23009653219320.0, + "grad_norm": 1.5668869280989284, + "language_loss": 0.71760362, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74270129, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.18615723, + "step": 2926, + "time_per_iteration": 2.9658141136169434 + }, + { + "auxiliary_loss_clip": 0.01480289, + "auxiliary_loss_mlp": 0.01045628, + "balance_loss_clip": 1.30257058, + "balance_loss_mlp": 1.02450371, + "epoch": 0.17598076055914624, + "flos": 20746505725200.0, + "grad_norm": 1.707703550189788, + "language_loss": 0.85134983, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87660897, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21130371, + "step": 2927, + "time_per_iteration": 2.8344593048095703 + }, + { + "auxiliary_loss_clip": 0.01470731, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.2995441, + "balance_loss_mlp": 1.02667201, + "epoch": 0.1760408838118142, + "flos": 34098816711600.0, + "grad_norm": 2.0596519119789933, + "language_loss": 0.71862876, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74378043, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1776123, + "step": 2928, + "time_per_iteration": 4.444894790649414 + }, + { + "auxiliary_loss_clip": 0.01466389, + "auxiliary_loss_mlp": 0.01035872, + "balance_loss_clip": 1.29738164, + "balance_loss_mlp": 1.01751411, + "epoch": 0.1761010070644822, + "flos": 24173473327200.0, + "grad_norm": 1.7448537900958359, + "language_loss": 0.83334541, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85836804, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.18359375, + "step": 2929, + "time_per_iteration": 2.9154467582702637 + }, + { + "auxiliary_loss_clip": 0.01467893, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.29473972, + "balance_loss_mlp": 1.01902843, + "epoch": 0.17616113031715017, + "flos": 26582133867120.0, + "grad_norm": 1.65483462341755, + "language_loss": 0.79807949, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.82313561, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.18701172, + "step": 2930, + "time_per_iteration": 2.921298027038574 + }, + { + "auxiliary_loss_clip": 0.01474024, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.30010653, + "balance_loss_mlp": 1.0156436, + "epoch": 0.17622125356981813, + "flos": 25672281399360.0, + "grad_norm": 1.9733578706646244, + "language_loss": 0.76890457, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.79400212, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20080566, + "step": 2931, + "time_per_iteration": 2.899199962615967 + }, + { + "auxiliary_loss_clip": 0.01469223, + "auxiliary_loss_mlp": 0.01044279, + "balance_loss_clip": 1.29699838, + "balance_loss_mlp": 1.02595615, + "epoch": 0.1762813768224861, + "flos": 16512901771320.0, + "grad_norm": 4.245483266400243, + "language_loss": 0.75793207, + "learning_rate": 3.779699901503696e-06, + "loss": 0.78306711, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.18322754, + "step": 2932, + "time_per_iteration": 2.796865224838257 + }, + { + "auxiliary_loss_clip": 0.01483075, + "auxiliary_loss_mlp": 0.01048953, + "balance_loss_clip": 1.30172181, + "balance_loss_mlp": 1.02850831, + "epoch": 0.17634150007515406, + "flos": 11214714206160.0, + "grad_norm": 2.5570279141882732, + "language_loss": 0.90492404, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.93024433, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.20471191, + "step": 2933, + "time_per_iteration": 2.7859504222869873 + }, + { + "auxiliary_loss_clip": 0.01463651, + "auxiliary_loss_mlp": 0.01044458, + "balance_loss_clip": 1.29343033, + "balance_loss_mlp": 1.02562332, + "epoch": 0.17640162332782203, + "flos": 23665253788440.0, + "grad_norm": 1.6711447912619064, + "language_loss": 0.87885314, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90393424, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.18811035, + "step": 2934, + "time_per_iteration": 2.953944683074951 + }, + { + "auxiliary_loss_clip": 0.01467748, + "auxiliary_loss_mlp": 0.01044685, + "balance_loss_clip": 1.29822755, + "balance_loss_mlp": 1.02701771, + "epoch": 0.17646174658049, + "flos": 53806223894400.0, + "grad_norm": 1.6820469139769003, + "language_loss": 0.71210468, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73722899, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.17663574, + "step": 2935, + "time_per_iteration": 3.0845112800598145 + }, + { + "auxiliary_loss_clip": 0.0148672, + "auxiliary_loss_mlp": 0.01040573, + "balance_loss_clip": 1.30819535, + "balance_loss_mlp": 1.02072465, + "epoch": 0.17652186983315798, + "flos": 24249295781280.0, + "grad_norm": 2.166918411822685, + "language_loss": 0.70997763, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.73525053, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.19836426, + "step": 2936, + "time_per_iteration": 2.821289300918579 + }, + { + "auxiliary_loss_clip": 0.01478814, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.30811191, + "balance_loss_mlp": 1.01944613, + "epoch": 0.17658199308582595, + "flos": 27460085145120.0, + "grad_norm": 2.007950934123713, + "language_loss": 0.72053599, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.74570322, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.18493652, + "step": 2937, + "time_per_iteration": 2.901578903198242 + }, + { + "auxiliary_loss_clip": 0.01481507, + "auxiliary_loss_mlp": 0.010466, + "balance_loss_clip": 1.30252218, + "balance_loss_mlp": 1.02616787, + "epoch": 0.17664211633849392, + "flos": 22423743241920.0, + "grad_norm": 2.0384924359323824, + "language_loss": 0.75853252, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.7838136, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.2043457, + "step": 2938, + "time_per_iteration": 2.90342378616333 + }, + { + "auxiliary_loss_clip": 0.0147764, + "auxiliary_loss_mlp": 0.01041552, + "balance_loss_clip": 1.30274081, + "balance_loss_mlp": 1.02220488, + "epoch": 0.17670223959116188, + "flos": 24720213218400.0, + "grad_norm": 3.7156694014018745, + "language_loss": 0.70968163, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73487353, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.19348145, + "step": 2939, + "time_per_iteration": 2.870412826538086 + }, + { + "auxiliary_loss_clip": 0.01477754, + "auxiliary_loss_mlp": 0.01043846, + "balance_loss_clip": 1.30518484, + "balance_loss_mlp": 1.02424777, + "epoch": 0.17676236284382985, + "flos": 22531994794440.0, + "grad_norm": 2.7305704123785635, + "language_loss": 0.73995906, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76517504, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.19592285, + "step": 2940, + "time_per_iteration": 2.859726667404175 + }, + { + "auxiliary_loss_clip": 0.01484997, + "auxiliary_loss_mlp": 0.01049162, + "balance_loss_clip": 1.30925488, + "balance_loss_mlp": 1.02866995, + "epoch": 0.1768224860964978, + "flos": 12388930054200.0, + "grad_norm": 1.9906664830144878, + "language_loss": 0.8636905, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.88903213, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20495605, + "step": 2941, + "time_per_iteration": 2.8219962120056152 + }, + { + "auxiliary_loss_clip": 0.01477092, + "auxiliary_loss_mlp": 0.01047539, + "balance_loss_clip": 1.30226243, + "balance_loss_mlp": 1.0285368, + "epoch": 0.1768826093491658, + "flos": 24358806192960.0, + "grad_norm": 2.1175338064765623, + "language_loss": 0.77747792, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.80272418, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.19018555, + "step": 2942, + "time_per_iteration": 2.970123052597046 + }, + { + "auxiliary_loss_clip": 0.01491208, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.31314647, + "balance_loss_mlp": 1.02879655, + "epoch": 0.17694273260183377, + "flos": 23592395744640.0, + "grad_norm": 1.6703297231317957, + "language_loss": 0.80487561, + "learning_rate": 3.77774119516197e-06, + "loss": 0.83027661, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.2010498, + "step": 2943, + "time_per_iteration": 2.904071569442749 + }, + { + "auxiliary_loss_clip": 0.01493613, + "auxiliary_loss_mlp": 0.01051432, + "balance_loss_clip": 1.31296468, + "balance_loss_mlp": 1.02986765, + "epoch": 0.17700285585450173, + "flos": 26766695174040.0, + "grad_norm": 1.9164687982262862, + "language_loss": 0.8069253, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83237576, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.21569824, + "step": 2944, + "time_per_iteration": 2.9193930625915527 + }, + { + "auxiliary_loss_clip": 0.01475192, + "auxiliary_loss_mlp": 0.01051976, + "balance_loss_clip": 1.29934669, + "balance_loss_mlp": 1.03138852, + "epoch": 0.1770629791071697, + "flos": 42783867800280.0, + "grad_norm": 1.7490139952063628, + "language_loss": 0.74024236, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.76551402, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20581055, + "step": 2945, + "time_per_iteration": 3.113819122314453 + }, + { + "auxiliary_loss_clip": 0.01487233, + "auxiliary_loss_mlp": 0.01048024, + "balance_loss_clip": 1.31424248, + "balance_loss_mlp": 1.02836633, + "epoch": 0.17712310235983766, + "flos": 17349774370200.0, + "grad_norm": 4.003480909641047, + "language_loss": 0.78051996, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80587256, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.1965332, + "step": 2946, + "time_per_iteration": 2.8394277095794678 + }, + { + "auxiliary_loss_clip": 0.01474072, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_clip": 1.30129695, + "balance_loss_mlp": 1.0359081, + "epoch": 0.17718322561250563, + "flos": 23883787311480.0, + "grad_norm": 1.7335523140204572, + "language_loss": 0.76444578, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78974777, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20214844, + "step": 2947, + "time_per_iteration": 2.846041202545166 + }, + { + "auxiliary_loss_clip": 0.01480949, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.30685401, + "balance_loss_mlp": 1.0211693, + "epoch": 0.1772433488651736, + "flos": 36473342601960.0, + "grad_norm": 2.0651707947977234, + "language_loss": 0.73060453, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75582486, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.19934082, + "step": 2948, + "time_per_iteration": 2.983330011367798 + }, + { + "auxiliary_loss_clip": 0.01470723, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.29840004, + "balance_loss_mlp": 1.01909399, + "epoch": 0.1773034721178416, + "flos": 26689979336040.0, + "grad_norm": 1.8667450582442813, + "language_loss": 0.82195455, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84704989, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.19714355, + "step": 2949, + "time_per_iteration": 3.0042195320129395 + }, + { + "auxiliary_loss_clip": 0.01323264, + "auxiliary_loss_mlp": 0.01017955, + "balance_loss_clip": 1.22787356, + "balance_loss_mlp": 1.01247108, + "epoch": 0.17736359537050955, + "flos": 57131941563000.0, + "grad_norm": 0.8720140973838556, + "language_loss": 0.64994252, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67335463, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.05493164, + "step": 2950, + "time_per_iteration": 3.4011473655700684 + }, + { + "auxiliary_loss_clip": 0.01473555, + "auxiliary_loss_mlp": 0.01045428, + "balance_loss_clip": 1.29997253, + "balance_loss_mlp": 1.0256629, + "epoch": 0.17742371862317752, + "flos": 27203884045200.0, + "grad_norm": 1.6038387613207286, + "language_loss": 0.83924001, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.86442983, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.19775391, + "step": 2951, + "time_per_iteration": 2.8545687198638916 + }, + { + "auxiliary_loss_clip": 0.01483929, + "auxiliary_loss_mlp": 0.01050067, + "balance_loss_clip": 1.30518389, + "balance_loss_mlp": 1.03001606, + "epoch": 0.17748384187584548, + "flos": 20964633164640.0, + "grad_norm": 5.640489257346146, + "language_loss": 0.81012154, + "learning_rate": 3.776132549750806e-06, + "loss": 0.8354615, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20056152, + "step": 2952, + "time_per_iteration": 2.879833936691284 + }, + { + "auxiliary_loss_clip": 0.01477884, + "auxiliary_loss_mlp": 0.01050158, + "balance_loss_clip": 1.30353808, + "balance_loss_mlp": 1.02963042, + "epoch": 0.17754396512851345, + "flos": 25015503187800.0, + "grad_norm": 2.151540202636006, + "language_loss": 0.79954362, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82482398, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.20532227, + "step": 2953, + "time_per_iteration": 2.834095001220703 + }, + { + "auxiliary_loss_clip": 0.01480896, + "auxiliary_loss_mlp": 0.01043805, + "balance_loss_clip": 1.30375075, + "balance_loss_mlp": 1.02467227, + "epoch": 0.1776040883811814, + "flos": 32057979318000.0, + "grad_norm": 1.7300683860712898, + "language_loss": 0.88177609, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90702307, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.19140625, + "step": 2954, + "time_per_iteration": 2.871502161026001 + }, + { + "auxiliary_loss_clip": 0.01479855, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.30538154, + "balance_loss_mlp": 1.03487253, + "epoch": 0.17766421163384938, + "flos": 21578383495800.0, + "grad_norm": 2.4659275199195925, + "language_loss": 0.85240221, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87776059, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.21118164, + "step": 2955, + "time_per_iteration": 2.771580457687378 + }, + { + "auxiliary_loss_clip": 0.01474409, + "auxiliary_loss_mlp": 0.01049209, + "balance_loss_clip": 1.30029523, + "balance_loss_mlp": 1.02890766, + "epoch": 0.17772433488651737, + "flos": 22424474192400.0, + "grad_norm": 1.6979105300306148, + "language_loss": 0.71110231, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.7363385, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20324707, + "step": 2956, + "time_per_iteration": 2.986534595489502 + }, + { + "auxiliary_loss_clip": 0.01467937, + "auxiliary_loss_mlp": 0.01048676, + "balance_loss_clip": 1.29629338, + "balance_loss_mlp": 1.02835071, + "epoch": 0.17778445813918534, + "flos": 25634938689360.0, + "grad_norm": 1.7855965235948508, + "language_loss": 0.83687592, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.86204207, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.203125, + "step": 2957, + "time_per_iteration": 4.201056003570557 + }, + { + "auxiliary_loss_clip": 0.01471812, + "auxiliary_loss_mlp": 0.01046227, + "balance_loss_clip": 1.29837656, + "balance_loss_mlp": 1.02753544, + "epoch": 0.1778445813918533, + "flos": 25634288955600.0, + "grad_norm": 1.6894185151767573, + "language_loss": 0.75146794, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77664834, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.18688965, + "step": 2958, + "time_per_iteration": 2.7732996940612793 + }, + { + "auxiliary_loss_clip": 0.01485218, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.30878723, + "balance_loss_mlp": 1.02799714, + "epoch": 0.17790470464452127, + "flos": 22350560331240.0, + "grad_norm": 2.4016939672201505, + "language_loss": 0.80726981, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.83259904, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.19689941, + "step": 2959, + "time_per_iteration": 2.810316801071167 + }, + { + "auxiliary_loss_clip": 0.01487563, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.30911064, + "balance_loss_mlp": 1.02407169, + "epoch": 0.17796482789718923, + "flos": 18769917403080.0, + "grad_norm": 1.6855446917055703, + "language_loss": 0.52037668, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54570508, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21203613, + "step": 2960, + "time_per_iteration": 2.7538106441497803 + }, + { + "auxiliary_loss_clip": 0.01476854, + "auxiliary_loss_mlp": 0.01049706, + "balance_loss_clip": 1.3020978, + "balance_loss_mlp": 1.02978647, + "epoch": 0.1780249511498572, + "flos": 23446232965080.0, + "grad_norm": 1.7471649535718778, + "language_loss": 0.89291811, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91818368, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.19909668, + "step": 2961, + "time_per_iteration": 2.880011558532715 + }, + { + "auxiliary_loss_clip": 0.01480081, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.30343246, + "balance_loss_mlp": 1.03039336, + "epoch": 0.1780850744025252, + "flos": 23372684579160.0, + "grad_norm": 1.7190166501348008, + "language_loss": 0.78984439, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81516576, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21655273, + "step": 2962, + "time_per_iteration": 4.23615837097168 + }, + { + "auxiliary_loss_clip": 0.01480286, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.30414402, + "balance_loss_mlp": 1.02878857, + "epoch": 0.17814519765519315, + "flos": 13775994254880.0, + "grad_norm": 1.6811328150875096, + "language_loss": 0.75099671, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77629894, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.21166992, + "step": 2963, + "time_per_iteration": 2.7573227882385254 + }, + { + "auxiliary_loss_clip": 0.01493038, + "auxiliary_loss_mlp": 0.01047341, + "balance_loss_clip": 1.31193006, + "balance_loss_mlp": 1.02481031, + "epoch": 0.17820532090786112, + "flos": 21980828592000.0, + "grad_norm": 1.706417202612554, + "language_loss": 0.78822845, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.81363225, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.2253418, + "step": 2964, + "time_per_iteration": 4.40155553817749 + }, + { + "auxiliary_loss_clip": 0.01480625, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_clip": 1.30455399, + "balance_loss_mlp": 1.02929091, + "epoch": 0.17826544416052909, + "flos": 24796198105920.0, + "grad_norm": 1.5460030491406969, + "language_loss": 0.81553411, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.84084249, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.20922852, + "step": 2965, + "time_per_iteration": 2.820354461669922 + }, + { + "auxiliary_loss_clip": 0.01483031, + "auxiliary_loss_mlp": 0.01049689, + "balance_loss_clip": 1.30991006, + "balance_loss_mlp": 1.03017402, + "epoch": 0.17832556741319705, + "flos": 13883352423480.0, + "grad_norm": 2.232543397018845, + "language_loss": 0.95588368, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.98121083, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.19506836, + "step": 2966, + "time_per_iteration": 2.829721450805664 + }, + { + "auxiliary_loss_clip": 0.01483165, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.30901074, + "balance_loss_mlp": 1.02919436, + "epoch": 0.17838569066586502, + "flos": 36647792427240.0, + "grad_norm": 2.134234603293605, + "language_loss": 0.72995645, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.75529093, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.2109375, + "step": 2967, + "time_per_iteration": 4.3713836669921875 + }, + { + "auxiliary_loss_clip": 0.01473673, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.30313754, + "balance_loss_mlp": 1.02840829, + "epoch": 0.17844581391853298, + "flos": 18731031575400.0, + "grad_norm": 1.998841145450323, + "language_loss": 0.77093196, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79615319, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20043945, + "step": 2968, + "time_per_iteration": 2.819463014602661 + }, + { + "auxiliary_loss_clip": 0.01477904, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_clip": 1.30604124, + "balance_loss_mlp": 1.02663767, + "epoch": 0.17850593717120097, + "flos": 27383653565640.0, + "grad_norm": 1.815844107653993, + "language_loss": 0.76536357, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.7906056, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.19665527, + "step": 2969, + "time_per_iteration": 2.8040194511413574 + }, + { + "auxiliary_loss_clip": 0.01322985, + "auxiliary_loss_mlp": 0.01016174, + "balance_loss_clip": 1.23224688, + "balance_loss_mlp": 1.01121473, + "epoch": 0.17856606042386894, + "flos": 67010602314600.0, + "grad_norm": 0.8397136398399154, + "language_loss": 0.68945777, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71284938, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.04956055, + "step": 2970, + "time_per_iteration": 3.3348095417022705 + }, + { + "auxiliary_loss_clip": 0.01484025, + "auxiliary_loss_mlp": 0.01049804, + "balance_loss_clip": 1.30624545, + "balance_loss_mlp": 1.02909732, + "epoch": 0.1786261836765369, + "flos": 36984039250680.0, + "grad_norm": 2.584925097887706, + "language_loss": 0.68034041, + "learning_rate": 3.772718611185505e-06, + "loss": 0.7056787, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20715332, + "step": 2971, + "time_per_iteration": 2.924147605895996 + }, + { + "auxiliary_loss_clip": 0.01475143, + "auxiliary_loss_mlp": 0.01050109, + "balance_loss_clip": 1.3011477, + "balance_loss_mlp": 1.02875876, + "epoch": 0.17868630692920487, + "flos": 24830657622360.0, + "grad_norm": 1.7200894526306458, + "language_loss": 0.89874303, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92399555, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21350098, + "step": 2972, + "time_per_iteration": 2.9256300926208496 + }, + { + "auxiliary_loss_clip": 0.01481672, + "auxiliary_loss_mlp": 0.0105034, + "balance_loss_clip": 1.30715156, + "balance_loss_mlp": 1.03045547, + "epoch": 0.17874643018187283, + "flos": 16986296318400.0, + "grad_norm": 1.997052364312841, + "language_loss": 0.88678986, + "learning_rate": 3.77235783676401e-06, + "loss": 0.91210997, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.19897461, + "step": 2973, + "time_per_iteration": 2.760465145111084 + }, + { + "auxiliary_loss_clip": 0.01480508, + "auxiliary_loss_mlp": 0.01053318, + "balance_loss_clip": 1.30819821, + "balance_loss_mlp": 1.03221822, + "epoch": 0.1788065534345408, + "flos": 21037125733200.0, + "grad_norm": 1.93800163449576, + "language_loss": 0.76048017, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.7858184, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21105957, + "step": 2974, + "time_per_iteration": 2.7651824951171875 + }, + { + "auxiliary_loss_clip": 0.01478176, + "auxiliary_loss_mlp": 0.01056997, + "balance_loss_clip": 1.30605841, + "balance_loss_mlp": 1.03604031, + "epoch": 0.17886667668720876, + "flos": 23993013464640.0, + "grad_norm": 2.439373999329665, + "language_loss": 0.74933481, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.77468657, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20959473, + "step": 2975, + "time_per_iteration": 2.814206123352051 + }, + { + "auxiliary_loss_clip": 0.01474889, + "auxiliary_loss_mlp": 0.01052386, + "balance_loss_clip": 1.30310452, + "balance_loss_mlp": 1.03243065, + "epoch": 0.17892679993987676, + "flos": 25745058226440.0, + "grad_norm": 1.7008994338288652, + "language_loss": 0.7341637, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75943649, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.19958496, + "step": 2976, + "time_per_iteration": 2.7747256755828857 + }, + { + "auxiliary_loss_clip": 0.0145865, + "auxiliary_loss_mlp": 0.01044774, + "balance_loss_clip": 1.2934866, + "balance_loss_mlp": 1.02781034, + "epoch": 0.17898692319254472, + "flos": 25704832322880.0, + "grad_norm": 1.4769275231892014, + "language_loss": 0.77156687, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79660106, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.16967773, + "step": 2977, + "time_per_iteration": 2.812195062637329 + }, + { + "auxiliary_loss_clip": 0.0148368, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.31327653, + "balance_loss_mlp": 1.02884984, + "epoch": 0.1790470464452127, + "flos": 19322504898120.0, + "grad_norm": 1.8889576452698753, + "language_loss": 0.80094433, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82625949, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.19006348, + "step": 2978, + "time_per_iteration": 2.866867780685425 + }, + { + "auxiliary_loss_clip": 0.01484186, + "auxiliary_loss_mlp": 0.01051261, + "balance_loss_clip": 1.30822885, + "balance_loss_mlp": 1.03019667, + "epoch": 0.17910716969788065, + "flos": 30050342581680.0, + "grad_norm": 1.4692212897840573, + "language_loss": 0.76382643, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78918093, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21069336, + "step": 2979, + "time_per_iteration": 2.836344003677368 + }, + { + "auxiliary_loss_clip": 0.01470945, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.30186558, + "balance_loss_mlp": 1.02734184, + "epoch": 0.17916729295054862, + "flos": 19432502610120.0, + "grad_norm": 1.8206928294106501, + "language_loss": 0.70020449, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.72538376, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19641113, + "step": 2980, + "time_per_iteration": 2.8142051696777344 + }, + { + "auxiliary_loss_clip": 0.01476781, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_clip": 1.3037591, + "balance_loss_mlp": 1.01995158, + "epoch": 0.17922741620321658, + "flos": 14615709438960.0, + "grad_norm": 2.066887463701619, + "language_loss": 0.71196973, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73715973, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.22265625, + "step": 2981, + "time_per_iteration": 2.7345950603485107 + }, + { + "auxiliary_loss_clip": 0.01490914, + "auxiliary_loss_mlp": 0.0105562, + "balance_loss_clip": 1.31636286, + "balance_loss_mlp": 1.03505683, + "epoch": 0.17928753945588458, + "flos": 17169761199600.0, + "grad_norm": 2.5864910704600153, + "language_loss": 0.82764018, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.85310549, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.20568848, + "step": 2982, + "time_per_iteration": 2.771158218383789 + }, + { + "auxiliary_loss_clip": 0.01471547, + "auxiliary_loss_mlp": 0.0104153, + "balance_loss_clip": 1.30081677, + "balance_loss_mlp": 1.02041745, + "epoch": 0.17934766270855254, + "flos": 31402053882000.0, + "grad_norm": 1.5181364241552449, + "language_loss": 0.83064854, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85577929, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.2109375, + "step": 2983, + "time_per_iteration": 2.8559794425964355 + }, + { + "auxiliary_loss_clip": 0.01485177, + "auxiliary_loss_mlp": 0.01049642, + "balance_loss_clip": 1.30876446, + "balance_loss_mlp": 1.02879262, + "epoch": 0.1794077859612205, + "flos": 20819160727200.0, + "grad_norm": 2.191708876170719, + "language_loss": 0.8570739, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.88242209, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.20837402, + "step": 2984, + "time_per_iteration": 2.8482120037078857 + }, + { + "auxiliary_loss_clip": 0.0147667, + "auxiliary_loss_mlp": 0.01044994, + "balance_loss_clip": 1.30373716, + "balance_loss_mlp": 1.02418017, + "epoch": 0.17946790921388847, + "flos": 28992418741440.0, + "grad_norm": 1.4592105842765826, + "language_loss": 0.89744186, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.9226585, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20812988, + "step": 2985, + "time_per_iteration": 2.895811080932617 + }, + { + "auxiliary_loss_clip": 0.01463911, + "auxiliary_loss_mlp": 0.01041842, + "balance_loss_clip": 1.29686809, + "balance_loss_mlp": 1.02406764, + "epoch": 0.17952803246655644, + "flos": 20741876372160.0, + "grad_norm": 2.0153931323726284, + "language_loss": 0.7033335, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72839105, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.17773438, + "step": 2986, + "time_per_iteration": 2.8075320720672607 + }, + { + "auxiliary_loss_clip": 0.01464526, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.29580605, + "balance_loss_mlp": 1.02618802, + "epoch": 0.1795881557192244, + "flos": 28261767277080.0, + "grad_norm": 2.129192999200559, + "language_loss": 0.77563208, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80073744, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19824219, + "step": 2987, + "time_per_iteration": 2.8505053520202637 + }, + { + "auxiliary_loss_clip": 0.0147955, + "auxiliary_loss_mlp": 0.01043696, + "balance_loss_clip": 1.30675256, + "balance_loss_mlp": 1.02252436, + "epoch": 0.17964827897189237, + "flos": 18556703575200.0, + "grad_norm": 1.8412559579638734, + "language_loss": 0.78519905, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.81043148, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.21179199, + "step": 2988, + "time_per_iteration": 2.743903398513794 + }, + { + "auxiliary_loss_clip": 0.0132523, + "auxiliary_loss_mlp": 0.01004885, + "balance_loss_clip": 1.23633873, + "balance_loss_mlp": 1.00004518, + "epoch": 0.17970840222456036, + "flos": 58178332629000.0, + "grad_norm": 0.7582568331841986, + "language_loss": 0.62711209, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.65041327, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.04833984, + "step": 2989, + "time_per_iteration": 3.2663676738739014 + }, + { + "auxiliary_loss_clip": 0.01471806, + "auxiliary_loss_mlp": 0.01045509, + "balance_loss_clip": 1.30136466, + "balance_loss_mlp": 1.02593529, + "epoch": 0.17976852547722832, + "flos": 20305377843120.0, + "grad_norm": 1.761041414407085, + "language_loss": 0.7099942, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.73516726, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.19580078, + "step": 2990, + "time_per_iteration": 2.789720296859741 + }, + { + "auxiliary_loss_clip": 0.01472585, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.29808855, + "balance_loss_mlp": 1.02135158, + "epoch": 0.1798286487298963, + "flos": 39675766643640.0, + "grad_norm": 2.554774484912695, + "language_loss": 0.69159532, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.71673942, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.20471191, + "step": 2991, + "time_per_iteration": 2.932065010070801 + }, + { + "auxiliary_loss_clip": 0.01467692, + "auxiliary_loss_mlp": 0.01042723, + "balance_loss_clip": 1.29577565, + "balance_loss_mlp": 1.02103901, + "epoch": 0.17988877198256426, + "flos": 25525834361280.0, + "grad_norm": 1.753991264791619, + "language_loss": 0.83106327, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85616744, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21691895, + "step": 2992, + "time_per_iteration": 2.8057732582092285 + }, + { + "auxiliary_loss_clip": 0.01454617, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_clip": 1.28952157, + "balance_loss_mlp": 1.02679777, + "epoch": 0.17994889523523222, + "flos": 18812214333000.0, + "grad_norm": 1.9429951463486421, + "language_loss": 0.82597274, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.85097378, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18701172, + "step": 2993, + "time_per_iteration": 2.7662105560302734 + }, + { + "auxiliary_loss_clip": 0.01464386, + "auxiliary_loss_mlp": 0.01045275, + "balance_loss_clip": 1.29305291, + "balance_loss_mlp": 1.0250814, + "epoch": 0.18000901848790019, + "flos": 21109252826520.0, + "grad_norm": 1.6187456044048547, + "language_loss": 0.79058385, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.8156805, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.2019043, + "step": 2994, + "time_per_iteration": 2.7685911655426025 + }, + { + "auxiliary_loss_clip": 0.01472631, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_clip": 1.30196214, + "balance_loss_mlp": 1.0235455, + "epoch": 0.18006914174056818, + "flos": 19651239174960.0, + "grad_norm": 1.940171358956347, + "language_loss": 0.80701602, + "learning_rate": 3.768371587287296e-06, + "loss": 0.83216804, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19018555, + "step": 2995, + "time_per_iteration": 2.7065608501434326 + }, + { + "auxiliary_loss_clip": 0.01471281, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.30010009, + "balance_loss_mlp": 1.0351125, + "epoch": 0.18012926499323614, + "flos": 19504589095080.0, + "grad_norm": 1.533231318788085, + "language_loss": 0.8468591, + "learning_rate": 3.768189622421512e-06, + "loss": 0.87210685, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.18371582, + "step": 2996, + "time_per_iteration": 4.155292749404907 + }, + { + "auxiliary_loss_clip": 0.01453621, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.28684497, + "balance_loss_mlp": 1.02265251, + "epoch": 0.1801893882459041, + "flos": 19469398628160.0, + "grad_norm": 1.8490668475346397, + "language_loss": 0.8845126, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90946132, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18591309, + "step": 2997, + "time_per_iteration": 2.8382155895233154 + }, + { + "auxiliary_loss_clip": 0.01475229, + "auxiliary_loss_mlp": 0.01054728, + "balance_loss_clip": 1.30009413, + "balance_loss_mlp": 1.03514194, + "epoch": 0.18024951149857207, + "flos": 26876327410800.0, + "grad_norm": 1.633525555034455, + "language_loss": 0.85504496, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.88034451, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.19580078, + "step": 2998, + "time_per_iteration": 2.7987654209136963 + }, + { + "auxiliary_loss_clip": 0.01464825, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.29781175, + "balance_loss_mlp": 1.02491212, + "epoch": 0.18030963475124004, + "flos": 30232711037160.0, + "grad_norm": 1.836333555348946, + "language_loss": 0.84849429, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.87357056, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.17895508, + "step": 2999, + "time_per_iteration": 2.8937458992004395 + }, + { + "auxiliary_loss_clip": 0.01461565, + "auxiliary_loss_mlp": 0.01050565, + "balance_loss_clip": 1.29169869, + "balance_loss_mlp": 1.03074086, + "epoch": 0.180369758003908, + "flos": 22312324237320.0, + "grad_norm": 1.703865651666293, + "language_loss": 0.75467622, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77979755, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.19824219, + "step": 3000, + "time_per_iteration": 4.38245701789856 + }, + { + "auxiliary_loss_clip": 0.01461172, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.29324031, + "balance_loss_mlp": 1.03842044, + "epoch": 0.18042988125657597, + "flos": 23737259056680.0, + "grad_norm": 1.7671696340933285, + "language_loss": 0.71269834, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73789161, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19726562, + "step": 3001, + "time_per_iteration": 2.948438882827759 + }, + { + "auxiliary_loss_clip": 0.01470383, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_clip": 1.29659283, + "balance_loss_mlp": 1.02631259, + "epoch": 0.18049000450924396, + "flos": 24098422431960.0, + "grad_norm": 2.001558860743759, + "language_loss": 0.88788772, + "learning_rate": 3.767096425420011e-06, + "loss": 0.91305423, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.19970703, + "step": 3002, + "time_per_iteration": 2.8431637287139893 + }, + { + "auxiliary_loss_clip": 0.01467778, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.29624915, + "balance_loss_mlp": 1.0227623, + "epoch": 0.18055012776191193, + "flos": 22168191875760.0, + "grad_norm": 1.5549527120588849, + "language_loss": 0.80915433, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.83424044, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.18054199, + "step": 3003, + "time_per_iteration": 4.294549226760864 + }, + { + "auxiliary_loss_clip": 0.01462164, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.2899462, + "balance_loss_mlp": 1.03233981, + "epoch": 0.1806102510145799, + "flos": 28919479480920.0, + "grad_norm": 1.861053536925148, + "language_loss": 0.67672747, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.70186579, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.1932373, + "step": 3004, + "time_per_iteration": 2.815420150756836 + }, + { + "auxiliary_loss_clip": 0.0147151, + "auxiliary_loss_mlp": 0.01051153, + "balance_loss_clip": 1.29887247, + "balance_loss_mlp": 1.0315547, + "epoch": 0.18067037426724786, + "flos": 19030341772440.0, + "grad_norm": 1.7011873304859628, + "language_loss": 0.85601652, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.88124323, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.19592285, + "step": 3005, + "time_per_iteration": 2.7953341007232666 + }, + { + "auxiliary_loss_clip": 0.01462596, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.29393268, + "balance_loss_mlp": 1.02309644, + "epoch": 0.18073049751991582, + "flos": 27459354194640.0, + "grad_norm": 1.5046602230770076, + "language_loss": 0.83512527, + "learning_rate": 3.766366287157432e-06, + "loss": 0.86017013, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.18774414, + "step": 3006, + "time_per_iteration": 4.321287393569946 + }, + { + "auxiliary_loss_clip": 0.01466778, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_clip": 1.29602671, + "balance_loss_mlp": 1.02447188, + "epoch": 0.1807906207725838, + "flos": 28734715132200.0, + "grad_norm": 1.6468964301194315, + "language_loss": 0.77365232, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79877859, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21386719, + "step": 3007, + "time_per_iteration": 2.870522975921631 + }, + { + "auxiliary_loss_clip": 0.01320711, + "auxiliary_loss_mlp": 0.01009752, + "balance_loss_clip": 1.22857523, + "balance_loss_mlp": 1.00479305, + "epoch": 0.18085074402525175, + "flos": 64483961196960.0, + "grad_norm": 0.8154789339131602, + "language_loss": 0.56911832, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59242296, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.04956055, + "step": 3008, + "time_per_iteration": 3.515202283859253 + }, + { + "auxiliary_loss_clip": 0.01465723, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.29481411, + "balance_loss_mlp": 1.03057587, + "epoch": 0.18091086727791975, + "flos": 23482194990840.0, + "grad_norm": 1.8965842847813699, + "language_loss": 0.6832459, + "learning_rate": 3.765817980138021e-06, + "loss": 0.70840991, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.2010498, + "step": 3009, + "time_per_iteration": 2.7853169441223145 + }, + { + "auxiliary_loss_clip": 0.01467199, + "auxiliary_loss_mlp": 0.01044583, + "balance_loss_clip": 1.29570198, + "balance_loss_mlp": 1.02647555, + "epoch": 0.1809709905305877, + "flos": 24175828612080.0, + "grad_norm": 2.5200797342860435, + "language_loss": 0.76106054, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.78617835, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.18115234, + "step": 3010, + "time_per_iteration": 2.887347459793091 + }, + { + "auxiliary_loss_clip": 0.01447178, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.28224564, + "balance_loss_mlp": 1.02213883, + "epoch": 0.18103111378325568, + "flos": 21655667850840.0, + "grad_norm": 1.5569614856076426, + "language_loss": 0.67788619, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.70275176, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.17260742, + "step": 3011, + "time_per_iteration": 2.7769272327423096 + }, + { + "auxiliary_loss_clip": 0.01456777, + "auxiliary_loss_mlp": 0.01046839, + "balance_loss_clip": 1.28795087, + "balance_loss_mlp": 1.02792037, + "epoch": 0.18109123703592364, + "flos": 53696063748960.0, + "grad_norm": 1.8228013632981626, + "language_loss": 0.71680623, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.74184239, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.18920898, + "step": 3012, + "time_per_iteration": 3.039788246154785 + }, + { + "auxiliary_loss_clip": 0.01449817, + "auxiliary_loss_mlp": 0.01046537, + "balance_loss_clip": 1.28415585, + "balance_loss_mlp": 1.02751136, + "epoch": 0.1811513602885916, + "flos": 35852404591080.0, + "grad_norm": 1.9112239313878112, + "language_loss": 0.62731457, + "learning_rate": 3.765085966704609e-06, + "loss": 0.65227807, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19030762, + "step": 3013, + "time_per_iteration": 2.9737210273742676 + }, + { + "auxiliary_loss_clip": 0.01458667, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.29091787, + "balance_loss_mlp": 1.02536929, + "epoch": 0.18121148354125957, + "flos": 23737868182080.0, + "grad_norm": 1.5096581241666642, + "language_loss": 0.76053417, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78555638, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1817627, + "step": 3014, + "time_per_iteration": 2.854559898376465 + }, + { + "auxiliary_loss_clip": 0.01476324, + "auxiliary_loss_mlp": 0.01050679, + "balance_loss_clip": 1.30063498, + "balance_loss_mlp": 1.0304606, + "epoch": 0.18127160679392756, + "flos": 28733902965000.0, + "grad_norm": 1.7513558905184923, + "language_loss": 0.66179383, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68706393, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20202637, + "step": 3015, + "time_per_iteration": 2.8414266109466553 + }, + { + "auxiliary_loss_clip": 0.01456599, + "auxiliary_loss_mlp": 0.01044005, + "balance_loss_clip": 1.28881335, + "balance_loss_mlp": 1.02581382, + "epoch": 0.18133173004659553, + "flos": 20489898541680.0, + "grad_norm": 1.6653417157026995, + "language_loss": 0.78157741, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80658346, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.18200684, + "step": 3016, + "time_per_iteration": 2.731137275695801 + }, + { + "auxiliary_loss_clip": 0.01472196, + "auxiliary_loss_mlp": 0.01051505, + "balance_loss_clip": 1.29810226, + "balance_loss_mlp": 1.03257418, + "epoch": 0.1813918532992635, + "flos": 22856627626920.0, + "grad_norm": 1.7159308963247912, + "language_loss": 0.83238155, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85761851, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.18933105, + "step": 3017, + "time_per_iteration": 2.7825818061828613 + }, + { + "auxiliary_loss_clip": 0.01454182, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.28762615, + "balance_loss_mlp": 1.01616013, + "epoch": 0.18145197655193146, + "flos": 36072075148200.0, + "grad_norm": 1.9514075891958043, + "language_loss": 0.67563498, + "learning_rate": 3.764169443989697e-06, + "loss": 0.7005198, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18139648, + "step": 3018, + "time_per_iteration": 2.9746594429016113 + }, + { + "auxiliary_loss_clip": 0.01461688, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.2886529, + "balance_loss_mlp": 1.01831996, + "epoch": 0.18151209980459942, + "flos": 24029219140560.0, + "grad_norm": 2.295195998427283, + "language_loss": 0.76576787, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.79076117, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.1932373, + "step": 3019, + "time_per_iteration": 2.860973596572876 + }, + { + "auxiliary_loss_clip": 0.0146553, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.29346728, + "balance_loss_mlp": 1.02211094, + "epoch": 0.1815722230572674, + "flos": 23956970222160.0, + "grad_norm": 2.168691706794428, + "language_loss": 0.82247829, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.84756285, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.20837402, + "step": 3020, + "time_per_iteration": 2.9767708778381348 + }, + { + "auxiliary_loss_clip": 0.01458756, + "auxiliary_loss_mlp": 0.01042647, + "balance_loss_clip": 1.28852534, + "balance_loss_mlp": 1.02234542, + "epoch": 0.18163234630993536, + "flos": 24391478941560.0, + "grad_norm": 2.0907473314427842, + "language_loss": 0.7812705, + "learning_rate": 3.763618727535352e-06, + "loss": 0.80628455, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20300293, + "step": 3021, + "time_per_iteration": 2.8464715480804443 + }, + { + "auxiliary_loss_clip": 0.0144475, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.27622378, + "balance_loss_mlp": 1.02223659, + "epoch": 0.18169246956260335, + "flos": 24686768910960.0, + "grad_norm": 1.6270906522965793, + "language_loss": 0.85229111, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87716269, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20166016, + "step": 3022, + "time_per_iteration": 3.0042548179626465 + }, + { + "auxiliary_loss_clip": 0.01461461, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.28802168, + "balance_loss_mlp": 1.0217737, + "epoch": 0.1817525928152713, + "flos": 24248646047520.0, + "grad_norm": 2.3889996422816298, + "language_loss": 0.69671673, + "learning_rate": 3.763251248837859e-06, + "loss": 0.72176868, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.21960449, + "step": 3023, + "time_per_iteration": 2.8857367038726807 + }, + { + "auxiliary_loss_clip": 0.01454275, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.28403151, + "balance_loss_mlp": 1.02289534, + "epoch": 0.18181271606793928, + "flos": 16476452445240.0, + "grad_norm": 1.7647798246493211, + "language_loss": 0.74477649, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76975524, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20715332, + "step": 3024, + "time_per_iteration": 2.821971893310547 + }, + { + "auxiliary_loss_clip": 0.01459206, + "auxiliary_loss_mlp": 0.01041514, + "balance_loss_clip": 1.28724158, + "balance_loss_mlp": 1.02240515, + "epoch": 0.18187283932060724, + "flos": 18584746970760.0, + "grad_norm": 2.792135237539353, + "language_loss": 0.88640863, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.91141582, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.19104004, + "step": 3025, + "time_per_iteration": 2.806537389755249 + }, + { + "auxiliary_loss_clip": 0.01455962, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.28662276, + "balance_loss_mlp": 1.02954185, + "epoch": 0.1819329625732752, + "flos": 20271730493880.0, + "grad_norm": 1.7135197730616467, + "language_loss": 0.79184359, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81690681, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20837402, + "step": 3026, + "time_per_iteration": 2.854915142059326 + }, + { + "auxiliary_loss_clip": 0.01466223, + "auxiliary_loss_mlp": 0.01053503, + "balance_loss_clip": 1.2930305, + "balance_loss_mlp": 1.03225958, + "epoch": 0.18199308582594317, + "flos": 25919914135320.0, + "grad_norm": 1.6439525146339957, + "language_loss": 0.76047921, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78567648, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21228027, + "step": 3027, + "time_per_iteration": 2.8483431339263916 + }, + { + "auxiliary_loss_clip": 0.01472538, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.2944752, + "balance_loss_mlp": 1.02623773, + "epoch": 0.18205320907861114, + "flos": 15381754412040.0, + "grad_norm": 2.056432352338872, + "language_loss": 0.85392034, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87911081, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.20275879, + "step": 3028, + "time_per_iteration": 2.8177571296691895 + }, + { + "auxiliary_loss_clip": 0.01471794, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.29920781, + "balance_loss_mlp": 1.0273912, + "epoch": 0.18211333233127913, + "flos": 25629131693880.0, + "grad_norm": 1.5364187958907451, + "language_loss": 0.83112037, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85631597, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20361328, + "step": 3029, + "time_per_iteration": 2.88161039352417 + }, + { + "auxiliary_loss_clip": 0.01472847, + "auxiliary_loss_mlp": 0.01047162, + "balance_loss_clip": 1.29906738, + "balance_loss_mlp": 1.02624035, + "epoch": 0.1821734555839471, + "flos": 14980162091400.0, + "grad_norm": 1.867769495348933, + "language_loss": 0.78464317, + "learning_rate": 3.761962967588891e-06, + "loss": 0.8098433, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.20935059, + "step": 3030, + "time_per_iteration": 2.9194211959838867 + }, + { + "auxiliary_loss_clip": 0.0147201, + "auxiliary_loss_mlp": 0.01046452, + "balance_loss_clip": 1.29661989, + "balance_loss_mlp": 1.02563751, + "epoch": 0.18223357883661506, + "flos": 20198913058440.0, + "grad_norm": 1.9710403845792002, + "language_loss": 0.8534379, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87862253, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20812988, + "step": 3031, + "time_per_iteration": 2.959146022796631 + }, + { + "auxiliary_loss_clip": 0.01468786, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.2934742, + "balance_loss_mlp": 1.03247905, + "epoch": 0.18229370208928303, + "flos": 15236241366240.0, + "grad_norm": 1.7626434822257242, + "language_loss": 0.80205703, + "learning_rate": 3.76159428580299e-06, + "loss": 0.82726347, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.19396973, + "step": 3032, + "time_per_iteration": 2.8553426265716553 + }, + { + "auxiliary_loss_clip": 0.01479114, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.30035865, + "balance_loss_mlp": 1.0244143, + "epoch": 0.182353825341951, + "flos": 23845591825920.0, + "grad_norm": 2.2512795012463465, + "language_loss": 0.81276715, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83800566, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20324707, + "step": 3033, + "time_per_iteration": 2.97938871383667 + }, + { + "auxiliary_loss_clip": 0.01290728, + "auxiliary_loss_mlp": 0.01015725, + "balance_loss_clip": 1.20050597, + "balance_loss_mlp": 1.01183891, + "epoch": 0.18241394859461896, + "flos": 61204276427400.0, + "grad_norm": 0.8795736997901344, + "language_loss": 0.6351496, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65821409, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.03881836, + "step": 3034, + "time_per_iteration": 4.651507377624512 + }, + { + "auxiliary_loss_clip": 0.01467082, + "auxiliary_loss_mlp": 0.01049664, + "balance_loss_clip": 1.29338813, + "balance_loss_mlp": 1.02932668, + "epoch": 0.18247407184728695, + "flos": 18475398992520.0, + "grad_norm": 2.296230375750623, + "language_loss": 0.80199891, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.82716638, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.20336914, + "step": 3035, + "time_per_iteration": 2.789707660675049 + }, + { + "auxiliary_loss_clip": 0.0145936, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.28926885, + "balance_loss_mlp": 1.02191377, + "epoch": 0.18253419509995492, + "flos": 21799759604040.0, + "grad_norm": 1.7008474340160853, + "language_loss": 0.84800124, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87300909, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19506836, + "step": 3036, + "time_per_iteration": 2.8136816024780273 + }, + { + "auxiliary_loss_clip": 0.01445693, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.27802396, + "balance_loss_mlp": 1.025949, + "epoch": 0.18259431835262288, + "flos": 20153651718240.0, + "grad_norm": 2.048090780210832, + "language_loss": 0.80541605, + "learning_rate": 3.760671412463617e-06, + "loss": 0.83032382, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19152832, + "step": 3037, + "time_per_iteration": 2.8602542877197266 + }, + { + "auxiliary_loss_clip": 0.01456108, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_clip": 1.28374183, + "balance_loss_mlp": 1.02268553, + "epoch": 0.18265444160529085, + "flos": 16985930843160.0, + "grad_norm": 2.682737189528094, + "language_loss": 0.80099082, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.82599509, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21643066, + "step": 3038, + "time_per_iteration": 2.8077147006988525 + }, + { + "auxiliary_loss_clip": 0.01454302, + "auxiliary_loss_mlp": 0.01051533, + "balance_loss_clip": 1.28521931, + "balance_loss_mlp": 1.03145862, + "epoch": 0.1827145648579588, + "flos": 34429987490040.0, + "grad_norm": 1.9783916170113347, + "language_loss": 0.68046027, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.7055186, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20056152, + "step": 3039, + "time_per_iteration": 4.388702154159546 + }, + { + "auxiliary_loss_clip": 0.01462948, + "auxiliary_loss_mlp": 0.01049064, + "balance_loss_clip": 1.28958845, + "balance_loss_mlp": 1.02851224, + "epoch": 0.18277468811062678, + "flos": 53296298804520.0, + "grad_norm": 1.623131626504505, + "language_loss": 0.73733139, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.76245159, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20556641, + "step": 3040, + "time_per_iteration": 3.111746072769165 + }, + { + "auxiliary_loss_clip": 0.01456042, + "auxiliary_loss_mlp": 0.01050266, + "balance_loss_clip": 1.28656673, + "balance_loss_mlp": 1.02985728, + "epoch": 0.18283481136329474, + "flos": 31657077339480.0, + "grad_norm": 1.7322563111959763, + "language_loss": 0.61096823, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.63603133, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20397949, + "step": 3041, + "time_per_iteration": 2.9434385299682617 + }, + { + "auxiliary_loss_clip": 0.01463767, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_clip": 1.28880548, + "balance_loss_mlp": 1.02899218, + "epoch": 0.18289493461596273, + "flos": 53146481272560.0, + "grad_norm": 1.5546981285536794, + "language_loss": 0.60075235, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62589711, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21728516, + "step": 3042, + "time_per_iteration": 3.175135374069214 + }, + { + "auxiliary_loss_clip": 0.01458584, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.28836274, + "balance_loss_mlp": 1.02734756, + "epoch": 0.1829550578686307, + "flos": 25594387918920.0, + "grad_norm": 1.823404974994917, + "language_loss": 0.87895119, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.90401185, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20141602, + "step": 3043, + "time_per_iteration": 4.745075702667236 + }, + { + "auxiliary_loss_clip": 0.01460008, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.28630686, + "balance_loss_mlp": 1.02446842, + "epoch": 0.18301518112129866, + "flos": 22606558389360.0, + "grad_norm": 1.950937429808563, + "language_loss": 0.71190739, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73694617, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.1940918, + "step": 3044, + "time_per_iteration": 3.048006772994995 + }, + { + "auxiliary_loss_clip": 0.01473476, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.29525602, + "balance_loss_mlp": 1.02596521, + "epoch": 0.18307530437396663, + "flos": 34027339352040.0, + "grad_norm": 2.036467399859224, + "language_loss": 0.64469349, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66991282, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.22497559, + "step": 3045, + "time_per_iteration": 4.446218490600586 + }, + { + "auxiliary_loss_clip": 0.01465321, + "auxiliary_loss_mlp": 0.01050354, + "balance_loss_clip": 1.29508877, + "balance_loss_mlp": 1.03068399, + "epoch": 0.1831354276266346, + "flos": 21283662043440.0, + "grad_norm": 2.478025132068673, + "language_loss": 0.79572743, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.82088411, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19677734, + "step": 3046, + "time_per_iteration": 2.7485156059265137 + }, + { + "auxiliary_loss_clip": 0.01465194, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.28824806, + "balance_loss_mlp": 1.02898765, + "epoch": 0.18319555087930256, + "flos": 21037937900400.0, + "grad_norm": 1.9729159852272689, + "language_loss": 0.79453349, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81967711, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.20202637, + "step": 3047, + "time_per_iteration": 2.8099842071533203 + }, + { + "auxiliary_loss_clip": 0.01461164, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.29212356, + "balance_loss_mlp": 1.02419376, + "epoch": 0.18325567413197055, + "flos": 34389193069440.0, + "grad_norm": 1.4848562893043016, + "language_loss": 0.80844218, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83348334, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.18737793, + "step": 3048, + "time_per_iteration": 2.8682363033294678 + }, + { + "auxiliary_loss_clip": 0.0146663, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.29394341, + "balance_loss_mlp": 1.02206159, + "epoch": 0.18331579738463852, + "flos": 20563365710880.0, + "grad_norm": 2.0447457573798977, + "language_loss": 0.86566126, + "learning_rate": 3.758449708105424e-06, + "loss": 0.89075744, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20922852, + "step": 3049, + "time_per_iteration": 2.767456293106079 + }, + { + "auxiliary_loss_clip": 0.01483141, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_clip": 1.30257046, + "balance_loss_mlp": 1.02630508, + "epoch": 0.18337592063730648, + "flos": 19612393955640.0, + "grad_norm": 2.2715897748999208, + "language_loss": 0.78165591, + "learning_rate": 3.75826413248424e-06, + "loss": 0.80697066, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.22021484, + "step": 3050, + "time_per_iteration": 2.71297025680542 + }, + { + "auxiliary_loss_clip": 0.01459972, + "auxiliary_loss_mlp": 0.01049878, + "balance_loss_clip": 1.28886199, + "balance_loss_mlp": 1.0288496, + "epoch": 0.18343604388997445, + "flos": 20855894311800.0, + "grad_norm": 2.024326514438193, + "language_loss": 0.99547648, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.02057493, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.21032715, + "step": 3051, + "time_per_iteration": 2.825070858001709 + }, + { + "auxiliary_loss_clip": 0.01464629, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.29437089, + "balance_loss_mlp": 1.01959658, + "epoch": 0.1834961671426424, + "flos": 24400412780760.0, + "grad_norm": 2.0017944374823267, + "language_loss": 0.86279917, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88784981, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20849609, + "step": 3052, + "time_per_iteration": 2.8316829204559326 + }, + { + "auxiliary_loss_clip": 0.01463112, + "auxiliary_loss_mlp": 0.01041202, + "balance_loss_clip": 1.29343319, + "balance_loss_mlp": 1.02198529, + "epoch": 0.18355629039531038, + "flos": 21256593248520.0, + "grad_norm": 1.8782518656273324, + "language_loss": 0.73396099, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75900412, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19213867, + "step": 3053, + "time_per_iteration": 2.8413937091827393 + }, + { + "auxiliary_loss_clip": 0.01475808, + "auxiliary_loss_mlp": 0.010467, + "balance_loss_clip": 1.30129802, + "balance_loss_mlp": 1.02581453, + "epoch": 0.18361641364797834, + "flos": 28662222563640.0, + "grad_norm": 1.7371412314032533, + "language_loss": 0.62262386, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64784896, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.2088623, + "step": 3054, + "time_per_iteration": 2.886664867401123 + }, + { + "auxiliary_loss_clip": 0.01470905, + "auxiliary_loss_mlp": 0.01046791, + "balance_loss_clip": 1.29676557, + "balance_loss_mlp": 1.0265733, + "epoch": 0.18367653690064634, + "flos": 20923189010280.0, + "grad_norm": 2.093081219276387, + "language_loss": 0.78414583, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80932283, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20227051, + "step": 3055, + "time_per_iteration": 2.8657286167144775 + }, + { + "auxiliary_loss_clip": 0.01457, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_clip": 1.28910637, + "balance_loss_mlp": 1.02769196, + "epoch": 0.1837366601533143, + "flos": 28771286283360.0, + "grad_norm": 2.0040162492148257, + "language_loss": 0.70027626, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72531462, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19140625, + "step": 3056, + "time_per_iteration": 2.905750036239624 + }, + { + "auxiliary_loss_clip": 0.01462532, + "auxiliary_loss_mlp": 0.01049784, + "balance_loss_clip": 1.29010916, + "balance_loss_mlp": 1.02918446, + "epoch": 0.18379678340598227, + "flos": 21256390206720.0, + "grad_norm": 1.466251746835921, + "language_loss": 0.81092298, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.8360461, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20593262, + "step": 3057, + "time_per_iteration": 2.7778186798095703 + }, + { + "auxiliary_loss_clip": 0.01487516, + "auxiliary_loss_mlp": 0.01053222, + "balance_loss_clip": 1.3069067, + "balance_loss_mlp": 1.02942789, + "epoch": 0.18385690665865023, + "flos": 20454586249680.0, + "grad_norm": 2.380125660063416, + "language_loss": 0.8225466, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84795392, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23803711, + "step": 3058, + "time_per_iteration": 2.826622247695923 + }, + { + "auxiliary_loss_clip": 0.01464608, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.28810239, + "balance_loss_mlp": 1.03037763, + "epoch": 0.1839170299113182, + "flos": 26146163246760.0, + "grad_norm": 2.140904317936165, + "language_loss": 0.86445218, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88961124, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.20947266, + "step": 3059, + "time_per_iteration": 2.845811367034912 + }, + { + "auxiliary_loss_clip": 0.01465715, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.29410267, + "balance_loss_mlp": 1.01837826, + "epoch": 0.18397715316398616, + "flos": 31764191857920.0, + "grad_norm": 2.47896264611782, + "language_loss": 0.7289865, + "learning_rate": 3.756404710389396e-06, + "loss": 0.75403053, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20324707, + "step": 3060, + "time_per_iteration": 2.860440254211426 + }, + { + "auxiliary_loss_clip": 0.01469471, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.29476428, + "balance_loss_mlp": 1.02444983, + "epoch": 0.18403727641665413, + "flos": 24617890486440.0, + "grad_norm": 1.5664047508673042, + "language_loss": 0.72946894, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75461459, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20654297, + "step": 3061, + "time_per_iteration": 2.8099300861358643 + }, + { + "auxiliary_loss_clip": 0.01473556, + "auxiliary_loss_mlp": 0.01048252, + "balance_loss_clip": 1.30254459, + "balance_loss_mlp": 1.0281769, + "epoch": 0.18409739966932212, + "flos": 23445055322640.0, + "grad_norm": 1.7032424943413982, + "language_loss": 0.81786764, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84308565, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20092773, + "step": 3062, + "time_per_iteration": 2.745429515838623 + }, + { + "auxiliary_loss_clip": 0.01481551, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.30497766, + "balance_loss_mlp": 1.02523232, + "epoch": 0.18415752292199009, + "flos": 21877206392520.0, + "grad_norm": 1.9657787228828587, + "language_loss": 0.73507333, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.76034689, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.20581055, + "step": 3063, + "time_per_iteration": 2.7629101276397705 + }, + { + "auxiliary_loss_clip": 0.01459217, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_clip": 1.28846216, + "balance_loss_mlp": 1.02582288, + "epoch": 0.18421764617465805, + "flos": 25416161516160.0, + "grad_norm": 1.5963752367619901, + "language_loss": 0.66237503, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.68741536, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.18994141, + "step": 3064, + "time_per_iteration": 2.9344675540924072 + }, + { + "auxiliary_loss_clip": 0.01467388, + "auxiliary_loss_mlp": 0.01057824, + "balance_loss_clip": 1.29622626, + "balance_loss_mlp": 1.03565109, + "epoch": 0.18427776942732602, + "flos": 27203802828480.0, + "grad_norm": 1.7800477976612719, + "language_loss": 0.6882652, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.71351737, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.22167969, + "step": 3065, + "time_per_iteration": 2.8739447593688965 + }, + { + "auxiliary_loss_clip": 0.01477297, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_clip": 1.30300283, + "balance_loss_mlp": 1.02734089, + "epoch": 0.18433789267999398, + "flos": 27857779063200.0, + "grad_norm": 3.9071501230889334, + "language_loss": 0.72777349, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75303209, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.21228027, + "step": 3066, + "time_per_iteration": 2.785332679748535 + }, + { + "auxiliary_loss_clip": 0.01469934, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.29621744, + "balance_loss_mlp": 1.02372718, + "epoch": 0.18439801593266195, + "flos": 17861283186120.0, + "grad_norm": 2.1748572501342296, + "language_loss": 0.82784921, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.85298181, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.19592285, + "step": 3067, + "time_per_iteration": 2.7616219520568848 + }, + { + "auxiliary_loss_clip": 0.01295063, + "auxiliary_loss_mlp": 0.01017242, + "balance_loss_clip": 1.20657754, + "balance_loss_mlp": 1.01273561, + "epoch": 0.18445813918532994, + "flos": 56404132683840.0, + "grad_norm": 0.7946205215815406, + "language_loss": 0.59705305, + "learning_rate": 3.754912376956657e-06, + "loss": 0.62017614, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.04516602, + "step": 3068, + "time_per_iteration": 3.1192166805267334 + }, + { + "auxiliary_loss_clip": 0.01464414, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.29658604, + "balance_loss_mlp": 1.02507257, + "epoch": 0.1845182624379979, + "flos": 20961790579440.0, + "grad_norm": 1.7017694149891158, + "language_loss": 0.7688427, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.79393929, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20166016, + "step": 3069, + "time_per_iteration": 2.763143539428711 + }, + { + "auxiliary_loss_clip": 0.01482225, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.3078196, + "balance_loss_mlp": 1.02352738, + "epoch": 0.18457838569066587, + "flos": 20490020366760.0, + "grad_norm": 2.1595244618282, + "language_loss": 0.85069978, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87596941, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.2121582, + "step": 3070, + "time_per_iteration": 2.774202585220337 + }, + { + "auxiliary_loss_clip": 0.01484559, + "auxiliary_loss_mlp": 0.01045305, + "balance_loss_clip": 1.31147814, + "balance_loss_mlp": 1.02395475, + "epoch": 0.18463850894333383, + "flos": 25015543796160.0, + "grad_norm": 4.433256379582151, + "language_loss": 0.7783832, + "learning_rate": 3.754351653708265e-06, + "loss": 0.80368185, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21337891, + "step": 3071, + "time_per_iteration": 2.799248456954956 + }, + { + "auxiliary_loss_clip": 0.01483238, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.30783415, + "balance_loss_mlp": 1.03404713, + "epoch": 0.1846986321960018, + "flos": 16805064897000.0, + "grad_norm": 2.0317234509195505, + "language_loss": 0.78007972, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.80546343, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.21081543, + "step": 3072, + "time_per_iteration": 2.7283549308776855 + }, + { + "auxiliary_loss_clip": 0.01478725, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.30285728, + "balance_loss_mlp": 1.02741385, + "epoch": 0.18475875544866976, + "flos": 20819323160640.0, + "grad_norm": 2.069151167716857, + "language_loss": 0.86453414, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88980913, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21350098, + "step": 3073, + "time_per_iteration": 4.213237047195435 + }, + { + "auxiliary_loss_clip": 0.01485962, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.31012428, + "balance_loss_mlp": 1.02310717, + "epoch": 0.18481887870133773, + "flos": 22606558389360.0, + "grad_norm": 1.9895595055989848, + "language_loss": 0.9169035, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94219339, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.19909668, + "step": 3074, + "time_per_iteration": 2.736086845397949 + }, + { + "auxiliary_loss_clip": 0.01475736, + "auxiliary_loss_mlp": 0.01044318, + "balance_loss_clip": 1.30259991, + "balance_loss_mlp": 1.02306342, + "epoch": 0.18487900195400572, + "flos": 29464595037720.0, + "grad_norm": 3.8951933406930155, + "language_loss": 0.65076602, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67596656, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.21264648, + "step": 3075, + "time_per_iteration": 2.877009153366089 + }, + { + "auxiliary_loss_clip": 0.01481663, + "auxiliary_loss_mlp": 0.01053528, + "balance_loss_clip": 1.31157148, + "balance_loss_mlp": 1.03341746, + "epoch": 0.1849391252066737, + "flos": 20633340561120.0, + "grad_norm": 2.077734715793924, + "language_loss": 0.732548, + "learning_rate": 3.753415784551761e-06, + "loss": 0.7579, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.2010498, + "step": 3076, + "time_per_iteration": 2.7977540493011475 + }, + { + "auxiliary_loss_clip": 0.01486427, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.31022227, + "balance_loss_mlp": 1.02359772, + "epoch": 0.18499924845934165, + "flos": 14432610033000.0, + "grad_norm": 9.53572025631618, + "language_loss": 0.81400424, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83930838, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.20385742, + "step": 3077, + "time_per_iteration": 4.18617057800293 + }, + { + "auxiliary_loss_clip": 0.01472704, + "auxiliary_loss_mlp": 0.01042367, + "balance_loss_clip": 1.3034054, + "balance_loss_mlp": 1.02295995, + "epoch": 0.18505937171200962, + "flos": 23732304836760.0, + "grad_norm": 2.41355922498924, + "language_loss": 0.78853893, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81368971, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.19421387, + "step": 3078, + "time_per_iteration": 2.803898334503174 + }, + { + "auxiliary_loss_clip": 0.014833, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.30895066, + "balance_loss_mlp": 1.02508783, + "epoch": 0.18511949496467758, + "flos": 25962982624080.0, + "grad_norm": 1.794172732417748, + "language_loss": 0.77888036, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.80415738, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.19299316, + "step": 3079, + "time_per_iteration": 2.859143018722534 + }, + { + "auxiliary_loss_clip": 0.01467572, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.29694402, + "balance_loss_mlp": 1.02049732, + "epoch": 0.18517961821734555, + "flos": 42421323740760.0, + "grad_norm": 1.707014601694479, + "language_loss": 0.82232714, + "learning_rate": 3.752665892369369e-06, + "loss": 0.8474108, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20300293, + "step": 3080, + "time_per_iteration": 3.0202066898345947 + }, + { + "auxiliary_loss_clip": 0.0148617, + "auxiliary_loss_mlp": 0.01049004, + "balance_loss_clip": 1.30980408, + "balance_loss_mlp": 1.02755857, + "epoch": 0.18523974147001354, + "flos": 24102929959920.0, + "grad_norm": 2.2248885230866473, + "language_loss": 0.74813265, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.77348435, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21435547, + "step": 3081, + "time_per_iteration": 4.305554628372192 + }, + { + "auxiliary_loss_clip": 0.01478278, + "auxiliary_loss_mlp": 0.01049794, + "balance_loss_clip": 1.30593145, + "balance_loss_mlp": 1.02846694, + "epoch": 0.1852998647226815, + "flos": 27380283071760.0, + "grad_norm": 2.061562152227801, + "language_loss": 0.72132874, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74660945, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21313477, + "step": 3082, + "time_per_iteration": 2.8561668395996094 + }, + { + "auxiliary_loss_clip": 0.01486528, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_clip": 1.31161511, + "balance_loss_mlp": 1.02913499, + "epoch": 0.18535998797534947, + "flos": 18337357884960.0, + "grad_norm": 1.9326938652500874, + "language_loss": 0.69682109, + "learning_rate": 3.752102775364407e-06, + "loss": 0.72218424, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20654297, + "step": 3083, + "time_per_iteration": 2.7641077041625977 + }, + { + "auxiliary_loss_clip": 0.01468261, + "auxiliary_loss_mlp": 0.01047901, + "balance_loss_clip": 1.29993272, + "balance_loss_mlp": 1.02769458, + "epoch": 0.18542011122801744, + "flos": 37852122697200.0, + "grad_norm": 2.4375069216764675, + "language_loss": 0.68712902, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71229064, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20202637, + "step": 3084, + "time_per_iteration": 4.238704442977905 + }, + { + "auxiliary_loss_clip": 0.01470587, + "auxiliary_loss_mlp": 0.01048908, + "balance_loss_clip": 1.30042458, + "balance_loss_mlp": 1.02935791, + "epoch": 0.1854802344806854, + "flos": 25190846397000.0, + "grad_norm": 1.610273799441008, + "language_loss": 0.77949846, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.8046934, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19555664, + "step": 3085, + "time_per_iteration": 2.8287301063537598 + }, + { + "auxiliary_loss_clip": 0.01471503, + "auxiliary_loss_mlp": 0.01044861, + "balance_loss_clip": 1.30137956, + "balance_loss_mlp": 1.02442861, + "epoch": 0.18554035773335337, + "flos": 26690060552760.0, + "grad_norm": 1.7215274467939146, + "language_loss": 0.73542368, + "learning_rate": 3.751539060400244e-06, + "loss": 0.76058733, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.2043457, + "step": 3086, + "time_per_iteration": 2.926499366760254 + }, + { + "auxiliary_loss_clip": 0.01468846, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_clip": 1.29684246, + "balance_loss_mlp": 1.02899551, + "epoch": 0.18560048098602133, + "flos": 22352103448920.0, + "grad_norm": 2.6638639923636296, + "language_loss": 0.70160085, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72679055, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21130371, + "step": 3087, + "time_per_iteration": 2.7416694164276123 + }, + { + "auxiliary_loss_clip": 0.01473225, + "auxiliary_loss_mlp": 0.01051096, + "balance_loss_clip": 1.30165577, + "balance_loss_mlp": 1.0304966, + "epoch": 0.18566060423868933, + "flos": 17752991025240.0, + "grad_norm": 2.0386041110890805, + "language_loss": 0.72802591, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.75326914, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20593262, + "step": 3088, + "time_per_iteration": 2.7258918285369873 + }, + { + "auxiliary_loss_clip": 0.01468248, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.29983687, + "balance_loss_mlp": 1.01961493, + "epoch": 0.1857207274913573, + "flos": 24682180166280.0, + "grad_norm": 1.9685845894949345, + "language_loss": 0.92065024, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94572103, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19213867, + "step": 3089, + "time_per_iteration": 2.7602901458740234 + }, + { + "auxiliary_loss_clip": 0.01474148, + "auxiliary_loss_mlp": 0.0104234, + "balance_loss_clip": 1.3045423, + "balance_loss_mlp": 1.02208614, + "epoch": 0.18578085074402526, + "flos": 28153759374720.0, + "grad_norm": 2.7029148858309506, + "language_loss": 0.58488846, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.6100533, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20251465, + "step": 3090, + "time_per_iteration": 2.803818702697754 + }, + { + "auxiliary_loss_clip": 0.01465238, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_clip": 1.2979511, + "balance_loss_mlp": 1.0265739, + "epoch": 0.18584097399669322, + "flos": 23957254480680.0, + "grad_norm": 1.885960062693395, + "language_loss": 0.82292783, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84805298, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20703125, + "step": 3091, + "time_per_iteration": 2.8021202087402344 + }, + { + "auxiliary_loss_clip": 0.01484521, + "auxiliary_loss_mlp": 0.01050777, + "balance_loss_clip": 1.31211829, + "balance_loss_mlp": 1.03048778, + "epoch": 0.18590109724936119, + "flos": 17206129308960.0, + "grad_norm": 2.9801553415039788, + "language_loss": 0.84872925, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.87408221, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20288086, + "step": 3092, + "time_per_iteration": 2.721137762069702 + }, + { + "auxiliary_loss_clip": 0.01486195, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_clip": 1.31222391, + "balance_loss_mlp": 1.02851152, + "epoch": 0.18596122050202915, + "flos": 17238639624120.0, + "grad_norm": 2.358436784414332, + "language_loss": 0.93380249, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95914853, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.19897461, + "step": 3093, + "time_per_iteration": 2.722114324569702 + }, + { + "auxiliary_loss_clip": 0.01480191, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.30927098, + "balance_loss_mlp": 1.02797294, + "epoch": 0.18602134375469712, + "flos": 19024737818760.0, + "grad_norm": 1.6519574426975852, + "language_loss": 0.77389193, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79917121, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.19775391, + "step": 3094, + "time_per_iteration": 2.7472047805786133 + }, + { + "auxiliary_loss_clip": 0.01478781, + "auxiliary_loss_mlp": 0.01049364, + "balance_loss_clip": 1.30745506, + "balance_loss_mlp": 1.03056431, + "epoch": 0.1860814670073651, + "flos": 50958831365640.0, + "grad_norm": 1.6913186004452319, + "language_loss": 0.69870532, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72398674, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.18811035, + "step": 3095, + "time_per_iteration": 3.173912286758423 + }, + { + "auxiliary_loss_clip": 0.0149285, + "auxiliary_loss_mlp": 0.01053283, + "balance_loss_clip": 1.31756926, + "balance_loss_mlp": 1.03302944, + "epoch": 0.18614159026003307, + "flos": 19395728417160.0, + "grad_norm": 2.008352942643896, + "language_loss": 0.8083694, + "learning_rate": 3.749655694397135e-06, + "loss": 0.83383071, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.20251465, + "step": 3096, + "time_per_iteration": 2.8409242630004883 + }, + { + "auxiliary_loss_clip": 0.01490188, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_clip": 1.31752682, + "balance_loss_mlp": 1.03260934, + "epoch": 0.18620171351270104, + "flos": 21803820440040.0, + "grad_norm": 6.421523572387591, + "language_loss": 0.7554251, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.7808553, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20227051, + "step": 3097, + "time_per_iteration": 2.783604145050049 + }, + { + "auxiliary_loss_clip": 0.01484993, + "auxiliary_loss_mlp": 0.01045866, + "balance_loss_clip": 1.31629801, + "balance_loss_mlp": 1.02672076, + "epoch": 0.186261836765369, + "flos": 16367266900440.0, + "grad_norm": 2.3049507611193016, + "language_loss": 0.67139053, + "learning_rate": 3.749278224802352e-06, + "loss": 0.69669914, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19152832, + "step": 3098, + "time_per_iteration": 2.949800968170166 + }, + { + "auxiliary_loss_clip": 0.01494195, + "auxiliary_loss_mlp": 0.01055422, + "balance_loss_clip": 1.31977177, + "balance_loss_mlp": 1.03146052, + "epoch": 0.18632196001803697, + "flos": 23375608381080.0, + "grad_norm": 1.552720780238757, + "language_loss": 0.69503772, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72053397, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.23974609, + "step": 3099, + "time_per_iteration": 2.77435564994812 + }, + { + "auxiliary_loss_clip": 0.01485826, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.31427538, + "balance_loss_mlp": 1.03704, + "epoch": 0.18638208327070493, + "flos": 22497129194400.0, + "grad_norm": 1.804193080403569, + "language_loss": 0.71796918, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74339908, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.20117188, + "step": 3100, + "time_per_iteration": 2.823758840560913 + }, + { + "auxiliary_loss_clip": 0.01492538, + "auxiliary_loss_mlp": 0.01060979, + "balance_loss_clip": 1.31815386, + "balance_loss_mlp": 1.03985476, + "epoch": 0.18644220652337293, + "flos": 29170766969280.0, + "grad_norm": 1.7732679979641466, + "language_loss": 0.80031705, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82585222, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21118164, + "step": 3101, + "time_per_iteration": 2.7812747955322266 + }, + { + "auxiliary_loss_clip": 0.01483979, + "auxiliary_loss_mlp": 0.0105484, + "balance_loss_clip": 1.31587136, + "balance_loss_mlp": 1.03512287, + "epoch": 0.1865023297760409, + "flos": 24249214564560.0, + "grad_norm": 1.9210535704348213, + "language_loss": 0.77014118, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79552937, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19714355, + "step": 3102, + "time_per_iteration": 2.796862840652466 + }, + { + "auxiliary_loss_clip": 0.01488365, + "auxiliary_loss_mlp": 0.01054639, + "balance_loss_clip": 1.31421304, + "balance_loss_mlp": 1.03407502, + "epoch": 0.18656245302870886, + "flos": 19132542679320.0, + "grad_norm": 2.161358535375477, + "language_loss": 0.76997858, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79540861, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.20544434, + "step": 3103, + "time_per_iteration": 2.850965738296509 + }, + { + "auxiliary_loss_clip": 0.01491988, + "auxiliary_loss_mlp": 0.01053528, + "balance_loss_clip": 1.31816888, + "balance_loss_mlp": 1.03315473, + "epoch": 0.18662257628137682, + "flos": 17790699210480.0, + "grad_norm": 1.6993551185845195, + "language_loss": 0.79456151, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.82001668, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20373535, + "step": 3104, + "time_per_iteration": 2.7506134510040283 + }, + { + "auxiliary_loss_clip": 0.01492389, + "auxiliary_loss_mlp": 0.01052987, + "balance_loss_clip": 1.32272935, + "balance_loss_mlp": 1.03346086, + "epoch": 0.1866826995340448, + "flos": 24029706440880.0, + "grad_norm": 1.8205072176931285, + "language_loss": 0.84682059, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87227434, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1953125, + "step": 3105, + "time_per_iteration": 2.8009958267211914 + }, + { + "auxiliary_loss_clip": 0.0149938, + "auxiliary_loss_mlp": 0.01054129, + "balance_loss_clip": 1.32125604, + "balance_loss_mlp": 1.03257561, + "epoch": 0.18674282278671275, + "flos": 26146934805600.0, + "grad_norm": 1.7451857090430287, + "language_loss": 0.87169278, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89722788, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.2154541, + "step": 3106, + "time_per_iteration": 2.7690606117248535 + }, + { + "auxiliary_loss_clip": 0.01501493, + "auxiliary_loss_mlp": 0.01053708, + "balance_loss_clip": 1.32462895, + "balance_loss_mlp": 1.03346658, + "epoch": 0.18680294603938072, + "flos": 19205806806720.0, + "grad_norm": 1.8823447157081297, + "language_loss": 0.77901983, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80457187, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.20239258, + "step": 3107, + "time_per_iteration": 2.72361421585083 + }, + { + "auxiliary_loss_clip": 0.0149233, + "auxiliary_loss_mlp": 0.01063712, + "balance_loss_clip": 1.31734681, + "balance_loss_mlp": 1.04225421, + "epoch": 0.1868630692920487, + "flos": 28550316258720.0, + "grad_norm": 2.032955121638025, + "language_loss": 0.74384153, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76940191, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21472168, + "step": 3108, + "time_per_iteration": 2.7797069549560547 + }, + { + "auxiliary_loss_clip": 0.0149083, + "auxiliary_loss_mlp": 0.01056605, + "balance_loss_clip": 1.31619191, + "balance_loss_mlp": 1.03607678, + "epoch": 0.18692319254471668, + "flos": 17242456809960.0, + "grad_norm": 2.8108845391582395, + "language_loss": 0.74679589, + "learning_rate": 3.747197400772658e-06, + "loss": 0.77227026, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.2052002, + "step": 3109, + "time_per_iteration": 2.7382137775421143 + }, + { + "auxiliary_loss_clip": 0.01492306, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_clip": 1.32110071, + "balance_loss_mlp": 1.02803922, + "epoch": 0.18698331579738464, + "flos": 23190397340400.0, + "grad_norm": 1.8948757009795927, + "language_loss": 0.84578884, + "learning_rate": 3.747007837284772e-06, + "loss": 0.87119818, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20593262, + "step": 3110, + "time_per_iteration": 2.7835590839385986 + }, + { + "auxiliary_loss_clip": 0.01497183, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.3244102, + "balance_loss_mlp": 1.02437794, + "epoch": 0.1870434390500526, + "flos": 25521773525280.0, + "grad_norm": 1.5726239944538773, + "language_loss": 0.84692812, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.8723824, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.23876953, + "step": 3111, + "time_per_iteration": 4.240411281585693 + }, + { + "auxiliary_loss_clip": 0.01492659, + "auxiliary_loss_mlp": 0.01048875, + "balance_loss_clip": 1.32076144, + "balance_loss_mlp": 1.02936029, + "epoch": 0.18710356230272057, + "flos": 19505644912440.0, + "grad_norm": 1.7703619695104924, + "language_loss": 0.76144981, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78686523, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.1953125, + "step": 3112, + "time_per_iteration": 2.807864189147949 + }, + { + "auxiliary_loss_clip": 0.01490684, + "auxiliary_loss_mlp": 0.01057811, + "balance_loss_clip": 1.31710148, + "balance_loss_mlp": 1.03724694, + "epoch": 0.18716368555538854, + "flos": 26766532740600.0, + "grad_norm": 2.420296945282809, + "language_loss": 0.65061224, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.67609715, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.20544434, + "step": 3113, + "time_per_iteration": 2.761282444000244 + }, + { + "auxiliary_loss_clip": 0.01499262, + "auxiliary_loss_mlp": 0.01053276, + "balance_loss_clip": 1.32329917, + "balance_loss_mlp": 1.03320122, + "epoch": 0.1872238088080565, + "flos": 25195029058080.0, + "grad_norm": 2.216481549677647, + "language_loss": 0.82033408, + "learning_rate": 3.746248920938024e-06, + "loss": 0.84585953, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.20056152, + "step": 3114, + "time_per_iteration": 2.7935192584991455 + }, + { + "auxiliary_loss_clip": 0.01498594, + "auxiliary_loss_mlp": 0.01059013, + "balance_loss_clip": 1.32266068, + "balance_loss_mlp": 1.03675699, + "epoch": 0.1872839320607245, + "flos": 24139704152880.0, + "grad_norm": 7.646200372618651, + "language_loss": 0.57114345, + "learning_rate": 3.74605902628851e-06, + "loss": 0.5967195, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.22265625, + "step": 3115, + "time_per_iteration": 2.839106798171997 + }, + { + "auxiliary_loss_clip": 0.01490384, + "auxiliary_loss_mlp": 0.0105817, + "balance_loss_clip": 1.31913161, + "balance_loss_mlp": 1.03778577, + "epoch": 0.18734405531339246, + "flos": 21178131251040.0, + "grad_norm": 1.7857224505830156, + "language_loss": 0.7152859, + "learning_rate": 3.745869065428261e-06, + "loss": 0.74077147, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20385742, + "step": 3116, + "time_per_iteration": 4.271256923675537 + }, + { + "auxiliary_loss_clip": 0.01479894, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.31213868, + "balance_loss_mlp": 1.02213013, + "epoch": 0.18740417856606043, + "flos": 17242091334720.0, + "grad_norm": 1.883070623807313, + "language_loss": 0.7887944, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81400621, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19165039, + "step": 3117, + "time_per_iteration": 2.7504425048828125 + }, + { + "auxiliary_loss_clip": 0.01482472, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.31384873, + "balance_loss_mlp": 1.03575528, + "epoch": 0.1874643018187284, + "flos": 32563965396960.0, + "grad_norm": 1.7098977988617368, + "language_loss": 0.84102607, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86641252, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20410156, + "step": 3118, + "time_per_iteration": 2.868299722671509 + }, + { + "auxiliary_loss_clip": 0.01495752, + "auxiliary_loss_mlp": 0.01053572, + "balance_loss_clip": 1.32383871, + "balance_loss_mlp": 1.03392649, + "epoch": 0.18752442507139636, + "flos": 23263377209280.0, + "grad_norm": 1.8596144292178916, + "language_loss": 0.76690632, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.79239959, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.19641113, + "step": 3119, + "time_per_iteration": 2.7718334197998047 + }, + { + "auxiliary_loss_clip": 0.01488136, + "auxiliary_loss_mlp": 0.01049905, + "balance_loss_clip": 1.31648147, + "balance_loss_mlp": 1.03034282, + "epoch": 0.18758454832406432, + "flos": 21765787387920.0, + "grad_norm": 1.7396706458563773, + "language_loss": 0.82443881, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84981924, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.19567871, + "step": 3120, + "time_per_iteration": 4.381545782089233 + }, + { + "auxiliary_loss_clip": 0.01482403, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_clip": 1.31307173, + "balance_loss_mlp": 1.0274694, + "epoch": 0.1876446715767323, + "flos": 29576257692480.0, + "grad_norm": 1.8224369446582527, + "language_loss": 0.85160917, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87689054, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.18273926, + "step": 3121, + "time_per_iteration": 2.819309711456299 + }, + { + "auxiliary_loss_clip": 0.01479646, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.31246233, + "balance_loss_mlp": 1.02519321, + "epoch": 0.18770479482940028, + "flos": 30346607151720.0, + "grad_norm": 1.7146278690891288, + "language_loss": 0.70559692, + "learning_rate": 3.744727910244937e-06, + "loss": 0.73083949, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1940918, + "step": 3122, + "time_per_iteration": 2.857999324798584 + }, + { + "auxiliary_loss_clip": 0.01484253, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_clip": 1.31518948, + "balance_loss_mlp": 1.02933013, + "epoch": 0.18776491808206824, + "flos": 14469465442680.0, + "grad_norm": 2.187853381444687, + "language_loss": 0.70639849, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.73174191, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20751953, + "step": 3123, + "time_per_iteration": 4.1201372146606445 + }, + { + "auxiliary_loss_clip": 0.01483568, + "auxiliary_loss_mlp": 0.01042623, + "balance_loss_clip": 1.31523728, + "balance_loss_mlp": 1.02288139, + "epoch": 0.1878250413347362, + "flos": 24503913155160.0, + "grad_norm": 4.0084399216419735, + "language_loss": 0.74514914, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.77041101, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19726562, + "step": 3124, + "time_per_iteration": 2.757377862930298 + }, + { + "auxiliary_loss_clip": 0.01480398, + "auxiliary_loss_mlp": 0.01052794, + "balance_loss_clip": 1.3097192, + "balance_loss_mlp": 1.03133631, + "epoch": 0.18788516458740417, + "flos": 39793561160760.0, + "grad_norm": 1.84684289700445, + "language_loss": 0.80488831, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.83022028, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21472168, + "step": 3125, + "time_per_iteration": 2.925116777420044 + }, + { + "auxiliary_loss_clip": 0.013132, + "auxiliary_loss_mlp": 0.01095825, + "balance_loss_clip": 1.23073888, + "balance_loss_mlp": 1.09160507, + "epoch": 0.18794528784007214, + "flos": 64713766862880.0, + "grad_norm": 0.949474258705472, + "language_loss": 0.6360594, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.66014963, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.04223633, + "step": 3126, + "time_per_iteration": 3.3422434329986572 + }, + { + "auxiliary_loss_clip": 0.01484127, + "auxiliary_loss_mlp": 0.01050169, + "balance_loss_clip": 1.31666732, + "balance_loss_mlp": 1.0299753, + "epoch": 0.1880054110927401, + "flos": 28628047305720.0, + "grad_norm": 1.6193723454076001, + "language_loss": 0.81458795, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83993089, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20202637, + "step": 3127, + "time_per_iteration": 2.829927682876587 + }, + { + "auxiliary_loss_clip": 0.01310742, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_clip": 1.22696912, + "balance_loss_mlp": 1.04385805, + "epoch": 0.1880655343454081, + "flos": 64504573262640.0, + "grad_norm": 0.7807666213720641, + "language_loss": 0.61921537, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64280427, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.04296875, + "step": 3128, + "time_per_iteration": 3.30153751373291 + }, + { + "auxiliary_loss_clip": 0.01494467, + "auxiliary_loss_mlp": 0.01053459, + "balance_loss_clip": 1.31999636, + "balance_loss_mlp": 1.03353894, + "epoch": 0.18812565759807606, + "flos": 32130877970160.0, + "grad_norm": 2.1579815891281555, + "language_loss": 0.71631765, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.74179691, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.19921875, + "step": 3129, + "time_per_iteration": 2.869832754135132 + }, + { + "auxiliary_loss_clip": 0.01488066, + "auxiliary_loss_mlp": 0.01054829, + "balance_loss_clip": 1.31885719, + "balance_loss_mlp": 1.03425312, + "epoch": 0.18818578085074403, + "flos": 20628548774640.0, + "grad_norm": 1.919799918414421, + "language_loss": 0.85192311, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87735206, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20581055, + "step": 3130, + "time_per_iteration": 2.8753600120544434 + }, + { + "auxiliary_loss_clip": 0.01484841, + "auxiliary_loss_mlp": 0.01055869, + "balance_loss_clip": 1.31229246, + "balance_loss_mlp": 1.03538918, + "epoch": 0.188245904103412, + "flos": 28846459003680.0, + "grad_norm": 1.8619989613547125, + "language_loss": 0.76944858, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.79485571, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.20483398, + "step": 3131, + "time_per_iteration": 2.8148834705352783 + }, + { + "auxiliary_loss_clip": 0.01479234, + "auxiliary_loss_mlp": 0.01055474, + "balance_loss_clip": 1.31245971, + "balance_loss_mlp": 1.03569722, + "epoch": 0.18830602735607996, + "flos": 29425831035120.0, + "grad_norm": 1.8139949098127395, + "language_loss": 0.81273168, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83807874, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19775391, + "step": 3132, + "time_per_iteration": 2.795260429382324 + }, + { + "auxiliary_loss_clip": 0.01498057, + "auxiliary_loss_mlp": 0.01065609, + "balance_loss_clip": 1.3264935, + "balance_loss_mlp": 1.04554582, + "epoch": 0.18836615060874792, + "flos": 28878360193440.0, + "grad_norm": 2.384305678109459, + "language_loss": 0.79685688, + "learning_rate": 3.742629607551614e-06, + "loss": 0.82249355, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.20056152, + "step": 3133, + "time_per_iteration": 2.8058414459228516 + }, + { + "auxiliary_loss_clip": 0.01481458, + "auxiliary_loss_mlp": 0.01067039, + "balance_loss_clip": 1.31249666, + "balance_loss_mlp": 1.04777431, + "epoch": 0.18842627386141592, + "flos": 22606842647880.0, + "grad_norm": 1.7511653124191504, + "language_loss": 0.83054912, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85603416, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19262695, + "step": 3134, + "time_per_iteration": 2.73153018951416 + }, + { + "auxiliary_loss_clip": 0.01483464, + "auxiliary_loss_mlp": 0.0107139, + "balance_loss_clip": 1.31620908, + "balance_loss_mlp": 1.05105257, + "epoch": 0.18848639711408388, + "flos": 24579248308920.0, + "grad_norm": 1.7833487747678451, + "language_loss": 0.83031535, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85586387, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20336914, + "step": 3135, + "time_per_iteration": 2.8521554470062256 + }, + { + "auxiliary_loss_clip": 0.01485805, + "auxiliary_loss_mlp": 0.0105752, + "balance_loss_clip": 1.31511211, + "balance_loss_mlp": 1.03792167, + "epoch": 0.18854652036675185, + "flos": 34173502131600.0, + "grad_norm": 1.8754004016751056, + "language_loss": 0.7807188, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80615199, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.19592285, + "step": 3136, + "time_per_iteration": 2.946592092514038 + }, + { + "auxiliary_loss_clip": 0.01493693, + "auxiliary_loss_mlp": 0.01059197, + "balance_loss_clip": 1.32517791, + "balance_loss_mlp": 1.03964663, + "epoch": 0.1886066436194198, + "flos": 24204724783200.0, + "grad_norm": 1.980881708995818, + "language_loss": 0.81771493, + "learning_rate": 3.741864605462996e-06, + "loss": 0.84324378, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19543457, + "step": 3137, + "time_per_iteration": 2.7542901039123535 + }, + { + "auxiliary_loss_clip": 0.01486372, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.31795645, + "balance_loss_mlp": 1.030164, + "epoch": 0.18866676687208778, + "flos": 21256024731480.0, + "grad_norm": 2.6065145399924394, + "language_loss": 0.80917132, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83454037, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20349121, + "step": 3138, + "time_per_iteration": 2.8312625885009766 + }, + { + "auxiliary_loss_clip": 0.01487888, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_clip": 1.31417465, + "balance_loss_mlp": 1.0486834, + "epoch": 0.18872689012475574, + "flos": 37316184629760.0, + "grad_norm": 1.9377923469217473, + "language_loss": 0.64089751, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.66646349, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.20007324, + "step": 3139, + "time_per_iteration": 2.9033472537994385 + }, + { + "auxiliary_loss_clip": 0.01478911, + "auxiliary_loss_mlp": 0.01053942, + "balance_loss_clip": 1.31003237, + "balance_loss_mlp": 1.03288925, + "epoch": 0.1887870133774237, + "flos": 21657251576880.0, + "grad_norm": 2.2119222528863776, + "language_loss": 0.71388721, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73921573, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21044922, + "step": 3140, + "time_per_iteration": 2.9298276901245117 + }, + { + "auxiliary_loss_clip": 0.01477212, + "auxiliary_loss_mlp": 0.0105511, + "balance_loss_clip": 1.30765569, + "balance_loss_mlp": 1.03461838, + "epoch": 0.1888471366300917, + "flos": 15928940995200.0, + "grad_norm": 2.3253627670985137, + "language_loss": 0.88071853, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.90604174, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20495605, + "step": 3141, + "time_per_iteration": 2.7003979682922363 + }, + { + "auxiliary_loss_clip": 0.01493598, + "auxiliary_loss_mlp": 0.01056967, + "balance_loss_clip": 1.32008398, + "balance_loss_mlp": 1.03628373, + "epoch": 0.18890725988275966, + "flos": 18556622358480.0, + "grad_norm": 1.9190745513120637, + "language_loss": 0.77133501, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79684067, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20690918, + "step": 3142, + "time_per_iteration": 2.968228340148926 + }, + { + "auxiliary_loss_clip": 0.01483615, + "auxiliary_loss_mlp": 0.01053707, + "balance_loss_clip": 1.31532383, + "balance_loss_mlp": 1.03477669, + "epoch": 0.18896738313542763, + "flos": 28846865087280.0, + "grad_norm": 3.413287153422012, + "language_loss": 0.78845477, + "learning_rate": 3.740715120924971e-06, + "loss": 0.81382799, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.18920898, + "step": 3143, + "time_per_iteration": 2.833970546722412 + }, + { + "auxiliary_loss_clip": 0.01482615, + "auxiliary_loss_mlp": 0.01052444, + "balance_loss_clip": 1.31301069, + "balance_loss_mlp": 1.03168941, + "epoch": 0.1890275063880956, + "flos": 22417042862520.0, + "grad_norm": 2.0490920861102624, + "language_loss": 0.71367753, + "learning_rate": 3.740523309097912e-06, + "loss": 0.7390281, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20751953, + "step": 3144, + "time_per_iteration": 2.7722270488739014 + }, + { + "auxiliary_loss_clip": 0.01488008, + "auxiliary_loss_mlp": 0.01046641, + "balance_loss_clip": 1.31429958, + "balance_loss_mlp": 1.02674496, + "epoch": 0.18908762964076356, + "flos": 24249336389640.0, + "grad_norm": 2.1384369637352276, + "language_loss": 0.73845929, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.76380581, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.19909668, + "step": 3145, + "time_per_iteration": 2.772329092025757 + }, + { + "auxiliary_loss_clip": 0.01477838, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_clip": 1.31082511, + "balance_loss_mlp": 1.02736247, + "epoch": 0.18914775289343153, + "flos": 16987352135760.0, + "grad_norm": 4.790500211133418, + "language_loss": 0.76889974, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79413319, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.18139648, + "step": 3146, + "time_per_iteration": 2.7127318382263184 + }, + { + "auxiliary_loss_clip": 0.01481033, + "auxiliary_loss_mlp": 0.01050806, + "balance_loss_clip": 1.31290889, + "balance_loss_mlp": 1.03120828, + "epoch": 0.1892078761460995, + "flos": 21548837590920.0, + "grad_norm": 1.7462330870082627, + "language_loss": 0.78738081, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.81269926, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19604492, + "step": 3147, + "time_per_iteration": 2.7783124446868896 + }, + { + "auxiliary_loss_clip": 0.01483202, + "auxiliary_loss_mlp": 0.01048669, + "balance_loss_clip": 1.31499851, + "balance_loss_mlp": 1.02969122, + "epoch": 0.18926799939876748, + "flos": 23006566983960.0, + "grad_norm": 2.9766163479943164, + "language_loss": 0.67438692, + "learning_rate": 3.739755401854267e-06, + "loss": 0.6997056, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.18994141, + "step": 3148, + "time_per_iteration": 2.7375810146331787 + }, + { + "auxiliary_loss_clip": 0.01476764, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.30743468, + "balance_loss_mlp": 1.02409327, + "epoch": 0.18932812265143545, + "flos": 22278108371040.0, + "grad_norm": 2.0086138127404314, + "language_loss": 0.75835335, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78356069, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19873047, + "step": 3149, + "time_per_iteration": 2.7926151752471924 + }, + { + "auxiliary_loss_clip": 0.01469964, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_clip": 1.30750132, + "balance_loss_mlp": 1.02927768, + "epoch": 0.1893882459041034, + "flos": 18629033710320.0, + "grad_norm": 2.21582750851288, + "language_loss": 0.80852211, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83370185, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18725586, + "step": 3150, + "time_per_iteration": 4.157578945159912 + }, + { + "auxiliary_loss_clip": 0.01490757, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.31972635, + "balance_loss_mlp": 1.03060126, + "epoch": 0.18944836915677138, + "flos": 22898112389640.0, + "grad_norm": 2.7015419625553574, + "language_loss": 0.86066723, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.88607609, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.19543457, + "step": 3151, + "time_per_iteration": 2.7440032958984375 + }, + { + "auxiliary_loss_clip": 0.01482762, + "auxiliary_loss_mlp": 0.01047619, + "balance_loss_clip": 1.31307769, + "balance_loss_mlp": 1.02747273, + "epoch": 0.18950849240943934, + "flos": 26802129291120.0, + "grad_norm": 2.0202929365691817, + "language_loss": 0.74809074, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.77339458, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20153809, + "step": 3152, + "time_per_iteration": 2.8253772258758545 + }, + { + "auxiliary_loss_clip": 0.01488497, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_clip": 1.31871915, + "balance_loss_mlp": 1.02703214, + "epoch": 0.1895686156621073, + "flos": 24976454926680.0, + "grad_norm": 2.343258857979576, + "language_loss": 0.75357544, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77893978, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20922852, + "step": 3153, + "time_per_iteration": 2.831923484802246 + }, + { + "auxiliary_loss_clip": 0.01487102, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_clip": 1.31577682, + "balance_loss_mlp": 1.03211176, + "epoch": 0.1896287389147753, + "flos": 21949577136000.0, + "grad_norm": 1.9477708933477473, + "language_loss": 0.79790938, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.82330698, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.20556641, + "step": 3154, + "time_per_iteration": 2.7665557861328125 + }, + { + "auxiliary_loss_clip": 0.01498379, + "auxiliary_loss_mlp": 0.01047692, + "balance_loss_clip": 1.32322729, + "balance_loss_mlp": 1.02661562, + "epoch": 0.18968886216744327, + "flos": 18182626741440.0, + "grad_norm": 2.924925717275809, + "language_loss": 0.73774743, + "learning_rate": 3.738409024548223e-06, + "loss": 0.76320815, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21044922, + "step": 3155, + "time_per_iteration": 2.687389612197876 + }, + { + "auxiliary_loss_clip": 0.01479974, + "auxiliary_loss_mlp": 0.01052678, + "balance_loss_clip": 1.31234157, + "balance_loss_mlp": 1.03275788, + "epoch": 0.18974898542011123, + "flos": 20417365364760.0, + "grad_norm": 1.7314702043429797, + "language_loss": 0.7398715, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76519811, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19921875, + "step": 3156, + "time_per_iteration": 4.206334829330444 + }, + { + "auxiliary_loss_clip": 0.0149579, + "auxiliary_loss_mlp": 0.01047105, + "balance_loss_clip": 1.32415211, + "balance_loss_mlp": 1.02784073, + "epoch": 0.1898091086727792, + "flos": 23989967837640.0, + "grad_norm": 1.7364491173305323, + "language_loss": 0.68256688, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70799583, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.19262695, + "step": 3157, + "time_per_iteration": 2.79655122756958 + }, + { + "auxiliary_loss_clip": 0.01489582, + "auxiliary_loss_mlp": 0.01046997, + "balance_loss_clip": 1.31947923, + "balance_loss_mlp": 1.0268271, + "epoch": 0.18986923192544716, + "flos": 27642940900920.0, + "grad_norm": 1.906655703484729, + "language_loss": 0.80516303, + "learning_rate": 3.737831016747176e-06, + "loss": 0.8305288, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20153809, + "step": 3158, + "time_per_iteration": 4.30379319190979 + }, + { + "auxiliary_loss_clip": 0.01503882, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_clip": 1.32798147, + "balance_loss_mlp": 1.02806997, + "epoch": 0.18992935517811513, + "flos": 25489709902080.0, + "grad_norm": 1.8603645824052677, + "language_loss": 0.71818495, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74371219, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.20788574, + "step": 3159, + "time_per_iteration": 2.7623159885406494 + }, + { + "auxiliary_loss_clip": 0.01495546, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.32474089, + "balance_loss_mlp": 1.0322659, + "epoch": 0.1899894784307831, + "flos": 17425312565760.0, + "grad_norm": 2.0224782705687647, + "language_loss": 0.85725415, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.88273841, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.2064209, + "step": 3160, + "time_per_iteration": 2.787020444869995 + }, + { + "auxiliary_loss_clip": 0.01480755, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.31596196, + "balance_loss_mlp": 1.03306055, + "epoch": 0.19004960168345109, + "flos": 27498240022320.0, + "grad_norm": 2.2249262474543965, + "language_loss": 0.73588395, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.76120794, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18591309, + "step": 3161, + "time_per_iteration": 2.782959222793579 + }, + { + "auxiliary_loss_clip": 0.01488181, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.3198241, + "balance_loss_mlp": 1.02377844, + "epoch": 0.19010972493611905, + "flos": 38661845284440.0, + "grad_norm": 1.8254923972161825, + "language_loss": 0.8069793, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83228612, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.1875, + "step": 3162, + "time_per_iteration": 4.3261518478393555 + }, + { + "auxiliary_loss_clip": 0.01497145, + "auxiliary_loss_mlp": 0.01045091, + "balance_loss_clip": 1.32790112, + "balance_loss_mlp": 1.02459908, + "epoch": 0.19016984818878702, + "flos": 19249443812520.0, + "grad_norm": 2.3663056836531573, + "language_loss": 0.75893807, + "learning_rate": 3.73686635253511e-06, + "loss": 0.78436047, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20507812, + "step": 3163, + "time_per_iteration": 2.8714025020599365 + }, + { + "auxiliary_loss_clip": 0.01489696, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.32203376, + "balance_loss_mlp": 1.02509332, + "epoch": 0.19022997144145498, + "flos": 37603962052560.0, + "grad_norm": 1.84091837435043, + "language_loss": 0.74421799, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76955932, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19348145, + "step": 3164, + "time_per_iteration": 2.8516595363616943 + }, + { + "auxiliary_loss_clip": 0.01492484, + "auxiliary_loss_mlp": 0.01044995, + "balance_loss_clip": 1.32539392, + "balance_loss_mlp": 1.02449095, + "epoch": 0.19029009469412295, + "flos": 61541034178320.0, + "grad_norm": 1.5767434771713593, + "language_loss": 0.67013228, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69550705, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20495605, + "step": 3165, + "time_per_iteration": 3.076849937438965 + }, + { + "auxiliary_loss_clip": 0.01497466, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.33021486, + "balance_loss_mlp": 1.0253849, + "epoch": 0.1903502179467909, + "flos": 13958931227400.0, + "grad_norm": 2.2437922416920206, + "language_loss": 0.74175996, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76720428, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21569824, + "step": 3166, + "time_per_iteration": 2.712632894515991 + }, + { + "auxiliary_loss_clip": 0.01311385, + "auxiliary_loss_mlp": 0.01015923, + "balance_loss_clip": 1.2309835, + "balance_loss_mlp": 1.01215637, + "epoch": 0.1904103411994589, + "flos": 66915913954320.0, + "grad_norm": 0.8313686044309218, + "language_loss": 0.50426292, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52753603, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.03759766, + "step": 3167, + "time_per_iteration": 3.284090518951416 + }, + { + "auxiliary_loss_clip": 0.01483281, + "auxiliary_loss_mlp": 0.01052444, + "balance_loss_clip": 1.31888533, + "balance_loss_mlp": 1.03271496, + "epoch": 0.19047046445212687, + "flos": 21913858760400.0, + "grad_norm": 1.7375251483799017, + "language_loss": 0.75066429, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.77602148, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19726562, + "step": 3168, + "time_per_iteration": 2.754237651824951 + }, + { + "auxiliary_loss_clip": 0.01306302, + "auxiliary_loss_mlp": 0.01005134, + "balance_loss_clip": 1.22488904, + "balance_loss_mlp": 1.00084269, + "epoch": 0.19053058770479483, + "flos": 59267954617200.0, + "grad_norm": 0.8756681403737183, + "language_loss": 0.60056311, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62367749, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04296875, + "step": 3169, + "time_per_iteration": 3.1035027503967285 + }, + { + "auxiliary_loss_clip": 0.01505681, + "auxiliary_loss_mlp": 0.01053744, + "balance_loss_clip": 1.33171475, + "balance_loss_mlp": 1.03319216, + "epoch": 0.1905907109574628, + "flos": 23956604746920.0, + "grad_norm": 1.664717222347377, + "language_loss": 0.78474224, + "learning_rate": 3.735513056633436e-06, + "loss": 0.81033647, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20544434, + "step": 3170, + "time_per_iteration": 2.8043506145477295 + }, + { + "auxiliary_loss_clip": 0.01486094, + "auxiliary_loss_mlp": 0.01045121, + "balance_loss_clip": 1.32136929, + "balance_loss_mlp": 1.02516556, + "epoch": 0.19065083421013077, + "flos": 20817049092480.0, + "grad_norm": 1.4942059973588262, + "language_loss": 0.78530836, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.81062055, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19958496, + "step": 3171, + "time_per_iteration": 2.7234108448028564 + }, + { + "auxiliary_loss_clip": 0.01510247, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.33492839, + "balance_loss_mlp": 1.02946746, + "epoch": 0.19071095746279873, + "flos": 31291244002800.0, + "grad_norm": 3.5577738689569576, + "language_loss": 0.78845656, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.81405842, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.2043457, + "step": 3172, + "time_per_iteration": 2.814990997314453 + }, + { + "auxiliary_loss_clip": 0.01492516, + "auxiliary_loss_mlp": 0.01056688, + "balance_loss_clip": 1.32392991, + "balance_loss_mlp": 1.03663731, + "epoch": 0.1907710807154667, + "flos": 14360239289520.0, + "grad_norm": 1.729433393790442, + "language_loss": 0.80241495, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82790703, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20056152, + "step": 3173, + "time_per_iteration": 2.7148613929748535 + }, + { + "auxiliary_loss_clip": 0.01503, + "auxiliary_loss_mlp": 0.01049968, + "balance_loss_clip": 1.33151758, + "balance_loss_mlp": 1.03059697, + "epoch": 0.1908312039681347, + "flos": 26912776736880.0, + "grad_norm": 1.6083845881895602, + "language_loss": 0.79027843, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.81580806, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19384766, + "step": 3174, + "time_per_iteration": 2.9074509143829346 + }, + { + "auxiliary_loss_clip": 0.01506227, + "auxiliary_loss_mlp": 0.01054688, + "balance_loss_clip": 1.33555436, + "balance_loss_mlp": 1.0341599, + "epoch": 0.19089132722080265, + "flos": 14497874313480.0, + "grad_norm": 1.7152355328037912, + "language_loss": 0.80856073, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83416986, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20532227, + "step": 3175, + "time_per_iteration": 2.8612937927246094 + }, + { + "auxiliary_loss_clip": 0.01507789, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.33729351, + "balance_loss_mlp": 1.03124428, + "epoch": 0.19095145047347062, + "flos": 13956697767600.0, + "grad_norm": 2.062977467572556, + "language_loss": 0.85712463, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.88271141, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19641113, + "step": 3176, + "time_per_iteration": 2.7530510425567627 + }, + { + "auxiliary_loss_clip": 0.01513862, + "auxiliary_loss_mlp": 0.01052596, + "balance_loss_clip": 1.34029365, + "balance_loss_mlp": 1.03008962, + "epoch": 0.19101157372613858, + "flos": 25307382054960.0, + "grad_norm": 2.6634593218394333, + "language_loss": 0.81341654, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83908117, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.22485352, + "step": 3177, + "time_per_iteration": 2.846738815307617 + }, + { + "auxiliary_loss_clip": 0.01497839, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.3314923, + "balance_loss_mlp": 1.02748156, + "epoch": 0.19107169697880655, + "flos": 20563081452360.0, + "grad_norm": 6.998598660839356, + "language_loss": 0.7557165, + "learning_rate": 3.73396248424356e-06, + "loss": 0.78116608, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19616699, + "step": 3178, + "time_per_iteration": 2.743206262588501 + }, + { + "auxiliary_loss_clip": 0.01506454, + "auxiliary_loss_mlp": 0.01050994, + "balance_loss_clip": 1.33522642, + "balance_loss_mlp": 1.03230178, + "epoch": 0.19113182023147451, + "flos": 22168232484120.0, + "grad_norm": 1.595329715079744, + "language_loss": 0.81329155, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83886594, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.18676758, + "step": 3179, + "time_per_iteration": 2.8471829891204834 + }, + { + "auxiliary_loss_clip": 0.01506679, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_clip": 1.33734381, + "balance_loss_mlp": 1.03374779, + "epoch": 0.19119194348414248, + "flos": 18585599746320.0, + "grad_norm": 2.336415842943708, + "language_loss": 0.80255866, + "learning_rate": 3.733574183478691e-06, + "loss": 0.8281588, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19567871, + "step": 3180, + "time_per_iteration": 2.7707762718200684 + }, + { + "auxiliary_loss_clip": 0.0150106, + "auxiliary_loss_mlp": 0.01055462, + "balance_loss_clip": 1.33245695, + "balance_loss_mlp": 1.03556597, + "epoch": 0.19125206673681047, + "flos": 19031397589800.0, + "grad_norm": 2.0350477033820624, + "language_loss": 0.79546309, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82102823, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19909668, + "step": 3181, + "time_per_iteration": 2.792997121810913 + }, + { + "auxiliary_loss_clip": 0.01503263, + "auxiliary_loss_mlp": 0.01051477, + "balance_loss_clip": 1.33142447, + "balance_loss_mlp": 1.031533, + "epoch": 0.19131218998947844, + "flos": 21695284629000.0, + "grad_norm": 4.483444390827286, + "language_loss": 0.74806893, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.77361631, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.19946289, + "step": 3182, + "time_per_iteration": 2.8178491592407227 + }, + { + "auxiliary_loss_clip": 0.01504213, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.33568192, + "balance_loss_mlp": 1.03176391, + "epoch": 0.1913723132421464, + "flos": 18447152555160.0, + "grad_norm": 1.7579309617452663, + "language_loss": 0.65136635, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67692286, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19677734, + "step": 3183, + "time_per_iteration": 2.7550318241119385 + }, + { + "auxiliary_loss_clip": 0.01513577, + "auxiliary_loss_mlp": 0.01056375, + "balance_loss_clip": 1.34119773, + "balance_loss_mlp": 1.0362289, + "epoch": 0.19143243649481437, + "flos": 27165566734560.0, + "grad_norm": 1.6812528178469228, + "language_loss": 0.73205328, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75775278, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20153809, + "step": 3184, + "time_per_iteration": 2.8777666091918945 + }, + { + "auxiliary_loss_clip": 0.01510842, + "auxiliary_loss_mlp": 0.01048673, + "balance_loss_clip": 1.34017467, + "balance_loss_mlp": 1.02800214, + "epoch": 0.19149255974748233, + "flos": 21723084374400.0, + "grad_norm": 1.803484377293765, + "language_loss": 0.88376445, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90935957, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20654297, + "step": 3185, + "time_per_iteration": 2.744680404663086 + }, + { + "auxiliary_loss_clip": 0.01504706, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.33622599, + "balance_loss_mlp": 1.02749705, + "epoch": 0.1915526830001503, + "flos": 22968087239880.0, + "grad_norm": 2.0736055642396587, + "language_loss": 0.7306056, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75612795, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20031738, + "step": 3186, + "time_per_iteration": 2.8804423809051514 + }, + { + "auxiliary_loss_clip": 0.0151025, + "auxiliary_loss_mlp": 0.01058232, + "balance_loss_clip": 1.33873653, + "balance_loss_mlp": 1.03557038, + "epoch": 0.1916128062528183, + "flos": 26146366288560.0, + "grad_norm": 1.9559195088295112, + "language_loss": 0.83922124, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86490607, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.2265625, + "step": 3187, + "time_per_iteration": 2.7641565799713135 + }, + { + "auxiliary_loss_clip": 0.01315647, + "auxiliary_loss_mlp": 0.01053388, + "balance_loss_clip": 1.23198724, + "balance_loss_mlp": 1.04888177, + "epoch": 0.19167292950548626, + "flos": 54939621694680.0, + "grad_norm": 3.183685701865006, + "language_loss": 0.55843049, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58212084, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.04516602, + "step": 3188, + "time_per_iteration": 4.7471983432769775 + }, + { + "auxiliary_loss_clip": 0.01505368, + "auxiliary_loss_mlp": 0.01055625, + "balance_loss_clip": 1.33743846, + "balance_loss_mlp": 1.03521657, + "epoch": 0.19173305275815422, + "flos": 29941766162280.0, + "grad_norm": 1.7198231327504123, + "language_loss": 0.70665348, + "learning_rate": 3.731823576891397e-06, + "loss": 0.73226345, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20410156, + "step": 3189, + "time_per_iteration": 2.8050544261932373 + }, + { + "auxiliary_loss_clip": 0.01496365, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_clip": 1.33215165, + "balance_loss_mlp": 1.02478147, + "epoch": 0.1917931760108222, + "flos": 24757677753480.0, + "grad_norm": 1.7831178244566532, + "language_loss": 0.74583781, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.77123892, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18981934, + "step": 3190, + "time_per_iteration": 2.7575793266296387 + }, + { + "auxiliary_loss_clip": 0.01496998, + "auxiliary_loss_mlp": 0.01063552, + "balance_loss_clip": 1.33281612, + "balance_loss_mlp": 1.04413319, + "epoch": 0.19185329926349015, + "flos": 18848216967120.0, + "grad_norm": 1.904055953026011, + "language_loss": 0.84620547, + "learning_rate": 3.73143383063572e-06, + "loss": 0.87181103, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19421387, + "step": 3191, + "time_per_iteration": 2.760284185409546 + }, + { + "auxiliary_loss_clip": 0.01496814, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_clip": 1.33122873, + "balance_loss_mlp": 1.02578497, + "epoch": 0.19191342251615812, + "flos": 22091435429400.0, + "grad_norm": 1.723224400661142, + "language_loss": 0.89970756, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92512387, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19030762, + "step": 3192, + "time_per_iteration": 2.7581918239593506 + }, + { + "auxiliary_loss_clip": 0.01511188, + "auxiliary_loss_mlp": 0.01058325, + "balance_loss_clip": 1.33921099, + "balance_loss_mlp": 1.03647447, + "epoch": 0.19197354576882608, + "flos": 22204072684800.0, + "grad_norm": 1.7272948619571848, + "language_loss": 0.75300092, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77869606, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21850586, + "step": 3193, + "time_per_iteration": 2.7233495712280273 + }, + { + "auxiliary_loss_clip": 0.01511131, + "auxiliary_loss_mlp": 0.01060235, + "balance_loss_clip": 1.33856797, + "balance_loss_mlp": 1.03908765, + "epoch": 0.19203366902149407, + "flos": 24901119772920.0, + "grad_norm": 1.8169218011269836, + "language_loss": 0.75304806, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77876174, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21166992, + "step": 3194, + "time_per_iteration": 2.792604923248291 + }, + { + "auxiliary_loss_clip": 0.01316512, + "auxiliary_loss_mlp": 0.01031452, + "balance_loss_clip": 1.23027158, + "balance_loss_mlp": 1.02692163, + "epoch": 0.19209379227416204, + "flos": 68431192039440.0, + "grad_norm": 0.7811459858218438, + "language_loss": 0.68507814, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70855772, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.04541016, + "step": 3195, + "time_per_iteration": 4.66143798828125 + }, + { + "auxiliary_loss_clip": 0.015039, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_clip": 1.33521962, + "balance_loss_mlp": 1.0264895, + "epoch": 0.19215391552683, + "flos": 22060387015200.0, + "grad_norm": 2.046644811386539, + "language_loss": 0.73143125, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75693119, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19616699, + "step": 3196, + "time_per_iteration": 2.737351655960083 + }, + { + "auxiliary_loss_clip": 0.01516042, + "auxiliary_loss_mlp": 0.01057769, + "balance_loss_clip": 1.34869075, + "balance_loss_mlp": 1.03774226, + "epoch": 0.19221403877949797, + "flos": 20307814344720.0, + "grad_norm": 2.0157456286566724, + "language_loss": 0.83897406, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.86471212, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20043945, + "step": 3197, + "time_per_iteration": 2.8178343772888184 + }, + { + "auxiliary_loss_clip": 0.01499497, + "auxiliary_loss_mlp": 0.01040867, + "balance_loss_clip": 1.33020353, + "balance_loss_mlp": 1.02083945, + "epoch": 0.19227416203216594, + "flos": 23190559773840.0, + "grad_norm": 2.18588380398305, + "language_loss": 0.8085317, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.83393538, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20007324, + "step": 3198, + "time_per_iteration": 4.382335186004639 + }, + { + "auxiliary_loss_clip": 0.01510184, + "auxiliary_loss_mlp": 0.01055887, + "balance_loss_clip": 1.33696413, + "balance_loss_mlp": 1.03471553, + "epoch": 0.1923342852848339, + "flos": 25782279111360.0, + "grad_norm": 4.009310527387856, + "language_loss": 0.78538102, + "learning_rate": 3.729872219959029e-06, + "loss": 0.81104171, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.21166992, + "step": 3199, + "time_per_iteration": 2.8094849586486816 + }, + { + "auxiliary_loss_clip": 0.01504521, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.33488107, + "balance_loss_mlp": 1.0247283, + "epoch": 0.19239440853750187, + "flos": 17132743356480.0, + "grad_norm": 2.4798892204169314, + "language_loss": 0.84115386, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.86664218, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19592285, + "step": 3200, + "time_per_iteration": 2.753138780593872 + }, + { + "auxiliary_loss_clip": 0.01505123, + "auxiliary_loss_mlp": 0.01052403, + "balance_loss_clip": 1.33755279, + "balance_loss_mlp": 1.03324652, + "epoch": 0.19245453179016986, + "flos": 16439393993760.0, + "grad_norm": 1.822887202144407, + "language_loss": 0.79582119, + "learning_rate": 3.729481161172443e-06, + "loss": 0.82139635, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19165039, + "step": 3201, + "time_per_iteration": 4.256797790527344 + }, + { + "auxiliary_loss_clip": 0.01510071, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.33941615, + "balance_loss_mlp": 1.02878916, + "epoch": 0.19251465504283782, + "flos": 20235159342720.0, + "grad_norm": 2.0162602591534347, + "language_loss": 0.69351149, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71909368, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.19372559, + "step": 3202, + "time_per_iteration": 2.763333320617676 + }, + { + "auxiliary_loss_clip": 0.01496485, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.33106685, + "balance_loss_mlp": 1.02217627, + "epoch": 0.1925747782955058, + "flos": 19469195586360.0, + "grad_norm": 1.7505899130602636, + "language_loss": 0.91232848, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93771428, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19921875, + "step": 3203, + "time_per_iteration": 2.723979949951172 + }, + { + "auxiliary_loss_clip": 0.01508579, + "auxiliary_loss_mlp": 0.01050492, + "balance_loss_clip": 1.33856022, + "balance_loss_mlp": 1.03007174, + "epoch": 0.19263490154817375, + "flos": 17790130693440.0, + "grad_norm": 2.0498844355059704, + "language_loss": 0.82041132, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.84600204, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20397949, + "step": 3204, + "time_per_iteration": 2.707141637802124 + }, + { + "auxiliary_loss_clip": 0.01497092, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.33029079, + "balance_loss_mlp": 1.02285635, + "epoch": 0.19269502480084172, + "flos": 17461558850040.0, + "grad_norm": 1.7783402901149823, + "language_loss": 0.75666469, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.782058, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19384766, + "step": 3205, + "time_per_iteration": 2.7144672870635986 + }, + { + "auxiliary_loss_clip": 0.01508454, + "auxiliary_loss_mlp": 0.01048569, + "balance_loss_clip": 1.33737409, + "balance_loss_mlp": 1.02835071, + "epoch": 0.19275514805350968, + "flos": 21511900964520.0, + "grad_norm": 2.6997107230884305, + "language_loss": 0.83838964, + "learning_rate": 3.728502366649107e-06, + "loss": 0.86395991, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20214844, + "step": 3206, + "time_per_iteration": 2.718226432800293 + }, + { + "auxiliary_loss_clip": 0.01325486, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.24271357, + "balance_loss_mlp": 1.02076364, + "epoch": 0.19281527130617768, + "flos": 47708157946320.0, + "grad_norm": 0.8965405995165998, + "language_loss": 0.60627288, + "learning_rate": 3.728306411079786e-06, + "loss": 0.6297828, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.04736328, + "step": 3207, + "time_per_iteration": 3.126418113708496 + }, + { + "auxiliary_loss_clip": 0.01504776, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_clip": 1.3339994, + "balance_loss_mlp": 1.02544141, + "epoch": 0.19287539455884564, + "flos": 11804928669720.0, + "grad_norm": 2.216642534268996, + "language_loss": 0.75579262, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.78129947, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20458984, + "step": 3208, + "time_per_iteration": 2.78644061088562 + }, + { + "auxiliary_loss_clip": 0.01509858, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.33815551, + "balance_loss_mlp": 1.02613449, + "epoch": 0.1929355178115136, + "flos": 20636345579760.0, + "grad_norm": 1.9876358221119779, + "language_loss": 0.61447716, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.6400519, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21472168, + "step": 3209, + "time_per_iteration": 2.7286412715911865 + }, + { + "auxiliary_loss_clip": 0.0150506, + "auxiliary_loss_mlp": 0.01055582, + "balance_loss_clip": 1.33373916, + "balance_loss_mlp": 1.0347327, + "epoch": 0.19299564106418157, + "flos": 40815441758520.0, + "grad_norm": 1.8802303010721007, + "language_loss": 0.80884922, + "learning_rate": 3.727718151176243e-06, + "loss": 0.83445561, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20861816, + "step": 3210, + "time_per_iteration": 2.9199867248535156 + }, + { + "auxiliary_loss_clip": 0.01491792, + "auxiliary_loss_mlp": 0.01047283, + "balance_loss_clip": 1.32731879, + "balance_loss_mlp": 1.02882934, + "epoch": 0.19305576431684954, + "flos": 11364775388280.0, + "grad_norm": 2.3830441796307547, + "language_loss": 0.82948154, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85487229, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18457031, + "step": 3211, + "time_per_iteration": 2.6971099376678467 + }, + { + "auxiliary_loss_clip": 0.01327808, + "auxiliary_loss_mlp": 0.01015779, + "balance_loss_clip": 1.24458027, + "balance_loss_mlp": 1.01098692, + "epoch": 0.1931158875695175, + "flos": 54524060098200.0, + "grad_norm": 0.9642370452422704, + "language_loss": 0.63701195, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.66044778, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.04785156, + "step": 3212, + "time_per_iteration": 3.1238033771514893 + }, + { + "auxiliary_loss_clip": 0.01499532, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.3335855, + "balance_loss_mlp": 1.02708983, + "epoch": 0.19317601082218547, + "flos": 19833201546840.0, + "grad_norm": 1.5632565085479568, + "language_loss": 0.76478988, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.79024398, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18786621, + "step": 3213, + "time_per_iteration": 2.7132487297058105 + }, + { + "auxiliary_loss_clip": 0.01508698, + "auxiliary_loss_mlp": 0.01047609, + "balance_loss_clip": 1.33604467, + "balance_loss_mlp": 1.02711737, + "epoch": 0.19323613407485346, + "flos": 13156030844640.0, + "grad_norm": 1.959806135131028, + "language_loss": 0.7129879, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73855102, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.20495605, + "step": 3214, + "time_per_iteration": 2.712752342224121 + }, + { + "auxiliary_loss_clip": 0.01497771, + "auxiliary_loss_mlp": 0.01047336, + "balance_loss_clip": 1.32881284, + "balance_loss_mlp": 1.02715349, + "epoch": 0.19329625732752143, + "flos": 14031261362520.0, + "grad_norm": 2.4645423883786046, + "language_loss": 0.75522959, + "learning_rate": 3.72673640779803e-06, + "loss": 0.78068066, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20178223, + "step": 3215, + "time_per_iteration": 2.7048776149749756 + }, + { + "auxiliary_loss_clip": 0.01492494, + "auxiliary_loss_mlp": 0.01046514, + "balance_loss_clip": 1.32630801, + "balance_loss_mlp": 1.02701139, + "epoch": 0.1933563805801894, + "flos": 23447126349000.0, + "grad_norm": 6.159118570798265, + "language_loss": 0.88404155, + "learning_rate": 3.72653986265854e-06, + "loss": 0.9094317, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19494629, + "step": 3216, + "time_per_iteration": 2.7796576023101807 + }, + { + "auxiliary_loss_clip": 0.01492308, + "auxiliary_loss_mlp": 0.01055201, + "balance_loss_clip": 1.32741618, + "balance_loss_mlp": 1.036116, + "epoch": 0.19341650383285736, + "flos": 20490182800200.0, + "grad_norm": 1.5983203793882115, + "language_loss": 0.80010766, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82558274, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.1907959, + "step": 3217, + "time_per_iteration": 2.727924108505249 + }, + { + "auxiliary_loss_clip": 0.01514552, + "auxiliary_loss_mlp": 0.0105277, + "balance_loss_clip": 1.34019494, + "balance_loss_mlp": 1.03081203, + "epoch": 0.19347662708552532, + "flos": 17863029345600.0, + "grad_norm": 2.8186485015657388, + "language_loss": 0.62060094, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.64627409, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.21948242, + "step": 3218, + "time_per_iteration": 2.7241339683532715 + }, + { + "auxiliary_loss_clip": 0.01509116, + "auxiliary_loss_mlp": 0.0105172, + "balance_loss_clip": 1.33894372, + "balance_loss_mlp": 1.03110838, + "epoch": 0.1935367503381933, + "flos": 18191885447520.0, + "grad_norm": 1.6838534000249459, + "language_loss": 0.80571395, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.83132231, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20629883, + "step": 3219, + "time_per_iteration": 2.9735147953033447 + }, + { + "auxiliary_loss_clip": 0.01497561, + "auxiliary_loss_mlp": 0.01048594, + "balance_loss_clip": 1.3314786, + "balance_loss_mlp": 1.02911568, + "epoch": 0.19359687359086128, + "flos": 15960882793320.0, + "grad_norm": 2.1603042201890292, + "language_loss": 0.86528081, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.89074248, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19494629, + "step": 3220, + "time_per_iteration": 2.699531078338623 + }, + { + "auxiliary_loss_clip": 0.01481754, + "auxiliary_loss_mlp": 0.01049641, + "balance_loss_clip": 1.32224309, + "balance_loss_mlp": 1.03151476, + "epoch": 0.19365699684352924, + "flos": 21220184530800.0, + "grad_norm": 1.9793693423434038, + "language_loss": 0.84462214, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86993611, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18133545, + "step": 3221, + "time_per_iteration": 2.7761998176574707 + }, + { + "auxiliary_loss_clip": 0.01493585, + "auxiliary_loss_mlp": 0.01053385, + "balance_loss_clip": 1.32852435, + "balance_loss_mlp": 1.03496695, + "epoch": 0.1937171200961972, + "flos": 17315558503920.0, + "grad_norm": 2.0531686438417127, + "language_loss": 0.86229217, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88776183, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18432617, + "step": 3222, + "time_per_iteration": 2.710655689239502 + }, + { + "auxiliary_loss_clip": 0.01499603, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.32939577, + "balance_loss_mlp": 1.02345717, + "epoch": 0.19377724334886517, + "flos": 22640855472360.0, + "grad_norm": 1.6050420867045347, + "language_loss": 0.7842738, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80969727, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19287109, + "step": 3223, + "time_per_iteration": 2.787930727005005 + }, + { + "auxiliary_loss_clip": 0.01496047, + "auxiliary_loss_mlp": 0.01050395, + "balance_loss_clip": 1.33170915, + "balance_loss_mlp": 1.03220367, + "epoch": 0.19383736660153314, + "flos": 15080982314040.0, + "grad_norm": 1.945563833434258, + "language_loss": 0.75500214, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.78046662, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18188477, + "step": 3224, + "time_per_iteration": 2.755913496017456 + }, + { + "auxiliary_loss_clip": 0.01499188, + "auxiliary_loss_mlp": 0.01046332, + "balance_loss_clip": 1.33258867, + "balance_loss_mlp": 1.02793765, + "epoch": 0.1938974898542011, + "flos": 47128281633360.0, + "grad_norm": 2.6884914266252364, + "language_loss": 0.71188927, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73734438, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.18408203, + "step": 3225, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.01492155, + "auxiliary_loss_mlp": 0.01056153, + "balance_loss_clip": 1.32571244, + "balance_loss_mlp": 1.03769946, + "epoch": 0.19395761310686907, + "flos": 25818159920400.0, + "grad_norm": 2.400518492550951, + "language_loss": 0.69029307, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71577615, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18444824, + "step": 3226, + "time_per_iteration": 2.7682528495788574 + }, + { + "auxiliary_loss_clip": 0.0149421, + "auxiliary_loss_mlp": 0.01049256, + "balance_loss_clip": 1.33174229, + "balance_loss_mlp": 1.029217, + "epoch": 0.19401773635953706, + "flos": 23044843686240.0, + "grad_norm": 1.490743023282368, + "language_loss": 0.76323462, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78866929, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20031738, + "step": 3227, + "time_per_iteration": 2.743797779083252 + }, + { + "auxiliary_loss_clip": 0.01498608, + "auxiliary_loss_mlp": 0.01047038, + "balance_loss_clip": 1.32983255, + "balance_loss_mlp": 1.02854824, + "epoch": 0.19407785961220503, + "flos": 15924555292320.0, + "grad_norm": 1.9652990726852697, + "language_loss": 0.69833022, + "learning_rate": 3.724176216414662e-06, + "loss": 0.72378665, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.18493652, + "step": 3228, + "time_per_iteration": 4.191017150878906 + }, + { + "auxiliary_loss_clip": 0.01493033, + "auxiliary_loss_mlp": 0.01046797, + "balance_loss_clip": 1.32565546, + "balance_loss_mlp": 1.02768803, + "epoch": 0.194137982864873, + "flos": 25927304856840.0, + "grad_norm": 1.7587120660032756, + "language_loss": 0.74036932, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76576757, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19104004, + "step": 3229, + "time_per_iteration": 2.8699684143066406 + }, + { + "auxiliary_loss_clip": 0.01493519, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_clip": 1.32693076, + "balance_loss_mlp": 1.02644682, + "epoch": 0.19419810611754096, + "flos": 13265297606160.0, + "grad_norm": 1.8351475062334424, + "language_loss": 0.66210687, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.6874907, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18432617, + "step": 3230, + "time_per_iteration": 2.7431836128234863 + }, + { + "auxiliary_loss_clip": 0.01490752, + "auxiliary_loss_mlp": 0.01043717, + "balance_loss_clip": 1.3252821, + "balance_loss_mlp": 1.02451229, + "epoch": 0.19425822937020892, + "flos": 15709798346760.0, + "grad_norm": 1.9327442252327922, + "language_loss": 0.81999731, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84534204, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19213867, + "step": 3231, + "time_per_iteration": 2.790860891342163 + }, + { + "auxiliary_loss_clip": 0.01504231, + "auxiliary_loss_mlp": 0.01047139, + "balance_loss_clip": 1.33471131, + "balance_loss_mlp": 1.02640867, + "epoch": 0.1943183526228769, + "flos": 23109701883120.0, + "grad_norm": 1.9073609767053654, + "language_loss": 0.87500143, + "learning_rate": 3.72338624150555e-06, + "loss": 0.9005152, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20739746, + "step": 3232, + "time_per_iteration": 2.7195653915405273 + }, + { + "auxiliary_loss_clip": 0.01495654, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_clip": 1.33231997, + "balance_loss_mlp": 1.02981675, + "epoch": 0.19437847587554485, + "flos": 24717370633200.0, + "grad_norm": 1.7445229840107246, + "language_loss": 0.85159826, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87703419, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18127441, + "step": 3233, + "time_per_iteration": 2.761308193206787 + }, + { + "auxiliary_loss_clip": 0.01503443, + "auxiliary_loss_mlp": 0.01057184, + "balance_loss_clip": 1.33273411, + "balance_loss_mlp": 1.03840899, + "epoch": 0.19443859912821285, + "flos": 23121843782760.0, + "grad_norm": 1.6712074478387333, + "language_loss": 0.89536452, + "learning_rate": 3.722990861915158e-06, + "loss": 0.9209708, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.18762207, + "step": 3234, + "time_per_iteration": 4.195584535598755 + }, + { + "auxiliary_loss_clip": 0.01499644, + "auxiliary_loss_mlp": 0.01048172, + "balance_loss_clip": 1.32819808, + "balance_loss_mlp": 1.02771521, + "epoch": 0.1944987223808808, + "flos": 15088413643920.0, + "grad_norm": 2.2087657809670636, + "language_loss": 0.79297328, + "learning_rate": 3.722793074112234e-06, + "loss": 0.8184514, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20446777, + "step": 3235, + "time_per_iteration": 2.7698936462402344 + }, + { + "auxiliary_loss_clip": 0.01497475, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_clip": 1.33149278, + "balance_loss_mlp": 1.03375971, + "epoch": 0.19455884563354878, + "flos": 17131078413720.0, + "grad_norm": 2.3320621149200425, + "language_loss": 0.79361045, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81910104, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.17834473, + "step": 3236, + "time_per_iteration": 2.750908136367798 + }, + { + "auxiliary_loss_clip": 0.01493833, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_clip": 1.32965493, + "balance_loss_mlp": 1.02844524, + "epoch": 0.19461896888621674, + "flos": 20198222716320.0, + "grad_norm": 1.641405112360444, + "language_loss": 0.76018715, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78560281, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19287109, + "step": 3237, + "time_per_iteration": 4.2957024574279785 + }, + { + "auxiliary_loss_clip": 0.01497896, + "auxiliary_loss_mlp": 0.01051818, + "balance_loss_clip": 1.32683027, + "balance_loss_mlp": 1.03274477, + "epoch": 0.1946790921388847, + "flos": 25306854146280.0, + "grad_norm": 1.5329294030077985, + "language_loss": 0.75028884, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77578598, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.19055176, + "step": 3238, + "time_per_iteration": 2.773831844329834 + }, + { + "auxiliary_loss_clip": 0.01490074, + "auxiliary_loss_mlp": 0.01044388, + "balance_loss_clip": 1.32522297, + "balance_loss_mlp": 1.02519548, + "epoch": 0.19473921539155267, + "flos": 20198222716320.0, + "grad_norm": 1.8184134628004875, + "language_loss": 0.74013269, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.76547724, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19189453, + "step": 3239, + "time_per_iteration": 4.223737716674805 + }, + { + "auxiliary_loss_clip": 0.01489817, + "auxiliary_loss_mlp": 0.01053905, + "balance_loss_clip": 1.32462478, + "balance_loss_mlp": 1.03522539, + "epoch": 0.19479933864422067, + "flos": 20892587288040.0, + "grad_norm": 1.6224676666292648, + "language_loss": 0.73801863, + "learning_rate": 3.721803155320412e-06, + "loss": 0.76345587, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.18676758, + "step": 3240, + "time_per_iteration": 2.7379767894744873 + }, + { + "auxiliary_loss_clip": 0.01489863, + "auxiliary_loss_mlp": 0.01047889, + "balance_loss_clip": 1.32426393, + "balance_loss_mlp": 1.02856565, + "epoch": 0.19485946189688863, + "flos": 23300354444040.0, + "grad_norm": 1.817424198495978, + "language_loss": 0.66058397, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68596148, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.1932373, + "step": 3241, + "time_per_iteration": 2.8301241397857666 + }, + { + "auxiliary_loss_clip": 0.01490856, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.32573509, + "balance_loss_mlp": 1.02982068, + "epoch": 0.1949195851495566, + "flos": 23300313835680.0, + "grad_norm": 1.341633231470284, + "language_loss": 0.83155888, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85695493, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18933105, + "step": 3242, + "time_per_iteration": 2.7799570560455322 + }, + { + "auxiliary_loss_clip": 0.0134949, + "auxiliary_loss_mlp": 0.01026593, + "balance_loss_clip": 1.26534867, + "balance_loss_mlp": 1.0215621, + "epoch": 0.19497970840222456, + "flos": 64978495718400.0, + "grad_norm": 0.8305623228658621, + "language_loss": 0.57519114, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59895205, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.05029297, + "step": 3243, + "time_per_iteration": 3.2485363483428955 + }, + { + "auxiliary_loss_clip": 0.0148706, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.32067144, + "balance_loss_mlp": 1.02793241, + "epoch": 0.19503983165489253, + "flos": 19649289973680.0, + "grad_norm": 1.7979351912136339, + "language_loss": 0.83766329, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.86300766, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19458008, + "step": 3244, + "time_per_iteration": 2.727308511734009 + }, + { + "auxiliary_loss_clip": 0.0149424, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.32764828, + "balance_loss_mlp": 1.03141344, + "epoch": 0.1950999549075605, + "flos": 21146920403400.0, + "grad_norm": 1.6972807364195142, + "language_loss": 0.76993132, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79537261, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18469238, + "step": 3245, + "time_per_iteration": 2.7232248783111572 + }, + { + "auxiliary_loss_clip": 0.01483467, + "auxiliary_loss_mlp": 0.01040652, + "balance_loss_clip": 1.31618369, + "balance_loss_mlp": 1.02105379, + "epoch": 0.19516007816022846, + "flos": 20889501052680.0, + "grad_norm": 1.9347779579275428, + "language_loss": 0.84578431, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.8710255, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19604492, + "step": 3246, + "time_per_iteration": 2.735752582550049 + }, + { + "auxiliary_loss_clip": 0.01484317, + "auxiliary_loss_mlp": 0.01046086, + "balance_loss_clip": 1.31619966, + "balance_loss_mlp": 1.02604723, + "epoch": 0.19522020141289645, + "flos": 16915022000640.0, + "grad_norm": 2.644514718339826, + "language_loss": 0.77341443, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.79871845, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20056152, + "step": 3247, + "time_per_iteration": 2.6989948749542236 + }, + { + "auxiliary_loss_clip": 0.01482072, + "auxiliary_loss_mlp": 0.01040769, + "balance_loss_clip": 1.31545758, + "balance_loss_mlp": 1.02250636, + "epoch": 0.19528032466556441, + "flos": 26730570714840.0, + "grad_norm": 1.6548225442671238, + "language_loss": 0.75838852, + "learning_rate": 3.720215890515421e-06, + "loss": 0.7836169, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18261719, + "step": 3248, + "time_per_iteration": 2.822693347930908 + }, + { + "auxiliary_loss_clip": 0.01484929, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_clip": 1.31822681, + "balance_loss_mlp": 1.02736878, + "epoch": 0.19534044791823238, + "flos": 21037775466960.0, + "grad_norm": 1.9202590395773556, + "language_loss": 0.78547037, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.81077671, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18347168, + "step": 3249, + "time_per_iteration": 2.7379977703094482 + }, + { + "auxiliary_loss_clip": 0.01486853, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.31857455, + "balance_loss_mlp": 1.02618456, + "epoch": 0.19540057117090034, + "flos": 22348895388480.0, + "grad_norm": 1.4492901223350312, + "language_loss": 0.73127967, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75659662, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.18652344, + "step": 3250, + "time_per_iteration": 2.7958428859710693 + }, + { + "auxiliary_loss_clip": 0.0147706, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.31399214, + "balance_loss_mlp": 1.03346777, + "epoch": 0.1954606944235683, + "flos": 20306311835400.0, + "grad_norm": 2.2035099826816262, + "language_loss": 0.79756594, + "learning_rate": 3.719619589699017e-06, + "loss": 0.82285082, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.17980957, + "step": 3251, + "time_per_iteration": 2.719341993331909 + }, + { + "auxiliary_loss_clip": 0.01480191, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.31388927, + "balance_loss_mlp": 1.02204812, + "epoch": 0.19552081767623627, + "flos": 17351236271160.0, + "grad_norm": 21.438772872383364, + "language_loss": 0.84024966, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86546135, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.18920898, + "step": 3252, + "time_per_iteration": 2.711913824081421 + }, + { + "auxiliary_loss_clip": 0.01492332, + "auxiliary_loss_mlp": 0.01047135, + "balance_loss_clip": 1.32067204, + "balance_loss_mlp": 1.02636838, + "epoch": 0.19558094092890424, + "flos": 31983903023400.0, + "grad_norm": 1.734762807194628, + "language_loss": 0.73803204, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76342666, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20776367, + "step": 3253, + "time_per_iteration": 2.9096436500549316 + }, + { + "auxiliary_loss_clip": 0.01488316, + "auxiliary_loss_mlp": 0.01050455, + "balance_loss_clip": 1.31720901, + "balance_loss_mlp": 1.03066671, + "epoch": 0.19564106418157223, + "flos": 22273438409640.0, + "grad_norm": 1.9087751745954187, + "language_loss": 0.77093136, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.79631901, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.19787598, + "step": 3254, + "time_per_iteration": 2.7700374126434326 + }, + { + "auxiliary_loss_clip": 0.013445, + "auxiliary_loss_mlp": 0.01005493, + "balance_loss_clip": 1.26199186, + "balance_loss_mlp": 1.00070095, + "epoch": 0.1957011874342402, + "flos": 54376516634400.0, + "grad_norm": 0.7637309381175369, + "language_loss": 0.55347419, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57697415, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.04785156, + "step": 3255, + "time_per_iteration": 3.2106122970581055 + }, + { + "auxiliary_loss_clip": 0.0149158, + "auxiliary_loss_mlp": 0.01046719, + "balance_loss_clip": 1.32460642, + "balance_loss_mlp": 1.02683473, + "epoch": 0.19576131068690816, + "flos": 16510830744960.0, + "grad_norm": 3.334111453909009, + "language_loss": 0.7123189, + "learning_rate": 3.718624450942688e-06, + "loss": 0.73770189, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19873047, + "step": 3256, + "time_per_iteration": 2.775491714477539 + }, + { + "auxiliary_loss_clip": 0.0147704, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.31087327, + "balance_loss_mlp": 1.02822614, + "epoch": 0.19582143393957613, + "flos": 14723676732960.0, + "grad_norm": 2.1507912118088344, + "language_loss": 0.80930299, + "learning_rate": 3.718425227649987e-06, + "loss": 0.8345499, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1940918, + "step": 3257, + "time_per_iteration": 2.7434134483337402 + }, + { + "auxiliary_loss_clip": 0.01486696, + "auxiliary_loss_mlp": 0.01047547, + "balance_loss_clip": 1.32028818, + "balance_loss_mlp": 1.02893889, + "epoch": 0.1958815571922441, + "flos": 24430852069560.0, + "grad_norm": 1.6801351928423727, + "language_loss": 0.75111508, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77645749, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1862793, + "step": 3258, + "time_per_iteration": 2.823347330093384 + }, + { + "auxiliary_loss_clip": 0.01484277, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_clip": 1.31481647, + "balance_loss_mlp": 1.02233732, + "epoch": 0.19594168044491206, + "flos": 24905749125960.0, + "grad_norm": 1.8923161941417708, + "language_loss": 0.74074554, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76601094, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19934082, + "step": 3259, + "time_per_iteration": 2.80258846282959 + }, + { + "auxiliary_loss_clip": 0.01490102, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.3189137, + "balance_loss_mlp": 1.02677441, + "epoch": 0.19600180369758005, + "flos": 12060764294400.0, + "grad_norm": 2.4304868627687632, + "language_loss": 0.77195275, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79733038, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.2088623, + "step": 3260, + "time_per_iteration": 2.797100305557251 + }, + { + "auxiliary_loss_clip": 0.01484591, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_clip": 1.31629205, + "balance_loss_mlp": 1.02618074, + "epoch": 0.19606192695024802, + "flos": 20855122752960.0, + "grad_norm": 1.9182573550061401, + "language_loss": 0.83198166, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.85728043, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19128418, + "step": 3261, + "time_per_iteration": 2.768249273300171 + }, + { + "auxiliary_loss_clip": 0.01481592, + "auxiliary_loss_mlp": 0.01050736, + "balance_loss_clip": 1.31473851, + "balance_loss_mlp": 1.03021979, + "epoch": 0.19612205020291598, + "flos": 28481397225840.0, + "grad_norm": 1.6250136571084974, + "language_loss": 0.76695305, + "learning_rate": 3.717428133894807e-06, + "loss": 0.79227626, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.2052002, + "step": 3262, + "time_per_iteration": 2.8685250282287598 + }, + { + "auxiliary_loss_clip": 0.01476905, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.31335211, + "balance_loss_mlp": 1.02767551, + "epoch": 0.19618217345558395, + "flos": 25562121253920.0, + "grad_norm": 1.6329996702901322, + "language_loss": 0.86560875, + "learning_rate": 3.71722851973837e-06, + "loss": 0.89084876, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19433594, + "step": 3263, + "time_per_iteration": 2.817760944366455 + }, + { + "auxiliary_loss_clip": 0.01486157, + "auxiliary_loss_mlp": 0.01051066, + "balance_loss_clip": 1.3177433, + "balance_loss_mlp": 1.03192127, + "epoch": 0.1962422967082519, + "flos": 25269836303160.0, + "grad_norm": 2.097234231446114, + "language_loss": 0.73719597, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76256824, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19128418, + "step": 3264, + "time_per_iteration": 2.7844903469085693 + }, + { + "auxiliary_loss_clip": 0.01476594, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_clip": 1.31521833, + "balance_loss_mlp": 1.03409636, + "epoch": 0.19630241996091988, + "flos": 18812051899560.0, + "grad_norm": 1.8762140521534232, + "language_loss": 0.78899634, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.81429189, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18859863, + "step": 3265, + "time_per_iteration": 2.82381010055542 + }, + { + "auxiliary_loss_clip": 0.01329332, + "auxiliary_loss_mlp": 0.01010427, + "balance_loss_clip": 1.24792254, + "balance_loss_mlp": 1.00594509, + "epoch": 0.19636254321358784, + "flos": 62334489794400.0, + "grad_norm": 0.790256391117155, + "language_loss": 0.53399646, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55739403, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04492188, + "step": 3266, + "time_per_iteration": 3.279691457748413 + }, + { + "auxiliary_loss_clip": 0.01488492, + "auxiliary_loss_mlp": 0.01059909, + "balance_loss_clip": 1.31894934, + "balance_loss_mlp": 1.03893971, + "epoch": 0.19642266646625584, + "flos": 21074306009760.0, + "grad_norm": 1.996600360646256, + "language_loss": 0.80320054, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82868451, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20959473, + "step": 3267, + "time_per_iteration": 4.128955602645874 + }, + { + "auxiliary_loss_clip": 0.01469368, + "auxiliary_loss_mlp": 0.0104693, + "balance_loss_clip": 1.30745673, + "balance_loss_mlp": 1.02718866, + "epoch": 0.1964827897189238, + "flos": 14542810786800.0, + "grad_norm": 2.5304578046648762, + "language_loss": 0.86748236, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89264536, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19763184, + "step": 3268, + "time_per_iteration": 2.733621120452881 + }, + { + "auxiliary_loss_clip": 0.01473984, + "auxiliary_loss_mlp": 0.01046477, + "balance_loss_clip": 1.31141376, + "balance_loss_mlp": 1.02832115, + "epoch": 0.19654291297159177, + "flos": 19249321987440.0, + "grad_norm": 2.473623129611164, + "language_loss": 0.69324523, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71844983, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18164062, + "step": 3269, + "time_per_iteration": 2.7087714672088623 + }, + { + "auxiliary_loss_clip": 0.01480314, + "auxiliary_loss_mlp": 0.01050806, + "balance_loss_clip": 1.31304288, + "balance_loss_mlp": 1.03049266, + "epoch": 0.19660303622425973, + "flos": 25781263902360.0, + "grad_norm": 1.8851079899124472, + "language_loss": 0.80984569, + "learning_rate": 3.715829397778135e-06, + "loss": 0.83515692, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.203125, + "step": 3270, + "time_per_iteration": 2.7857894897460938 + }, + { + "auxiliary_loss_clip": 0.01472061, + "auxiliary_loss_mlp": 0.0104362, + "balance_loss_clip": 1.30747855, + "balance_loss_mlp": 1.02458262, + "epoch": 0.1966631594769277, + "flos": 20600018078760.0, + "grad_norm": 1.933909763588955, + "language_loss": 0.8441484, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86930519, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19042969, + "step": 3271, + "time_per_iteration": 2.7518510818481445 + }, + { + "auxiliary_loss_clip": 0.01469557, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.31084991, + "balance_loss_mlp": 1.031528, + "epoch": 0.19672328272959566, + "flos": 23628479595480.0, + "grad_norm": 2.24431770913162, + "language_loss": 0.79969704, + "learning_rate": 3.715429062953087e-06, + "loss": 0.824898, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18994141, + "step": 3272, + "time_per_iteration": 4.330741882324219 + }, + { + "auxiliary_loss_clip": 0.01476232, + "auxiliary_loss_mlp": 0.01045112, + "balance_loss_clip": 1.31000733, + "balance_loss_mlp": 1.02497768, + "epoch": 0.19678340598226365, + "flos": 23115955570560.0, + "grad_norm": 1.8272550215599153, + "language_loss": 0.80716455, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83237803, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20141602, + "step": 3273, + "time_per_iteration": 2.793151617050171 + }, + { + "auxiliary_loss_clip": 0.01476466, + "auxiliary_loss_mlp": 0.01047058, + "balance_loss_clip": 1.31006563, + "balance_loss_mlp": 1.02805638, + "epoch": 0.19684352923493162, + "flos": 24540200047800.0, + "grad_norm": 1.6785441079894796, + "language_loss": 0.77978832, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80502355, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18994141, + "step": 3274, + "time_per_iteration": 2.828709602355957 + }, + { + "auxiliary_loss_clip": 0.014799, + "auxiliary_loss_mlp": 0.01049839, + "balance_loss_clip": 1.31315756, + "balance_loss_mlp": 1.02993107, + "epoch": 0.19690365248759958, + "flos": 21801180896640.0, + "grad_norm": 3.2276541592035635, + "language_loss": 0.8130995, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83839685, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19921875, + "step": 3275, + "time_per_iteration": 2.857473850250244 + }, + { + "auxiliary_loss_clip": 0.01474168, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.30851245, + "balance_loss_mlp": 1.02686977, + "epoch": 0.19696377574026755, + "flos": 19060578019440.0, + "grad_norm": 1.9253886852782285, + "language_loss": 0.81258559, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83778077, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18469238, + "step": 3276, + "time_per_iteration": 4.352791786193848 + }, + { + "auxiliary_loss_clip": 0.01467445, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.30383444, + "balance_loss_mlp": 1.01821184, + "epoch": 0.19702389899293551, + "flos": 22825863471240.0, + "grad_norm": 15.220678651337224, + "language_loss": 0.89945287, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.9244988, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18933105, + "step": 3277, + "time_per_iteration": 2.7346858978271484 + }, + { + "auxiliary_loss_clip": 0.01479831, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_clip": 1.3095175, + "balance_loss_mlp": 1.0256989, + "epoch": 0.19708402224560348, + "flos": 22899533682240.0, + "grad_norm": 2.399048925958411, + "language_loss": 0.62934732, + "learning_rate": 3.714226497539239e-06, + "loss": 0.65462518, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.22253418, + "step": 3278, + "time_per_iteration": 2.795851469039917 + }, + { + "auxiliary_loss_clip": 0.0147481, + "auxiliary_loss_mlp": 0.01053527, + "balance_loss_clip": 1.30664957, + "balance_loss_mlp": 1.03271317, + "epoch": 0.19714414549827144, + "flos": 25667570829600.0, + "grad_norm": 2.3960298040731476, + "language_loss": 0.74501646, + "learning_rate": 3.714025842413166e-06, + "loss": 0.77029985, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20812988, + "step": 3279, + "time_per_iteration": 4.300048351287842 + }, + { + "auxiliary_loss_clip": 0.01468133, + "auxiliary_loss_mlp": 0.01047587, + "balance_loss_clip": 1.30066085, + "balance_loss_mlp": 1.02717853, + "epoch": 0.19720426875093944, + "flos": 23921251846560.0, + "grad_norm": 2.641569490222117, + "language_loss": 0.82879555, + "learning_rate": 3.713825122291061e-06, + "loss": 0.85395277, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20422363, + "step": 3280, + "time_per_iteration": 2.7809619903564453 + }, + { + "auxiliary_loss_clip": 0.01477195, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.31040823, + "balance_loss_mlp": 1.02668118, + "epoch": 0.1972643920036074, + "flos": 13886032575240.0, + "grad_norm": 2.750038670986602, + "language_loss": 0.78208965, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80731654, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18811035, + "step": 3281, + "time_per_iteration": 2.8634703159332275 + }, + { + "auxiliary_loss_clip": 0.01469083, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.30826306, + "balance_loss_mlp": 1.0176084, + "epoch": 0.19732451525627537, + "flos": 19868188971960.0, + "grad_norm": 1.6641575845570655, + "language_loss": 0.79905617, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.82411635, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1932373, + "step": 3282, + "time_per_iteration": 2.727015972137451 + }, + { + "auxiliary_loss_clip": 0.0148628, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.31731606, + "balance_loss_mlp": 1.02449238, + "epoch": 0.19738463850894333, + "flos": 24978932036640.0, + "grad_norm": 2.0153919630126182, + "language_loss": 0.71919, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74449682, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19909668, + "step": 3283, + "time_per_iteration": 2.779944658279419 + }, + { + "auxiliary_loss_clip": 0.01487331, + "auxiliary_loss_mlp": 0.01051131, + "balance_loss_clip": 1.3176527, + "balance_loss_mlp": 1.03137839, + "epoch": 0.1974447617616113, + "flos": 18373441735800.0, + "grad_norm": 2.181060714552813, + "language_loss": 0.79179579, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.8171804, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19750977, + "step": 3284, + "time_per_iteration": 2.715236186981201 + }, + { + "auxiliary_loss_clip": 0.01483645, + "auxiliary_loss_mlp": 0.01047293, + "balance_loss_clip": 1.3139056, + "balance_loss_mlp": 1.02736092, + "epoch": 0.19750488501427926, + "flos": 22898071781280.0, + "grad_norm": 1.992321352919022, + "language_loss": 0.86480957, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.89011896, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19934082, + "step": 3285, + "time_per_iteration": 2.8100719451904297 + }, + { + "auxiliary_loss_clip": 0.01483898, + "auxiliary_loss_mlp": 0.01044499, + "balance_loss_clip": 1.32021141, + "balance_loss_mlp": 1.02515125, + "epoch": 0.19756500826694723, + "flos": 21876516050400.0, + "grad_norm": 2.114720670712303, + "language_loss": 0.8832249, + "learning_rate": 3.712619437068174e-06, + "loss": 0.9085089, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19335938, + "step": 3286, + "time_per_iteration": 2.774064302444458 + }, + { + "auxiliary_loss_clip": 0.0149235, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.32190812, + "balance_loss_mlp": 1.02950847, + "epoch": 0.19762513151961522, + "flos": 15163220889000.0, + "grad_norm": 3.148716880158304, + "language_loss": 0.78045875, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80588698, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.2097168, + "step": 3287, + "time_per_iteration": 2.755659341812134 + }, + { + "auxiliary_loss_clip": 0.01490184, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.32050872, + "balance_loss_mlp": 1.02582359, + "epoch": 0.1976852547722832, + "flos": 16983128866320.0, + "grad_norm": 2.9115513195201497, + "language_loss": 0.81198245, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83734965, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20715332, + "step": 3288, + "time_per_iteration": 2.755798578262329 + }, + { + "auxiliary_loss_clip": 0.01479105, + "auxiliary_loss_mlp": 0.01048201, + "balance_loss_clip": 1.31788969, + "balance_loss_mlp": 1.02786422, + "epoch": 0.19774537802495115, + "flos": 20307773736360.0, + "grad_norm": 1.6120983459031717, + "language_loss": 0.72978902, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75506204, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20324707, + "step": 3289, + "time_per_iteration": 2.7340750694274902 + }, + { + "auxiliary_loss_clip": 0.01486372, + "auxiliary_loss_mlp": 0.01053118, + "balance_loss_clip": 1.32123017, + "balance_loss_mlp": 1.03342438, + "epoch": 0.19780550127761912, + "flos": 27241267363560.0, + "grad_norm": 1.873984895200636, + "language_loss": 0.79483205, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.82022691, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19677734, + "step": 3290, + "time_per_iteration": 2.7958872318267822 + }, + { + "auxiliary_loss_clip": 0.0131039, + "auxiliary_loss_mlp": 0.01007493, + "balance_loss_clip": 1.22982788, + "balance_loss_mlp": 1.00250995, + "epoch": 0.19786562453028708, + "flos": 63567855041760.0, + "grad_norm": 0.9852290586269004, + "language_loss": 0.60324526, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62642407, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04980469, + "step": 3291, + "time_per_iteration": 3.2739450931549072 + }, + { + "auxiliary_loss_clip": 0.01498307, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_clip": 1.32619357, + "balance_loss_mlp": 1.02452826, + "epoch": 0.19792574778295505, + "flos": 26292285417960.0, + "grad_norm": 1.9679708459717935, + "language_loss": 0.8139832, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83942866, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21716309, + "step": 3292, + "time_per_iteration": 2.7966597080230713 + }, + { + "auxiliary_loss_clip": 0.01491297, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.32845783, + "balance_loss_mlp": 1.0321126, + "epoch": 0.19798587103562304, + "flos": 19942915000320.0, + "grad_norm": 1.8409384968242566, + "language_loss": 0.81603515, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84147078, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20141602, + "step": 3293, + "time_per_iteration": 2.76957106590271 + }, + { + "auxiliary_loss_clip": 0.0150473, + "auxiliary_loss_mlp": 0.01059519, + "balance_loss_clip": 1.33181298, + "balance_loss_mlp": 1.03728628, + "epoch": 0.198045994288291, + "flos": 20125080414000.0, + "grad_norm": 1.9102663327819118, + "language_loss": 0.61475027, + "learning_rate": 3.711008220265093e-06, + "loss": 0.64039278, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22229004, + "step": 3294, + "time_per_iteration": 2.7843828201293945 + }, + { + "auxiliary_loss_clip": 0.01491219, + "auxiliary_loss_mlp": 0.01050808, + "balance_loss_clip": 1.32441831, + "balance_loss_mlp": 1.03130507, + "epoch": 0.19810611754095897, + "flos": 17972174282040.0, + "grad_norm": 1.8750516429499242, + "language_loss": 0.87630773, + "learning_rate": 3.710806526117251e-06, + "loss": 0.90172803, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19506836, + "step": 3295, + "time_per_iteration": 2.75319766998291 + }, + { + "auxiliary_loss_clip": 0.01490325, + "auxiliary_loss_mlp": 0.01048437, + "balance_loss_clip": 1.32625294, + "balance_loss_mlp": 1.02886295, + "epoch": 0.19816624079362694, + "flos": 15089103986040.0, + "grad_norm": 2.4421517765563627, + "language_loss": 0.80844343, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.83383101, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19555664, + "step": 3296, + "time_per_iteration": 2.746690511703491 + }, + { + "auxiliary_loss_clip": 0.01497817, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.32752228, + "balance_loss_mlp": 1.02995753, + "epoch": 0.1982263640462949, + "flos": 24905789734320.0, + "grad_norm": 1.7539311499245744, + "language_loss": 0.67759395, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70308948, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21777344, + "step": 3297, + "time_per_iteration": 2.7940831184387207 + }, + { + "auxiliary_loss_clip": 0.01483143, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.32006741, + "balance_loss_mlp": 1.02064693, + "epoch": 0.19828648729896287, + "flos": 20381078472120.0, + "grad_norm": 1.9328997279109108, + "language_loss": 0.81574035, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.8409698, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19165039, + "step": 3298, + "time_per_iteration": 2.8605921268463135 + }, + { + "auxiliary_loss_clip": 0.01502543, + "auxiliary_loss_mlp": 0.01050088, + "balance_loss_clip": 1.32990646, + "balance_loss_mlp": 1.02711606, + "epoch": 0.19834661055163083, + "flos": 18884706901560.0, + "grad_norm": 2.220503013730714, + "language_loss": 0.85590237, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.88142872, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22973633, + "step": 3299, + "time_per_iteration": 2.748708963394165 + }, + { + "auxiliary_loss_clip": 0.01311076, + "auxiliary_loss_mlp": 0.01005834, + "balance_loss_clip": 1.23006392, + "balance_loss_mlp": 1.00118434, + "epoch": 0.19840673380429882, + "flos": 60274583452800.0, + "grad_norm": 0.7539200757344188, + "language_loss": 0.53166991, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55483902, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04638672, + "step": 3300, + "time_per_iteration": 3.2398626804351807 + }, + { + "auxiliary_loss_clip": 0.01492775, + "auxiliary_loss_mlp": 0.01056041, + "balance_loss_clip": 1.32587743, + "balance_loss_mlp": 1.03399944, + "epoch": 0.1984668570569668, + "flos": 19906871757840.0, + "grad_norm": 1.6958601888677913, + "language_loss": 0.74033672, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.76582485, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.22021484, + "step": 3301, + "time_per_iteration": 2.812167167663574 + }, + { + "auxiliary_loss_clip": 0.0149414, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.32724476, + "balance_loss_mlp": 1.0280292, + "epoch": 0.19852698030963475, + "flos": 15634747451520.0, + "grad_norm": 2.295989563817552, + "language_loss": 0.88521564, + "learning_rate": 3.709392851040235e-06, + "loss": 0.91063142, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1940918, + "step": 3302, + "time_per_iteration": 2.801626682281494 + }, + { + "auxiliary_loss_clip": 0.01495362, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_clip": 1.32676303, + "balance_loss_mlp": 1.03333306, + "epoch": 0.19858710356230272, + "flos": 43150838171040.0, + "grad_norm": 2.109614273743558, + "language_loss": 0.73753667, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76302809, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.2043457, + "step": 3303, + "time_per_iteration": 2.9362478256225586 + }, + { + "auxiliary_loss_clip": 0.01497215, + "auxiliary_loss_mlp": 0.01056507, + "balance_loss_clip": 1.33062243, + "balance_loss_mlp": 1.0358839, + "epoch": 0.19864722681497068, + "flos": 35149755913920.0, + "grad_norm": 1.7068200913041496, + "language_loss": 0.75538409, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.78092134, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20617676, + "step": 3304, + "time_per_iteration": 2.869579553604126 + }, + { + "auxiliary_loss_clip": 0.01485845, + "auxiliary_loss_mlp": 0.01047114, + "balance_loss_clip": 1.32073331, + "balance_loss_mlp": 1.02769446, + "epoch": 0.19870735006763865, + "flos": 19430797059000.0, + "grad_norm": 1.7107900991458485, + "language_loss": 0.86112595, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88645548, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.1940918, + "step": 3305, + "time_per_iteration": 4.177524089813232 + }, + { + "auxiliary_loss_clip": 0.01490657, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.32046211, + "balance_loss_mlp": 1.02308929, + "epoch": 0.19876747332030664, + "flos": 23552291666160.0, + "grad_norm": 1.622403008638054, + "language_loss": 0.68586648, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.71120262, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19873047, + "step": 3306, + "time_per_iteration": 2.768437385559082 + }, + { + "auxiliary_loss_clip": 0.01492111, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.3262974, + "balance_loss_mlp": 1.02468324, + "epoch": 0.1988275965729746, + "flos": 19834297972560.0, + "grad_norm": 1.6090844138365854, + "language_loss": 0.76670039, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.79205388, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18554688, + "step": 3307, + "time_per_iteration": 2.873948097229004 + }, + { + "auxiliary_loss_clip": 0.01487342, + "auxiliary_loss_mlp": 0.01051025, + "balance_loss_clip": 1.32211351, + "balance_loss_mlp": 1.032107, + "epoch": 0.19888771982564257, + "flos": 23518887967080.0, + "grad_norm": 2.1016660061323167, + "language_loss": 0.7605406, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78592426, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18933105, + "step": 3308, + "time_per_iteration": 2.774614095687866 + }, + { + "auxiliary_loss_clip": 0.01492036, + "auxiliary_loss_mlp": 0.01049127, + "balance_loss_clip": 1.32572651, + "balance_loss_mlp": 1.02839625, + "epoch": 0.19894784307831054, + "flos": 18155111254560.0, + "grad_norm": 1.6405931103852314, + "language_loss": 0.75724757, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.78265923, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20727539, + "step": 3309, + "time_per_iteration": 2.7661020755767822 + }, + { + "auxiliary_loss_clip": 0.01478526, + "auxiliary_loss_mlp": 0.01056191, + "balance_loss_clip": 1.31629443, + "balance_loss_mlp": 1.03525817, + "epoch": 0.1990079663309785, + "flos": 24280912712520.0, + "grad_norm": 1.8141584421419032, + "language_loss": 0.87811178, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90345895, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20922852, + "step": 3310, + "time_per_iteration": 2.8459651470184326 + }, + { + "auxiliary_loss_clip": 0.01484235, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.31985223, + "balance_loss_mlp": 1.02401328, + "epoch": 0.19906808958364647, + "flos": 34904478462840.0, + "grad_norm": 2.0506431389421955, + "language_loss": 0.64551973, + "learning_rate": 3.70757060210226e-06, + "loss": 0.67079866, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19628906, + "step": 3311, + "time_per_iteration": 2.904276132583618 + }, + { + "auxiliary_loss_clip": 0.01489498, + "auxiliary_loss_mlp": 0.01054954, + "balance_loss_clip": 1.32056189, + "balance_loss_mlp": 1.03380609, + "epoch": 0.19912821283631443, + "flos": 24030274957920.0, + "grad_norm": 2.219740484517529, + "language_loss": 0.7383008, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76374531, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21154785, + "step": 3312, + "time_per_iteration": 4.247531414031982 + }, + { + "auxiliary_loss_clip": 0.01490012, + "auxiliary_loss_mlp": 0.01057136, + "balance_loss_clip": 1.32449841, + "balance_loss_mlp": 1.03713214, + "epoch": 0.19918833608898243, + "flos": 19863031710240.0, + "grad_norm": 1.8642105206842303, + "language_loss": 0.83339757, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85886902, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20007324, + "step": 3313, + "time_per_iteration": 2.7310492992401123 + }, + { + "auxiliary_loss_clip": 0.01485312, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.32004893, + "balance_loss_mlp": 1.03071308, + "epoch": 0.1992484593416504, + "flos": 29101766719680.0, + "grad_norm": 2.030481958722943, + "language_loss": 0.81375122, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83910203, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1907959, + "step": 3314, + "time_per_iteration": 2.823516368865967 + }, + { + "auxiliary_loss_clip": 0.01472582, + "auxiliary_loss_mlp": 0.01047854, + "balance_loss_clip": 1.31179869, + "balance_loss_mlp": 1.029019, + "epoch": 0.19930858259431836, + "flos": 23300273227320.0, + "grad_norm": 1.5760935117688648, + "language_loss": 0.87677288, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.9019773, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18847656, + "step": 3315, + "time_per_iteration": 4.380500555038452 + }, + { + "auxiliary_loss_clip": 0.01487719, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.32231796, + "balance_loss_mlp": 1.03372073, + "epoch": 0.19936870584698632, + "flos": 25384910060160.0, + "grad_norm": 2.0526141586553814, + "language_loss": 0.71129876, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73670626, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19311523, + "step": 3316, + "time_per_iteration": 2.8508245944976807 + }, + { + "auxiliary_loss_clip": 0.01307851, + "auxiliary_loss_mlp": 0.01002169, + "balance_loss_clip": 1.22796082, + "balance_loss_mlp": 0.99792558, + "epoch": 0.1994288290996543, + "flos": 62185217152320.0, + "grad_norm": 0.8575653139179628, + "language_loss": 0.66387922, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68697935, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.04248047, + "step": 3317, + "time_per_iteration": 4.8241493701934814 + }, + { + "auxiliary_loss_clip": 0.01490708, + "auxiliary_loss_mlp": 0.0105252, + "balance_loss_clip": 1.32136285, + "balance_loss_mlp": 1.03170657, + "epoch": 0.19948895235232225, + "flos": 19030504205880.0, + "grad_norm": 3.5654629246165794, + "language_loss": 0.74514592, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.77057821, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20812988, + "step": 3318, + "time_per_iteration": 2.9274914264678955 + }, + { + "auxiliary_loss_clip": 0.01476538, + "auxiliary_loss_mlp": 0.01052551, + "balance_loss_clip": 1.31422496, + "balance_loss_mlp": 1.03329825, + "epoch": 0.19954907560499022, + "flos": 37822901659200.0, + "grad_norm": 1.855782169520033, + "language_loss": 0.79086107, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81615198, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19274902, + "step": 3319, + "time_per_iteration": 2.9485223293304443 + }, + { + "auxiliary_loss_clip": 0.01484974, + "auxiliary_loss_mlp": 0.01052288, + "balance_loss_clip": 1.31867528, + "balance_loss_mlp": 1.03047228, + "epoch": 0.1996091988576582, + "flos": 49573960016400.0, + "grad_norm": 5.302730108638505, + "language_loss": 0.76297212, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.78834474, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21826172, + "step": 3320, + "time_per_iteration": 3.0816032886505127 + }, + { + "auxiliary_loss_clip": 0.01479995, + "auxiliary_loss_mlp": 0.01046736, + "balance_loss_clip": 1.31534457, + "balance_loss_mlp": 1.02681613, + "epoch": 0.19966932211032618, + "flos": 22640652430560.0, + "grad_norm": 1.5835818206924932, + "language_loss": 0.80657125, + "learning_rate": 3.705539729936701e-06, + "loss": 0.83183855, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19934082, + "step": 3321, + "time_per_iteration": 2.7830748558044434 + }, + { + "auxiliary_loss_clip": 0.01308546, + "auxiliary_loss_mlp": 0.01005697, + "balance_loss_clip": 1.22831774, + "balance_loss_mlp": 1.00142944, + "epoch": 0.19972944536299414, + "flos": 54095277157560.0, + "grad_norm": 0.876341434286015, + "language_loss": 0.65175819, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67490059, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04272461, + "step": 3322, + "time_per_iteration": 3.0900466442108154 + }, + { + "auxiliary_loss_clip": 0.01303682, + "auxiliary_loss_mlp": 0.01011717, + "balance_loss_clip": 1.22306919, + "balance_loss_mlp": 1.00747323, + "epoch": 0.1997895686156621, + "flos": 69368275735560.0, + "grad_norm": 0.7858500796814826, + "language_loss": 0.56924272, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59239662, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04248047, + "step": 3323, + "time_per_iteration": 3.3465123176574707 + }, + { + "auxiliary_loss_clip": 0.01476743, + "auxiliary_loss_mlp": 0.01053758, + "balance_loss_clip": 1.31506753, + "balance_loss_mlp": 1.03233612, + "epoch": 0.19984969186833007, + "flos": 18556744183560.0, + "grad_norm": 1.8392126369452304, + "language_loss": 0.80844736, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.83375239, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.21411133, + "step": 3324, + "time_per_iteration": 2.7344281673431396 + }, + { + "auxiliary_loss_clip": 0.01480457, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.31673908, + "balance_loss_mlp": 1.02391124, + "epoch": 0.19990981512099804, + "flos": 26434955878560.0, + "grad_norm": 1.5368869512021726, + "language_loss": 0.53621292, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56147033, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.21374512, + "step": 3325, + "time_per_iteration": 2.770106315612793 + }, + { + "auxiliary_loss_clip": 0.01480014, + "auxiliary_loss_mlp": 0.01052786, + "balance_loss_clip": 1.31356192, + "balance_loss_mlp": 1.03316426, + "epoch": 0.19996993837366603, + "flos": 16330330274040.0, + "grad_norm": 2.3465237372261436, + "language_loss": 0.86631751, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.89164549, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19628906, + "step": 3326, + "time_per_iteration": 2.763188600540161 + }, + { + "auxiliary_loss_clip": 0.01469961, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_clip": 1.30903244, + "balance_loss_mlp": 1.02555871, + "epoch": 0.200030061626334, + "flos": 20848300548480.0, + "grad_norm": 2.1228639773301827, + "language_loss": 0.72153002, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74667919, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1940918, + "step": 3327, + "time_per_iteration": 2.7237389087677 + }, + { + "auxiliary_loss_clip": 0.01473729, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_clip": 1.3072412, + "balance_loss_mlp": 1.02756202, + "epoch": 0.20009018487900196, + "flos": 23767089220080.0, + "grad_norm": 1.6803735290785353, + "language_loss": 0.77060866, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.79583508, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21350098, + "step": 3328, + "time_per_iteration": 2.781362295150757 + }, + { + "auxiliary_loss_clip": 0.01459795, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.30083656, + "balance_loss_mlp": 1.02828383, + "epoch": 0.20015030813166992, + "flos": 28117513090440.0, + "grad_norm": 1.6889309804831614, + "language_loss": 0.6969856, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.72204876, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18249512, + "step": 3329, + "time_per_iteration": 2.869614601135254 + }, + { + "auxiliary_loss_clip": 0.0147489, + "auxiliary_loss_mlp": 0.01056853, + "balance_loss_clip": 1.30918813, + "balance_loss_mlp": 1.03431082, + "epoch": 0.2002104313843379, + "flos": 26072696077560.0, + "grad_norm": 1.6650126888168502, + "language_loss": 0.81400812, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83932555, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.22546387, + "step": 3330, + "time_per_iteration": 2.810814619064331 + }, + { + "auxiliary_loss_clip": 0.01474327, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.30866063, + "balance_loss_mlp": 1.02784085, + "epoch": 0.20027055463700585, + "flos": 22971863817360.0, + "grad_norm": 2.725058977147656, + "language_loss": 0.77036071, + "learning_rate": 3.703502390349417e-06, + "loss": 0.79557931, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19702148, + "step": 3331, + "time_per_iteration": 2.7420475482940674 + }, + { + "auxiliary_loss_clip": 0.01481281, + "auxiliary_loss_mlp": 0.01056551, + "balance_loss_clip": 1.31218326, + "balance_loss_mlp": 1.03592813, + "epoch": 0.20033067788967382, + "flos": 17170167283200.0, + "grad_norm": 2.840860766388462, + "language_loss": 0.79585987, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.82123816, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20629883, + "step": 3332, + "time_per_iteration": 2.731504201889038 + }, + { + "auxiliary_loss_clip": 0.01292621, + "auxiliary_loss_mlp": 0.01009789, + "balance_loss_clip": 1.2114718, + "balance_loss_mlp": 1.00535417, + "epoch": 0.2003908011423418, + "flos": 60838767957600.0, + "grad_norm": 0.9544415592669984, + "language_loss": 0.61974448, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64276856, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04443359, + "step": 3333, + "time_per_iteration": 3.168365955352783 + }, + { + "auxiliary_loss_clip": 0.01473773, + "auxiliary_loss_mlp": 0.01052446, + "balance_loss_clip": 1.30741751, + "balance_loss_mlp": 1.03231144, + "epoch": 0.20045092439500978, + "flos": 24211506379320.0, + "grad_norm": 2.1625740578247328, + "language_loss": 0.81755602, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.84281826, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20117188, + "step": 3334, + "time_per_iteration": 2.885852813720703 + }, + { + "auxiliary_loss_clip": 0.01480348, + "auxiliary_loss_mlp": 0.01056376, + "balance_loss_clip": 1.31267738, + "balance_loss_mlp": 1.03398824, + "epoch": 0.20051104764767774, + "flos": 29393604978480.0, + "grad_norm": 1.7554436036469216, + "language_loss": 0.74311012, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76847738, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.22375488, + "step": 3335, + "time_per_iteration": 2.811349630355835 + }, + { + "auxiliary_loss_clip": 0.01481612, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_clip": 1.31463814, + "balance_loss_mlp": 1.0463376, + "epoch": 0.2005711709003457, + "flos": 23519091008880.0, + "grad_norm": 1.68840879844233, + "language_loss": 0.79905885, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82453775, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19946289, + "step": 3336, + "time_per_iteration": 2.783770799636841 + }, + { + "auxiliary_loss_clip": 0.01482189, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.31113601, + "balance_loss_mlp": 1.03096271, + "epoch": 0.20063129415301367, + "flos": 22527852741720.0, + "grad_norm": 2.8698123146243257, + "language_loss": 0.77322698, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79857916, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.2208252, + "step": 3337, + "time_per_iteration": 2.925590991973877 + }, + { + "auxiliary_loss_clip": 0.01482075, + "auxiliary_loss_mlp": 0.01046454, + "balance_loss_clip": 1.3158052, + "balance_loss_mlp": 1.02566385, + "epoch": 0.20069141740568164, + "flos": 25963835399640.0, + "grad_norm": 2.216012463532443, + "language_loss": 0.69168037, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71696568, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20788574, + "step": 3338, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.01484182, + "auxiliary_loss_mlp": 0.01057092, + "balance_loss_clip": 1.31711173, + "balance_loss_mlp": 1.03669536, + "epoch": 0.2007515406583496, + "flos": 24796035672480.0, + "grad_norm": 1.8924972902753594, + "language_loss": 0.69438672, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71979946, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20385742, + "step": 3339, + "time_per_iteration": 2.8023149967193604 + }, + { + "auxiliary_loss_clip": 0.01492521, + "auxiliary_loss_mlp": 0.01048168, + "balance_loss_clip": 1.32011986, + "balance_loss_mlp": 1.02798605, + "epoch": 0.2008116639110176, + "flos": 37932736937760.0, + "grad_norm": 2.4426294810603495, + "language_loss": 0.66935003, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.69475693, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.20166016, + "step": 3340, + "time_per_iteration": 2.8723278045654297 + }, + { + "auxiliary_loss_clip": 0.01483665, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.31465399, + "balance_loss_mlp": 1.02311981, + "epoch": 0.20087178716368556, + "flos": 20745409299480.0, + "grad_norm": 2.0138280525085634, + "language_loss": 0.74339569, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76867568, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21228027, + "step": 3341, + "time_per_iteration": 2.755364418029785 + }, + { + "auxiliary_loss_clip": 0.01473301, + "auxiliary_loss_mlp": 0.01051525, + "balance_loss_clip": 1.31120849, + "balance_loss_mlp": 1.0315094, + "epoch": 0.20093191041635353, + "flos": 23847622243920.0, + "grad_norm": 1.906142411192417, + "language_loss": 0.72112918, + "learning_rate": 3.70125385615256e-06, + "loss": 0.74637741, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20031738, + "step": 3342, + "time_per_iteration": 2.751474142074585 + }, + { + "auxiliary_loss_clip": 0.01482333, + "auxiliary_loss_mlp": 0.01049559, + "balance_loss_clip": 1.31520784, + "balance_loss_mlp": 1.0298419, + "epoch": 0.2009920336690215, + "flos": 21796226676720.0, + "grad_norm": 4.865379930027461, + "language_loss": 0.7216363, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74695516, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19714355, + "step": 3343, + "time_per_iteration": 4.252162218093872 + }, + { + "auxiliary_loss_clip": 0.01482787, + "auxiliary_loss_mlp": 0.01060259, + "balance_loss_clip": 1.31548584, + "balance_loss_mlp": 1.03906345, + "epoch": 0.20105215692168946, + "flos": 26364818594880.0, + "grad_norm": 1.9198509559528811, + "language_loss": 0.80564559, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83107603, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21203613, + "step": 3344, + "time_per_iteration": 2.7709109783172607 + }, + { + "auxiliary_loss_clip": 0.01484635, + "auxiliary_loss_mlp": 0.01047406, + "balance_loss_clip": 1.31640983, + "balance_loss_mlp": 1.02752161, + "epoch": 0.20111228017435742, + "flos": 18811970682840.0, + "grad_norm": 2.1791878866146654, + "language_loss": 0.84113657, + "learning_rate": 3.700639264372948e-06, + "loss": 0.86645699, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19873047, + "step": 3345, + "time_per_iteration": 2.7919726371765137 + }, + { + "auxiliary_loss_clip": 0.01464735, + "auxiliary_loss_mlp": 0.01045804, + "balance_loss_clip": 1.30637074, + "balance_loss_mlp": 1.02795792, + "epoch": 0.20117240342702541, + "flos": 19979932843440.0, + "grad_norm": 1.7061118789443321, + "language_loss": 0.68662322, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.71172857, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17858887, + "step": 3346, + "time_per_iteration": 2.7599377632141113 + }, + { + "auxiliary_loss_clip": 0.01475818, + "auxiliary_loss_mlp": 0.01051973, + "balance_loss_clip": 1.31088448, + "balance_loss_mlp": 1.03185105, + "epoch": 0.20123252667969338, + "flos": 23147125809840.0, + "grad_norm": 1.9859206069269704, + "language_loss": 0.74111164, + "learning_rate": 3.70022921406487e-06, + "loss": 0.76638955, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20129395, + "step": 3347, + "time_per_iteration": 2.81622314453125 + }, + { + "auxiliary_loss_clip": 0.01480364, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_clip": 1.31599689, + "balance_loss_mlp": 1.03084469, + "epoch": 0.20129264993236134, + "flos": 23226887274840.0, + "grad_norm": 1.687161782488772, + "language_loss": 0.86958557, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89488161, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18395996, + "step": 3348, + "time_per_iteration": 2.7557806968688965 + }, + { + "auxiliary_loss_clip": 0.01469228, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.30708027, + "balance_loss_mlp": 1.02361894, + "epoch": 0.2013527731850293, + "flos": 21876475442040.0, + "grad_norm": 1.6497153727702167, + "language_loss": 0.71218908, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73732448, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20715332, + "step": 3349, + "time_per_iteration": 2.757570743560791 + }, + { + "auxiliary_loss_clip": 0.01473895, + "auxiliary_loss_mlp": 0.01046292, + "balance_loss_clip": 1.3102777, + "balance_loss_mlp": 1.02426219, + "epoch": 0.20141289643769728, + "flos": 18045357192720.0, + "grad_norm": 1.8047849647320564, + "language_loss": 0.71635491, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.74155676, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.22033691, + "step": 3350, + "time_per_iteration": 2.6983766555786133 + }, + { + "auxiliary_loss_clip": 0.01478224, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_clip": 1.31272495, + "balance_loss_mlp": 1.02474952, + "epoch": 0.20147301969036524, + "flos": 23956726572000.0, + "grad_norm": 2.213524330189437, + "language_loss": 0.76748151, + "learning_rate": 3.69940833983661e-06, + "loss": 0.79273224, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.2208252, + "step": 3351, + "time_per_iteration": 4.200873136520386 + }, + { + "auxiliary_loss_clip": 0.01482621, + "auxiliary_loss_mlp": 0.01042436, + "balance_loss_clip": 1.31508827, + "balance_loss_mlp": 1.0210731, + "epoch": 0.2015331429430332, + "flos": 25593372709920.0, + "grad_norm": 1.4762475247833509, + "language_loss": 0.8069331, + "learning_rate": 3.699202960155748e-06, + "loss": 0.83218372, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21362305, + "step": 3352, + "time_per_iteration": 2.758707284927368 + }, + { + "auxiliary_loss_clip": 0.01477498, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_clip": 1.31268764, + "balance_loss_mlp": 1.03145504, + "epoch": 0.2015932661957012, + "flos": 26730733148280.0, + "grad_norm": 1.9427110890844226, + "language_loss": 0.80655682, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.83185184, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20556641, + "step": 3353, + "time_per_iteration": 4.286282300949097 + }, + { + "auxiliary_loss_clip": 0.01470417, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.30953491, + "balance_loss_mlp": 1.02261055, + "epoch": 0.20165338944836916, + "flos": 15637427603280.0, + "grad_norm": 1.6966635178827092, + "language_loss": 0.89892864, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92405701, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19812012, + "step": 3354, + "time_per_iteration": 2.7522332668304443 + }, + { + "auxiliary_loss_clip": 0.01301093, + "auxiliary_loss_mlp": 0.01010538, + "balance_loss_clip": 1.22119844, + "balance_loss_mlp": 1.0053401, + "epoch": 0.20171351270103713, + "flos": 57926111148360.0, + "grad_norm": 0.8243139395875446, + "language_loss": 0.55806804, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.58118439, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.05200195, + "step": 3355, + "time_per_iteration": 4.654572010040283 + }, + { + "auxiliary_loss_clip": 0.01470804, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_clip": 1.30935419, + "balance_loss_mlp": 1.03282142, + "epoch": 0.2017736359537051, + "flos": 20819363769000.0, + "grad_norm": 1.4779612818392869, + "language_loss": 0.84344596, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86867893, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19677734, + "step": 3356, + "time_per_iteration": 2.8319625854492188 + }, + { + "auxiliary_loss_clip": 0.01500139, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.32699442, + "balance_loss_mlp": 1.02368426, + "epoch": 0.20183375920637306, + "flos": 17096456463840.0, + "grad_norm": 2.560146070289757, + "language_loss": 0.70541602, + "learning_rate": 3.698175095398085e-06, + "loss": 0.73089135, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.23706055, + "step": 3357, + "time_per_iteration": 2.770989179611206 + }, + { + "auxiliary_loss_clip": 0.01478796, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_clip": 1.31044173, + "balance_loss_mlp": 1.02889132, + "epoch": 0.20189388245904102, + "flos": 18666010945080.0, + "grad_norm": 1.6887894254864533, + "language_loss": 0.71973884, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74503136, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21569824, + "step": 3358, + "time_per_iteration": 2.758713483810425 + }, + { + "auxiliary_loss_clip": 0.01467528, + "auxiliary_loss_mlp": 0.01057471, + "balance_loss_clip": 1.3049444, + "balance_loss_mlp": 1.03756332, + "epoch": 0.20195400571170902, + "flos": 16801572578040.0, + "grad_norm": 1.6854607881205887, + "language_loss": 0.82988065, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85513061, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19909668, + "step": 3359, + "time_per_iteration": 2.710090398788452 + }, + { + "auxiliary_loss_clip": 0.01298234, + "auxiliary_loss_mlp": 0.01006, + "balance_loss_clip": 1.21968699, + "balance_loss_mlp": 1.00099337, + "epoch": 0.20201412896437698, + "flos": 67188828717360.0, + "grad_norm": 0.778468750919242, + "language_loss": 0.59033847, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61338085, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.05004883, + "step": 3360, + "time_per_iteration": 3.246459722518921 + }, + { + "auxiliary_loss_clip": 0.01483717, + "auxiliary_loss_mlp": 0.01057148, + "balance_loss_clip": 1.31652141, + "balance_loss_mlp": 1.03647709, + "epoch": 0.20207425221704495, + "flos": 21330060417720.0, + "grad_norm": 2.4872943840154145, + "language_loss": 0.63249719, + "learning_rate": 3.697351644435763e-06, + "loss": 0.65790582, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20703125, + "step": 3361, + "time_per_iteration": 2.722682476043701 + }, + { + "auxiliary_loss_clip": 0.01474294, + "auxiliary_loss_mlp": 0.01052434, + "balance_loss_clip": 1.3109777, + "balance_loss_mlp": 1.03296733, + "epoch": 0.2021343754697129, + "flos": 22532116619520.0, + "grad_norm": 1.9959356597138516, + "language_loss": 0.76082951, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.78609681, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19458008, + "step": 3362, + "time_per_iteration": 2.8627514839172363 + }, + { + "auxiliary_loss_clip": 0.01469351, + "auxiliary_loss_mlp": 0.01049056, + "balance_loss_clip": 1.30582595, + "balance_loss_mlp": 1.02887392, + "epoch": 0.20219449872238088, + "flos": 19067522049000.0, + "grad_norm": 1.4916482440425398, + "language_loss": 0.76898646, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.7941705, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20178223, + "step": 3363, + "time_per_iteration": 2.7125144004821777 + }, + { + "auxiliary_loss_clip": 0.01472941, + "auxiliary_loss_mlp": 0.01056224, + "balance_loss_clip": 1.30775499, + "balance_loss_mlp": 1.03676879, + "epoch": 0.20225462197504884, + "flos": 24722365461480.0, + "grad_norm": 1.5897321070212367, + "language_loss": 0.75190705, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77719873, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19470215, + "step": 3364, + "time_per_iteration": 2.787259578704834 + }, + { + "auxiliary_loss_clip": 0.01487511, + "auxiliary_loss_mlp": 0.01052265, + "balance_loss_clip": 1.31911898, + "balance_loss_mlp": 1.0313201, + "epoch": 0.2023147452277168, + "flos": 22023450388800.0, + "grad_norm": 2.008388276279366, + "language_loss": 0.72495103, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.75034875, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20947266, + "step": 3365, + "time_per_iteration": 2.7527198791503906 + }, + { + "auxiliary_loss_clip": 0.01476791, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.31250381, + "balance_loss_mlp": 1.03101528, + "epoch": 0.2023748684803848, + "flos": 17749417489560.0, + "grad_norm": 1.9418431037272976, + "language_loss": 0.86139756, + "learning_rate": 3.696320882607286e-06, + "loss": 0.88667518, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19958496, + "step": 3366, + "time_per_iteration": 2.713094711303711 + }, + { + "auxiliary_loss_clip": 0.01476138, + "auxiliary_loss_mlp": 0.01046097, + "balance_loss_clip": 1.31237566, + "balance_loss_mlp": 1.02666628, + "epoch": 0.20243499173305277, + "flos": 31145162439960.0, + "grad_norm": 1.7139249134383712, + "language_loss": 0.69622487, + "learning_rate": 3.696114537236335e-06, + "loss": 0.72144717, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19421387, + "step": 3367, + "time_per_iteration": 2.9578487873077393 + }, + { + "auxiliary_loss_clip": 0.01488191, + "auxiliary_loss_mlp": 0.01047886, + "balance_loss_clip": 1.31703997, + "balance_loss_mlp": 1.0245924, + "epoch": 0.20249511498572073, + "flos": 33845864280480.0, + "grad_norm": 1.7035879488337897, + "language_loss": 0.68930018, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.71466094, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.23278809, + "step": 3368, + "time_per_iteration": 2.859436511993408 + }, + { + "auxiliary_loss_clip": 0.0147298, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.31168401, + "balance_loss_mlp": 1.03051031, + "epoch": 0.2025552382383887, + "flos": 21220671831120.0, + "grad_norm": 1.6998737299436768, + "language_loss": 0.7749846, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.80021977, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20031738, + "step": 3369, + "time_per_iteration": 2.797679901123047 + }, + { + "auxiliary_loss_clip": 0.01483281, + "auxiliary_loss_mlp": 0.01053606, + "balance_loss_clip": 1.31291032, + "balance_loss_mlp": 1.03238714, + "epoch": 0.20261536149105666, + "flos": 14651468422920.0, + "grad_norm": 2.4798158196964204, + "language_loss": 0.65343052, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67879939, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21228027, + "step": 3370, + "time_per_iteration": 2.684781789779663 + }, + { + "auxiliary_loss_clip": 0.01297897, + "auxiliary_loss_mlp": 0.01008007, + "balance_loss_clip": 1.21762741, + "balance_loss_mlp": 1.00347757, + "epoch": 0.20267548474372463, + "flos": 66799053429480.0, + "grad_norm": 0.6778874116495389, + "language_loss": 0.58160853, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.6046676, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04541016, + "step": 3371, + "time_per_iteration": 3.3239340782165527 + }, + { + "auxiliary_loss_clip": 0.01473493, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_clip": 1.30920017, + "balance_loss_mlp": 1.02891064, + "epoch": 0.2027356079963926, + "flos": 24686159785560.0, + "grad_norm": 1.6464726375034342, + "language_loss": 0.91552949, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94076484, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.21142578, + "step": 3372, + "time_per_iteration": 2.819631338119507 + }, + { + "auxiliary_loss_clip": 0.01483316, + "auxiliary_loss_mlp": 0.01053253, + "balance_loss_clip": 1.31434846, + "balance_loss_mlp": 1.03122318, + "epoch": 0.20279573124906058, + "flos": 26397978643800.0, + "grad_norm": 1.643106104398186, + "language_loss": 0.7907514, + "learning_rate": 3.694875114631167e-06, + "loss": 0.81611717, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.22021484, + "step": 3373, + "time_per_iteration": 2.8490076065063477 + }, + { + "auxiliary_loss_clip": 0.01460276, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.29820096, + "balance_loss_mlp": 1.0232048, + "epoch": 0.20285585450172855, + "flos": 33806247502320.0, + "grad_norm": 1.7756009856430601, + "language_loss": 0.71884274, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.74388433, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20654297, + "step": 3374, + "time_per_iteration": 2.866209030151367 + }, + { + "auxiliary_loss_clip": 0.01290531, + "auxiliary_loss_mlp": 0.01006206, + "balance_loss_clip": 1.20794713, + "balance_loss_mlp": 1.00150907, + "epoch": 0.20291597775439651, + "flos": 71181175447800.0, + "grad_norm": 0.969263078672855, + "language_loss": 0.62491333, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64788067, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.046875, + "step": 3375, + "time_per_iteration": 3.1991875171661377 + }, + { + "auxiliary_loss_clip": 0.01467594, + "auxiliary_loss_mlp": 0.01047677, + "balance_loss_clip": 1.30384636, + "balance_loss_mlp": 1.02742386, + "epoch": 0.20297610100706448, + "flos": 19498254190920.0, + "grad_norm": 1.6994978682562725, + "language_loss": 0.82240349, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84755623, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20251465, + "step": 3376, + "time_per_iteration": 2.7655115127563477 + }, + { + "auxiliary_loss_clip": 0.01476811, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.30903268, + "balance_loss_mlp": 1.02692652, + "epoch": 0.20303622425973245, + "flos": 25049475403920.0, + "grad_norm": 1.9890261242051828, + "language_loss": 0.82194108, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.84720743, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.22900391, + "step": 3377, + "time_per_iteration": 2.7752737998962402 + }, + { + "auxiliary_loss_clip": 0.01473836, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.3103143, + "balance_loss_mlp": 1.03281224, + "epoch": 0.2030963475124004, + "flos": 21984889428000.0, + "grad_norm": 1.8594859230392888, + "language_loss": 0.77176416, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79703408, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20336914, + "step": 3378, + "time_per_iteration": 2.827561855316162 + }, + { + "auxiliary_loss_clip": 0.01473482, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_clip": 1.30596673, + "balance_loss_mlp": 1.02687657, + "epoch": 0.2031564707650684, + "flos": 19505157612120.0, + "grad_norm": 1.9217166874657723, + "language_loss": 0.79817587, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.8233856, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20605469, + "step": 3379, + "time_per_iteration": 2.804111957550049 + }, + { + "auxiliary_loss_clip": 0.01467912, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_clip": 1.30659556, + "balance_loss_mlp": 1.02506638, + "epoch": 0.20321659401773637, + "flos": 22752193260240.0, + "grad_norm": 1.6038128704150394, + "language_loss": 0.86961192, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89473635, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19458008, + "step": 3380, + "time_per_iteration": 2.7985551357269287 + }, + { + "auxiliary_loss_clip": 0.01475632, + "auxiliary_loss_mlp": 0.01048377, + "balance_loss_clip": 1.31140351, + "balance_loss_mlp": 1.02796841, + "epoch": 0.20327671727040433, + "flos": 22461004735200.0, + "grad_norm": 1.8589161569515087, + "language_loss": 0.74935353, + "learning_rate": 3.693218952340186e-06, + "loss": 0.77459359, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20397949, + "step": 3381, + "time_per_iteration": 2.852710008621216 + }, + { + "auxiliary_loss_clip": 0.01476666, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.30849195, + "balance_loss_mlp": 1.03094292, + "epoch": 0.2033368405230723, + "flos": 19539535911840.0, + "grad_norm": 1.8946010895688266, + "language_loss": 0.79518425, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.82046211, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.2019043, + "step": 3382, + "time_per_iteration": 4.168558359146118 + }, + { + "auxiliary_loss_clip": 0.01474753, + "auxiliary_loss_mlp": 0.01048171, + "balance_loss_clip": 1.30614591, + "balance_loss_mlp": 1.02720237, + "epoch": 0.20339696377574026, + "flos": 13813336964880.0, + "grad_norm": 1.8856098556870609, + "language_loss": 0.80309474, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82832402, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.2097168, + "step": 3383, + "time_per_iteration": 2.6978704929351807 + }, + { + "auxiliary_loss_clip": 0.01470322, + "auxiliary_loss_mlp": 0.01041129, + "balance_loss_clip": 1.30604649, + "balance_loss_mlp": 1.02164996, + "epoch": 0.20345708702840823, + "flos": 20344263670800.0, + "grad_norm": 1.8611825700228535, + "language_loss": 0.74355674, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76867127, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19482422, + "step": 3384, + "time_per_iteration": 2.771207332611084 + }, + { + "auxiliary_loss_clip": 0.01486978, + "auxiliary_loss_mlp": 0.01053735, + "balance_loss_clip": 1.31282043, + "balance_loss_mlp": 1.0313952, + "epoch": 0.2035172102810762, + "flos": 20338091200080.0, + "grad_norm": 2.1158942215631247, + "language_loss": 0.77169466, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.7971018, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.22351074, + "step": 3385, + "time_per_iteration": 2.71311092376709 + }, + { + "auxiliary_loss_clip": 0.01475635, + "auxiliary_loss_mlp": 0.01056183, + "balance_loss_clip": 1.30995119, + "balance_loss_mlp": 1.03509521, + "epoch": 0.2035773335337442, + "flos": 23336194644720.0, + "grad_norm": 1.4874034038224755, + "language_loss": 0.68761432, + "learning_rate": 3.692181763924639e-06, + "loss": 0.71293247, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.21069336, + "step": 3386, + "time_per_iteration": 2.762695550918579 + }, + { + "auxiliary_loss_clip": 0.01475606, + "auxiliary_loss_mlp": 0.01053555, + "balance_loss_clip": 1.30796218, + "balance_loss_mlp": 1.03191876, + "epoch": 0.20363745678641215, + "flos": 28336249655280.0, + "grad_norm": 1.3065643042087451, + "language_loss": 0.81181312, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83710468, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21643066, + "step": 3387, + "time_per_iteration": 2.7774317264556885 + }, + { + "auxiliary_loss_clip": 0.01462463, + "auxiliary_loss_mlp": 0.01053545, + "balance_loss_clip": 1.30129468, + "balance_loss_mlp": 1.03336334, + "epoch": 0.20369758003908012, + "flos": 18920222235360.0, + "grad_norm": 2.307726455997236, + "language_loss": 0.79668307, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82184315, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20202637, + "step": 3388, + "time_per_iteration": 2.7144975662231445 + }, + { + "auxiliary_loss_clip": 0.01475813, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_clip": 1.30826485, + "balance_loss_mlp": 1.02764714, + "epoch": 0.20375770329174808, + "flos": 19210964068440.0, + "grad_norm": 1.768485145465893, + "language_loss": 0.72303319, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74827135, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20349121, + "step": 3389, + "time_per_iteration": 2.761946439743042 + }, + { + "auxiliary_loss_clip": 0.01474729, + "auxiliary_loss_mlp": 0.01047719, + "balance_loss_clip": 1.31125212, + "balance_loss_mlp": 1.02692866, + "epoch": 0.20381782654441605, + "flos": 19395769025520.0, + "grad_norm": 1.8745152763050383, + "language_loss": 0.87431592, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89954042, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20788574, + "step": 3390, + "time_per_iteration": 4.27845311164856 + }, + { + "auxiliary_loss_clip": 0.01466564, + "auxiliary_loss_mlp": 0.01052222, + "balance_loss_clip": 1.30229628, + "balance_loss_mlp": 1.03270757, + "epoch": 0.203877949797084, + "flos": 24833215949040.0, + "grad_norm": 1.7786797265811665, + "language_loss": 0.70724308, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73243093, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19519043, + "step": 3391, + "time_per_iteration": 4.477794885635376 + }, + { + "auxiliary_loss_clip": 0.01467696, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.30462885, + "balance_loss_mlp": 1.03591776, + "epoch": 0.20393807304975198, + "flos": 18008176916160.0, + "grad_norm": 2.9307871088854927, + "language_loss": 0.86481464, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.89005637, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20556641, + "step": 3392, + "time_per_iteration": 2.733844757080078 + }, + { + "auxiliary_loss_clip": 0.01479217, + "auxiliary_loss_mlp": 0.01052147, + "balance_loss_clip": 1.31037438, + "balance_loss_mlp": 1.03209615, + "epoch": 0.20399819630241997, + "flos": 24212359154880.0, + "grad_norm": 1.3913834498962083, + "language_loss": 0.80828476, + "learning_rate": 3.69072700532013e-06, + "loss": 0.83359838, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20056152, + "step": 3393, + "time_per_iteration": 4.217505693435669 + }, + { + "auxiliary_loss_clip": 0.01472267, + "auxiliary_loss_mlp": 0.0104382, + "balance_loss_clip": 1.30870318, + "balance_loss_mlp": 1.02386487, + "epoch": 0.20405831955508794, + "flos": 20782021059000.0, + "grad_norm": 1.8173684936317347, + "language_loss": 0.8618378, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88699865, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19958496, + "step": 3394, + "time_per_iteration": 2.7482974529266357 + }, + { + "auxiliary_loss_clip": 0.01468707, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_clip": 1.30369759, + "balance_loss_mlp": 1.02621865, + "epoch": 0.2041184428077559, + "flos": 15491142998640.0, + "grad_norm": 2.035405303607879, + "language_loss": 0.84447736, + "learning_rate": 3.69031078287345e-06, + "loss": 0.8696295, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20288086, + "step": 3395, + "time_per_iteration": 2.81343412399292 + }, + { + "auxiliary_loss_clip": 0.0147735, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.30686474, + "balance_loss_mlp": 1.0238893, + "epoch": 0.20417856606042387, + "flos": 15591678962760.0, + "grad_norm": 2.1673841850502527, + "language_loss": 0.8416816, + "learning_rate": 3.690102575501033e-06, + "loss": 0.86691272, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21862793, + "step": 3396, + "time_per_iteration": 2.69350266456604 + }, + { + "auxiliary_loss_clip": 0.01468677, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_clip": 1.30625606, + "balance_loss_mlp": 1.02827311, + "epoch": 0.20423868931309183, + "flos": 24284648681640.0, + "grad_norm": 1.6985151853456806, + "language_loss": 0.77499849, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.8001709, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20300293, + "step": 3397, + "time_per_iteration": 2.752347469329834 + }, + { + "auxiliary_loss_clip": 0.01471342, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_clip": 1.30628002, + "balance_loss_mlp": 1.02960896, + "epoch": 0.2042988125657598, + "flos": 18618231886560.0, + "grad_norm": 2.547872162709216, + "language_loss": 0.87872034, + "learning_rate": 3.689685968497518e-06, + "loss": 0.90391773, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18798828, + "step": 3398, + "time_per_iteration": 2.782951831817627 + }, + { + "auxiliary_loss_clip": 0.01477561, + "auxiliary_loss_mlp": 0.01050525, + "balance_loss_clip": 1.31030202, + "balance_loss_mlp": 1.02927017, + "epoch": 0.2043589358184278, + "flos": 17854826456880.0, + "grad_norm": 1.9099841583087052, + "language_loss": 0.78116572, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80644661, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21240234, + "step": 3399, + "time_per_iteration": 2.699974536895752 + }, + { + "auxiliary_loss_clip": 0.01469976, + "auxiliary_loss_mlp": 0.01045603, + "balance_loss_clip": 1.30277014, + "balance_loss_mlp": 1.02619624, + "epoch": 0.20441905907109575, + "flos": 21440382996600.0, + "grad_norm": 1.981295602615834, + "language_loss": 0.76939559, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.79455137, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.1940918, + "step": 3400, + "time_per_iteration": 2.736293077468872 + }, + { + "auxiliary_loss_clip": 0.01464144, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_clip": 1.30218911, + "balance_loss_mlp": 1.02391601, + "epoch": 0.20447918232376372, + "flos": 27713281226400.0, + "grad_norm": 1.6648231207938264, + "language_loss": 0.79628366, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.82134604, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18188477, + "step": 3401, + "time_per_iteration": 2.8610711097717285 + }, + { + "auxiliary_loss_clip": 0.01471036, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.30237329, + "balance_loss_mlp": 1.02613854, + "epoch": 0.20453930557643168, + "flos": 30531087241920.0, + "grad_norm": 1.4660511153895086, + "language_loss": 0.68931365, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7144959, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21044922, + "step": 3402, + "time_per_iteration": 2.9279611110687256 + }, + { + "auxiliary_loss_clip": 0.01475779, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.30796576, + "balance_loss_mlp": 1.02462459, + "epoch": 0.20459942882909965, + "flos": 18992227503600.0, + "grad_norm": 2.0619430364322846, + "language_loss": 0.81178099, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83699685, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21191406, + "step": 3403, + "time_per_iteration": 2.7609636783599854 + }, + { + "auxiliary_loss_clip": 0.01468008, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.30153751, + "balance_loss_mlp": 1.02379835, + "epoch": 0.20465955208176762, + "flos": 20343857587200.0, + "grad_norm": 1.8109753898313041, + "language_loss": 0.84028208, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.86540216, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20214844, + "step": 3404, + "time_per_iteration": 2.7957890033721924 + }, + { + "auxiliary_loss_clip": 0.01467183, + "auxiliary_loss_mlp": 0.0104954, + "balance_loss_clip": 1.29777193, + "balance_loss_mlp": 1.02966762, + "epoch": 0.20471967533443558, + "flos": 21256227773280.0, + "grad_norm": 1.7647340070942505, + "language_loss": 0.86323309, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88840032, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19885254, + "step": 3405, + "time_per_iteration": 2.7127933502197266 + }, + { + "auxiliary_loss_clip": 0.01462766, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.29691672, + "balance_loss_mlp": 1.02130973, + "epoch": 0.20477979858710357, + "flos": 14505630510240.0, + "grad_norm": 2.1297803254889693, + "language_loss": 0.84773338, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87276864, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19445801, + "step": 3406, + "time_per_iteration": 2.6895813941955566 + }, + { + "auxiliary_loss_clip": 0.01468419, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.30350995, + "balance_loss_mlp": 1.02681494, + "epoch": 0.20483992183977154, + "flos": 11403986082840.0, + "grad_norm": 2.078567391283857, + "language_loss": 0.68293041, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70807123, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.18847656, + "step": 3407, + "time_per_iteration": 2.676331043243408 + }, + { + "auxiliary_loss_clip": 0.01460098, + "auxiliary_loss_mlp": 0.01049498, + "balance_loss_clip": 1.29697561, + "balance_loss_mlp": 1.0297451, + "epoch": 0.2049000450924395, + "flos": 19064679463800.0, + "grad_norm": 2.1217736640636806, + "language_loss": 0.84388888, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86898488, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19750977, + "step": 3408, + "time_per_iteration": 2.740457534790039 + }, + { + "auxiliary_loss_clip": 0.01477229, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.30767894, + "balance_loss_mlp": 1.03556252, + "epoch": 0.20496016834510747, + "flos": 14578650987480.0, + "grad_norm": 2.1604063737395967, + "language_loss": 0.64023936, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.6655677, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20031738, + "step": 3409, + "time_per_iteration": 2.708611011505127 + }, + { + "auxiliary_loss_clip": 0.01465838, + "auxiliary_loss_mlp": 0.0104732, + "balance_loss_clip": 1.30057931, + "balance_loss_mlp": 1.02736473, + "epoch": 0.20502029159777543, + "flos": 22131661332960.0, + "grad_norm": 1.3102325338717264, + "language_loss": 0.80885971, + "learning_rate": 3.687180946553745e-06, + "loss": 0.83399135, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19958496, + "step": 3410, + "time_per_iteration": 2.7446956634521484 + }, + { + "auxiliary_loss_clip": 0.01460737, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.29695356, + "balance_loss_mlp": 1.02897179, + "epoch": 0.2050804148504434, + "flos": 25372565118720.0, + "grad_norm": 2.8373931220271578, + "language_loss": 0.76350707, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78860128, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19726562, + "step": 3411, + "time_per_iteration": 2.7331247329711914 + }, + { + "auxiliary_loss_clip": 0.01464376, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_clip": 1.29896069, + "balance_loss_mlp": 1.0252111, + "epoch": 0.2051405381031114, + "flos": 23625068493240.0, + "grad_norm": 3.286776377013695, + "language_loss": 0.73933148, + "learning_rate": 3.686762546833722e-06, + "loss": 0.76442969, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20251465, + "step": 3412, + "time_per_iteration": 2.764927864074707 + }, + { + "auxiliary_loss_clip": 0.01467482, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_clip": 1.29808831, + "balance_loss_mlp": 1.02949989, + "epoch": 0.20520066135577936, + "flos": 19569284858520.0, + "grad_norm": 2.1309701970255635, + "language_loss": 0.78059989, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80577266, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20288086, + "step": 3413, + "time_per_iteration": 2.762511968612671 + }, + { + "auxiliary_loss_clip": 0.01458589, + "auxiliary_loss_mlp": 0.01051587, + "balance_loss_clip": 1.29808259, + "balance_loss_mlp": 1.03151202, + "epoch": 0.20526078460844732, + "flos": 17680904540280.0, + "grad_norm": 1.933136036000658, + "language_loss": 0.85077554, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.87587732, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20056152, + "step": 3414, + "time_per_iteration": 2.7717041969299316 + }, + { + "auxiliary_loss_clip": 0.01462252, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_clip": 1.29771018, + "balance_loss_mlp": 1.02206814, + "epoch": 0.2053209078611153, + "flos": 21504469634640.0, + "grad_norm": 1.8708497248631166, + "language_loss": 0.80671918, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.83176661, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.2043457, + "step": 3415, + "time_per_iteration": 2.9080026149749756 + }, + { + "auxiliary_loss_clip": 0.01467638, + "auxiliary_loss_mlp": 0.01044151, + "balance_loss_clip": 1.3030808, + "balance_loss_mlp": 1.02494597, + "epoch": 0.20538103111378325, + "flos": 25668707863680.0, + "grad_norm": 2.008166472519954, + "language_loss": 0.73148286, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75660074, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19213867, + "step": 3416, + "time_per_iteration": 2.7774658203125 + }, + { + "auxiliary_loss_clip": 0.01469857, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.30274487, + "balance_loss_mlp": 1.02437878, + "epoch": 0.20544115436645122, + "flos": 23154191664480.0, + "grad_norm": 2.070726653276217, + "language_loss": 0.79271376, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.81786472, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20874023, + "step": 3417, + "time_per_iteration": 2.7765095233917236 + }, + { + "auxiliary_loss_clip": 0.01466583, + "auxiliary_loss_mlp": 0.01050659, + "balance_loss_clip": 1.30131888, + "balance_loss_mlp": 1.02996492, + "epoch": 0.20550127761911918, + "flos": 19395119291760.0, + "grad_norm": 2.285586041023942, + "language_loss": 0.87454319, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89971566, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20690918, + "step": 3418, + "time_per_iteration": 2.8341939449310303 + }, + { + "auxiliary_loss_clip": 0.01466148, + "auxiliary_loss_mlp": 0.01048689, + "balance_loss_clip": 1.30060375, + "balance_loss_mlp": 1.02794695, + "epoch": 0.20556140087178718, + "flos": 22898274823080.0, + "grad_norm": 2.08878338657752, + "language_loss": 0.62574387, + "learning_rate": 3.685296133421035e-06, + "loss": 0.65089226, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20727539, + "step": 3419, + "time_per_iteration": 2.7661445140838623 + }, + { + "auxiliary_loss_clip": 0.01476509, + "auxiliary_loss_mlp": 0.01056771, + "balance_loss_clip": 1.30719078, + "balance_loss_mlp": 1.03493166, + "epoch": 0.20562152412445514, + "flos": 19794112677360.0, + "grad_norm": 1.7980029194802827, + "language_loss": 0.86608303, + "learning_rate": 3.685086390100674e-06, + "loss": 0.89141577, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.21862793, + "step": 3420, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.01466703, + "auxiliary_loss_mlp": 0.01053276, + "balance_loss_clip": 1.30109239, + "balance_loss_mlp": 1.03396416, + "epoch": 0.2056816473771231, + "flos": 31508031366360.0, + "grad_norm": 2.193313751140844, + "language_loss": 0.71815199, + "learning_rate": 3.684876582881668e-06, + "loss": 0.74335182, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19311523, + "step": 3421, + "time_per_iteration": 4.258956670761108 + }, + { + "auxiliary_loss_clip": 0.0146171, + "auxiliary_loss_mlp": 0.01047506, + "balance_loss_clip": 1.29751539, + "balance_loss_mlp": 1.02640581, + "epoch": 0.20574177062979107, + "flos": 23263620859440.0, + "grad_norm": 1.9698070351767938, + "language_loss": 0.71159333, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73668551, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.21081543, + "step": 3422, + "time_per_iteration": 2.7953145503997803 + }, + { + "auxiliary_loss_clip": 0.01302464, + "auxiliary_loss_mlp": 0.01037751, + "balance_loss_clip": 1.21769452, + "balance_loss_mlp": 1.03357875, + "epoch": 0.20580189388245904, + "flos": 70327044295920.0, + "grad_norm": 0.7460658502563527, + "language_loss": 0.55509126, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57849342, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.04174805, + "step": 3423, + "time_per_iteration": 3.31325101852417 + }, + { + "auxiliary_loss_clip": 0.01474559, + "auxiliary_loss_mlp": 0.0105295, + "balance_loss_clip": 1.30755877, + "balance_loss_mlp": 1.03168321, + "epoch": 0.205862017135127, + "flos": 30744057419640.0, + "grad_norm": 1.740936137536494, + "language_loss": 0.7183404, + "learning_rate": 3.684246777912353e-06, + "loss": 0.74361551, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21289062, + "step": 3424, + "time_per_iteration": 2.796550750732422 + }, + { + "auxiliary_loss_clip": 0.01464767, + "auxiliary_loss_mlp": 0.01054644, + "balance_loss_clip": 1.30117989, + "balance_loss_mlp": 1.03393734, + "epoch": 0.20592214038779497, + "flos": 21329532509040.0, + "grad_norm": 1.4538899534027883, + "language_loss": 0.74982983, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77502394, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20703125, + "step": 3425, + "time_per_iteration": 2.842454433441162 + }, + { + "auxiliary_loss_clip": 0.01458092, + "auxiliary_loss_mlp": 0.01058429, + "balance_loss_clip": 1.29643059, + "balance_loss_mlp": 1.03890252, + "epoch": 0.20598226364046296, + "flos": 22896609880320.0, + "grad_norm": 1.8727701458954629, + "language_loss": 0.88424504, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90941024, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1953125, + "step": 3426, + "time_per_iteration": 2.884300470352173 + }, + { + "auxiliary_loss_clip": 0.01456968, + "auxiliary_loss_mlp": 0.01052973, + "balance_loss_clip": 1.29549813, + "balance_loss_mlp": 1.03317249, + "epoch": 0.20604238689313092, + "flos": 23883868528200.0, + "grad_norm": 1.7233896417720673, + "language_loss": 0.77102864, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79612803, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19787598, + "step": 3427, + "time_per_iteration": 2.8096330165863037 + }, + { + "auxiliary_loss_clip": 0.01469099, + "auxiliary_loss_mlp": 0.01053481, + "balance_loss_clip": 1.30251205, + "balance_loss_mlp": 1.0328815, + "epoch": 0.2061025101457989, + "flos": 22496601285720.0, + "grad_norm": 1.4445634156883438, + "language_loss": 0.74199677, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76722252, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20617676, + "step": 3428, + "time_per_iteration": 2.728121042251587 + }, + { + "auxiliary_loss_clip": 0.01471678, + "auxiliary_loss_mlp": 0.01051604, + "balance_loss_clip": 1.30301452, + "balance_loss_mlp": 1.03074241, + "epoch": 0.20616263339846685, + "flos": 22783444716240.0, + "grad_norm": 1.6387971852864895, + "language_loss": 0.73694646, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.76217926, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20874023, + "step": 3429, + "time_per_iteration": 4.171383380889893 + }, + { + "auxiliary_loss_clip": 0.01476154, + "auxiliary_loss_mlp": 0.01054408, + "balance_loss_clip": 1.30987513, + "balance_loss_mlp": 1.03374863, + "epoch": 0.20622275665113482, + "flos": 20886536642400.0, + "grad_norm": 1.7925941326768482, + "language_loss": 0.85621697, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.8815226, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20666504, + "step": 3430, + "time_per_iteration": 4.301201105117798 + }, + { + "auxiliary_loss_clip": 0.01469129, + "auxiliary_loss_mlp": 0.01060484, + "balance_loss_clip": 1.30225825, + "balance_loss_mlp": 1.03856111, + "epoch": 0.20628287990380278, + "flos": 19359157266000.0, + "grad_norm": 1.5909936766223334, + "language_loss": 0.69365299, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.7189492, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.21911621, + "step": 3431, + "time_per_iteration": 4.169352293014526 + }, + { + "auxiliary_loss_clip": 0.01298232, + "auxiliary_loss_mlp": 0.01018498, + "balance_loss_clip": 1.21523404, + "balance_loss_mlp": 1.01396775, + "epoch": 0.20634300315647078, + "flos": 71532795858480.0, + "grad_norm": 0.8044778180386067, + "language_loss": 0.60320437, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62637174, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.04541016, + "step": 3432, + "time_per_iteration": 3.3987772464752197 + }, + { + "auxiliary_loss_clip": 0.01469496, + "auxiliary_loss_mlp": 0.0105166, + "balance_loss_clip": 1.30431557, + "balance_loss_mlp": 1.03181112, + "epoch": 0.20640312640913874, + "flos": 21728728936440.0, + "grad_norm": 1.6783932240973252, + "language_loss": 0.7218293, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74704087, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.1986084, + "step": 3433, + "time_per_iteration": 2.750826120376587 + }, + { + "auxiliary_loss_clip": 0.01469981, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_clip": 1.3026166, + "balance_loss_mlp": 1.0313077, + "epoch": 0.2064632496618067, + "flos": 20559020616360.0, + "grad_norm": 1.6432011321588034, + "language_loss": 0.86978352, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89500707, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21057129, + "step": 3434, + "time_per_iteration": 2.7613141536712646 + }, + { + "auxiliary_loss_clip": 0.01472349, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.30417156, + "balance_loss_mlp": 1.02068448, + "epoch": 0.20652337291447467, + "flos": 29829047690160.0, + "grad_norm": 1.6631199914344883, + "language_loss": 0.69762313, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.72275734, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20397949, + "step": 3435, + "time_per_iteration": 2.784122943878174 + }, + { + "auxiliary_loss_clip": 0.0146436, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.30205226, + "balance_loss_mlp": 1.02273202, + "epoch": 0.20658349616714264, + "flos": 26219264940720.0, + "grad_norm": 1.8507642051860012, + "language_loss": 0.89459765, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91967881, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.21008301, + "step": 3436, + "time_per_iteration": 2.8871982097625732 + }, + { + "auxiliary_loss_clip": 0.01463884, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_clip": 1.29988515, + "balance_loss_mlp": 1.02380276, + "epoch": 0.2066436194198106, + "flos": 25999919250480.0, + "grad_norm": 1.566781595466881, + "language_loss": 0.76828873, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.79337132, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20556641, + "step": 3437, + "time_per_iteration": 2.8156721591949463 + }, + { + "auxiliary_loss_clip": 0.01462783, + "auxiliary_loss_mlp": 0.01052263, + "balance_loss_clip": 1.29648042, + "balance_loss_mlp": 1.03230774, + "epoch": 0.20670374267247857, + "flos": 21366063051840.0, + "grad_norm": 1.9043358782629707, + "language_loss": 0.77502936, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.80017984, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19958496, + "step": 3438, + "time_per_iteration": 2.8358511924743652 + }, + { + "auxiliary_loss_clip": 0.01291478, + "auxiliary_loss_mlp": 0.01014519, + "balance_loss_clip": 1.21018624, + "balance_loss_mlp": 1.01044166, + "epoch": 0.20676386592514656, + "flos": 66398760576360.0, + "grad_norm": 0.8307013670196868, + "language_loss": 0.67181158, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69487154, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.04077148, + "step": 3439, + "time_per_iteration": 3.313563346862793 + }, + { + "auxiliary_loss_clip": 0.01463151, + "auxiliary_loss_mlp": 0.01050144, + "balance_loss_clip": 1.29548764, + "balance_loss_mlp": 1.02942491, + "epoch": 0.20682398917781453, + "flos": 17279027961120.0, + "grad_norm": 2.069335755836038, + "language_loss": 0.83813709, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86326998, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20715332, + "step": 3440, + "time_per_iteration": 2.7833783626556396 + }, + { + "auxiliary_loss_clip": 0.01469563, + "auxiliary_loss_mlp": 0.01055493, + "balance_loss_clip": 1.30438137, + "balance_loss_mlp": 1.03481066, + "epoch": 0.2068841124304825, + "flos": 18081847127160.0, + "grad_norm": 2.1246398711636982, + "language_loss": 0.85312676, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87837726, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20678711, + "step": 3441, + "time_per_iteration": 2.852372407913208 + }, + { + "auxiliary_loss_clip": 0.01463334, + "auxiliary_loss_mlp": 0.01052901, + "balance_loss_clip": 1.30090749, + "balance_loss_mlp": 1.03156257, + "epoch": 0.20694423568315046, + "flos": 27353092451760.0, + "grad_norm": 1.6361883503632113, + "language_loss": 0.85955906, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88472146, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.21337891, + "step": 3442, + "time_per_iteration": 2.778578281402588 + }, + { + "auxiliary_loss_clip": 0.01474316, + "auxiliary_loss_mlp": 0.01051388, + "balance_loss_clip": 1.30664289, + "balance_loss_mlp": 1.03022885, + "epoch": 0.20700435893581842, + "flos": 20234672042400.0, + "grad_norm": 2.2144578904237266, + "language_loss": 0.72892249, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75417954, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21179199, + "step": 3443, + "time_per_iteration": 2.763693332672119 + }, + { + "auxiliary_loss_clip": 0.01457383, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.2964853, + "balance_loss_mlp": 1.02808142, + "epoch": 0.2070644821884864, + "flos": 20635980104520.0, + "grad_norm": 1.8041104765840479, + "language_loss": 0.8601898, + "learning_rate": 3.680033399147797e-06, + "loss": 0.88523662, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19213867, + "step": 3444, + "time_per_iteration": 2.7975010871887207 + }, + { + "auxiliary_loss_clip": 0.01287522, + "auxiliary_loss_mlp": 0.01001978, + "balance_loss_clip": 1.20626915, + "balance_loss_mlp": 0.99811572, + "epoch": 0.20712460544115438, + "flos": 65955927143160.0, + "grad_norm": 0.6863136675955159, + "language_loss": 0.57141685, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59431183, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.03857422, + "step": 3445, + "time_per_iteration": 3.213911294937134 + }, + { + "auxiliary_loss_clip": 0.01464076, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.30046701, + "balance_loss_mlp": 1.02981257, + "epoch": 0.20718472869382235, + "flos": 19430269150320.0, + "grad_norm": 1.4865074358248715, + "language_loss": 0.78041911, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80556554, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20739746, + "step": 3446, + "time_per_iteration": 2.7468950748443604 + }, + { + "auxiliary_loss_clip": 0.01478429, + "auxiliary_loss_mlp": 0.01057427, + "balance_loss_clip": 1.30528593, + "balance_loss_mlp": 1.03353715, + "epoch": 0.2072448519464903, + "flos": 24504481672200.0, + "grad_norm": 1.9136416040057689, + "language_loss": 0.6224519, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64781046, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.23913574, + "step": 3447, + "time_per_iteration": 2.780214548110962 + }, + { + "auxiliary_loss_clip": 0.01461785, + "auxiliary_loss_mlp": 0.01059449, + "balance_loss_clip": 1.29703724, + "balance_loss_mlp": 1.0382899, + "epoch": 0.20730497519915828, + "flos": 23080886928720.0, + "grad_norm": 1.8786097012240326, + "language_loss": 0.86733204, + "learning_rate": 3.679187663409184e-06, + "loss": 0.89254439, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.21154785, + "step": 3448, + "time_per_iteration": 2.7535455226898193 + }, + { + "auxiliary_loss_clip": 0.01462775, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_clip": 1.29818642, + "balance_loss_mlp": 1.02360356, + "epoch": 0.20736509845182624, + "flos": 21073981142880.0, + "grad_norm": 2.6761293327184665, + "language_loss": 0.76012635, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.78522241, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.2322998, + "step": 3449, + "time_per_iteration": 2.7080276012420654 + }, + { + "auxiliary_loss_clip": 0.01474391, + "auxiliary_loss_mlp": 0.01058362, + "balance_loss_clip": 1.30444169, + "balance_loss_mlp": 1.03691649, + "epoch": 0.2074252217044942, + "flos": 17636983275960.0, + "grad_norm": 1.975331913207097, + "language_loss": 0.76762569, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.79295325, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21435547, + "step": 3450, + "time_per_iteration": 2.8503527641296387 + }, + { + "auxiliary_loss_clip": 0.01469421, + "auxiliary_loss_mlp": 0.01049179, + "balance_loss_clip": 1.30120111, + "balance_loss_mlp": 1.02786398, + "epoch": 0.20748534495716217, + "flos": 23551844974200.0, + "grad_norm": 1.5852740745402625, + "language_loss": 0.8250252, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.85021114, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21313477, + "step": 3451, + "time_per_iteration": 2.7401375770568848 + }, + { + "auxiliary_loss_clip": 0.01287833, + "auxiliary_loss_mlp": 0.01004484, + "balance_loss_clip": 1.20725286, + "balance_loss_mlp": 1.00002599, + "epoch": 0.20754546820983016, + "flos": 52265217090240.0, + "grad_norm": 0.7884331620563346, + "language_loss": 0.56623805, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58916122, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04467773, + "step": 3452, + "time_per_iteration": 3.170027732849121 + }, + { + "auxiliary_loss_clip": 0.0147164, + "auxiliary_loss_mlp": 0.01053524, + "balance_loss_clip": 1.30561662, + "balance_loss_mlp": 1.03133917, + "epoch": 0.20760559146249813, + "flos": 20417243539680.0, + "grad_norm": 2.2054580964139077, + "language_loss": 0.88158262, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.9068343, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.2220459, + "step": 3453, + "time_per_iteration": 2.7686877250671387 + }, + { + "auxiliary_loss_clip": 0.01474121, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_clip": 1.30751777, + "balance_loss_mlp": 1.02210593, + "epoch": 0.2076657147151661, + "flos": 23191331332680.0, + "grad_norm": 1.6547313814689586, + "language_loss": 0.80066526, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82584214, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21472168, + "step": 3454, + "time_per_iteration": 2.7602710723876953 + }, + { + "auxiliary_loss_clip": 0.01469828, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.30385077, + "balance_loss_mlp": 1.0317862, + "epoch": 0.20772583796783406, + "flos": 18297416239920.0, + "grad_norm": 6.0801556217004515, + "language_loss": 0.77203554, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.79725969, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20800781, + "step": 3455, + "time_per_iteration": 2.767341136932373 + }, + { + "auxiliary_loss_clip": 0.01466294, + "auxiliary_loss_mlp": 0.01055663, + "balance_loss_clip": 1.30288851, + "balance_loss_mlp": 1.03531361, + "epoch": 0.20778596122050202, + "flos": 17607274937640.0, + "grad_norm": 1.7654210836205082, + "language_loss": 0.81247002, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.83768958, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20336914, + "step": 3456, + "time_per_iteration": 2.7041070461273193 + }, + { + "auxiliary_loss_clip": 0.01478339, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_clip": 1.31238508, + "balance_loss_mlp": 1.03192687, + "epoch": 0.20784608447317, + "flos": 23810888659320.0, + "grad_norm": 2.1879011078504105, + "language_loss": 0.78199458, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80730957, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21240234, + "step": 3457, + "time_per_iteration": 2.867870807647705 + }, + { + "auxiliary_loss_clip": 0.01480006, + "auxiliary_loss_mlp": 0.01057861, + "balance_loss_clip": 1.3102535, + "balance_loss_mlp": 1.03504479, + "epoch": 0.20790620772583795, + "flos": 17643764872080.0, + "grad_norm": 1.7776306635849253, + "language_loss": 0.83812869, + "learning_rate": 3.677068867939333e-06, + "loss": 0.86350733, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.22790527, + "step": 3458, + "time_per_iteration": 4.230481147766113 + }, + { + "auxiliary_loss_clip": 0.01470655, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.30797589, + "balance_loss_mlp": 1.01726317, + "epoch": 0.20796633097850595, + "flos": 27679349618640.0, + "grad_norm": 1.7362768717845858, + "language_loss": 0.75926703, + "learning_rate": 3.676856638489272e-06, + "loss": 0.7843554, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20922852, + "step": 3459, + "time_per_iteration": 2.8565444946289062 + }, + { + "auxiliary_loss_clip": 0.01465481, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.30351591, + "balance_loss_mlp": 1.02060366, + "epoch": 0.2080264542311739, + "flos": 19250418413160.0, + "grad_norm": 1.8923717262336979, + "language_loss": 0.77239859, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.7974562, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19689941, + "step": 3460, + "time_per_iteration": 2.832324981689453 + }, + { + "auxiliary_loss_clip": 0.01474, + "auxiliary_loss_mlp": 0.01047477, + "balance_loss_clip": 1.30802155, + "balance_loss_mlp": 1.02640057, + "epoch": 0.20808657748384188, + "flos": 27531562504680.0, + "grad_norm": 2.1720902002315667, + "language_loss": 0.75519043, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.78040516, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21081543, + "step": 3461, + "time_per_iteration": 2.957712173461914 + }, + { + "auxiliary_loss_clip": 0.01477144, + "auxiliary_loss_mlp": 0.01045305, + "balance_loss_clip": 1.30703354, + "balance_loss_mlp": 1.02351403, + "epoch": 0.20814670073650984, + "flos": 26912654911800.0, + "grad_norm": 1.725800470412604, + "language_loss": 0.88546491, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.91068935, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21777344, + "step": 3462, + "time_per_iteration": 2.8594307899475098 + }, + { + "auxiliary_loss_clip": 0.01302063, + "auxiliary_loss_mlp": 0.01011079, + "balance_loss_clip": 1.22166562, + "balance_loss_mlp": 1.00728786, + "epoch": 0.2082068239891778, + "flos": 70191439689960.0, + "grad_norm": 0.7585498675061741, + "language_loss": 0.59053683, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61366826, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.0378418, + "step": 3463, + "time_per_iteration": 3.3678252696990967 + }, + { + "auxiliary_loss_clip": 0.01480954, + "auxiliary_loss_mlp": 0.01051986, + "balance_loss_clip": 1.31216407, + "balance_loss_mlp": 1.02949119, + "epoch": 0.20826694724184577, + "flos": 24613464175200.0, + "grad_norm": 2.4861888172447397, + "language_loss": 0.6658445, + "learning_rate": 3.675794537601429e-06, + "loss": 0.69117391, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.22509766, + "step": 3464, + "time_per_iteration": 2.7367310523986816 + }, + { + "auxiliary_loss_clip": 0.01484235, + "auxiliary_loss_mlp": 0.01055136, + "balance_loss_clip": 1.31558609, + "balance_loss_mlp": 1.03125858, + "epoch": 0.20832707049451377, + "flos": 12896256209040.0, + "grad_norm": 2.0335509250084938, + "language_loss": 0.84087944, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86627316, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.23876953, + "step": 3465, + "time_per_iteration": 2.722588062286377 + }, + { + "auxiliary_loss_clip": 0.01485602, + "auxiliary_loss_mlp": 0.01049675, + "balance_loss_clip": 1.31857896, + "balance_loss_mlp": 1.02771688, + "epoch": 0.20838719374718173, + "flos": 22203747817920.0, + "grad_norm": 2.060280259628516, + "language_loss": 0.82217813, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.8475309, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21960449, + "step": 3466, + "time_per_iteration": 2.7470085620880127 + }, + { + "auxiliary_loss_clip": 0.01480004, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.31613946, + "balance_loss_mlp": 1.02808738, + "epoch": 0.2084473169998497, + "flos": 15162855413760.0, + "grad_norm": 1.9172657956412824, + "language_loss": 0.82411563, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84939551, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19909668, + "step": 3467, + "time_per_iteration": 4.154821395874023 + }, + { + "auxiliary_loss_clip": 0.01467706, + "auxiliary_loss_mlp": 0.01049058, + "balance_loss_clip": 1.30740392, + "balance_loss_mlp": 1.02851856, + "epoch": 0.20850744025251766, + "flos": 17461233983160.0, + "grad_norm": 1.953151734531849, + "language_loss": 0.82345396, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84862161, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.20556641, + "step": 3468, + "time_per_iteration": 2.745957136154175 + }, + { + "auxiliary_loss_clip": 0.01488435, + "auxiliary_loss_mlp": 0.0105605, + "balance_loss_clip": 1.31950581, + "balance_loss_mlp": 1.03297138, + "epoch": 0.20856756350518563, + "flos": 25704020155680.0, + "grad_norm": 1.7402825620692595, + "language_loss": 0.90095043, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.9263953, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.23071289, + "step": 3469, + "time_per_iteration": 4.424795389175415 + }, + { + "auxiliary_loss_clip": 0.01493357, + "auxiliary_loss_mlp": 0.01048195, + "balance_loss_clip": 1.32750952, + "balance_loss_mlp": 1.02571177, + "epoch": 0.2086276867578536, + "flos": 37896206394960.0, + "grad_norm": 2.391830058739834, + "language_loss": 0.7694571, + "learning_rate": 3.674517919597092e-06, + "loss": 0.79487264, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.22485352, + "step": 3470, + "time_per_iteration": 4.332190990447998 + }, + { + "auxiliary_loss_clip": 0.01486059, + "auxiliary_loss_mlp": 0.0105782, + "balance_loss_clip": 1.32146728, + "balance_loss_mlp": 1.0356952, + "epoch": 0.20868781001052156, + "flos": 25562608554240.0, + "grad_norm": 2.4862945864268546, + "language_loss": 0.75940645, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78484517, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.22131348, + "step": 3471, + "time_per_iteration": 2.7937850952148438 + }, + { + "auxiliary_loss_clip": 0.01501193, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.32848692, + "balance_loss_mlp": 1.03336442, + "epoch": 0.20874793326318955, + "flos": 27535257865440.0, + "grad_norm": 1.7824956132759684, + "language_loss": 0.76169491, + "learning_rate": 3.67409187219312e-06, + "loss": 0.78727478, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.23425293, + "step": 3472, + "time_per_iteration": 2.858091354370117 + }, + { + "auxiliary_loss_clip": 0.01484575, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_clip": 1.31995273, + "balance_loss_mlp": 1.02371955, + "epoch": 0.20880805651585752, + "flos": 18553048822800.0, + "grad_norm": 1.837883688040926, + "language_loss": 0.8461464, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.87143618, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20690918, + "step": 3473, + "time_per_iteration": 2.8444199562072754 + }, + { + "auxiliary_loss_clip": 0.01313621, + "auxiliary_loss_mlp": 0.01016586, + "balance_loss_clip": 1.23019838, + "balance_loss_mlp": 1.0119133, + "epoch": 0.20886817976852548, + "flos": 65962221438960.0, + "grad_norm": 0.883372200092685, + "language_loss": 0.6369521, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.66025418, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.04663086, + "step": 3474, + "time_per_iteration": 3.24546480178833 + }, + { + "auxiliary_loss_clip": 0.01496396, + "auxiliary_loss_mlp": 0.01051326, + "balance_loss_clip": 1.32864475, + "balance_loss_mlp": 1.03064358, + "epoch": 0.20892830302119345, + "flos": 36546160037400.0, + "grad_norm": 2.6427833669354106, + "language_loss": 0.70362395, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72910118, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20678711, + "step": 3475, + "time_per_iteration": 2.8595077991485596 + }, + { + "auxiliary_loss_clip": 0.01492358, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_clip": 1.32361507, + "balance_loss_mlp": 1.03514814, + "epoch": 0.2089884262738614, + "flos": 20961303279120.0, + "grad_norm": 1.4939472499316122, + "language_loss": 0.6991477, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72463173, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.2088623, + "step": 3476, + "time_per_iteration": 2.765068769454956 + }, + { + "auxiliary_loss_clip": 0.01483227, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.31946898, + "balance_loss_mlp": 1.0306561, + "epoch": 0.20904854952652938, + "flos": 22789211103360.0, + "grad_norm": 1.7172386217428042, + "language_loss": 0.89510179, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9204433, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20263672, + "step": 3477, + "time_per_iteration": 2.7397639751434326 + }, + { + "auxiliary_loss_clip": 0.014856, + "auxiliary_loss_mlp": 0.01053595, + "balance_loss_clip": 1.32080746, + "balance_loss_mlp": 1.03239954, + "epoch": 0.20910867277919734, + "flos": 27308399628600.0, + "grad_norm": 2.058296697542287, + "language_loss": 0.68427062, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70966262, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.21191406, + "step": 3478, + "time_per_iteration": 2.7852156162261963 + }, + { + "auxiliary_loss_clip": 0.01486412, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.32094872, + "balance_loss_mlp": 1.02987194, + "epoch": 0.20916879603186533, + "flos": 14323302663120.0, + "grad_norm": 2.1312456650710656, + "language_loss": 0.85398048, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87935454, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.21130371, + "step": 3479, + "time_per_iteration": 2.735102653503418 + }, + { + "auxiliary_loss_clip": 0.01490027, + "auxiliary_loss_mlp": 0.0106089, + "balance_loss_clip": 1.32264352, + "balance_loss_mlp": 1.03943253, + "epoch": 0.2092289192845333, + "flos": 22278108371040.0, + "grad_norm": 2.3207305006149825, + "language_loss": 0.75112104, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.77663028, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21472168, + "step": 3480, + "time_per_iteration": 2.7986607551574707 + }, + { + "auxiliary_loss_clip": 0.01481534, + "auxiliary_loss_mlp": 0.01059485, + "balance_loss_clip": 1.31942022, + "balance_loss_mlp": 1.0402205, + "epoch": 0.20928904253720126, + "flos": 14834933304120.0, + "grad_norm": 1.9677335301585306, + "language_loss": 0.7602039, + "learning_rate": 3.67217151746346e-06, + "loss": 0.78561407, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19274902, + "step": 3481, + "time_per_iteration": 2.744499921798706 + }, + { + "auxiliary_loss_clip": 0.01483617, + "auxiliary_loss_mlp": 0.01056018, + "balance_loss_clip": 1.31965518, + "balance_loss_mlp": 1.03582358, + "epoch": 0.20934916578986923, + "flos": 23264514243360.0, + "grad_norm": 2.1749054971962503, + "language_loss": 0.8526473, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87804365, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.2019043, + "step": 3482, + "time_per_iteration": 2.7199487686157227 + }, + { + "auxiliary_loss_clip": 0.01478946, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_clip": 1.31525803, + "balance_loss_mlp": 1.0269165, + "epoch": 0.2094092890425372, + "flos": 32020717824720.0, + "grad_norm": 2.166566218940285, + "language_loss": 0.71245635, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.7377221, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20727539, + "step": 3483, + "time_per_iteration": 2.847949266433716 + }, + { + "auxiliary_loss_clip": 0.01492617, + "auxiliary_loss_mlp": 0.01060519, + "balance_loss_clip": 1.32358432, + "balance_loss_mlp": 1.03933573, + "epoch": 0.20946941229520516, + "flos": 20015407568880.0, + "grad_norm": 1.712615653612104, + "language_loss": 0.75272059, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77825201, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21203613, + "step": 3484, + "time_per_iteration": 2.7363693714141846 + }, + { + "auxiliary_loss_clip": 0.01485928, + "auxiliary_loss_mlp": 0.01054107, + "balance_loss_clip": 1.32149196, + "balance_loss_mlp": 1.03325748, + "epoch": 0.20952953554787315, + "flos": 30746575137960.0, + "grad_norm": 1.6052612060236062, + "language_loss": 0.70608616, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7314865, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20849609, + "step": 3485, + "time_per_iteration": 2.839160442352295 + }, + { + "auxiliary_loss_clip": 0.01487638, + "auxiliary_loss_mlp": 0.01051443, + "balance_loss_clip": 1.32023919, + "balance_loss_mlp": 1.03025913, + "epoch": 0.20958965880054112, + "flos": 27054472596840.0, + "grad_norm": 1.8334151854880962, + "language_loss": 0.83371031, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85910118, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21179199, + "step": 3486, + "time_per_iteration": 2.8135788440704346 + }, + { + "auxiliary_loss_clip": 0.01483968, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.31751025, + "balance_loss_mlp": 1.0389204, + "epoch": 0.20964978205320908, + "flos": 34210804233240.0, + "grad_norm": 1.8001034624991783, + "language_loss": 0.87613457, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.90156567, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20214844, + "step": 3487, + "time_per_iteration": 2.862273931503296 + }, + { + "auxiliary_loss_clip": 0.01476678, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.31164837, + "balance_loss_mlp": 1.02602863, + "epoch": 0.20970990530587705, + "flos": 23482682291160.0, + "grad_norm": 1.951367268864874, + "language_loss": 0.72482455, + "learning_rate": 3.670674357028504e-06, + "loss": 0.75006044, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.2088623, + "step": 3488, + "time_per_iteration": 2.8121497631073 + }, + { + "auxiliary_loss_clip": 0.01480135, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.31641483, + "balance_loss_mlp": 1.02944851, + "epoch": 0.209770028558545, + "flos": 18556053841440.0, + "grad_norm": 2.693987403151334, + "language_loss": 0.8114599, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83675468, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19897461, + "step": 3489, + "time_per_iteration": 2.695675849914551 + }, + { + "auxiliary_loss_clip": 0.01479699, + "auxiliary_loss_mlp": 0.01045868, + "balance_loss_clip": 1.31405711, + "balance_loss_mlp": 1.02628207, + "epoch": 0.20983015181121298, + "flos": 21622101718320.0, + "grad_norm": 1.8168745471542516, + "language_loss": 0.73422545, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75948107, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19592285, + "step": 3490, + "time_per_iteration": 2.8147780895233154 + }, + { + "auxiliary_loss_clip": 0.01470803, + "auxiliary_loss_mlp": 0.01051692, + "balance_loss_clip": 1.31158376, + "balance_loss_mlp": 1.032058, + "epoch": 0.20989027506388094, + "flos": 16618676213880.0, + "grad_norm": 1.7334818229527647, + "language_loss": 0.70266593, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72789079, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19641113, + "step": 3491, + "time_per_iteration": 2.697681427001953 + }, + { + "auxiliary_loss_clip": 0.01481385, + "auxiliary_loss_mlp": 0.01048051, + "balance_loss_clip": 1.31480122, + "balance_loss_mlp": 1.02743936, + "epoch": 0.20995039831654894, + "flos": 23221405146240.0, + "grad_norm": 3.148421283590143, + "language_loss": 0.79909545, + "learning_rate": 3.669817442854444e-06, + "loss": 0.82438982, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20593262, + "step": 3492, + "time_per_iteration": 2.85760760307312 + }, + { + "auxiliary_loss_clip": 0.01472477, + "auxiliary_loss_mlp": 0.01045409, + "balance_loss_clip": 1.30912614, + "balance_loss_mlp": 1.02498817, + "epoch": 0.2100105215692169, + "flos": 18151700152320.0, + "grad_norm": 1.7568083076499585, + "language_loss": 0.8734206, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89859939, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20410156, + "step": 3493, + "time_per_iteration": 2.7253260612487793 + }, + { + "auxiliary_loss_clip": 0.01468865, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.3084619, + "balance_loss_mlp": 1.02783656, + "epoch": 0.21007064482188487, + "flos": 15965958838320.0, + "grad_norm": 1.735625397592563, + "language_loss": 0.69398355, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.71913958, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18884277, + "step": 3494, + "time_per_iteration": 2.7378222942352295 + }, + { + "auxiliary_loss_clip": 0.01491229, + "auxiliary_loss_mlp": 0.01053346, + "balance_loss_clip": 1.32313633, + "balance_loss_mlp": 1.03349745, + "epoch": 0.21013076807455283, + "flos": 32240794465440.0, + "grad_norm": 1.624763183592814, + "language_loss": 0.79365098, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81909674, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19848633, + "step": 3495, + "time_per_iteration": 2.8582072257995605 + }, + { + "auxiliary_loss_clip": 0.01480763, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.31579995, + "balance_loss_mlp": 1.03262186, + "epoch": 0.2101908913272208, + "flos": 23701987373040.0, + "grad_norm": 1.5632268441178494, + "language_loss": 0.77441943, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79975426, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20092773, + "step": 3496, + "time_per_iteration": 2.803297281265259 + }, + { + "auxiliary_loss_clip": 0.01484677, + "auxiliary_loss_mlp": 0.01050266, + "balance_loss_clip": 1.31846511, + "balance_loss_mlp": 1.02989328, + "epoch": 0.21025101457988876, + "flos": 20380631780160.0, + "grad_norm": 1.7745191675824463, + "language_loss": 0.82477582, + "learning_rate": 3.668744875505915e-06, + "loss": 0.85012525, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20385742, + "step": 3497, + "time_per_iteration": 4.157285451889038 + }, + { + "auxiliary_loss_clip": 0.01484021, + "auxiliary_loss_mlp": 0.01052107, + "balance_loss_clip": 1.31693506, + "balance_loss_mlp": 1.03177011, + "epoch": 0.21031113783255675, + "flos": 25781182685640.0, + "grad_norm": 2.15397415263066, + "language_loss": 0.67930913, + "learning_rate": 3.668530172166741e-06, + "loss": 0.70467043, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.203125, + "step": 3498, + "time_per_iteration": 2.80423903465271 + }, + { + "auxiliary_loss_clip": 0.01487132, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_clip": 1.31886101, + "balance_loss_mlp": 1.02585399, + "epoch": 0.21037126108522472, + "flos": 22023328563720.0, + "grad_norm": 1.8959521281862957, + "language_loss": 0.81000644, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.83534539, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20922852, + "step": 3499, + "time_per_iteration": 2.7778267860412598 + }, + { + "auxiliary_loss_clip": 0.01477948, + "auxiliary_loss_mlp": 0.0104857, + "balance_loss_clip": 1.31514168, + "balance_loss_mlp": 1.02984285, + "epoch": 0.21043138433789269, + "flos": 25339445678160.0, + "grad_norm": 1.589181250686167, + "language_loss": 0.78405195, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80931717, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18737793, + "step": 3500, + "time_per_iteration": 2.806800365447998 + }, + { + "auxiliary_loss_clip": 0.01479416, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.31538689, + "balance_loss_mlp": 1.02591097, + "epoch": 0.21049150759056065, + "flos": 25562161862280.0, + "grad_norm": 1.534335461226609, + "language_loss": 0.7413255, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76657903, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20031738, + "step": 3501, + "time_per_iteration": 2.7512569427490234 + }, + { + "auxiliary_loss_clip": 0.01477464, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.31421793, + "balance_loss_mlp": 1.02584267, + "epoch": 0.21055163084322862, + "flos": 24500583269640.0, + "grad_norm": 1.486745919796614, + "language_loss": 0.75975007, + "learning_rate": 3.667670726183183e-06, + "loss": 0.78498083, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19763184, + "step": 3502, + "time_per_iteration": 2.826416254043579 + }, + { + "auxiliary_loss_clip": 0.0147309, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_clip": 1.30951786, + "balance_loss_mlp": 1.02473211, + "epoch": 0.21061175409589658, + "flos": 25744570926120.0, + "grad_norm": 1.6965188232598356, + "language_loss": 0.77772444, + "learning_rate": 3.667455706571316e-06, + "loss": 0.80290675, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20410156, + "step": 3503, + "time_per_iteration": 2.841668128967285 + }, + { + "auxiliary_loss_clip": 0.01483855, + "auxiliary_loss_mlp": 0.01059705, + "balance_loss_clip": 1.31436419, + "balance_loss_mlp": 1.03495693, + "epoch": 0.21067187734856455, + "flos": 18993811229640.0, + "grad_norm": 2.4952538568411975, + "language_loss": 0.78734946, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.81278503, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.24755859, + "step": 3504, + "time_per_iteration": 2.7856979370117188 + }, + { + "auxiliary_loss_clip": 0.01481946, + "auxiliary_loss_mlp": 0.01052673, + "balance_loss_clip": 1.31343555, + "balance_loss_mlp": 1.03156114, + "epoch": 0.21073200060123254, + "flos": 24686809519320.0, + "grad_norm": 1.6748588424468573, + "language_loss": 0.76782626, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79317242, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21118164, + "step": 3505, + "time_per_iteration": 4.321808815002441 + }, + { + "auxiliary_loss_clip": 0.01466748, + "auxiliary_loss_mlp": 0.01051438, + "balance_loss_clip": 1.3053987, + "balance_loss_mlp": 1.03169703, + "epoch": 0.2107921238539005, + "flos": 28555351695360.0, + "grad_norm": 1.6984494396287966, + "language_loss": 0.63322926, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65841115, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19750977, + "step": 3506, + "time_per_iteration": 2.8280930519104004 + }, + { + "auxiliary_loss_clip": 0.01474217, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.31147909, + "balance_loss_mlp": 1.03134561, + "epoch": 0.21085224710656847, + "flos": 25891505264520.0, + "grad_norm": 3.473842622407604, + "language_loss": 0.82054079, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84580302, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20654297, + "step": 3507, + "time_per_iteration": 2.7713468074798584 + }, + { + "auxiliary_loss_clip": 0.01472578, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_clip": 1.30819392, + "balance_loss_mlp": 1.02495933, + "epoch": 0.21091237035923643, + "flos": 14980486958280.0, + "grad_norm": 1.6161127391925652, + "language_loss": 0.75641191, + "learning_rate": 3.666379660223824e-06, + "loss": 0.78159583, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20849609, + "step": 3508, + "time_per_iteration": 4.381594181060791 + }, + { + "auxiliary_loss_clip": 0.01481793, + "auxiliary_loss_mlp": 0.01041704, + "balance_loss_clip": 1.31449914, + "balance_loss_mlp": 1.02085447, + "epoch": 0.2109724936119044, + "flos": 16366860816840.0, + "grad_norm": 2.138995095922596, + "language_loss": 0.85217005, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87740505, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20849609, + "step": 3509, + "time_per_iteration": 4.143529176712036 + }, + { + "auxiliary_loss_clip": 0.01481879, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.3144412, + "balance_loss_mlp": 1.02813244, + "epoch": 0.21103261686457236, + "flos": 31508599883400.0, + "grad_norm": 1.6771263475670304, + "language_loss": 0.67970872, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70502555, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.2166748, + "step": 3510, + "time_per_iteration": 2.906538963317871 + }, + { + "auxiliary_loss_clip": 0.01479719, + "auxiliary_loss_mlp": 0.0105276, + "balance_loss_clip": 1.31332672, + "balance_loss_mlp": 1.03236294, + "epoch": 0.21109274011724033, + "flos": 27349762566240.0, + "grad_norm": 1.7718884395982122, + "language_loss": 0.72917855, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.75450325, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20397949, + "step": 3511, + "time_per_iteration": 2.8324899673461914 + }, + { + "auxiliary_loss_clip": 0.01487304, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.31960773, + "balance_loss_mlp": 1.02217472, + "epoch": 0.21115286336990832, + "flos": 17824712034960.0, + "grad_norm": 2.178506305789626, + "language_loss": 0.69260728, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71792668, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.22460938, + "step": 3512, + "time_per_iteration": 2.8221240043640137 + }, + { + "auxiliary_loss_clip": 0.0147812, + "auxiliary_loss_mlp": 0.01055995, + "balance_loss_clip": 1.31300354, + "balance_loss_mlp": 1.03528786, + "epoch": 0.2112129866225763, + "flos": 27203640395040.0, + "grad_norm": 2.1879853639776488, + "language_loss": 0.73835546, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.76369667, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20703125, + "step": 3513, + "time_per_iteration": 2.9790656566619873 + }, + { + "auxiliary_loss_clip": 0.01472166, + "auxiliary_loss_mlp": 0.01045991, + "balance_loss_clip": 1.309852, + "balance_loss_mlp": 1.02660775, + "epoch": 0.21127310987524425, + "flos": 23736406281120.0, + "grad_norm": 1.7377206267972005, + "language_loss": 0.74385232, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76903391, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19384766, + "step": 3514, + "time_per_iteration": 2.7663490772247314 + }, + { + "auxiliary_loss_clip": 0.0148258, + "auxiliary_loss_mlp": 0.01047662, + "balance_loss_clip": 1.3156209, + "balance_loss_mlp": 1.0283978, + "epoch": 0.21133323312791222, + "flos": 18337073626440.0, + "grad_norm": 1.5840201579375572, + "language_loss": 0.76833248, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.79363489, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19250488, + "step": 3515, + "time_per_iteration": 2.748314380645752 + }, + { + "auxiliary_loss_clip": 0.01486219, + "auxiliary_loss_mlp": 0.01051264, + "balance_loss_clip": 1.31938982, + "balance_loss_mlp": 1.03139138, + "epoch": 0.21139335638058018, + "flos": 17935724955960.0, + "grad_norm": 1.903275306574449, + "language_loss": 0.68376541, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70914018, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19873047, + "step": 3516, + "time_per_iteration": 2.7271533012390137 + }, + { + "auxiliary_loss_clip": 0.01481796, + "auxiliary_loss_mlp": 0.01054003, + "balance_loss_clip": 1.31530154, + "balance_loss_mlp": 1.03169858, + "epoch": 0.21145347963324815, + "flos": 24577542757800.0, + "grad_norm": 2.164363088471645, + "language_loss": 0.85308111, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87843913, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.22290039, + "step": 3517, + "time_per_iteration": 2.7674813270568848 + }, + { + "auxiliary_loss_clip": 0.01478664, + "auxiliary_loss_mlp": 0.01048187, + "balance_loss_clip": 1.31338143, + "balance_loss_mlp": 1.0289346, + "epoch": 0.21151360288591614, + "flos": 35852851283040.0, + "grad_norm": 2.7926477372630205, + "language_loss": 0.62986052, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65512902, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19262695, + "step": 3518, + "time_per_iteration": 2.9316375255584717 + }, + { + "auxiliary_loss_clip": 0.0147681, + "auxiliary_loss_mlp": 0.01056068, + "balance_loss_clip": 1.31267166, + "balance_loss_mlp": 1.03679204, + "epoch": 0.2115737261385841, + "flos": 24646583615760.0, + "grad_norm": 1.7233942496669696, + "language_loss": 0.89264739, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91797614, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19262695, + "step": 3519, + "time_per_iteration": 2.761270761489868 + }, + { + "auxiliary_loss_clip": 0.01483052, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.31669104, + "balance_loss_mlp": 1.03780818, + "epoch": 0.21163384939125207, + "flos": 25232168726280.0, + "grad_norm": 1.6813675069988974, + "language_loss": 0.81665808, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.84207767, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21105957, + "step": 3520, + "time_per_iteration": 2.8270885944366455 + }, + { + "auxiliary_loss_clip": 0.01471324, + "auxiliary_loss_mlp": 0.01055939, + "balance_loss_clip": 1.31100583, + "balance_loss_mlp": 1.03555453, + "epoch": 0.21169397264392004, + "flos": 26073102161160.0, + "grad_norm": 1.6534073021796503, + "language_loss": 0.76871741, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.79399002, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20373535, + "step": 3521, + "time_per_iteration": 2.784001588821411 + }, + { + "auxiliary_loss_clip": 0.01478857, + "auxiliary_loss_mlp": 0.01054422, + "balance_loss_clip": 1.31470859, + "balance_loss_mlp": 1.03609955, + "epoch": 0.211754095896588, + "flos": 23112950551920.0, + "grad_norm": 1.767748705563661, + "language_loss": 0.75948274, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78481561, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18347168, + "step": 3522, + "time_per_iteration": 2.87373948097229 + }, + { + "auxiliary_loss_clip": 0.01472765, + "auxiliary_loss_mlp": 0.01063727, + "balance_loss_clip": 1.30957735, + "balance_loss_mlp": 1.04364002, + "epoch": 0.21181421914925597, + "flos": 27927388438200.0, + "grad_norm": 3.215600940388621, + "language_loss": 0.70345867, + "learning_rate": 3.663142046877374e-06, + "loss": 0.7288236, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.2010498, + "step": 3523, + "time_per_iteration": 2.777376174926758 + }, + { + "auxiliary_loss_clip": 0.01481752, + "auxiliary_loss_mlp": 0.0105653, + "balance_loss_clip": 1.31720996, + "balance_loss_mlp": 1.0372777, + "epoch": 0.21187434240192393, + "flos": 17133068223360.0, + "grad_norm": 2.4390334819657755, + "language_loss": 0.77404273, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79942554, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19250488, + "step": 3524, + "time_per_iteration": 2.8397045135498047 + }, + { + "auxiliary_loss_clip": 0.0148998, + "auxiliary_loss_mlp": 0.010473, + "balance_loss_clip": 1.32052815, + "balance_loss_mlp": 1.02767801, + "epoch": 0.21193446565459192, + "flos": 22352590749240.0, + "grad_norm": 1.8343954223857224, + "language_loss": 0.82028037, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.84565318, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19641113, + "step": 3525, + "time_per_iteration": 2.834395408630371 + }, + { + "auxiliary_loss_clip": 0.0148001, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.31775856, + "balance_loss_mlp": 1.02633977, + "epoch": 0.2119945889072599, + "flos": 27205142904360.0, + "grad_norm": 1.7128216957880906, + "language_loss": 0.75528342, + "learning_rate": 3.662492820527356e-06, + "loss": 0.78053349, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18664551, + "step": 3526, + "time_per_iteration": 2.8289079666137695 + }, + { + "auxiliary_loss_clip": 0.01486077, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.32060695, + "balance_loss_mlp": 1.02534437, + "epoch": 0.21205471215992786, + "flos": 20996250095880.0, + "grad_norm": 2.1721264195007013, + "language_loss": 0.77449584, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79981029, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20007324, + "step": 3527, + "time_per_iteration": 2.794687271118164 + }, + { + "auxiliary_loss_clip": 0.01482451, + "auxiliary_loss_mlp": 0.01052623, + "balance_loss_clip": 1.31924331, + "balance_loss_mlp": 1.03061712, + "epoch": 0.21211483541259582, + "flos": 20782630184400.0, + "grad_norm": 1.6408873809029247, + "language_loss": 0.78203797, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80738866, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.22009277, + "step": 3528, + "time_per_iteration": 2.7977981567382812 + }, + { + "auxiliary_loss_clip": 0.01479903, + "auxiliary_loss_mlp": 0.01056478, + "balance_loss_clip": 1.31728268, + "balance_loss_mlp": 1.03703499, + "epoch": 0.21217495866526379, + "flos": 18994745221920.0, + "grad_norm": 1.822043240484313, + "language_loss": 0.81290841, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83827221, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19458008, + "step": 3529, + "time_per_iteration": 2.7983617782592773 + }, + { + "auxiliary_loss_clip": 0.01487724, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_clip": 1.32143021, + "balance_loss_mlp": 1.03094625, + "epoch": 0.21223508191793175, + "flos": 20672145172080.0, + "grad_norm": 2.187083686504766, + "language_loss": 0.76740658, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.7927925, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19921875, + "step": 3530, + "time_per_iteration": 2.8015756607055664 + }, + { + "auxiliary_loss_clip": 0.01478263, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.31701672, + "balance_loss_mlp": 1.02694774, + "epoch": 0.21229520517059972, + "flos": 21621492592920.0, + "grad_norm": 2.0553818828343675, + "language_loss": 0.82922739, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85447407, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19458008, + "step": 3531, + "time_per_iteration": 2.7592873573303223 + }, + { + "auxiliary_loss_clip": 0.01484678, + "auxiliary_loss_mlp": 0.0104506, + "balance_loss_clip": 1.32073009, + "balance_loss_mlp": 1.02332819, + "epoch": 0.2123553284232677, + "flos": 13995096294960.0, + "grad_norm": 2.1927909739157694, + "language_loss": 0.73479605, + "learning_rate": 3.661192665917977e-06, + "loss": 0.76009351, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.21716309, + "step": 3532, + "time_per_iteration": 2.7410006523132324 + }, + { + "auxiliary_loss_clip": 0.01482272, + "auxiliary_loss_mlp": 0.01042505, + "balance_loss_clip": 1.31919813, + "balance_loss_mlp": 1.02268076, + "epoch": 0.21241545167593567, + "flos": 18301436467560.0, + "grad_norm": 1.5977254963299776, + "language_loss": 0.74292707, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76817489, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19836426, + "step": 3533, + "time_per_iteration": 2.793210744857788 + }, + { + "auxiliary_loss_clip": 0.01491479, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.32494521, + "balance_loss_mlp": 1.0255971, + "epoch": 0.21247557492860364, + "flos": 34719632897400.0, + "grad_norm": 1.8435081482015283, + "language_loss": 0.71076536, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73613578, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19958496, + "step": 3534, + "time_per_iteration": 2.9434216022491455 + }, + { + "auxiliary_loss_clip": 0.01479285, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.31521082, + "balance_loss_mlp": 1.02258027, + "epoch": 0.2125356981812716, + "flos": 22058843897520.0, + "grad_norm": 1.8963607046299593, + "language_loss": 0.72255528, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74777782, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20373535, + "step": 3535, + "time_per_iteration": 2.760881185531616 + }, + { + "auxiliary_loss_clip": 0.01480461, + "auxiliary_loss_mlp": 0.01052337, + "balance_loss_clip": 1.31921995, + "balance_loss_mlp": 1.03222632, + "epoch": 0.21259582143393957, + "flos": 28554214661280.0, + "grad_norm": 1.9811174651219157, + "language_loss": 0.70897794, + "learning_rate": 3.660324636216996e-06, + "loss": 0.73430586, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.20117188, + "step": 3536, + "time_per_iteration": 2.9680018424987793 + }, + { + "auxiliary_loss_clip": 0.01483774, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.317523, + "balance_loss_mlp": 1.02699995, + "epoch": 0.21265594468660753, + "flos": 20125770756120.0, + "grad_norm": 1.8584843396933688, + "language_loss": 0.88012779, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90544391, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20837402, + "step": 3537, + "time_per_iteration": 4.116831302642822 + }, + { + "auxiliary_loss_clip": 0.01474294, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.31306434, + "balance_loss_mlp": 1.0234623, + "epoch": 0.21271606793927553, + "flos": 23081414837400.0, + "grad_norm": 2.0592390700988483, + "language_loss": 0.80795205, + "learning_rate": 3.659890243575524e-06, + "loss": 0.83313394, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.2043457, + "step": 3538, + "time_per_iteration": 2.8115391731262207 + }, + { + "auxiliary_loss_clip": 0.0146899, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.30745029, + "balance_loss_mlp": 1.02240276, + "epoch": 0.2127761911919435, + "flos": 26392821382080.0, + "grad_norm": 1.945137137753839, + "language_loss": 0.87496775, + "learning_rate": 3.659672952835863e-06, + "loss": 0.90007341, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19177246, + "step": 3539, + "time_per_iteration": 2.854964017868042 + }, + { + "auxiliary_loss_clip": 0.01475771, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_clip": 1.31282318, + "balance_loss_mlp": 1.02642369, + "epoch": 0.21283631444461146, + "flos": 20232885274560.0, + "grad_norm": 2.1296780560371067, + "language_loss": 0.58550119, + "learning_rate": 3.659455599161237e-06, + "loss": 0.61072117, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19787598, + "step": 3540, + "time_per_iteration": 2.7991881370544434 + }, + { + "auxiliary_loss_clip": 0.01479584, + "auxiliary_loss_mlp": 0.01042689, + "balance_loss_clip": 1.31633306, + "balance_loss_mlp": 1.02254272, + "epoch": 0.21289643769727942, + "flos": 13521011405760.0, + "grad_norm": 2.3029268139904793, + "language_loss": 0.75734264, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78256536, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20129395, + "step": 3541, + "time_per_iteration": 2.7508046627044678 + }, + { + "auxiliary_loss_clip": 0.014736, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.31423593, + "balance_loss_mlp": 1.02435434, + "epoch": 0.2129565609499474, + "flos": 24832444390200.0, + "grad_norm": 1.8336000915675115, + "language_loss": 0.69398534, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.719163, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19812012, + "step": 3542, + "time_per_iteration": 2.8205013275146484 + }, + { + "auxiliary_loss_clip": 0.01471085, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_clip": 1.31055903, + "balance_loss_mlp": 1.02682495, + "epoch": 0.21301668420261535, + "flos": 23664482229600.0, + "grad_norm": 2.764343590927393, + "language_loss": 0.76527423, + "learning_rate": 3.658803160610004e-06, + "loss": 0.79045463, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20141602, + "step": 3543, + "time_per_iteration": 2.7621214389801025 + }, + { + "auxiliary_loss_clip": 0.01474867, + "auxiliary_loss_mlp": 0.01045001, + "balance_loss_clip": 1.31486869, + "balance_loss_mlp": 1.02436614, + "epoch": 0.21307680745528332, + "flos": 16367185683720.0, + "grad_norm": 2.5018758022715675, + "language_loss": 0.66933948, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.69453812, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.20629883, + "step": 3544, + "time_per_iteration": 4.2333879470825195 + }, + { + "auxiliary_loss_clip": 0.01472926, + "auxiliary_loss_mlp": 0.01049807, + "balance_loss_clip": 1.31066084, + "balance_loss_mlp": 1.02983904, + "epoch": 0.2131369307079513, + "flos": 19103768333280.0, + "grad_norm": 1.6465812492449092, + "language_loss": 0.70834029, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.7335676, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19970703, + "step": 3545, + "time_per_iteration": 2.731987476348877 + }, + { + "auxiliary_loss_clip": 0.01481693, + "auxiliary_loss_mlp": 0.01051451, + "balance_loss_clip": 1.31788611, + "balance_loss_mlp": 1.03118515, + "epoch": 0.21319705396061928, + "flos": 30378345908040.0, + "grad_norm": 1.8469620817771788, + "language_loss": 0.72199762, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74732912, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20288086, + "step": 3546, + "time_per_iteration": 2.8726420402526855 + }, + { + "auxiliary_loss_clip": 0.01478961, + "auxiliary_loss_mlp": 0.01045696, + "balance_loss_clip": 1.31556606, + "balance_loss_mlp": 1.02548957, + "epoch": 0.21325717721328724, + "flos": 21760427084400.0, + "grad_norm": 1.727494676316959, + "language_loss": 0.80194545, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82719207, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20214844, + "step": 3547, + "time_per_iteration": 4.3420939445495605 + }, + { + "auxiliary_loss_clip": 0.01479585, + "auxiliary_loss_mlp": 0.0104702, + "balance_loss_clip": 1.31089449, + "balance_loss_mlp": 1.02599144, + "epoch": 0.2133173004659552, + "flos": 28736786158560.0, + "grad_norm": 2.496817513556581, + "language_loss": 0.74874467, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77401078, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21020508, + "step": 3548, + "time_per_iteration": 4.447422027587891 + }, + { + "auxiliary_loss_clip": 0.01481167, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.31349707, + "balance_loss_mlp": 1.02887511, + "epoch": 0.21337742371862317, + "flos": 16841514223080.0, + "grad_norm": 2.6559065217842637, + "language_loss": 0.74005198, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76537257, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.2199707, + "step": 3549, + "time_per_iteration": 2.792508602142334 + }, + { + "auxiliary_loss_clip": 0.01479324, + "auxiliary_loss_mlp": 0.01045768, + "balance_loss_clip": 1.31557238, + "balance_loss_mlp": 1.02553868, + "epoch": 0.21343754697129114, + "flos": 24430202335800.0, + "grad_norm": 1.6347338402707199, + "language_loss": 0.80877572, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83402669, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20227051, + "step": 3550, + "time_per_iteration": 2.839545726776123 + }, + { + "auxiliary_loss_clip": 0.014668, + "auxiliary_loss_mlp": 0.01051386, + "balance_loss_clip": 1.30770588, + "balance_loss_mlp": 1.03250265, + "epoch": 0.21349767022395913, + "flos": 19282075952760.0, + "grad_norm": 1.6683778831517648, + "language_loss": 0.87962019, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90480202, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18896484, + "step": 3551, + "time_per_iteration": 2.7578322887420654 + }, + { + "auxiliary_loss_clip": 0.01465138, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.30576634, + "balance_loss_mlp": 1.02579117, + "epoch": 0.2135577934766271, + "flos": 17352007830000.0, + "grad_norm": 1.8739862666406701, + "language_loss": 0.83540475, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86051536, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20129395, + "step": 3552, + "time_per_iteration": 2.7973837852478027 + }, + { + "auxiliary_loss_clip": 0.01468579, + "auxiliary_loss_mlp": 0.01045348, + "balance_loss_clip": 1.30827975, + "balance_loss_mlp": 1.02561843, + "epoch": 0.21361791672929506, + "flos": 24062054322600.0, + "grad_norm": 1.5817109200546664, + "language_loss": 0.76867735, + "learning_rate": 3.656624278062713e-06, + "loss": 0.79381663, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19726562, + "step": 3553, + "time_per_iteration": 2.8559765815734863 + }, + { + "auxiliary_loss_clip": 0.01460438, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_clip": 1.30005217, + "balance_loss_mlp": 1.02439642, + "epoch": 0.21367803998196302, + "flos": 22167217275120.0, + "grad_norm": 1.622705105115909, + "language_loss": 0.72829652, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75332999, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18530273, + "step": 3554, + "time_per_iteration": 2.8120532035827637 + }, + { + "auxiliary_loss_clip": 0.01475838, + "auxiliary_loss_mlp": 0.01042129, + "balance_loss_clip": 1.31365848, + "balance_loss_mlp": 1.0223279, + "epoch": 0.213738163234631, + "flos": 20891815729200.0, + "grad_norm": 1.9007109850198163, + "language_loss": 0.68273377, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.70791346, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19812012, + "step": 3555, + "time_per_iteration": 2.745656728744507 + }, + { + "auxiliary_loss_clip": 0.01474624, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.31013584, + "balance_loss_mlp": 1.0219301, + "epoch": 0.21379828648729896, + "flos": 28409067090720.0, + "grad_norm": 1.6704649645970695, + "language_loss": 0.65372068, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67888719, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.2010498, + "step": 3556, + "time_per_iteration": 2.8464505672454834 + }, + { + "auxiliary_loss_clip": 0.0147163, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_clip": 1.30773306, + "balance_loss_mlp": 1.02751696, + "epoch": 0.21385840973996692, + "flos": 25484633857080.0, + "grad_norm": 1.7384808598419264, + "language_loss": 0.72868085, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.75388271, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.21044922, + "step": 3557, + "time_per_iteration": 2.9413177967071533 + }, + { + "auxiliary_loss_clip": 0.01482694, + "auxiliary_loss_mlp": 0.01042618, + "balance_loss_clip": 1.31706965, + "balance_loss_mlp": 1.0218873, + "epoch": 0.2139185329926349, + "flos": 28080129772080.0, + "grad_norm": 1.8411515318818275, + "language_loss": 0.67708194, + "learning_rate": 3.655532480546528e-06, + "loss": 0.70233506, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20727539, + "step": 3558, + "time_per_iteration": 2.865267038345337 + }, + { + "auxiliary_loss_clip": 0.01474847, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.30702686, + "balance_loss_mlp": 1.01923704, + "epoch": 0.21397865624530288, + "flos": 19613327947920.0, + "grad_norm": 1.7456535380863998, + "language_loss": 0.80408955, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82923591, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20544434, + "step": 3559, + "time_per_iteration": 2.7525808811187744 + }, + { + "auxiliary_loss_clip": 0.0146615, + "auxiliary_loss_mlp": 0.01048405, + "balance_loss_clip": 1.30460966, + "balance_loss_mlp": 1.03011823, + "epoch": 0.21403877949797084, + "flos": 24686931344400.0, + "grad_norm": 2.007370551745485, + "language_loss": 0.68070686, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70585251, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18273926, + "step": 3560, + "time_per_iteration": 2.8183085918426514 + }, + { + "auxiliary_loss_clip": 0.01477959, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.31373107, + "balance_loss_mlp": 1.01897836, + "epoch": 0.2140989027506388, + "flos": 19865427603480.0, + "grad_norm": 1.9917989300957484, + "language_loss": 0.72974336, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75491166, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19897461, + "step": 3561, + "time_per_iteration": 2.74792742729187 + }, + { + "auxiliary_loss_clip": 0.01478115, + "auxiliary_loss_mlp": 0.01049675, + "balance_loss_clip": 1.31464982, + "balance_loss_mlp": 1.02944493, + "epoch": 0.21415902600330677, + "flos": 19140014617560.0, + "grad_norm": 2.3839702567789667, + "language_loss": 0.7840488, + "learning_rate": 3.654657912480698e-06, + "loss": 0.80932665, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20214844, + "step": 3562, + "time_per_iteration": 2.7236645221710205 + }, + { + "auxiliary_loss_clip": 0.01467028, + "auxiliary_loss_mlp": 0.01048311, + "balance_loss_clip": 1.30704594, + "balance_loss_mlp": 1.02923763, + "epoch": 0.21421914925597474, + "flos": 22277661679080.0, + "grad_norm": 8.504546152290152, + "language_loss": 0.84765005, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.87280345, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1907959, + "step": 3563, + "time_per_iteration": 2.7250044345855713 + }, + { + "auxiliary_loss_clip": 0.01469338, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.30856597, + "balance_loss_mlp": 1.02333188, + "epoch": 0.2142792725086427, + "flos": 33881663872800.0, + "grad_norm": 1.4064550083769702, + "language_loss": 0.7679261, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.79304171, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1887207, + "step": 3564, + "time_per_iteration": 2.907879590988159 + }, + { + "auxiliary_loss_clip": 0.01470591, + "auxiliary_loss_mlp": 0.01042649, + "balance_loss_clip": 1.31110835, + "balance_loss_mlp": 1.02364731, + "epoch": 0.2143393957613107, + "flos": 19864371786120.0, + "grad_norm": 1.8077271435719848, + "language_loss": 0.88580078, + "learning_rate": 3.654001327581981e-06, + "loss": 0.91093314, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19006348, + "step": 3565, + "time_per_iteration": 2.740658760070801 + }, + { + "auxiliary_loss_clip": 0.01310735, + "auxiliary_loss_mlp": 0.01019412, + "balance_loss_clip": 1.23393607, + "balance_loss_mlp": 1.01481044, + "epoch": 0.21439951901397866, + "flos": 68545088154000.0, + "grad_norm": 0.8869964143032238, + "language_loss": 0.52288777, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54618925, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.04589844, + "step": 3566, + "time_per_iteration": 3.2132675647735596 + }, + { + "auxiliary_loss_clip": 0.01462251, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.30703402, + "balance_loss_mlp": 1.01844096, + "epoch": 0.21445964226664663, + "flos": 19687810326120.0, + "grad_norm": 1.754095521693906, + "language_loss": 0.66947019, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69446504, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18774414, + "step": 3567, + "time_per_iteration": 2.719463586807251 + }, + { + "auxiliary_loss_clip": 0.01463187, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.30780768, + "balance_loss_mlp": 1.02136302, + "epoch": 0.2145197655193146, + "flos": 31114114025760.0, + "grad_norm": 1.4556707678317058, + "language_loss": 0.74813414, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.77315211, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17236328, + "step": 3568, + "time_per_iteration": 2.830249071121216 + }, + { + "auxiliary_loss_clip": 0.01467349, + "auxiliary_loss_mlp": 0.01043039, + "balance_loss_clip": 1.3100462, + "balance_loss_mlp": 1.02490723, + "epoch": 0.21457988877198256, + "flos": 20125648931040.0, + "grad_norm": 1.7883804556132497, + "language_loss": 0.77816403, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.8032679, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18139648, + "step": 3569, + "time_per_iteration": 2.816106081008911 + }, + { + "auxiliary_loss_clip": 0.01486295, + "auxiliary_loss_mlp": 0.01045973, + "balance_loss_clip": 1.3216939, + "balance_loss_mlp": 1.02519453, + "epoch": 0.21464001202465052, + "flos": 18592665600960.0, + "grad_norm": 2.3250601447603585, + "language_loss": 0.7025274, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72785008, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20776367, + "step": 3570, + "time_per_iteration": 2.760021924972534 + }, + { + "auxiliary_loss_clip": 0.01475967, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.31229591, + "balance_loss_mlp": 1.03111458, + "epoch": 0.21470013527731852, + "flos": 21840188549400.0, + "grad_norm": 2.1680002030646217, + "language_loss": 0.79219478, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.8174513, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18591309, + "step": 3571, + "time_per_iteration": 2.7642757892608643 + }, + { + "auxiliary_loss_clip": 0.0147733, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.31774378, + "balance_loss_mlp": 1.02058792, + "epoch": 0.21476025852998648, + "flos": 17607721629600.0, + "grad_norm": 2.137199642272322, + "language_loss": 0.83004856, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85523373, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.20593262, + "step": 3572, + "time_per_iteration": 2.714384078979492 + }, + { + "auxiliary_loss_clip": 0.01481594, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.31762874, + "balance_loss_mlp": 1.02209556, + "epoch": 0.21482038178265445, + "flos": 24833703249360.0, + "grad_norm": 2.123430567595024, + "language_loss": 0.65441012, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67963421, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18725586, + "step": 3573, + "time_per_iteration": 2.773097038269043 + }, + { + "auxiliary_loss_clip": 0.01465526, + "auxiliary_loss_mlp": 0.01047792, + "balance_loss_clip": 1.31090236, + "balance_loss_mlp": 1.0300777, + "epoch": 0.2148805050353224, + "flos": 23263458426000.0, + "grad_norm": 2.36702463856918, + "language_loss": 0.76019049, + "learning_rate": 3.652028186908807e-06, + "loss": 0.78532374, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.17712402, + "step": 3574, + "time_per_iteration": 4.130866765975952 + }, + { + "auxiliary_loss_clip": 0.01470769, + "auxiliary_loss_mlp": 0.01041123, + "balance_loss_clip": 1.31130159, + "balance_loss_mlp": 1.022789, + "epoch": 0.21494062828799038, + "flos": 21325634106480.0, + "grad_norm": 1.7186171153747378, + "language_loss": 0.71988636, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74500531, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18322754, + "step": 3575, + "time_per_iteration": 2.780618667602539 + }, + { + "auxiliary_loss_clip": 0.0146932, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.31128395, + "balance_loss_mlp": 1.02753234, + "epoch": 0.21500075154065834, + "flos": 18847973316960.0, + "grad_norm": 1.9867082547909602, + "language_loss": 0.68457699, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70972675, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18103027, + "step": 3576, + "time_per_iteration": 2.742448568344116 + }, + { + "auxiliary_loss_clip": 0.01481566, + "auxiliary_loss_mlp": 0.01040637, + "balance_loss_clip": 1.31856906, + "balance_loss_mlp": 1.02161157, + "epoch": 0.2150608747933263, + "flos": 18446584038120.0, + "grad_norm": 1.9671545842172018, + "language_loss": 0.89108312, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91630512, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19030762, + "step": 3577, + "time_per_iteration": 2.698246479034424 + }, + { + "auxiliary_loss_clip": 0.01328303, + "auxiliary_loss_mlp": 0.01023905, + "balance_loss_clip": 1.25398338, + "balance_loss_mlp": 1.01994729, + "epoch": 0.2151209980459943, + "flos": 66613233263400.0, + "grad_norm": 0.8041017447031263, + "language_loss": 0.56207585, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58559787, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.03955078, + "step": 3578, + "time_per_iteration": 3.210994005203247 + }, + { + "auxiliary_loss_clip": 0.01475556, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.31655085, + "balance_loss_mlp": 1.02191997, + "epoch": 0.21518112129866226, + "flos": 21580373305440.0, + "grad_norm": 2.477948621719285, + "language_loss": 0.88799071, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.91314781, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18237305, + "step": 3579, + "time_per_iteration": 2.729656934738159 + }, + { + "auxiliary_loss_clip": 0.01477728, + "auxiliary_loss_mlp": 0.01044373, + "balance_loss_clip": 1.31544971, + "balance_loss_mlp": 1.02425027, + "epoch": 0.21524124455133023, + "flos": 20052466020360.0, + "grad_norm": 1.626987092705989, + "language_loss": 0.78422415, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80944514, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20129395, + "step": 3580, + "time_per_iteration": 2.848923921585083 + }, + { + "auxiliary_loss_clip": 0.01464813, + "auxiliary_loss_mlp": 0.01050034, + "balance_loss_clip": 1.30705035, + "balance_loss_mlp": 1.03152084, + "epoch": 0.2153013678039982, + "flos": 23957173263960.0, + "grad_norm": 1.8494971273506668, + "language_loss": 0.73720217, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.76235056, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18505859, + "step": 3581, + "time_per_iteration": 2.7553980350494385 + }, + { + "auxiliary_loss_clip": 0.01468023, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.31005514, + "balance_loss_mlp": 1.01767921, + "epoch": 0.21536149105666616, + "flos": 20599733820240.0, + "grad_norm": 2.492556353263576, + "language_loss": 0.71741062, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.74247789, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.21008301, + "step": 3582, + "time_per_iteration": 2.766793966293335 + }, + { + "auxiliary_loss_clip": 0.01463845, + "auxiliary_loss_mlp": 0.01049684, + "balance_loss_clip": 1.30565214, + "balance_loss_mlp": 1.02784503, + "epoch": 0.21542161430933413, + "flos": 12863948935680.0, + "grad_norm": 2.3761612820397775, + "language_loss": 0.84326142, + "learning_rate": 3.650049971985889e-06, + "loss": 0.8683967, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.21850586, + "step": 3583, + "time_per_iteration": 4.187116622924805 + }, + { + "auxiliary_loss_clip": 0.01477019, + "auxiliary_loss_mlp": 0.01045928, + "balance_loss_clip": 1.31535792, + "balance_loss_mlp": 1.02748621, + "epoch": 0.21548173756200212, + "flos": 26109510878880.0, + "grad_norm": 2.376461064442827, + "language_loss": 0.83248079, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85771024, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18457031, + "step": 3584, + "time_per_iteration": 2.764681100845337 + }, + { + "auxiliary_loss_clip": 0.0146067, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_clip": 1.30445361, + "balance_loss_mlp": 1.03034568, + "epoch": 0.21554186081467008, + "flos": 22168882217880.0, + "grad_norm": 1.8647310038401732, + "language_loss": 0.91013026, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.93523097, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19055176, + "step": 3585, + "time_per_iteration": 4.407034635543823 + }, + { + "auxiliary_loss_clip": 0.01469474, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.31064868, + "balance_loss_mlp": 1.02365696, + "epoch": 0.21560198406733805, + "flos": 22971985642440.0, + "grad_norm": 1.7140382913351033, + "language_loss": 0.74954093, + "learning_rate": 3.649389440450277e-06, + "loss": 0.77465248, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18029785, + "step": 3586, + "time_per_iteration": 2.763225793838501 + }, + { + "auxiliary_loss_clip": 0.01476526, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_clip": 1.31563449, + "balance_loss_mlp": 1.02885509, + "epoch": 0.215662107320006, + "flos": 22789251711720.0, + "grad_norm": 1.7024289941748456, + "language_loss": 0.8311289, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85636771, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18505859, + "step": 3587, + "time_per_iteration": 4.194044589996338 + }, + { + "auxiliary_loss_clip": 0.01464035, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.30595088, + "balance_loss_mlp": 1.02012575, + "epoch": 0.21572223057267398, + "flos": 30890098374120.0, + "grad_norm": 1.9241606968358276, + "language_loss": 0.76496339, + "learning_rate": 3.648948773354224e-06, + "loss": 0.78999215, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18725586, + "step": 3588, + "time_per_iteration": 2.795321226119995 + }, + { + "auxiliary_loss_clip": 0.01470224, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.31012249, + "balance_loss_mlp": 1.02211857, + "epoch": 0.21578235382534194, + "flos": 26916675139440.0, + "grad_norm": 1.684677705987089, + "language_loss": 0.81373417, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83883834, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18066406, + "step": 3589, + "time_per_iteration": 2.870020627975464 + }, + { + "auxiliary_loss_clip": 0.01468688, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.31129289, + "balance_loss_mlp": 1.02194428, + "epoch": 0.2158424770780099, + "flos": 24431095719720.0, + "grad_norm": 1.881541043512435, + "language_loss": 0.732867, + "learning_rate": 3.648507856144961e-06, + "loss": 0.75795233, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.17907715, + "step": 3590, + "time_per_iteration": 2.8496851921081543 + }, + { + "auxiliary_loss_clip": 0.01472031, + "auxiliary_loss_mlp": 0.01054755, + "balance_loss_clip": 1.31021738, + "balance_loss_mlp": 1.03516901, + "epoch": 0.2159026003306779, + "flos": 23955102237600.0, + "grad_norm": 1.564826407082979, + "language_loss": 0.84182668, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86709452, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19604492, + "step": 3591, + "time_per_iteration": 2.774456739425659 + }, + { + "auxiliary_loss_clip": 0.01475983, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.31428552, + "balance_loss_mlp": 1.02693748, + "epoch": 0.21596272358334587, + "flos": 30046606612560.0, + "grad_norm": 1.7953594652215656, + "language_loss": 0.69461715, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71986914, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.22265625, + "step": 3592, + "time_per_iteration": 2.813307523727417 + }, + { + "auxiliary_loss_clip": 0.01473052, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_clip": 1.31171966, + "balance_loss_mlp": 1.03209651, + "epoch": 0.21602284683601383, + "flos": 20381119080480.0, + "grad_norm": 3.019359975923816, + "language_loss": 0.83932072, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86456454, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19213867, + "step": 3593, + "time_per_iteration": 2.8766705989837646 + }, + { + "auxiliary_loss_clip": 0.01475452, + "auxiliary_loss_mlp": 0.0104836, + "balance_loss_clip": 1.31542206, + "balance_loss_mlp": 1.0290122, + "epoch": 0.2160829700886818, + "flos": 20782264709160.0, + "grad_norm": 2.6056223138339756, + "language_loss": 0.76184255, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.78708071, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19360352, + "step": 3594, + "time_per_iteration": 2.8544323444366455 + }, + { + "auxiliary_loss_clip": 0.01462591, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.3053813, + "balance_loss_mlp": 1.02614212, + "epoch": 0.21614309334134976, + "flos": 22314882564000.0, + "grad_norm": 1.5024396056172291, + "language_loss": 0.80688345, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.83196354, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19274902, + "step": 3595, + "time_per_iteration": 2.820042610168457 + }, + { + "auxiliary_loss_clip": 0.01478713, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.31676114, + "balance_loss_mlp": 1.01897788, + "epoch": 0.21620321659401773, + "flos": 19614261940200.0, + "grad_norm": 2.122547464239818, + "language_loss": 0.78822505, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81339037, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18847656, + "step": 3596, + "time_per_iteration": 2.750668525695801 + }, + { + "auxiliary_loss_clip": 0.01464535, + "auxiliary_loss_mlp": 0.0104399, + "balance_loss_clip": 1.30882823, + "balance_loss_mlp": 1.02606094, + "epoch": 0.2162633398466857, + "flos": 18849354001200.0, + "grad_norm": 1.802786123786011, + "language_loss": 0.83588916, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.86097443, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.17932129, + "step": 3597, + "time_per_iteration": 2.7026758193969727 + }, + { + "auxiliary_loss_clip": 0.01471579, + "auxiliary_loss_mlp": 0.01048752, + "balance_loss_clip": 1.31165624, + "balance_loss_mlp": 1.02915454, + "epoch": 0.21632346309935369, + "flos": 18773247288600.0, + "grad_norm": 1.5160537018421136, + "language_loss": 0.8094517, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.83465499, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19604492, + "step": 3598, + "time_per_iteration": 2.7541120052337646 + }, + { + "auxiliary_loss_clip": 0.01473129, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.31232095, + "balance_loss_mlp": 1.03230834, + "epoch": 0.21638358635202165, + "flos": 26329952994840.0, + "grad_norm": 2.1170208257518888, + "language_loss": 0.82272685, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84798956, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20825195, + "step": 3599, + "time_per_iteration": 2.772538423538208 + }, + { + "auxiliary_loss_clip": 0.01464802, + "auxiliary_loss_mlp": 0.01046404, + "balance_loss_clip": 1.30703044, + "balance_loss_mlp": 1.02778316, + "epoch": 0.21644370960468962, + "flos": 20745774774720.0, + "grad_norm": 1.7444125833538666, + "language_loss": 0.7661612, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.7912733, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1862793, + "step": 3600, + "time_per_iteration": 2.7317721843719482 + }, + { + "auxiliary_loss_clip": 0.01469747, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_clip": 1.31152034, + "balance_loss_mlp": 1.0285008, + "epoch": 0.21650383285735758, + "flos": 23957660564280.0, + "grad_norm": 1.924731564210769, + "language_loss": 0.80500787, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.83016384, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.17346191, + "step": 3601, + "time_per_iteration": 2.799776792526245 + }, + { + "auxiliary_loss_clip": 0.01462725, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.30285907, + "balance_loss_mlp": 1.02780592, + "epoch": 0.21656395611002555, + "flos": 23701378247640.0, + "grad_norm": 4.289824448154705, + "language_loss": 0.83736193, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.86245507, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18786621, + "step": 3602, + "time_per_iteration": 2.8216214179992676 + }, + { + "auxiliary_loss_clip": 0.01470887, + "auxiliary_loss_mlp": 0.01049004, + "balance_loss_clip": 1.31168652, + "balance_loss_mlp": 1.02971554, + "epoch": 0.2166240793626935, + "flos": 20670723879480.0, + "grad_norm": 2.4284094107058336, + "language_loss": 0.74747807, + "learning_rate": 3.645635802397693e-06, + "loss": 0.772677, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19287109, + "step": 3603, + "time_per_iteration": 2.7399208545684814 + }, + { + "auxiliary_loss_clip": 0.01464426, + "auxiliary_loss_mlp": 0.01045907, + "balance_loss_clip": 1.31097996, + "balance_loss_mlp": 1.02727437, + "epoch": 0.2166842026153615, + "flos": 21585489958800.0, + "grad_norm": 1.9418955162699505, + "language_loss": 0.7436775, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76878083, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18615723, + "step": 3604, + "time_per_iteration": 2.723184585571289 + }, + { + "auxiliary_loss_clip": 0.01460262, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.30586338, + "balance_loss_mlp": 1.01954556, + "epoch": 0.21674432586802947, + "flos": 25635507206400.0, + "grad_norm": 1.5733997125722523, + "language_loss": 0.80080593, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82578331, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.17932129, + "step": 3605, + "time_per_iteration": 2.7922191619873047 + }, + { + "auxiliary_loss_clip": 0.01336048, + "auxiliary_loss_mlp": 0.0100202, + "balance_loss_clip": 1.2604959, + "balance_loss_mlp": 0.99777585, + "epoch": 0.21680444912069743, + "flos": 56431323303840.0, + "grad_norm": 0.7072620137024157, + "language_loss": 0.58353955, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.6069203, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.04248047, + "step": 3606, + "time_per_iteration": 3.3237979412078857 + }, + { + "auxiliary_loss_clip": 0.01470762, + "auxiliary_loss_mlp": 0.0104887, + "balance_loss_clip": 1.30985093, + "balance_loss_mlp": 1.02909303, + "epoch": 0.2168645723733654, + "flos": 23884315220160.0, + "grad_norm": 1.7472971898086238, + "language_loss": 0.73045981, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75565612, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19763184, + "step": 3607, + "time_per_iteration": 2.7898871898651123 + }, + { + "auxiliary_loss_clip": 0.01470657, + "auxiliary_loss_mlp": 0.01050716, + "balance_loss_clip": 1.30838311, + "balance_loss_mlp": 1.03031921, + "epoch": 0.21692469562603336, + "flos": 16950577942800.0, + "grad_norm": 1.8808226955391651, + "language_loss": 0.77716213, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.80237585, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20410156, + "step": 3608, + "time_per_iteration": 2.7640790939331055 + }, + { + "auxiliary_loss_clip": 0.01468406, + "auxiliary_loss_mlp": 0.01046026, + "balance_loss_clip": 1.30837142, + "balance_loss_mlp": 1.02759683, + "epoch": 0.21698481887870133, + "flos": 25124526299160.0, + "grad_norm": 1.7102380139347135, + "language_loss": 0.7461465, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.77129078, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18432617, + "step": 3609, + "time_per_iteration": 2.743565082550049 + }, + { + "auxiliary_loss_clip": 0.01462867, + "auxiliary_loss_mlp": 0.01056877, + "balance_loss_clip": 1.30315685, + "balance_loss_mlp": 1.03783894, + "epoch": 0.2170449421313693, + "flos": 17899194413160.0, + "grad_norm": 2.395839201782983, + "language_loss": 0.88516235, + "learning_rate": 3.6440849425579e-06, + "loss": 0.9103598, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19030762, + "step": 3610, + "time_per_iteration": 2.743915319442749 + }, + { + "auxiliary_loss_clip": 0.01460502, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.30340672, + "balance_loss_mlp": 1.02236831, + "epoch": 0.2171050653840373, + "flos": 22643657449200.0, + "grad_norm": 1.5986116080685002, + "language_loss": 0.77790093, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.80291802, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18835449, + "step": 3611, + "time_per_iteration": 2.7146401405334473 + }, + { + "auxiliary_loss_clip": 0.01451786, + "auxiliary_loss_mlp": 0.0105252, + "balance_loss_clip": 1.29565954, + "balance_loss_mlp": 1.03491235, + "epoch": 0.21716518863670525, + "flos": 19504710920160.0, + "grad_norm": 2.102570905062997, + "language_loss": 0.64332598, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.66836905, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.17614746, + "step": 3612, + "time_per_iteration": 2.7378592491149902 + }, + { + "auxiliary_loss_clip": 0.0145837, + "auxiliary_loss_mlp": 0.01044779, + "balance_loss_clip": 1.30274606, + "balance_loss_mlp": 1.02612305, + "epoch": 0.21722531188937322, + "flos": 19796792829120.0, + "grad_norm": 1.7361350482936566, + "language_loss": 0.76578313, + "learning_rate": 3.643419353014776e-06, + "loss": 0.79081464, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18652344, + "step": 3613, + "time_per_iteration": 2.7026185989379883 + }, + { + "auxiliary_loss_clip": 0.0145471, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_clip": 1.29988933, + "balance_loss_mlp": 1.02693295, + "epoch": 0.21728543514204118, + "flos": 13338358691760.0, + "grad_norm": 1.8435531272731174, + "language_loss": 0.71116245, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73617059, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.19165039, + "step": 3614, + "time_per_iteration": 4.1497979164123535 + }, + { + "auxiliary_loss_clip": 0.01455194, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_clip": 1.29959607, + "balance_loss_mlp": 1.03346884, + "epoch": 0.21734555839470915, + "flos": 15236119541160.0, + "grad_norm": 1.644716971346947, + "language_loss": 0.73381066, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75887716, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18005371, + "step": 3615, + "time_per_iteration": 2.7219083309173584 + }, + { + "auxiliary_loss_clip": 0.01466166, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.30308056, + "balance_loss_mlp": 1.02065825, + "epoch": 0.2174056816473771, + "flos": 19978714592640.0, + "grad_norm": 2.4300403058725455, + "language_loss": 0.90050352, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92557371, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20202637, + "step": 3616, + "time_per_iteration": 2.7535359859466553 + }, + { + "auxiliary_loss_clip": 0.01463906, + "auxiliary_loss_mlp": 0.0104146, + "balance_loss_clip": 1.30554223, + "balance_loss_mlp": 1.02291107, + "epoch": 0.21746580490004508, + "flos": 16691209390800.0, + "grad_norm": 2.295494721042008, + "language_loss": 0.8177014, + "learning_rate": 3.642531027869148e-06, + "loss": 0.84275508, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1854248, + "step": 3617, + "time_per_iteration": 2.7172598838806152 + }, + { + "auxiliary_loss_clip": 0.01459904, + "auxiliary_loss_mlp": 0.01046596, + "balance_loss_clip": 1.30128813, + "balance_loss_mlp": 1.0282495, + "epoch": 0.21752592815271307, + "flos": 25777406108160.0, + "grad_norm": 1.6878498797119499, + "language_loss": 0.76229763, + "learning_rate": 3.642308790849329e-06, + "loss": 0.78736258, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18334961, + "step": 3618, + "time_per_iteration": 2.748704195022583 + }, + { + "auxiliary_loss_clip": 0.01462617, + "auxiliary_loss_mlp": 0.01049307, + "balance_loss_clip": 1.306126, + "balance_loss_mlp": 1.02984047, + "epoch": 0.21758605140538104, + "flos": 11258391820320.0, + "grad_norm": 1.9327203624472815, + "language_loss": 0.69321001, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71832919, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19458008, + "step": 3619, + "time_per_iteration": 2.754300355911255 + }, + { + "auxiliary_loss_clip": 0.0146557, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.30534816, + "balance_loss_mlp": 1.03446138, + "epoch": 0.217646174658049, + "flos": 19247047919280.0, + "grad_norm": 1.645233424843343, + "language_loss": 0.78655219, + "learning_rate": 3.641864129988579e-06, + "loss": 0.81175292, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.20043945, + "step": 3620, + "time_per_iteration": 2.7261312007904053 + }, + { + "auxiliary_loss_clip": 0.01449684, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.29661906, + "balance_loss_mlp": 1.02274227, + "epoch": 0.21770629791071697, + "flos": 21950186261400.0, + "grad_norm": 1.5202956511971122, + "language_loss": 0.80115199, + "learning_rate": 3.641641706164509e-06, + "loss": 0.82606256, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.1862793, + "step": 3621, + "time_per_iteration": 2.7287356853485107 + }, + { + "auxiliary_loss_clip": 0.01455544, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.29944813, + "balance_loss_mlp": 1.02265012, + "epoch": 0.21776642116338493, + "flos": 24942239060400.0, + "grad_norm": 1.552109680143343, + "language_loss": 0.88152653, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90649438, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18579102, + "step": 3622, + "time_per_iteration": 4.1797685623168945 + }, + { + "auxiliary_loss_clip": 0.0146597, + "auxiliary_loss_mlp": 0.01046798, + "balance_loss_clip": 1.30674386, + "balance_loss_mlp": 1.02579331, + "epoch": 0.2178265444160529, + "flos": 17825767852320.0, + "grad_norm": 1.692932593151346, + "language_loss": 0.77350265, + "learning_rate": 3.641196671771152e-06, + "loss": 0.7986303, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.20983887, + "step": 3623, + "time_per_iteration": 2.6900360584259033 + }, + { + "auxiliary_loss_clip": 0.01464962, + "auxiliary_loss_mlp": 0.01054597, + "balance_loss_clip": 1.30580735, + "balance_loss_mlp": 1.0343672, + "epoch": 0.2178866676687209, + "flos": 17717475691440.0, + "grad_norm": 1.9978126738607238, + "language_loss": 0.85027677, + "learning_rate": 3.640974061218741e-06, + "loss": 0.87547243, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.20227051, + "step": 3624, + "time_per_iteration": 2.724658966064453 + }, + { + "auxiliary_loss_clip": 0.01467428, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.31127667, + "balance_loss_mlp": 1.03562033, + "epoch": 0.21794679092138886, + "flos": 16950212467560.0, + "grad_norm": 2.380357237847574, + "language_loss": 0.77761698, + "learning_rate": 3.640751388440429e-06, + "loss": 0.80283999, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19250488, + "step": 3625, + "time_per_iteration": 4.332759141921997 + }, + { + "auxiliary_loss_clip": 0.01316923, + "auxiliary_loss_mlp": 0.01015705, + "balance_loss_clip": 1.2438575, + "balance_loss_mlp": 1.01203346, + "epoch": 0.21800691417405682, + "flos": 63733533461280.0, + "grad_norm": 0.8073929777367744, + "language_loss": 0.60770726, + "learning_rate": 3.64052865344466e-06, + "loss": 0.63103354, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.03662109, + "step": 3626, + "time_per_iteration": 4.741283178329468 + }, + { + "auxiliary_loss_clip": 0.01461478, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_clip": 1.30275202, + "balance_loss_mlp": 1.02731633, + "epoch": 0.21806703742672479, + "flos": 21621411376200.0, + "grad_norm": 1.8463841374965895, + "language_loss": 0.90740836, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.93249679, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.20056152, + "step": 3627, + "time_per_iteration": 2.7325599193573 + }, + { + "auxiliary_loss_clip": 0.01460778, + "auxiliary_loss_mlp": 0.01040906, + "balance_loss_clip": 1.30451763, + "balance_loss_mlp": 1.02104568, + "epoch": 0.21812716067939275, + "flos": 19359685174680.0, + "grad_norm": 1.7604421269167159, + "language_loss": 0.74120426, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.76622105, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19836426, + "step": 3628, + "time_per_iteration": 2.7838425636291504 + }, + { + "auxiliary_loss_clip": 0.01453299, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.2989434, + "balance_loss_mlp": 1.02167737, + "epoch": 0.21818728393206072, + "flos": 23553063225000.0, + "grad_norm": 1.7206259108602662, + "language_loss": 0.7710948, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79603624, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.19165039, + "step": 3629, + "time_per_iteration": 2.7577860355377197 + }, + { + "auxiliary_loss_clip": 0.01459617, + "auxiliary_loss_mlp": 0.01040599, + "balance_loss_clip": 1.30642068, + "balance_loss_mlp": 1.02283645, + "epoch": 0.21824740718472868, + "flos": 30231614611440.0, + "grad_norm": 1.549855051536349, + "language_loss": 0.72128791, + "learning_rate": 3.63963709145597e-06, + "loss": 0.74629009, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.1776123, + "step": 3630, + "time_per_iteration": 2.851501226425171 + }, + { + "auxiliary_loss_clip": 0.01452234, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.30333805, + "balance_loss_mlp": 1.02702475, + "epoch": 0.21830753043739667, + "flos": 26139503475720.0, + "grad_norm": 1.7262808302542447, + "language_loss": 0.77129233, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.79625154, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.16674805, + "step": 3631, + "time_per_iteration": 2.7813785076141357 + }, + { + "auxiliary_loss_clip": 0.01462764, + "auxiliary_loss_mlp": 0.01044468, + "balance_loss_clip": 1.30633473, + "balance_loss_mlp": 1.02578747, + "epoch": 0.21836765369006464, + "flos": 21725074184040.0, + "grad_norm": 1.9204421880895366, + "language_loss": 0.75085294, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77592528, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18664551, + "step": 3632, + "time_per_iteration": 2.8072195053100586 + }, + { + "auxiliary_loss_clip": 0.01450625, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.29851604, + "balance_loss_mlp": 1.02253985, + "epoch": 0.2184277769427326, + "flos": 19942346483280.0, + "grad_norm": 1.8676737496958853, + "language_loss": 0.83783138, + "learning_rate": 3.638967767095249e-06, + "loss": 0.86273563, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.17260742, + "step": 3633, + "time_per_iteration": 2.817150831222534 + }, + { + "auxiliary_loss_clip": 0.0145428, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.30198383, + "balance_loss_mlp": 1.02750063, + "epoch": 0.21848790019540057, + "flos": 20345197663080.0, + "grad_norm": 1.5854580085523475, + "language_loss": 0.8179363, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.84293222, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.17810059, + "step": 3634, + "time_per_iteration": 2.7469303607940674 + }, + { + "auxiliary_loss_clip": 0.01465974, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.30901515, + "balance_loss_mlp": 1.02448332, + "epoch": 0.21854802344806853, + "flos": 15455099756160.0, + "grad_norm": 1.69061340195668, + "language_loss": 0.75463152, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77971697, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.1809082, + "step": 3635, + "time_per_iteration": 2.8588368892669678 + }, + { + "auxiliary_loss_clip": 0.01451014, + "auxiliary_loss_mlp": 0.01054381, + "balance_loss_clip": 1.29967141, + "balance_loss_mlp": 1.03645158, + "epoch": 0.2186081467007365, + "flos": 16324360845120.0, + "grad_norm": 2.2051066324087065, + "language_loss": 0.88225746, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90731138, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.17919922, + "step": 3636, + "time_per_iteration": 2.7573142051696777 + }, + { + "auxiliary_loss_clip": 0.01454008, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.3002876, + "balance_loss_mlp": 1.03203154, + "epoch": 0.2186682699534045, + "flos": 21694269420000.0, + "grad_norm": 1.929435001315579, + "language_loss": 0.75589561, + "learning_rate": 3.638074464556311e-06, + "loss": 0.7809357, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.1796875, + "step": 3637, + "time_per_iteration": 2.828876495361328 + }, + { + "auxiliary_loss_clip": 0.014633, + "auxiliary_loss_mlp": 0.01041897, + "balance_loss_clip": 1.3061235, + "balance_loss_mlp": 1.02278781, + "epoch": 0.21872839320607246, + "flos": 17742514068360.0, + "grad_norm": 2.6308096906914673, + "language_loss": 0.90076703, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92581898, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19104004, + "step": 3638, + "time_per_iteration": 2.709021806716919 + }, + { + "auxiliary_loss_clip": 0.01453827, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.30106235, + "balance_loss_mlp": 1.02895975, + "epoch": 0.21878851645874042, + "flos": 18655777638360.0, + "grad_norm": 2.2872218982374624, + "language_loss": 0.89765787, + "learning_rate": 3.637627440557275e-06, + "loss": 0.92266697, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18127441, + "step": 3639, + "time_per_iteration": 2.7157623767852783 + }, + { + "auxiliary_loss_clip": 0.01452868, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.30126333, + "balance_loss_mlp": 1.03193235, + "epoch": 0.2188486397114084, + "flos": 25562892812760.0, + "grad_norm": 1.9625057519203672, + "language_loss": 0.7958107, + "learning_rate": 3.637403835405024e-06, + "loss": 0.82083398, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.17541504, + "step": 3640, + "time_per_iteration": 2.7856638431549072 + }, + { + "auxiliary_loss_clip": 0.01459112, + "auxiliary_loss_mlp": 0.0105613, + "balance_loss_clip": 1.30504262, + "balance_loss_mlp": 1.03731871, + "epoch": 0.21890876296407635, + "flos": 17896230002880.0, + "grad_norm": 2.73648267334645, + "language_loss": 0.72731715, + "learning_rate": 3.637180168162255e-06, + "loss": 0.7524696, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.18835449, + "step": 3641, + "time_per_iteration": 2.7398645877838135 + }, + { + "auxiliary_loss_clip": 0.0146238, + "auxiliary_loss_mlp": 0.0104891, + "balance_loss_clip": 1.31033134, + "balance_loss_mlp": 1.03176785, + "epoch": 0.21896888621674432, + "flos": 17753640759000.0, + "grad_norm": 1.8793316782941987, + "language_loss": 0.81353104, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83864397, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.17150879, + "step": 3642, + "time_per_iteration": 2.730696678161621 + }, + { + "auxiliary_loss_clip": 0.01457396, + "auxiliary_loss_mlp": 0.01050059, + "balance_loss_clip": 1.30276442, + "balance_loss_mlp": 1.0319984, + "epoch": 0.21902900946941228, + "flos": 23081414837400.0, + "grad_norm": 1.6592448835781937, + "language_loss": 0.71647316, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74154776, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.18066406, + "step": 3643, + "time_per_iteration": 2.8405702114105225 + }, + { + "auxiliary_loss_clip": 0.01456109, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_clip": 1.30178249, + "balance_loss_mlp": 1.02829337, + "epoch": 0.21908913272208028, + "flos": 48187098857520.0, + "grad_norm": 1.797842354275932, + "language_loss": 0.68513978, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.71016508, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.18139648, + "step": 3644, + "time_per_iteration": 3.025390863418579 + }, + { + "auxiliary_loss_clip": 0.01463083, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_clip": 1.30399883, + "balance_loss_mlp": 1.03045726, + "epoch": 0.21914925597474824, + "flos": 22241862086760.0, + "grad_norm": 2.1265438201645592, + "language_loss": 0.7811709, + "learning_rate": 3.636284878455669e-06, + "loss": 0.80628884, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18261719, + "step": 3645, + "time_per_iteration": 2.727445363998413 + }, + { + "auxiliary_loss_clip": 0.01450839, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.30029559, + "balance_loss_mlp": 1.0346247, + "epoch": 0.2192093792274162, + "flos": 22130483690520.0, + "grad_norm": 1.596560872338089, + "language_loss": 0.82854587, + "learning_rate": 3.636060900887582e-06, + "loss": 0.85357285, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.17224121, + "step": 3646, + "time_per_iteration": 2.844465732574463 + }, + { + "auxiliary_loss_clip": 0.01454067, + "auxiliary_loss_mlp": 0.01043051, + "balance_loss_clip": 1.30128431, + "balance_loss_mlp": 1.02568257, + "epoch": 0.21926950248008417, + "flos": 15673714495920.0, + "grad_norm": 1.7445924749450852, + "language_loss": 0.83140695, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85637814, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.17382812, + "step": 3647, + "time_per_iteration": 2.7206692695617676 + }, + { + "auxiliary_loss_clip": 0.01451035, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.29835153, + "balance_loss_mlp": 1.03162503, + "epoch": 0.21932962573275214, + "flos": 30268510629480.0, + "grad_norm": 1.720913057360166, + "language_loss": 0.72555822, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.17114258, + "step": 3648, + "time_per_iteration": 2.841129779815674 + }, + { + "auxiliary_loss_clip": 0.01457066, + "auxiliary_loss_mlp": 0.01049247, + "balance_loss_clip": 1.29866982, + "balance_loss_mlp": 1.02881503, + "epoch": 0.2193897489854201, + "flos": 10783738414080.0, + "grad_norm": 2.3223611627142366, + "language_loss": 0.75077122, + "learning_rate": 3.635388595979745e-06, + "loss": 0.77583432, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.20410156, + "step": 3649, + "time_per_iteration": 2.6997547149658203 + }, + { + "auxiliary_loss_clip": 0.01445932, + "auxiliary_loss_mlp": 0.01045765, + "balance_loss_clip": 1.29572535, + "balance_loss_mlp": 1.02905202, + "epoch": 0.21944987223808807, + "flos": 19137740549400.0, + "grad_norm": 1.9459854137371395, + "language_loss": 0.86282355, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88774049, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.16699219, + "step": 3650, + "time_per_iteration": 2.7200746536254883 + }, + { + "auxiliary_loss_clip": 0.01452512, + "auxiliary_loss_mlp": 0.01043746, + "balance_loss_clip": 1.29777312, + "balance_loss_mlp": 1.02531672, + "epoch": 0.21950999549075606, + "flos": 22716271842840.0, + "grad_norm": 2.4736521948912245, + "language_loss": 0.84597641, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.87093902, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.18432617, + "step": 3651, + "time_per_iteration": 2.798266649246216 + }, + { + "auxiliary_loss_clip": 0.01453581, + "auxiliary_loss_mlp": 0.01042393, + "balance_loss_clip": 1.2990272, + "balance_loss_mlp": 1.02506018, + "epoch": 0.21957011874342403, + "flos": 10564920632520.0, + "grad_norm": 2.0464803788676185, + "language_loss": 0.74872094, + "learning_rate": 3.634715732945027e-06, + "loss": 0.77368069, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.17346191, + "step": 3652, + "time_per_iteration": 4.086323022842407 + }, + { + "auxiliary_loss_clip": 0.01332633, + "auxiliary_loss_mlp": 0.01003053, + "balance_loss_clip": 1.25495386, + "balance_loss_mlp": 0.99921429, + "epoch": 0.219630241996092, + "flos": 65761335571320.0, + "grad_norm": 0.7797074606511212, + "language_loss": 0.51606244, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53941929, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.03833008, + "step": 3653, + "time_per_iteration": 3.273761034011841 + }, + { + "auxiliary_loss_clip": 0.01462423, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_clip": 1.30746448, + "balance_loss_mlp": 1.03074694, + "epoch": 0.21969036524875996, + "flos": 23701865547960.0, + "grad_norm": 2.6479745857500423, + "language_loss": 0.76078868, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.78590345, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18310547, + "step": 3654, + "time_per_iteration": 2.798802375793457 + }, + { + "auxiliary_loss_clip": 0.01463864, + "auxiliary_loss_mlp": 0.01046091, + "balance_loss_clip": 1.30618441, + "balance_loss_mlp": 1.0272795, + "epoch": 0.21975048850142792, + "flos": 19644985487520.0, + "grad_norm": 1.9804854212542258, + "language_loss": 0.7300573, + "learning_rate": 3.634042312013064e-06, + "loss": 0.75515687, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18823242, + "step": 3655, + "time_per_iteration": 2.7958600521087646 + }, + { + "auxiliary_loss_clip": 0.01457288, + "auxiliary_loss_mlp": 0.01047531, + "balance_loss_clip": 1.3025949, + "balance_loss_mlp": 1.02989948, + "epoch": 0.21981061175409589, + "flos": 22452761238120.0, + "grad_norm": 1.4987819396257296, + "language_loss": 0.80981034, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83485848, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.1763916, + "step": 3656, + "time_per_iteration": 2.7764482498168945 + }, + { + "auxiliary_loss_clip": 0.01462901, + "auxiliary_loss_mlp": 0.0104376, + "balance_loss_clip": 1.30751979, + "balance_loss_mlp": 1.02635586, + "epoch": 0.21987073500676388, + "flos": 18155720379960.0, + "grad_norm": 2.1312164853326014, + "language_loss": 0.85434926, + "learning_rate": 3.63359305489566e-06, + "loss": 0.87941587, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.17407227, + "step": 3657, + "time_per_iteration": 2.7229886054992676 + }, + { + "auxiliary_loss_clip": 0.01459471, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_clip": 1.30112612, + "balance_loss_mlp": 1.02426434, + "epoch": 0.21993085825943184, + "flos": 25631243328600.0, + "grad_norm": 1.8348104968624128, + "language_loss": 0.81035316, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.83537185, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18127441, + "step": 3658, + "time_per_iteration": 2.841444969177246 + }, + { + "auxiliary_loss_clip": 0.01320235, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 1.24567533, + "balance_loss_mlp": 0.99650306, + "epoch": 0.2199909815120998, + "flos": 70941339516960.0, + "grad_norm": 0.7843100375820864, + "language_loss": 0.58301997, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60622239, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.03515625, + "step": 3659, + "time_per_iteration": 3.3444902896881104 + }, + { + "auxiliary_loss_clip": 0.01455149, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.2997309, + "balance_loss_mlp": 1.0294106, + "epoch": 0.22005110476476777, + "flos": 21548553332400.0, + "grad_norm": 2.0973629654836214, + "language_loss": 0.74436665, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76938725, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17504883, + "step": 3660, + "time_per_iteration": 4.175879001617432 + }, + { + "auxiliary_loss_clip": 0.01460154, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.30171204, + "balance_loss_mlp": 1.0223031, + "epoch": 0.22011122801743574, + "flos": 22059615456360.0, + "grad_norm": 1.6105241095670069, + "language_loss": 0.81007206, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83508813, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19152832, + "step": 3661, + "time_per_iteration": 2.7498860359191895 + }, + { + "auxiliary_loss_clip": 0.01460622, + "auxiliary_loss_mlp": 0.01042678, + "balance_loss_clip": 1.30484402, + "balance_loss_mlp": 1.02611423, + "epoch": 0.2201713512701037, + "flos": 26693228004840.0, + "grad_norm": 1.883613447674147, + "language_loss": 0.74074942, + "learning_rate": 3.632468828196102e-06, + "loss": 0.76578248, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.16558838, + "step": 3662, + "time_per_iteration": 2.7892441749572754 + }, + { + "auxiliary_loss_clip": 0.01455356, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_clip": 1.3019563, + "balance_loss_mlp": 1.03560257, + "epoch": 0.22023147452277167, + "flos": 22167095450040.0, + "grad_norm": 1.5262790862747153, + "language_loss": 0.78789759, + "learning_rate": 3.632243797111929e-06, + "loss": 0.81297696, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.16992188, + "step": 3663, + "time_per_iteration": 4.262408971786499 + }, + { + "auxiliary_loss_clip": 0.01465216, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_clip": 1.30633593, + "balance_loss_mlp": 1.03790402, + "epoch": 0.22029159777543966, + "flos": 22527730916640.0, + "grad_norm": 1.7076712720933502, + "language_loss": 0.80283511, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82805246, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18603516, + "step": 3664, + "time_per_iteration": 2.8114395141601562 + }, + { + "auxiliary_loss_clip": 0.01467353, + "auxiliary_loss_mlp": 0.01056509, + "balance_loss_clip": 1.30516052, + "balance_loss_mlp": 1.03608799, + "epoch": 0.22035172102810763, + "flos": 13046398607880.0, + "grad_norm": 2.210721770409989, + "language_loss": 0.77399206, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.79923069, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.2043457, + "step": 3665, + "time_per_iteration": 4.2790422439575195 + }, + { + "auxiliary_loss_clip": 0.01458576, + "auxiliary_loss_mlp": 0.01058175, + "balance_loss_clip": 1.30209279, + "balance_loss_mlp": 1.04001927, + "epoch": 0.2204118442807756, + "flos": 12169503147240.0, + "grad_norm": 2.3167450451604807, + "language_loss": 0.97950119, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00466871, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18164062, + "step": 3666, + "time_per_iteration": 2.8141069412231445 + }, + { + "auxiliary_loss_clip": 0.01457315, + "auxiliary_loss_mlp": 0.01053093, + "balance_loss_clip": 1.30186415, + "balance_loss_mlp": 1.03457987, + "epoch": 0.22047196753344356, + "flos": 40115473233120.0, + "grad_norm": 1.6968151973427423, + "language_loss": 0.80798781, + "learning_rate": 3.631343053912122e-06, + "loss": 0.83309197, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18518066, + "step": 3667, + "time_per_iteration": 3.0152225494384766 + }, + { + "auxiliary_loss_clip": 0.01464744, + "auxiliary_loss_mlp": 0.01060064, + "balance_loss_clip": 1.3050127, + "balance_loss_mlp": 1.03929734, + "epoch": 0.22053209078611152, + "flos": 20705630087880.0, + "grad_norm": 1.8239235669329106, + "language_loss": 0.77580023, + "learning_rate": 3.631117713439087e-06, + "loss": 0.8010484, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.2076416, + "step": 3668, + "time_per_iteration": 2.8474080562591553 + }, + { + "auxiliary_loss_clip": 0.01454541, + "auxiliary_loss_mlp": 0.01046537, + "balance_loss_clip": 1.29870176, + "balance_loss_mlp": 1.02767766, + "epoch": 0.2205922140387795, + "flos": 24721553294280.0, + "grad_norm": 1.51993512545566, + "language_loss": 0.71396583, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7389766, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.1887207, + "step": 3669, + "time_per_iteration": 2.835170030593872 + }, + { + "auxiliary_loss_clip": 0.01459691, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.30277121, + "balance_loss_mlp": 1.02437353, + "epoch": 0.22065233729144745, + "flos": 23482519857720.0, + "grad_norm": 1.9108891745397107, + "language_loss": 0.85472286, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87974566, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18237305, + "step": 3670, + "time_per_iteration": 2.7905213832855225 + }, + { + "auxiliary_loss_clip": 0.01468325, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.30923176, + "balance_loss_mlp": 1.02927661, + "epoch": 0.22071246054411545, + "flos": 35232359964120.0, + "grad_norm": 2.028630001120021, + "language_loss": 0.7696408, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79480755, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19091797, + "step": 3671, + "time_per_iteration": 2.849010705947876 + }, + { + "auxiliary_loss_clip": 0.01456332, + "auxiliary_loss_mlp": 0.01041611, + "balance_loss_clip": 1.29998493, + "balance_loss_mlp": 1.02299047, + "epoch": 0.2207725837967834, + "flos": 18154867604400.0, + "grad_norm": 3.7047113716522526, + "language_loss": 0.81729591, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.84227532, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18615723, + "step": 3672, + "time_per_iteration": 2.786776065826416 + }, + { + "auxiliary_loss_clip": 0.01459375, + "auxiliary_loss_mlp": 0.01054811, + "balance_loss_clip": 1.30354404, + "balance_loss_mlp": 1.0373702, + "epoch": 0.22083270704945138, + "flos": 20483807287680.0, + "grad_norm": 1.8993647086994825, + "language_loss": 0.7426694, + "learning_rate": 3.629990083462682e-06, + "loss": 0.76781124, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.17431641, + "step": 3673, + "time_per_iteration": 2.74326491355896 + }, + { + "auxiliary_loss_clip": 0.01452887, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_clip": 1.29933131, + "balance_loss_mlp": 1.02485204, + "epoch": 0.22089283030211934, + "flos": 34131773718720.0, + "grad_norm": 1.8433344151254307, + "language_loss": 0.7693212, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.7942816, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18310547, + "step": 3674, + "time_per_iteration": 2.919182300567627 + }, + { + "auxiliary_loss_clip": 0.01448158, + "auxiliary_loss_mlp": 0.01046596, + "balance_loss_clip": 1.29283261, + "balance_loss_mlp": 1.02654493, + "epoch": 0.2209529535547873, + "flos": 18081765910440.0, + "grad_norm": 1.7647412381503693, + "language_loss": 0.75585151, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.78079909, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.20056152, + "step": 3675, + "time_per_iteration": 2.74368953704834 + }, + { + "auxiliary_loss_clip": 0.01455288, + "auxiliary_loss_mlp": 0.01046307, + "balance_loss_clip": 1.29885256, + "balance_loss_mlp": 1.02797306, + "epoch": 0.22101307680745527, + "flos": 27241104930120.0, + "grad_norm": 1.8093549322304, + "language_loss": 0.80588448, + "learning_rate": 3.629312763695772e-06, + "loss": 0.83090043, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18322754, + "step": 3676, + "time_per_iteration": 2.847910165786743 + }, + { + "auxiliary_loss_clip": 0.01451009, + "auxiliary_loss_mlp": 0.01047503, + "balance_loss_clip": 1.29340768, + "balance_loss_mlp": 1.02872729, + "epoch": 0.22107320006012326, + "flos": 16547401896120.0, + "grad_norm": 2.1278284235816973, + "language_loss": 0.76022452, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.78520966, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18786621, + "step": 3677, + "time_per_iteration": 2.7354543209075928 + }, + { + "auxiliary_loss_clip": 0.01449033, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.29365122, + "balance_loss_mlp": 1.02409554, + "epoch": 0.22113332331279123, + "flos": 22059574848000.0, + "grad_norm": 1.7010699873858683, + "language_loss": 0.83507472, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85998046, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.17443848, + "step": 3678, + "time_per_iteration": 2.7638847827911377 + }, + { + "auxiliary_loss_clip": 0.01447189, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.29366255, + "balance_loss_mlp": 1.02656865, + "epoch": 0.2211934465654592, + "flos": 26618177109600.0, + "grad_norm": 1.692910491056371, + "language_loss": 0.89071429, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91562748, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.17541504, + "step": 3679, + "time_per_iteration": 2.8175337314605713 + }, + { + "auxiliary_loss_clip": 0.01459903, + "auxiliary_loss_mlp": 0.01046463, + "balance_loss_clip": 1.30302501, + "balance_loss_mlp": 1.02823615, + "epoch": 0.22125356981812716, + "flos": 16364424315240.0, + "grad_norm": 2.3020854257275385, + "language_loss": 0.87005353, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.89511716, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18212891, + "step": 3680, + "time_per_iteration": 2.7069571018218994 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.29409802, + "balance_loss_mlp": 1.02285695, + "epoch": 0.22131369307079513, + "flos": 21655911501000.0, + "grad_norm": 7.933563515880159, + "language_loss": 0.81282938, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83770049, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.18615723, + "step": 3681, + "time_per_iteration": 2.7696585655212402 + }, + { + "auxiliary_loss_clip": 0.01439184, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.29074788, + "balance_loss_mlp": 1.02131581, + "epoch": 0.2213738163234631, + "flos": 19614261940200.0, + "grad_norm": 3.1047633019607663, + "language_loss": 0.79782158, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82259929, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.17272949, + "step": 3682, + "time_per_iteration": 2.767300605773926 + }, + { + "auxiliary_loss_clip": 0.01445229, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.28977418, + "balance_loss_mlp": 1.0224781, + "epoch": 0.22143393957613106, + "flos": 23628723245640.0, + "grad_norm": 1.5628274772219242, + "language_loss": 0.77487838, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79975355, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.19799805, + "step": 3683, + "time_per_iteration": 2.7606844902038574 + }, + { + "auxiliary_loss_clip": 0.01450334, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.29189444, + "balance_loss_mlp": 1.02730298, + "epoch": 0.22149406282879905, + "flos": 26183505956760.0, + "grad_norm": 2.1967345733236305, + "language_loss": 0.73170102, + "learning_rate": 3.627503859796234e-06, + "loss": 0.75666595, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18859863, + "step": 3684, + "time_per_iteration": 2.7912485599517822 + }, + { + "auxiliary_loss_clip": 0.01446066, + "auxiliary_loss_mlp": 0.01053388, + "balance_loss_clip": 1.29150343, + "balance_loss_mlp": 1.03324175, + "epoch": 0.221554186081467, + "flos": 14542932611880.0, + "grad_norm": 2.3859304833059234, + "language_loss": 0.8074441, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.83243859, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.20141602, + "step": 3685, + "time_per_iteration": 2.769615888595581 + }, + { + "auxiliary_loss_clip": 0.01435891, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.28637838, + "balance_loss_mlp": 1.02672064, + "epoch": 0.22161430933413498, + "flos": 22243567637880.0, + "grad_norm": 1.412242186653802, + "language_loss": 0.87402129, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89881027, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.16259766, + "step": 3686, + "time_per_iteration": 2.815150022506714 + }, + { + "auxiliary_loss_clip": 0.01443248, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_clip": 1.28974319, + "balance_loss_mlp": 1.02642798, + "epoch": 0.22167443258680294, + "flos": 23481707690520.0, + "grad_norm": 2.1331078614424293, + "language_loss": 0.77956569, + "learning_rate": 3.626824502298707e-06, + "loss": 0.80445111, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.18847656, + "step": 3687, + "time_per_iteration": 2.7737717628479004 + }, + { + "auxiliary_loss_clip": 0.01458221, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.29792523, + "balance_loss_mlp": 1.03155661, + "epoch": 0.2217345558394709, + "flos": 23226278149440.0, + "grad_norm": 1.7351840207256573, + "language_loss": 0.85261941, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87771237, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19506836, + "step": 3688, + "time_per_iteration": 2.768242120742798 + }, + { + "auxiliary_loss_clip": 0.01454754, + "auxiliary_loss_mlp": 0.01044669, + "balance_loss_clip": 1.29558468, + "balance_loss_mlp": 1.02552414, + "epoch": 0.22179467909213887, + "flos": 20015894869200.0, + "grad_norm": 1.7831152306650357, + "language_loss": 0.81739688, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.84239113, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19152832, + "step": 3689, + "time_per_iteration": 2.761138677597046 + }, + { + "auxiliary_loss_clip": 0.01448534, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.2951082, + "balance_loss_mlp": 1.03072882, + "epoch": 0.22185480234480687, + "flos": 19687850934480.0, + "grad_norm": 2.3905661798807247, + "language_loss": 0.70241451, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72738391, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.17675781, + "step": 3690, + "time_per_iteration": 2.781386375427246 + }, + { + "auxiliary_loss_clip": 0.01453428, + "auxiliary_loss_mlp": 0.01050734, + "balance_loss_clip": 1.29604959, + "balance_loss_mlp": 1.03081393, + "epoch": 0.22191492559747483, + "flos": 21986391937320.0, + "grad_norm": 2.189375450603204, + "language_loss": 0.72748321, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.75252485, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19909668, + "step": 3691, + "time_per_iteration": 4.2307610511779785 + }, + { + "auxiliary_loss_clip": 0.01447036, + "auxiliary_loss_mlp": 0.01053384, + "balance_loss_clip": 1.29370713, + "balance_loss_mlp": 1.03509736, + "epoch": 0.2219750488501428, + "flos": 23227333966800.0, + "grad_norm": 2.2632169988876703, + "language_loss": 0.71695709, + "learning_rate": 3.625691006130477e-06, + "loss": 0.7419613, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18273926, + "step": 3692, + "time_per_iteration": 2.782571315765381 + }, + { + "auxiliary_loss_clip": 0.01455708, + "auxiliary_loss_mlp": 0.01063282, + "balance_loss_clip": 1.29772496, + "balance_loss_mlp": 1.04509115, + "epoch": 0.22203517210281076, + "flos": 22458568233600.0, + "grad_norm": 1.5678215438084757, + "language_loss": 0.87556553, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.90075541, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18200684, + "step": 3693, + "time_per_iteration": 2.780731439590454 + }, + { + "auxiliary_loss_clip": 0.01439413, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_clip": 1.28938758, + "balance_loss_mlp": 1.03405452, + "epoch": 0.22209529535547873, + "flos": 17568795193560.0, + "grad_norm": 1.8364684578789863, + "language_loss": 0.85700405, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.88190901, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.17028809, + "step": 3694, + "time_per_iteration": 2.7307188510894775 + }, + { + "auxiliary_loss_clip": 0.01457829, + "auxiliary_loss_mlp": 0.01049261, + "balance_loss_clip": 1.29769492, + "balance_loss_mlp": 1.03016341, + "epoch": 0.2221554186081467, + "flos": 21473827304040.0, + "grad_norm": 1.7759953546893674, + "language_loss": 0.70012665, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.72519749, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19091797, + "step": 3695, + "time_per_iteration": 2.786212921142578 + }, + { + "auxiliary_loss_clip": 0.01438613, + "auxiliary_loss_mlp": 0.01051861, + "balance_loss_clip": 1.28922248, + "balance_loss_mlp": 1.03552914, + "epoch": 0.22221554186081466, + "flos": 27679309010280.0, + "grad_norm": 1.3615574877560832, + "language_loss": 0.71850002, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.74340475, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.16320801, + "step": 3696, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.01448097, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_clip": 1.29525375, + "balance_loss_mlp": 1.02836764, + "epoch": 0.22227566511348265, + "flos": 25964606958480.0, + "grad_norm": 1.7424291109799737, + "language_loss": 0.88308394, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90802753, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.17907715, + "step": 3697, + "time_per_iteration": 2.8018951416015625 + }, + { + "auxiliary_loss_clip": 0.01433723, + "auxiliary_loss_mlp": 0.01047288, + "balance_loss_clip": 1.28497946, + "balance_loss_mlp": 1.03102839, + "epoch": 0.22233578836615062, + "flos": 39211143502320.0, + "grad_norm": 1.5729178052494048, + "language_loss": 0.65963519, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68444526, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.16259766, + "step": 3698, + "time_per_iteration": 2.9483070373535156 + }, + { + "auxiliary_loss_clip": 0.01449073, + "auxiliary_loss_mlp": 0.01044664, + "balance_loss_clip": 1.29273391, + "balance_loss_mlp": 1.02649689, + "epoch": 0.22239591161881858, + "flos": 36290730496320.0, + "grad_norm": 1.707559157694449, + "language_loss": 0.82596546, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.85090286, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18188477, + "step": 3699, + "time_per_iteration": 4.407181739807129 + }, + { + "auxiliary_loss_clip": 0.01442359, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_clip": 1.28979564, + "balance_loss_mlp": 1.02822876, + "epoch": 0.22245603487148655, + "flos": 19724422085640.0, + "grad_norm": 1.5868224663387738, + "language_loss": 0.80138558, + "learning_rate": 3.62387420709809e-06, + "loss": 0.82627851, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18725586, + "step": 3700, + "time_per_iteration": 2.8238444328308105 + }, + { + "auxiliary_loss_clip": 0.01454105, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_clip": 1.29754996, + "balance_loss_mlp": 1.02698576, + "epoch": 0.2225161581241545, + "flos": 46288647666000.0, + "grad_norm": 1.8712752141516922, + "language_loss": 0.72579753, + "learning_rate": 3.623646830029943e-06, + "loss": 0.75080144, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.19299316, + "step": 3701, + "time_per_iteration": 2.9633874893188477 + }, + { + "auxiliary_loss_clip": 0.01442483, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.28916407, + "balance_loss_mlp": 1.02126241, + "epoch": 0.22257628137682248, + "flos": 23701500072720.0, + "grad_norm": 1.6381294194039815, + "language_loss": 0.80288029, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82769763, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.17993164, + "step": 3702, + "time_per_iteration": 5.62881064414978 + }, + { + "auxiliary_loss_clip": 0.01430636, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.28278041, + "balance_loss_mlp": 1.02511406, + "epoch": 0.22263640462949044, + "flos": 19358669965680.0, + "grad_norm": 1.6740368239539023, + "language_loss": 0.7777366, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80247891, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.18469238, + "step": 3703, + "time_per_iteration": 2.8044683933258057 + }, + { + "auxiliary_loss_clip": 0.0144399, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.28930473, + "balance_loss_mlp": 1.01799572, + "epoch": 0.22269652788215843, + "flos": 20780802808200.0, + "grad_norm": 1.820480120920461, + "language_loss": 0.74894875, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.77376473, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.19604492, + "step": 3704, + "time_per_iteration": 2.744102954864502 + }, + { + "auxiliary_loss_clip": 0.01432443, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_clip": 1.28566849, + "balance_loss_mlp": 1.03004718, + "epoch": 0.2227566511348264, + "flos": 47967265866960.0, + "grad_norm": 1.674619610688538, + "language_loss": 0.64430642, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66910076, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.16955566, + "step": 3705, + "time_per_iteration": 2.958357334136963 + }, + { + "auxiliary_loss_clip": 0.01303782, + "auxiliary_loss_mlp": 0.01010074, + "balance_loss_clip": 1.22849727, + "balance_loss_mlp": 1.00518692, + "epoch": 0.22281677438749437, + "flos": 66232009358280.0, + "grad_norm": 1.3734144400390893, + "language_loss": 0.65267676, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67581528, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.04882812, + "step": 3706, + "time_per_iteration": 3.1770105361938477 + }, + { + "auxiliary_loss_clip": 0.01442547, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.29004526, + "balance_loss_mlp": 1.02114475, + "epoch": 0.22287689764016233, + "flos": 21876637875480.0, + "grad_norm": 1.7033270807037733, + "language_loss": 0.80663347, + "learning_rate": 3.622281274977141e-06, + "loss": 0.83144742, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.17712402, + "step": 3707, + "time_per_iteration": 2.7983152866363525 + }, + { + "auxiliary_loss_clip": 0.01437471, + "auxiliary_loss_mlp": 0.01039775, + "balance_loss_clip": 1.28734875, + "balance_loss_mlp": 1.02186966, + "epoch": 0.2229370208928303, + "flos": 27678212584560.0, + "grad_norm": 1.8979145060464506, + "language_loss": 0.79341471, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.81818712, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.17907715, + "step": 3708, + "time_per_iteration": 2.799553632736206 + }, + { + "auxiliary_loss_clip": 0.01443741, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.28880596, + "balance_loss_mlp": 1.02057338, + "epoch": 0.22299714414549826, + "flos": 30160868202360.0, + "grad_norm": 1.8804714418003468, + "language_loss": 0.8092798, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.83411276, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18981934, + "step": 3709, + "time_per_iteration": 2.8676319122314453 + }, + { + "auxiliary_loss_clip": 0.01449258, + "auxiliary_loss_mlp": 0.01049145, + "balance_loss_clip": 1.29412305, + "balance_loss_mlp": 1.03001189, + "epoch": 0.22305726739816625, + "flos": 23147613110160.0, + "grad_norm": 1.727414409501543, + "language_loss": 0.69047207, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71545613, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.19116211, + "step": 3710, + "time_per_iteration": 2.7860920429229736 + }, + { + "auxiliary_loss_clip": 0.0144975, + "auxiliary_loss_mlp": 0.01048989, + "balance_loss_clip": 1.29304898, + "balance_loss_mlp": 1.02946281, + "epoch": 0.22311739065083422, + "flos": 19176788810520.0, + "grad_norm": 2.121973432857275, + "language_loss": 0.90806782, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.93305528, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.1953125, + "step": 3711, + "time_per_iteration": 2.781405448913574 + }, + { + "auxiliary_loss_clip": 0.01443568, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.29085588, + "balance_loss_mlp": 1.02418363, + "epoch": 0.22317751390350218, + "flos": 13620653985960.0, + "grad_norm": 2.543508581709898, + "language_loss": 0.89726663, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.92215252, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.20825195, + "step": 3712, + "time_per_iteration": 2.6583609580993652 + }, + { + "auxiliary_loss_clip": 0.01441785, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_clip": 1.2925359, + "balance_loss_mlp": 1.02891636, + "epoch": 0.22323763715617015, + "flos": 11031858450360.0, + "grad_norm": 2.66482034495128, + "language_loss": 0.75259662, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77749658, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.19311523, + "step": 3713, + "time_per_iteration": 2.7320334911346436 + }, + { + "auxiliary_loss_clip": 0.014443, + "auxiliary_loss_mlp": 0.01040826, + "balance_loss_clip": 1.29277229, + "balance_loss_mlp": 1.02263427, + "epoch": 0.22329776040883811, + "flos": 41358648722400.0, + "grad_norm": 1.916560665525574, + "language_loss": 0.62262768, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64747894, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.1817627, + "step": 3714, + "time_per_iteration": 2.9257664680480957 + }, + { + "auxiliary_loss_clip": 0.01442068, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.29007173, + "balance_loss_mlp": 1.01907325, + "epoch": 0.22335788366150608, + "flos": 25125013599480.0, + "grad_norm": 2.213223675677594, + "language_loss": 0.78912079, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81391001, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.17785645, + "step": 3715, + "time_per_iteration": 2.8243296146392822 + }, + { + "auxiliary_loss_clip": 0.01441386, + "auxiliary_loss_mlp": 0.01048824, + "balance_loss_clip": 1.29006124, + "balance_loss_mlp": 1.02965498, + "epoch": 0.22341800691417404, + "flos": 16987880044440.0, + "grad_norm": 2.3567782840025275, + "language_loss": 0.77696437, + "learning_rate": 3.620228790579645e-06, + "loss": 0.80186647, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.19165039, + "step": 3716, + "time_per_iteration": 2.7719309329986572 + }, + { + "auxiliary_loss_clip": 0.01444054, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.29373813, + "balance_loss_mlp": 1.02617574, + "epoch": 0.22347813016684204, + "flos": 14140771774200.0, + "grad_norm": 2.247987855833141, + "language_loss": 0.78733128, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81221575, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.18225098, + "step": 3717, + "time_per_iteration": 2.7169439792633057 + }, + { + "auxiliary_loss_clip": 0.01452987, + "auxiliary_loss_mlp": 0.01050389, + "balance_loss_clip": 1.29778767, + "balance_loss_mlp": 1.03110075, + "epoch": 0.22353825341951, + "flos": 23587969433400.0, + "grad_norm": 1.8503573361902492, + "language_loss": 0.67993271, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70496649, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.19274902, + "step": 3718, + "time_per_iteration": 2.756166458129883 + }, + { + "auxiliary_loss_clip": 0.01446486, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.29114842, + "balance_loss_mlp": 1.02104247, + "epoch": 0.22359837667217797, + "flos": 29830103507520.0, + "grad_norm": 1.4843259669544135, + "language_loss": 0.80840021, + "learning_rate": 3.619543522896045e-06, + "loss": 0.83326858, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.19311523, + "step": 3719, + "time_per_iteration": 2.8538341522216797 + }, + { + "auxiliary_loss_clip": 0.01454693, + "auxiliary_loss_mlp": 0.01050457, + "balance_loss_clip": 1.2964797, + "balance_loss_mlp": 1.03042948, + "epoch": 0.22365849992484593, + "flos": 17607599804520.0, + "grad_norm": 1.6595088052086306, + "language_loss": 0.86758524, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.89263672, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.20031738, + "step": 3720, + "time_per_iteration": 2.7822840213775635 + }, + { + "auxiliary_loss_clip": 0.01441343, + "auxiliary_loss_mlp": 0.01050502, + "balance_loss_clip": 1.29109073, + "balance_loss_mlp": 1.03190494, + "epoch": 0.2237186231775139, + "flos": 22716109409400.0, + "grad_norm": 1.7405722256059097, + "language_loss": 0.74659681, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77151519, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.18603516, + "step": 3721, + "time_per_iteration": 2.751300573348999 + }, + { + "auxiliary_loss_clip": 0.01452654, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.29573298, + "balance_loss_mlp": 1.02428472, + "epoch": 0.22377874643018186, + "flos": 13375579576680.0, + "grad_norm": 3.8828419019342606, + "language_loss": 0.79250449, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81746817, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.19421387, + "step": 3722, + "time_per_iteration": 2.7661378383636475 + }, + { + "auxiliary_loss_clip": 0.01439333, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.28874063, + "balance_loss_mlp": 1.0246172, + "epoch": 0.22383886968284986, + "flos": 17899519280040.0, + "grad_norm": 2.1116867609866463, + "language_loss": 0.83125359, + "learning_rate": 3.618628972906178e-06, + "loss": 0.8560636, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.17053223, + "step": 3723, + "time_per_iteration": 2.723998785018921 + }, + { + "auxiliary_loss_clip": 0.0145578, + "auxiliary_loss_mlp": 0.01048679, + "balance_loss_clip": 1.29864872, + "balance_loss_mlp": 1.02964139, + "epoch": 0.22389899293551782, + "flos": 23884518261960.0, + "grad_norm": 1.800133015503167, + "language_loss": 0.8500669, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.87511152, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19055176, + "step": 3724, + "time_per_iteration": 2.7639076709747314 + }, + { + "auxiliary_loss_clip": 0.01444035, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.29015207, + "balance_loss_mlp": 1.0225594, + "epoch": 0.2239591161881858, + "flos": 27278285206680.0, + "grad_norm": 3.319502150754995, + "language_loss": 0.79517287, + "learning_rate": 3.618171329605121e-06, + "loss": 0.82001776, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.17907715, + "step": 3725, + "time_per_iteration": 2.8509931564331055 + }, + { + "auxiliary_loss_clip": 0.01441608, + "auxiliary_loss_mlp": 0.01042469, + "balance_loss_clip": 1.28994632, + "balance_loss_mlp": 1.02437317, + "epoch": 0.22401923944085375, + "flos": 22241983911840.0, + "grad_norm": 2.2309086049108604, + "language_loss": 0.7747997, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79964042, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.1809082, + "step": 3726, + "time_per_iteration": 2.7176148891448975 + }, + { + "auxiliary_loss_clip": 0.01461689, + "auxiliary_loss_mlp": 0.01054136, + "balance_loss_clip": 1.30155063, + "balance_loss_mlp": 1.03345287, + "epoch": 0.22407936269352172, + "flos": 12056987716920.0, + "grad_norm": 2.2670375986985447, + "language_loss": 0.73250997, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.75766826, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.20690918, + "step": 3727, + "time_per_iteration": 2.7950756549835205 + }, + { + "auxiliary_loss_clip": 0.01452126, + "auxiliary_loss_mlp": 0.01049701, + "balance_loss_clip": 1.29429805, + "balance_loss_mlp": 1.0287559, + "epoch": 0.22413948594618968, + "flos": 19358182665360.0, + "grad_norm": 2.338559823221403, + "language_loss": 0.87973928, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.9047575, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.20947266, + "step": 3728, + "time_per_iteration": 2.7329959869384766 + }, + { + "auxiliary_loss_clip": 0.01449615, + "auxiliary_loss_mlp": 0.01048734, + "balance_loss_clip": 1.29615569, + "balance_loss_mlp": 1.02898145, + "epoch": 0.22419960919885765, + "flos": 24175300703400.0, + "grad_norm": 1.9993300240961052, + "language_loss": 0.80476141, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82974488, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.19750977, + "step": 3729, + "time_per_iteration": 2.8059592247009277 + }, + { + "auxiliary_loss_clip": 0.01441474, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.29120874, + "balance_loss_mlp": 1.02822232, + "epoch": 0.22425973245152564, + "flos": 27384465732840.0, + "grad_norm": 1.5571030410633877, + "language_loss": 0.8720268, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89689946, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.17578125, + "step": 3730, + "time_per_iteration": 4.245139122009277 + }, + { + "auxiliary_loss_clip": 0.01442508, + "auxiliary_loss_mlp": 0.01041245, + "balance_loss_clip": 1.29172516, + "balance_loss_mlp": 1.02351832, + "epoch": 0.2243198557041936, + "flos": 13739666753880.0, + "grad_norm": 1.6649243410342947, + "language_loss": 0.73141116, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75624871, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.17724609, + "step": 3731, + "time_per_iteration": 2.7081422805786133 + }, + { + "auxiliary_loss_clip": 0.0145413, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.29898739, + "balance_loss_mlp": 1.02520108, + "epoch": 0.22437997895686157, + "flos": 19534662908640.0, + "grad_norm": 1.6368386135176727, + "language_loss": 0.75383431, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77880871, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18103027, + "step": 3732, + "time_per_iteration": 2.7805254459381104 + }, + { + "auxiliary_loss_clip": 0.01449383, + "auxiliary_loss_mlp": 0.01055444, + "balance_loss_clip": 1.29640186, + "balance_loss_mlp": 1.03738427, + "epoch": 0.22444010220952954, + "flos": 23701540681080.0, + "grad_norm": 2.2055380452888786, + "language_loss": 0.88476717, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90981549, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18066406, + "step": 3733, + "time_per_iteration": 2.76360821723938 + }, + { + "auxiliary_loss_clip": 0.01445477, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_clip": 1.29246318, + "balance_loss_mlp": 1.02502167, + "epoch": 0.2245002254621975, + "flos": 22388065474680.0, + "grad_norm": 1.6013875738449068, + "language_loss": 0.85061044, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.87550545, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18981934, + "step": 3734, + "time_per_iteration": 2.7781498432159424 + }, + { + "auxiliary_loss_clip": 0.0144537, + "auxiliary_loss_mlp": 0.01048359, + "balance_loss_clip": 1.29186296, + "balance_loss_mlp": 1.02833152, + "epoch": 0.22456034871486547, + "flos": 26947885987080.0, + "grad_norm": 1.5187118779161446, + "language_loss": 0.76829469, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79323196, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.20031738, + "step": 3735, + "time_per_iteration": 2.896089553833008 + }, + { + "auxiliary_loss_clip": 0.01433188, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_clip": 1.28619874, + "balance_loss_mlp": 1.02633905, + "epoch": 0.22462047196753343, + "flos": 28989129464280.0, + "grad_norm": 1.6607223442188739, + "language_loss": 0.84798753, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.87275624, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.17346191, + "step": 3736, + "time_per_iteration": 2.861661434173584 + }, + { + "auxiliary_loss_clip": 0.01451229, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.29766488, + "balance_loss_mlp": 1.0221988, + "epoch": 0.22468059522020142, + "flos": 20016179127720.0, + "grad_norm": 1.5865754477644356, + "language_loss": 0.86710656, + "learning_rate": 3.615420317888586e-06, + "loss": 0.89201939, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.1784668, + "step": 3737, + "time_per_iteration": 2.812300205230713 + }, + { + "auxiliary_loss_clip": 0.01451713, + "auxiliary_loss_mlp": 0.01048077, + "balance_loss_clip": 1.29607511, + "balance_loss_mlp": 1.02890861, + "epoch": 0.2247407184728694, + "flos": 29320178417640.0, + "grad_norm": 1.7525183399705486, + "language_loss": 0.7931363, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81813419, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.19177246, + "step": 3738, + "time_per_iteration": 4.224346399307251 + }, + { + "auxiliary_loss_clip": 0.01447471, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.29375148, + "balance_loss_mlp": 1.02509332, + "epoch": 0.22480084172553735, + "flos": 22315572906120.0, + "grad_norm": 1.8735534091178478, + "language_loss": 0.76633948, + "learning_rate": 3.614960957933224e-06, + "loss": 0.79123676, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.17163086, + "step": 3739, + "time_per_iteration": 2.8035168647766113 + }, + { + "auxiliary_loss_clip": 0.01442566, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.28840697, + "balance_loss_mlp": 1.02024531, + "epoch": 0.22486096497820532, + "flos": 25596540162000.0, + "grad_norm": 1.7330827610890862, + "language_loss": 0.74598759, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.77080417, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18847656, + "step": 3740, + "time_per_iteration": 2.8348309993743896 + }, + { + "auxiliary_loss_clip": 0.01443211, + "auxiliary_loss_mlp": 0.01041416, + "balance_loss_clip": 1.29108882, + "balance_loss_mlp": 1.02283168, + "epoch": 0.22492108823087328, + "flos": 17644373997480.0, + "grad_norm": 2.2324608050630323, + "language_loss": 0.7571578, + "learning_rate": 3.614501353019939e-06, + "loss": 0.782004, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.18591309, + "step": 3741, + "time_per_iteration": 5.826469898223877 + }, + { + "auxiliary_loss_clip": 0.01447099, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.29662025, + "balance_loss_mlp": 1.02267456, + "epoch": 0.22498121148354125, + "flos": 16039019923920.0, + "grad_norm": 2.016327675268558, + "language_loss": 0.87424189, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89911306, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.17346191, + "step": 3742, + "time_per_iteration": 2.712141513824463 + }, + { + "auxiliary_loss_clip": 0.0143807, + "auxiliary_loss_mlp": 0.0104627, + "balance_loss_clip": 1.2880857, + "balance_loss_mlp": 1.0278523, + "epoch": 0.22504133473620924, + "flos": 24029016098760.0, + "grad_norm": 1.6369186212826101, + "language_loss": 0.81649232, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84133571, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.18408203, + "step": 3743, + "time_per_iteration": 2.7858200073242188 + }, + { + "auxiliary_loss_clip": 0.01454327, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.30122423, + "balance_loss_mlp": 1.02054334, + "epoch": 0.2251014579888772, + "flos": 16768371920760.0, + "grad_norm": 2.2997836542936705, + "language_loss": 0.63464081, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65957272, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18347168, + "step": 3744, + "time_per_iteration": 2.7112817764282227 + }, + { + "auxiliary_loss_clip": 0.01442933, + "auxiliary_loss_mlp": 0.01039745, + "balance_loss_clip": 1.28942251, + "balance_loss_mlp": 1.02138734, + "epoch": 0.22516158124154517, + "flos": 13995055686600.0, + "grad_norm": 2.8934855938372914, + "language_loss": 0.7692951, + "learning_rate": 3.613581408598489e-06, + "loss": 0.79412186, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.18347168, + "step": 3745, + "time_per_iteration": 2.748375177383423 + }, + { + "auxiliary_loss_clip": 0.01443378, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.29281425, + "balance_loss_mlp": 1.02008319, + "epoch": 0.22522170449421314, + "flos": 14393805422040.0, + "grad_norm": 1.7900251333270847, + "language_loss": 0.81176281, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83657277, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.17541504, + "step": 3746, + "time_per_iteration": 2.7315890789031982 + }, + { + "auxiliary_loss_clip": 0.01443291, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_clip": 1.28931355, + "balance_loss_mlp": 1.02618074, + "epoch": 0.2252818277468811, + "flos": 23810848050960.0, + "grad_norm": 2.331502599221298, + "language_loss": 0.86291945, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88779652, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.18249512, + "step": 3747, + "time_per_iteration": 2.8310561180114746 + }, + { + "auxiliary_loss_clip": 0.01451731, + "auxiliary_loss_mlp": 0.01039315, + "balance_loss_clip": 1.29699814, + "balance_loss_mlp": 1.02222037, + "epoch": 0.22534195099954907, + "flos": 24723502495560.0, + "grad_norm": 1.70083167093869, + "language_loss": 0.76563096, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.79054135, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.17089844, + "step": 3748, + "time_per_iteration": 2.8986785411834717 + }, + { + "auxiliary_loss_clip": 0.01452751, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_clip": 1.29944253, + "balance_loss_mlp": 1.03094268, + "epoch": 0.22540207425221703, + "flos": 21037247558280.0, + "grad_norm": 1.7644827109376613, + "language_loss": 0.80486727, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.82987881, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.17480469, + "step": 3749, + "time_per_iteration": 2.781789779663086 + }, + { + "auxiliary_loss_clip": 0.01440566, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.29054987, + "balance_loss_mlp": 1.02560663, + "epoch": 0.22546219750488503, + "flos": 19395038075040.0, + "grad_norm": 1.6592506247117687, + "language_loss": 0.7976197, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82245195, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.17028809, + "step": 3750, + "time_per_iteration": 2.7790396213531494 + }, + { + "auxiliary_loss_clip": 0.01455177, + "auxiliary_loss_mlp": 0.01042588, + "balance_loss_clip": 1.29997659, + "balance_loss_mlp": 1.02380037, + "epoch": 0.225522320757553, + "flos": 25198196510160.0, + "grad_norm": 3.224268233291794, + "language_loss": 0.82373047, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84870815, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18774414, + "step": 3751, + "time_per_iteration": 2.8249425888061523 + }, + { + "auxiliary_loss_clip": 0.01453749, + "auxiliary_loss_mlp": 0.0104472, + "balance_loss_clip": 1.30153489, + "balance_loss_mlp": 1.02621841, + "epoch": 0.22558244401022096, + "flos": 17167284089640.0, + "grad_norm": 2.1073207144816752, + "language_loss": 0.83737075, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86235547, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.18505859, + "step": 3752, + "time_per_iteration": 2.8015689849853516 + }, + { + "auxiliary_loss_clip": 0.01446525, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.29529977, + "balance_loss_mlp": 1.01720893, + "epoch": 0.22564256726288892, + "flos": 15235144940520.0, + "grad_norm": 1.672417364960009, + "language_loss": 0.78933203, + "learning_rate": 3.611738583330375e-06, + "loss": 0.81413484, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.16540527, + "step": 3753, + "time_per_iteration": 2.7452259063720703 + }, + { + "auxiliary_loss_clip": 0.01455616, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.30284739, + "balance_loss_mlp": 1.02261353, + "epoch": 0.2257026905155569, + "flos": 34575338102400.0, + "grad_norm": 1.919823208830622, + "language_loss": 0.7906968, + "learning_rate": 3.611507955052295e-06, + "loss": 0.81566739, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18811035, + "step": 3754, + "time_per_iteration": 2.9379236698150635 + }, + { + "auxiliary_loss_clip": 0.01452221, + "auxiliary_loss_mlp": 0.01040949, + "balance_loss_clip": 1.3019352, + "balance_loss_mlp": 1.0229249, + "epoch": 0.22576281376822485, + "flos": 19943280475560.0, + "grad_norm": 1.8586638133567164, + "language_loss": 0.70080024, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72573197, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.18017578, + "step": 3755, + "time_per_iteration": 2.7699458599090576 + }, + { + "auxiliary_loss_clip": 0.01455246, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.29919803, + "balance_loss_mlp": 1.02878761, + "epoch": 0.22582293702089282, + "flos": 24606601362360.0, + "grad_norm": 2.1631648843054982, + "language_loss": 0.7748068, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79983282, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18566895, + "step": 3756, + "time_per_iteration": 2.776413679122925 + }, + { + "auxiliary_loss_clip": 0.01461187, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.30528021, + "balance_loss_mlp": 1.02320695, + "epoch": 0.2258830602735608, + "flos": 23040133116480.0, + "grad_norm": 2.009607846683892, + "language_loss": 0.82785153, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.85288209, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18652344, + "step": 3757, + "time_per_iteration": 2.8088386058807373 + }, + { + "auxiliary_loss_clip": 0.0145781, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.30261731, + "balance_loss_mlp": 1.02422953, + "epoch": 0.22594318352622877, + "flos": 22162831572240.0, + "grad_norm": 1.7891246951090591, + "language_loss": 0.73679465, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.76182103, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.20593262, + "step": 3758, + "time_per_iteration": 2.7108469009399414 + }, + { + "auxiliary_loss_clip": 0.01452744, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.29773569, + "balance_loss_mlp": 1.02527595, + "epoch": 0.22600330677889674, + "flos": 20599124694840.0, + "grad_norm": 2.0108172165373253, + "language_loss": 0.77020729, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79518187, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.19421387, + "step": 3759, + "time_per_iteration": 2.754304885864258 + }, + { + "auxiliary_loss_clip": 0.0145219, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.2960676, + "balance_loss_mlp": 1.02031291, + "epoch": 0.2260634300315647, + "flos": 35666178341400.0, + "grad_norm": 1.5842883721734604, + "language_loss": 0.78431904, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80922765, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18359375, + "step": 3760, + "time_per_iteration": 2.867302894592285 + }, + { + "auxiliary_loss_clip": 0.0132139, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.24856031, + "balance_loss_mlp": 1.02799118, + "epoch": 0.22612355328423267, + "flos": 72103924392840.0, + "grad_norm": 0.9608584178075174, + "language_loss": 0.60071516, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62424695, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.0378418, + "step": 3761, + "time_per_iteration": 3.2066538333892822 + }, + { + "auxiliary_loss_clip": 0.01456248, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.29877925, + "balance_loss_mlp": 1.02194285, + "epoch": 0.22618367653690064, + "flos": 22788926844840.0, + "grad_norm": 2.0959313764254355, + "language_loss": 0.77418804, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79916286, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19274902, + "step": 3762, + "time_per_iteration": 2.803018569946289 + }, + { + "auxiliary_loss_clip": 0.01462201, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.30690372, + "balance_loss_mlp": 1.02098536, + "epoch": 0.22624379978956863, + "flos": 20453043132000.0, + "grad_norm": 1.9139216411808828, + "language_loss": 0.79590231, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.82092744, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.19299316, + "step": 3763, + "time_per_iteration": 2.73289155960083 + }, + { + "auxiliary_loss_clip": 0.01464104, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.30658293, + "balance_loss_mlp": 1.02455664, + "epoch": 0.2263039230422366, + "flos": 17498901560040.0, + "grad_norm": 1.5444928741042196, + "language_loss": 0.91458511, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93967611, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.20410156, + "step": 3764, + "time_per_iteration": 2.758043050765991 + }, + { + "auxiliary_loss_clip": 0.01450079, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.29890525, + "balance_loss_mlp": 1.0328207, + "epoch": 0.22636404629490456, + "flos": 28335640529880.0, + "grad_norm": 1.6682708116130465, + "language_loss": 0.75525928, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.78027523, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.18688965, + "step": 3765, + "time_per_iteration": 2.8711273670196533 + }, + { + "auxiliary_loss_clip": 0.01454821, + "auxiliary_loss_mlp": 0.01049555, + "balance_loss_clip": 1.30254912, + "balance_loss_mlp": 1.03066039, + "epoch": 0.22642416954757252, + "flos": 17493053956200.0, + "grad_norm": 1.8407761085641734, + "language_loss": 0.90064085, + "learning_rate": 3.608735651752494e-06, + "loss": 0.92568469, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.18896484, + "step": 3766, + "time_per_iteration": 2.784853219985962 + }, + { + "auxiliary_loss_clip": 0.01449439, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.30071092, + "balance_loss_mlp": 1.02588379, + "epoch": 0.2264842928002405, + "flos": 24389286090120.0, + "grad_norm": 1.6039996854099876, + "language_loss": 0.74937743, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.77432042, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.18994141, + "step": 3767, + "time_per_iteration": 2.7602062225341797 + }, + { + "auxiliary_loss_clip": 0.01457594, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.30262947, + "balance_loss_mlp": 1.03152454, + "epoch": 0.22654441605290845, + "flos": 19835556831720.0, + "grad_norm": 1.5152821695483463, + "language_loss": 0.72386742, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.748954, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.19543457, + "step": 3768, + "time_per_iteration": 2.761603832244873 + }, + { + "auxiliary_loss_clip": 0.01457454, + "auxiliary_loss_mlp": 0.01061143, + "balance_loss_clip": 1.30625534, + "balance_loss_mlp": 1.04202175, + "epoch": 0.22660453930557642, + "flos": 27460410012000.0, + "grad_norm": 1.666532434417783, + "language_loss": 0.78373563, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80892158, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.19116211, + "step": 3769, + "time_per_iteration": 4.331127882003784 + }, + { + "auxiliary_loss_clip": 0.01459943, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.30298543, + "balance_loss_mlp": 1.02798676, + "epoch": 0.2266646625582444, + "flos": 23993500764960.0, + "grad_norm": 1.8218238759896055, + "language_loss": 0.68796879, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.71304381, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19592285, + "step": 3770, + "time_per_iteration": 2.7791106700897217 + }, + { + "auxiliary_loss_clip": 0.01464769, + "auxiliary_loss_mlp": 0.01047265, + "balance_loss_clip": 1.3086648, + "balance_loss_mlp": 1.02902544, + "epoch": 0.22672478581091238, + "flos": 26033241732840.0, + "grad_norm": 1.6237742678626412, + "language_loss": 0.80565226, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.83077264, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18249512, + "step": 3771, + "time_per_iteration": 2.8131139278411865 + }, + { + "auxiliary_loss_clip": 0.01445883, + "auxiliary_loss_mlp": 0.01053251, + "balance_loss_clip": 1.29612815, + "balance_loss_mlp": 1.03453481, + "epoch": 0.22678490906358034, + "flos": 23847094335240.0, + "grad_norm": 1.6747675246660287, + "language_loss": 0.78847158, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81346291, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.18713379, + "step": 3772, + "time_per_iteration": 2.8117847442626953 + }, + { + "auxiliary_loss_clip": 0.01315272, + "auxiliary_loss_mlp": 0.01013646, + "balance_loss_clip": 1.2437613, + "balance_loss_mlp": 1.01059389, + "epoch": 0.2268450323162483, + "flos": 65065323038040.0, + "grad_norm": 0.6822612555899743, + "language_loss": 0.54342186, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56671107, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.03051758, + "step": 3773, + "time_per_iteration": 3.3584091663360596 + }, + { + "auxiliary_loss_clip": 0.01448954, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.29751825, + "balance_loss_mlp": 1.02131915, + "epoch": 0.22690515556891627, + "flos": 22530898368720.0, + "grad_norm": 1.584709978320924, + "language_loss": 0.70365769, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.7285459, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.18554688, + "step": 3774, + "time_per_iteration": 2.782848358154297 + }, + { + "auxiliary_loss_clip": 0.0145282, + "auxiliary_loss_mlp": 0.01045455, + "balance_loss_clip": 1.30096245, + "balance_loss_mlp": 1.0269897, + "epoch": 0.22696527882158424, + "flos": 18228050515080.0, + "grad_norm": 2.0472620706580034, + "language_loss": 0.7532168, + "learning_rate": 3.606650658627658e-06, + "loss": 0.77819955, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.18469238, + "step": 3775, + "time_per_iteration": 2.722069501876831 + }, + { + "auxiliary_loss_clip": 0.01448029, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_clip": 1.29583645, + "balance_loss_mlp": 1.03060031, + "epoch": 0.22702540207425223, + "flos": 17023679636760.0, + "grad_norm": 1.7819684822854929, + "language_loss": 0.82537258, + "learning_rate": 3.606418687985928e-06, + "loss": 0.85034049, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.18139648, + "step": 3776, + "time_per_iteration": 4.180495738983154 + }, + { + "auxiliary_loss_clip": 0.01460407, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.30513883, + "balance_loss_mlp": 1.02687275, + "epoch": 0.2270855253269202, + "flos": 21330872584920.0, + "grad_norm": 1.8084639983110975, + "language_loss": 0.83105314, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85611463, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18884277, + "step": 3777, + "time_per_iteration": 2.827446937561035 + }, + { + "auxiliary_loss_clip": 0.01458468, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_clip": 1.30494046, + "balance_loss_mlp": 1.02446651, + "epoch": 0.22714564857958816, + "flos": 23555783985120.0, + "grad_norm": 1.6969421707945276, + "language_loss": 0.73507345, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.76009625, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.19348145, + "step": 3778, + "time_per_iteration": 2.7932136058807373 + }, + { + "auxiliary_loss_clip": 0.01460556, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.30431759, + "balance_loss_mlp": 1.01982665, + "epoch": 0.22720577183225613, + "flos": 25994843205480.0, + "grad_norm": 3.425155724688736, + "language_loss": 0.64239877, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66739595, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19348145, + "step": 3779, + "time_per_iteration": 4.335829734802246 + }, + { + "auxiliary_loss_clip": 0.01451442, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.30148828, + "balance_loss_mlp": 1.02270913, + "epoch": 0.2272658950849241, + "flos": 20819038902120.0, + "grad_norm": 1.6509678749458703, + "language_loss": 0.70812964, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.73304886, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1776123, + "step": 3780, + "time_per_iteration": 4.340451955795288 + }, + { + "auxiliary_loss_clip": 0.01462926, + "auxiliary_loss_mlp": 0.01046679, + "balance_loss_clip": 1.31086636, + "balance_loss_mlp": 1.02694941, + "epoch": 0.22732601833759206, + "flos": 23913901733400.0, + "grad_norm": 1.789428220151949, + "language_loss": 0.89677805, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92187411, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.19726562, + "step": 3781, + "time_per_iteration": 2.7889840602874756 + }, + { + "auxiliary_loss_clip": 0.01453998, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.3001008, + "balance_loss_mlp": 1.02189434, + "epoch": 0.22738614159026002, + "flos": 15928656736680.0, + "grad_norm": 2.045732118652896, + "language_loss": 0.74596393, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.77091587, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.19299316, + "step": 3782, + "time_per_iteration": 2.755202293395996 + }, + { + "auxiliary_loss_clip": 0.01445805, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.29439187, + "balance_loss_mlp": 1.02446651, + "epoch": 0.22744626484292801, + "flos": 24210694212120.0, + "grad_norm": 1.4043255510860377, + "language_loss": 0.83085895, + "learning_rate": 3.604793188351095e-06, + "loss": 0.85574257, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.18066406, + "step": 3783, + "time_per_iteration": 2.8178839683532715 + }, + { + "auxiliary_loss_clip": 0.01447947, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_clip": 1.295681, + "balance_loss_mlp": 1.02553296, + "epoch": 0.22750638809559598, + "flos": 24797172706560.0, + "grad_norm": 1.747362935150019, + "language_loss": 0.76022875, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78515023, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.18676758, + "step": 3784, + "time_per_iteration": 2.766746759414673 + }, + { + "auxiliary_loss_clip": 0.01452606, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.30058444, + "balance_loss_mlp": 1.01828933, + "epoch": 0.22756651134826394, + "flos": 22241577828240.0, + "grad_norm": 1.4896370146558735, + "language_loss": 0.70834994, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73324639, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.1875, + "step": 3785, + "time_per_iteration": 2.793565511703491 + }, + { + "auxiliary_loss_clip": 0.0131034, + "auxiliary_loss_mlp": 0.01003941, + "balance_loss_clip": 1.2391057, + "balance_loss_mlp": 1.00118732, + "epoch": 0.2276266346009319, + "flos": 62723672938080.0, + "grad_norm": 0.8127279945696873, + "language_loss": 0.61866266, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64180547, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.02758789, + "step": 3786, + "time_per_iteration": 3.202559471130371 + }, + { + "auxiliary_loss_clip": 0.01461207, + "auxiliary_loss_mlp": 0.0104959, + "balance_loss_clip": 1.30728197, + "balance_loss_mlp": 1.0273819, + "epoch": 0.22768675785359987, + "flos": 18617176069200.0, + "grad_norm": 3.3507072891285694, + "language_loss": 0.87625271, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.90136069, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.22180176, + "step": 3787, + "time_per_iteration": 2.765307664871216 + }, + { + "auxiliary_loss_clip": 0.01449026, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.29809308, + "balance_loss_mlp": 1.02318633, + "epoch": 0.22774688110626784, + "flos": 26875880718840.0, + "grad_norm": 1.2535037655619785, + "language_loss": 0.7264939, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.75139248, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.17651367, + "step": 3788, + "time_per_iteration": 2.825446128845215 + }, + { + "auxiliary_loss_clip": 0.01450606, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.30048394, + "balance_loss_mlp": 1.02692246, + "epoch": 0.2278070043589358, + "flos": 15557016404520.0, + "grad_norm": 2.686812837034025, + "language_loss": 0.67946863, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.70442176, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.1776123, + "step": 3789, + "time_per_iteration": 2.758995294570923 + }, + { + "auxiliary_loss_clip": 0.01454428, + "auxiliary_loss_mlp": 0.01047645, + "balance_loss_clip": 1.30327392, + "balance_loss_mlp": 1.02879846, + "epoch": 0.2278671276116038, + "flos": 22421631607200.0, + "grad_norm": 1.7743386087203694, + "language_loss": 0.75773066, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78275144, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.18835449, + "step": 3790, + "time_per_iteration": 2.7861523628234863 + }, + { + "auxiliary_loss_clip": 0.01442538, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.29185164, + "balance_loss_mlp": 1.02556133, + "epoch": 0.22792725086427176, + "flos": 20636264363040.0, + "grad_norm": 1.9254408634189877, + "language_loss": 0.91072768, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93559796, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.18920898, + "step": 3791, + "time_per_iteration": 2.7614376544952393 + }, + { + "auxiliary_loss_clip": 0.01455405, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.3010428, + "balance_loss_mlp": 1.01999724, + "epoch": 0.22798737411693973, + "flos": 31434970280760.0, + "grad_norm": 1.8366947460130822, + "language_loss": 0.82534236, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85028034, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18395996, + "step": 3792, + "time_per_iteration": 3.0590078830718994 + }, + { + "auxiliary_loss_clip": 0.0131388, + "auxiliary_loss_mlp": 0.01008347, + "balance_loss_clip": 1.2434094, + "balance_loss_mlp": 1.00505686, + "epoch": 0.2280474973696077, + "flos": 52408699718040.0, + "grad_norm": 1.1543469198852963, + "language_loss": 0.65683234, + "learning_rate": 3.602465874182981e-06, + "loss": 0.68005455, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.03295898, + "step": 3793, + "time_per_iteration": 2.998979330062866 + }, + { + "auxiliary_loss_clip": 0.01468586, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.31060576, + "balance_loss_mlp": 1.02480507, + "epoch": 0.22810762062227566, + "flos": 26401714612920.0, + "grad_norm": 1.9588818703339774, + "language_loss": 0.77376723, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79890823, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.20727539, + "step": 3794, + "time_per_iteration": 2.7969632148742676 + }, + { + "auxiliary_loss_clip": 0.01452535, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.29973674, + "balance_loss_mlp": 1.0205766, + "epoch": 0.22816774387494362, + "flos": 25635872681640.0, + "grad_norm": 2.582943578818178, + "language_loss": 0.81366003, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.838579, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.18774414, + "step": 3795, + "time_per_iteration": 2.760556221008301 + }, + { + "auxiliary_loss_clip": 0.01446, + "auxiliary_loss_mlp": 0.01057713, + "balance_loss_clip": 1.29506922, + "balance_loss_mlp": 1.03872323, + "epoch": 0.22822786712761162, + "flos": 22456212948720.0, + "grad_norm": 1.5464367219801474, + "language_loss": 0.77140969, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.7964468, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.18994141, + "step": 3796, + "time_per_iteration": 2.809187173843384 + }, + { + "auxiliary_loss_clip": 0.01453866, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.30095136, + "balance_loss_mlp": 1.02789342, + "epoch": 0.22828799038027958, + "flos": 12206277340200.0, + "grad_norm": 2.185999230628004, + "language_loss": 0.95776993, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98277384, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.18640137, + "step": 3797, + "time_per_iteration": 2.7383298873901367 + }, + { + "auxiliary_loss_clip": 0.01455121, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.30196428, + "balance_loss_mlp": 1.02402925, + "epoch": 0.22834811363294755, + "flos": 22090135961880.0, + "grad_norm": 1.497350682545597, + "language_loss": 0.81708413, + "learning_rate": 3.601299937834666e-06, + "loss": 0.84206271, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18688965, + "step": 3798, + "time_per_iteration": 2.7410876750946045 + }, + { + "auxiliary_loss_clip": 0.01457326, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.30283546, + "balance_loss_mlp": 1.02331805, + "epoch": 0.2284082368856155, + "flos": 24865685655840.0, + "grad_norm": 1.9254953377295276, + "language_loss": 0.79103506, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81603324, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.19189453, + "step": 3799, + "time_per_iteration": 2.779233455657959 + }, + { + "auxiliary_loss_clip": 0.01451612, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.30017221, + "balance_loss_mlp": 1.03252554, + "epoch": 0.22846836013828348, + "flos": 23297877334080.0, + "grad_norm": 1.478146381030263, + "language_loss": 0.7496596, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.7746911, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.19018555, + "step": 3800, + "time_per_iteration": 2.7434940338134766 + }, + { + "auxiliary_loss_clip": 0.01450387, + "auxiliary_loss_mlp": 0.01046277, + "balance_loss_clip": 1.29910064, + "balance_loss_mlp": 1.02896762, + "epoch": 0.22852848339095144, + "flos": 27421889659560.0, + "grad_norm": 1.551343595683299, + "language_loss": 0.64175415, + "learning_rate": 3.600599647297484e-06, + "loss": 0.66672075, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.17333984, + "step": 3801, + "time_per_iteration": 2.8146212100982666 + }, + { + "auxiliary_loss_clip": 0.01446854, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.298491, + "balance_loss_mlp": 1.02192831, + "epoch": 0.2285886066436194, + "flos": 26326541892600.0, + "grad_norm": 2.194169584872062, + "language_loss": 0.81976223, + "learning_rate": 3.60036609571682e-06, + "loss": 0.84461904, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.16906738, + "step": 3802, + "time_per_iteration": 2.7495057582855225 + }, + { + "auxiliary_loss_clip": 0.01452321, + "auxiliary_loss_mlp": 0.01052283, + "balance_loss_clip": 1.29858625, + "balance_loss_mlp": 1.0338769, + "epoch": 0.2286487298962874, + "flos": 29722582905480.0, + "grad_norm": 1.6202376679602775, + "language_loss": 0.79215074, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81719673, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.18395996, + "step": 3803, + "time_per_iteration": 2.816054344177246 + }, + { + "auxiliary_loss_clip": 0.01451127, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.29601514, + "balance_loss_mlp": 1.02759027, + "epoch": 0.22870885314895537, + "flos": 21292067973960.0, + "grad_norm": 1.6186863538376814, + "language_loss": 0.85948014, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.8844499, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18273926, + "step": 3804, + "time_per_iteration": 2.7181220054626465 + }, + { + "auxiliary_loss_clip": 0.01453798, + "auxiliary_loss_mlp": 0.01041205, + "balance_loss_clip": 1.29888868, + "balance_loss_mlp": 1.02265573, + "epoch": 0.22876897640162333, + "flos": 14943590940240.0, + "grad_norm": 1.9694170597351697, + "language_loss": 0.77204406, + "learning_rate": 3.59966507689401e-06, + "loss": 0.79699409, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1854248, + "step": 3805, + "time_per_iteration": 2.7437212467193604 + }, + { + "auxiliary_loss_clip": 0.01458841, + "auxiliary_loss_mlp": 0.01046003, + "balance_loss_clip": 1.3014164, + "balance_loss_mlp": 1.02719164, + "epoch": 0.2288290996542913, + "flos": 18118499495040.0, + "grad_norm": 2.0732929973199394, + "language_loss": 0.79431927, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81936771, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18811035, + "step": 3806, + "time_per_iteration": 2.742187976837158 + }, + { + "auxiliary_loss_clip": 0.01453109, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_clip": 1.29989839, + "balance_loss_mlp": 1.03293395, + "epoch": 0.22888922290695926, + "flos": 39862358368560.0, + "grad_norm": 1.8801635476019998, + "language_loss": 0.7014457, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.72648698, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.1809082, + "step": 3807, + "time_per_iteration": 4.285046815872192 + }, + { + "auxiliary_loss_clip": 0.01458076, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.30197716, + "balance_loss_mlp": 1.03057337, + "epoch": 0.22894934615962723, + "flos": 23408971471800.0, + "grad_norm": 2.0058259453247946, + "language_loss": 0.65928066, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.68435621, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18908691, + "step": 3808, + "time_per_iteration": 2.7897119522094727 + }, + { + "auxiliary_loss_clip": 0.01455206, + "auxiliary_loss_mlp": 0.01051418, + "balance_loss_clip": 1.30019283, + "balance_loss_mlp": 1.03309536, + "epoch": 0.22900946941229522, + "flos": 18847607841720.0, + "grad_norm": 1.844320498375341, + "language_loss": 0.75330532, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77837157, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18322754, + "step": 3809, + "time_per_iteration": 2.7450833320617676 + }, + { + "auxiliary_loss_clip": 0.01447127, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_clip": 1.29609144, + "balance_loss_mlp": 1.02859306, + "epoch": 0.22906959266496318, + "flos": 22934521107360.0, + "grad_norm": 1.5699369326035497, + "language_loss": 0.81938571, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.84431171, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.16870117, + "step": 3810, + "time_per_iteration": 2.7664034366607666 + }, + { + "auxiliary_loss_clip": 0.01449506, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.29778957, + "balance_loss_mlp": 1.02030325, + "epoch": 0.22912971591763115, + "flos": 19359279091080.0, + "grad_norm": 1.814533427709706, + "language_loss": 0.79199988, + "learning_rate": 3.598261401682441e-06, + "loss": 0.81687415, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.17626953, + "step": 3811, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.01445988, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_clip": 1.29170859, + "balance_loss_mlp": 1.02614605, + "epoch": 0.22918983917029911, + "flos": 19937757738600.0, + "grad_norm": 2.248517461340722, + "language_loss": 0.83188486, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.85678673, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18078613, + "step": 3812, + "time_per_iteration": 2.785769462585449 + }, + { + "auxiliary_loss_clip": 0.01462585, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.3041563, + "balance_loss_mlp": 1.02625442, + "epoch": 0.22924996242296708, + "flos": 16695270226800.0, + "grad_norm": 2.562014838119363, + "language_loss": 0.84019661, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.86526752, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18249512, + "step": 3813, + "time_per_iteration": 2.7274131774902344 + }, + { + "auxiliary_loss_clip": 0.01449739, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.29425192, + "balance_loss_mlp": 1.0235604, + "epoch": 0.22931008567563504, + "flos": 33042639030840.0, + "grad_norm": 1.78916837234862, + "language_loss": 0.70667589, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.73159134, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18249512, + "step": 3814, + "time_per_iteration": 2.844202756881714 + }, + { + "auxiliary_loss_clip": 0.01447102, + "auxiliary_loss_mlp": 0.010411, + "balance_loss_clip": 1.29640627, + "balance_loss_mlp": 1.02243161, + "epoch": 0.229370208928303, + "flos": 23335869777840.0, + "grad_norm": 2.051909820262529, + "language_loss": 0.67311591, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69799793, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.18664551, + "step": 3815, + "time_per_iteration": 4.190938234329224 + }, + { + "auxiliary_loss_clip": 0.01455373, + "auxiliary_loss_mlp": 0.01045498, + "balance_loss_clip": 1.30119777, + "balance_loss_mlp": 1.02721119, + "epoch": 0.229430332180971, + "flos": 28622565177120.0, + "grad_norm": 1.5699148155784617, + "language_loss": 0.83508885, + "learning_rate": 3.597090005586848e-06, + "loss": 0.86009765, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18273926, + "step": 3816, + "time_per_iteration": 2.827446460723877 + }, + { + "auxiliary_loss_clip": 0.01449936, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.29721749, + "balance_loss_mlp": 1.02214384, + "epoch": 0.22949045543363897, + "flos": 17242659851760.0, + "grad_norm": 2.045067776401494, + "language_loss": 0.8800385, + "learning_rate": 3.596855544646742e-06, + "loss": 0.90494674, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18737793, + "step": 3817, + "time_per_iteration": 4.3390045166015625 + }, + { + "auxiliary_loss_clip": 0.01454124, + "auxiliary_loss_mlp": 0.01049243, + "balance_loss_clip": 1.29971027, + "balance_loss_mlp": 1.03086114, + "epoch": 0.22955057868630693, + "flos": 27495072570240.0, + "grad_norm": 1.527604566535942, + "language_loss": 0.74618268, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77121627, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.18395996, + "step": 3818, + "time_per_iteration": 2.7812702655792236 + }, + { + "auxiliary_loss_clip": 0.01447198, + "auxiliary_loss_mlp": 0.01044303, + "balance_loss_clip": 1.29330659, + "balance_loss_mlp": 1.02389431, + "epoch": 0.2296107019389749, + "flos": 23481585865440.0, + "grad_norm": 1.5640674748119843, + "language_loss": 0.74645168, + "learning_rate": 3.596386441116659e-06, + "loss": 0.77136672, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.20410156, + "step": 3819, + "time_per_iteration": 4.260051727294922 + }, + { + "auxiliary_loss_clip": 0.01452742, + "auxiliary_loss_mlp": 0.01044636, + "balance_loss_clip": 1.29752374, + "balance_loss_mlp": 1.02705252, + "epoch": 0.22967082519164286, + "flos": 31291447044600.0, + "grad_norm": 2.003985484596605, + "language_loss": 0.81354302, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83851683, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.17565918, + "step": 3820, + "time_per_iteration": 2.8210830688476562 + }, + { + "auxiliary_loss_clip": 0.01462016, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.30333471, + "balance_loss_mlp": 1.02239418, + "epoch": 0.22973094844431083, + "flos": 14646432986280.0, + "grad_norm": 1.8528741766468155, + "language_loss": 0.69581741, + "learning_rate": 3.595917095446042e-06, + "loss": 0.72086298, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.20153809, + "step": 3821, + "time_per_iteration": 2.6890652179718018 + }, + { + "auxiliary_loss_clip": 0.01450008, + "auxiliary_loss_mlp": 0.01037526, + "balance_loss_clip": 1.29714704, + "balance_loss_mlp": 1.01883423, + "epoch": 0.2297910716969788, + "flos": 22829477615280.0, + "grad_norm": 1.6844697446085903, + "language_loss": 0.82970148, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85457683, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18701172, + "step": 3822, + "time_per_iteration": 2.8333895206451416 + }, + { + "auxiliary_loss_clip": 0.01449997, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.29779065, + "balance_loss_mlp": 1.01995993, + "epoch": 0.2298511949496468, + "flos": 23044193952480.0, + "grad_norm": 1.5936874514170078, + "language_loss": 0.66580546, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.69069958, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.19470215, + "step": 3823, + "time_per_iteration": 2.7801132202148438 + }, + { + "auxiliary_loss_clip": 0.01327236, + "auxiliary_loss_mlp": 0.01015776, + "balance_loss_clip": 1.25301671, + "balance_loss_mlp": 1.01181841, + "epoch": 0.22991131820231475, + "flos": 66906046122840.0, + "grad_norm": 0.7945223437176424, + "language_loss": 0.56829846, + "learning_rate": 3.595212623082357e-06, + "loss": 0.59172857, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.03955078, + "step": 3824, + "time_per_iteration": 3.3320209980010986 + }, + { + "auxiliary_loss_clip": 0.01439996, + "auxiliary_loss_mlp": 0.01043751, + "balance_loss_clip": 1.28974915, + "balance_loss_mlp": 1.02584612, + "epoch": 0.22997144145498272, + "flos": 17890788482640.0, + "grad_norm": 2.0298211888664217, + "language_loss": 0.73592454, + "learning_rate": 3.594977677968009e-06, + "loss": 0.76076204, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.17907715, + "step": 3825, + "time_per_iteration": 2.693667411804199 + }, + { + "auxiliary_loss_clip": 0.01452756, + "auxiliary_loss_mlp": 0.01046546, + "balance_loss_clip": 1.29859328, + "balance_loss_mlp": 1.02746081, + "epoch": 0.23003156470765068, + "flos": 24681692865960.0, + "grad_norm": 1.7259144763826797, + "language_loss": 0.88232625, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.90731931, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.19067383, + "step": 3826, + "time_per_iteration": 2.7556138038635254 + }, + { + "auxiliary_loss_clip": 0.0145555, + "auxiliary_loss_mlp": 0.0104373, + "balance_loss_clip": 1.29846239, + "balance_loss_mlp": 1.0240252, + "epoch": 0.23009168796031865, + "flos": 15818252941080.0, + "grad_norm": 2.269762766119256, + "language_loss": 0.81375194, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8387447, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19702148, + "step": 3827, + "time_per_iteration": 2.688682794570923 + }, + { + "auxiliary_loss_clip": 0.01444902, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.29390454, + "balance_loss_mlp": 1.02376199, + "epoch": 0.2301518112129866, + "flos": 16216677809640.0, + "grad_norm": 1.8562224978128803, + "language_loss": 0.87224126, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.89710498, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.17712402, + "step": 3828, + "time_per_iteration": 2.861807107925415 + }, + { + "auxiliary_loss_clip": 0.01441572, + "auxiliary_loss_mlp": 0.01058181, + "balance_loss_clip": 1.2881453, + "balance_loss_mlp": 1.03895283, + "epoch": 0.2302119344656546, + "flos": 20600424162360.0, + "grad_norm": 1.8666933533773318, + "language_loss": 0.70821989, + "learning_rate": 3.594037292782607e-06, + "loss": 0.73321736, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.19238281, + "step": 3829, + "time_per_iteration": 2.92604923248291 + }, + { + "auxiliary_loss_clip": 0.01438284, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.28864872, + "balance_loss_mlp": 1.02720189, + "epoch": 0.23027205771832257, + "flos": 26802535374720.0, + "grad_norm": 2.182449229290467, + "language_loss": 0.84554589, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.8703692, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.16845703, + "step": 3830, + "time_per_iteration": 2.768376350402832 + }, + { + "auxiliary_loss_clip": 0.01444556, + "auxiliary_loss_mlp": 0.01060166, + "balance_loss_clip": 1.2927779, + "balance_loss_mlp": 1.04110408, + "epoch": 0.23033218097099054, + "flos": 43881124160160.0, + "grad_norm": 1.8699087974590825, + "language_loss": 0.67952341, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.70457065, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.19067383, + "step": 3831, + "time_per_iteration": 2.937859296798706 + }, + { + "auxiliary_loss_clip": 0.01445334, + "auxiliary_loss_mlp": 0.01057758, + "balance_loss_clip": 1.2916038, + "balance_loss_mlp": 1.03873253, + "epoch": 0.2303923042236585, + "flos": 26073264594600.0, + "grad_norm": 2.172923121974061, + "language_loss": 0.75164217, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77667308, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.19018555, + "step": 3832, + "time_per_iteration": 2.745784044265747 + }, + { + "auxiliary_loss_clip": 0.01449708, + "auxiliary_loss_mlp": 0.01052773, + "balance_loss_clip": 1.29522276, + "balance_loss_mlp": 1.03350854, + "epoch": 0.23045242747632647, + "flos": 18300989775600.0, + "grad_norm": 1.8469857018161853, + "language_loss": 0.87718225, + "learning_rate": 3.593095940460389e-06, + "loss": 0.90220702, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.19274902, + "step": 3833, + "time_per_iteration": 2.707432746887207 + }, + { + "auxiliary_loss_clip": 0.01453439, + "auxiliary_loss_mlp": 0.01050272, + "balance_loss_clip": 1.29857743, + "balance_loss_mlp": 1.0308764, + "epoch": 0.23051255072899443, + "flos": 25526078011440.0, + "grad_norm": 1.9376386863555233, + "language_loss": 0.75695312, + "learning_rate": 3.592860451331624e-06, + "loss": 0.78199023, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.19384766, + "step": 3834, + "time_per_iteration": 2.8240976333618164 + }, + { + "auxiliary_loss_clip": 0.01446887, + "auxiliary_loss_mlp": 0.01055741, + "balance_loss_clip": 1.29403436, + "balance_loss_mlp": 1.03671491, + "epoch": 0.2305726739816624, + "flos": 21220184530800.0, + "grad_norm": 1.947384586445308, + "language_loss": 0.86353695, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88856322, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.19006348, + "step": 3835, + "time_per_iteration": 2.765500783920288 + }, + { + "auxiliary_loss_clip": 0.01459869, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.30137324, + "balance_loss_mlp": 1.03200305, + "epoch": 0.2306327972343304, + "flos": 23336357078160.0, + "grad_norm": 2.5739738419856804, + "language_loss": 0.82706219, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.85217357, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19262695, + "step": 3836, + "time_per_iteration": 2.7683494091033936 + }, + { + "auxiliary_loss_clip": 0.01456094, + "auxiliary_loss_mlp": 0.0105344, + "balance_loss_clip": 1.30324054, + "balance_loss_mlp": 1.03510571, + "epoch": 0.23069292048699835, + "flos": 20671495438320.0, + "grad_norm": 1.5452365905967214, + "language_loss": 0.79534733, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.82044268, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.18334961, + "step": 3837, + "time_per_iteration": 2.8907933235168457 + }, + { + "auxiliary_loss_clip": 0.01327418, + "auxiliary_loss_mlp": 0.01024101, + "balance_loss_clip": 1.25509453, + "balance_loss_mlp": 1.02081096, + "epoch": 0.23075304373966632, + "flos": 70468902589320.0, + "grad_norm": 0.9154463625142677, + "language_loss": 0.65431809, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67783332, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.03295898, + "step": 3838, + "time_per_iteration": 3.1516458988189697 + }, + { + "auxiliary_loss_clip": 0.01444956, + "auxiliary_loss_mlp": 0.01045989, + "balance_loss_clip": 1.29407799, + "balance_loss_mlp": 1.02823925, + "epoch": 0.23081316699233428, + "flos": 16622371574640.0, + "grad_norm": 2.13695633468476, + "language_loss": 0.75539839, + "learning_rate": 3.591682099845058e-06, + "loss": 0.78030789, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.17736816, + "step": 3839, + "time_per_iteration": 2.677297592163086 + }, + { + "auxiliary_loss_clip": 0.01455992, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.30219829, + "balance_loss_mlp": 1.02631044, + "epoch": 0.23087329024500225, + "flos": 13302071799120.0, + "grad_norm": 3.1296577898520264, + "language_loss": 0.69509196, + "learning_rate": 3.591446248441752e-06, + "loss": 0.72009325, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.17810059, + "step": 3840, + "time_per_iteration": 2.7019712924957275 + }, + { + "auxiliary_loss_clip": 0.01447735, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.29605198, + "balance_loss_mlp": 1.02553618, + "epoch": 0.23093341349767021, + "flos": 17790293126880.0, + "grad_norm": 1.9706100054597429, + "language_loss": 0.79868996, + "learning_rate": 3.591210336690645e-06, + "loss": 0.82362807, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.20532227, + "step": 3841, + "time_per_iteration": 2.682953119277954 + }, + { + "auxiliary_loss_clip": 0.01451178, + "auxiliary_loss_mlp": 0.01040719, + "balance_loss_clip": 1.29812503, + "balance_loss_mlp": 1.02292073, + "epoch": 0.23099353675033818, + "flos": 23993419548240.0, + "grad_norm": 1.8173233652628458, + "language_loss": 0.83323807, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85815704, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.17797852, + "step": 3842, + "time_per_iteration": 2.7531187534332275 + }, + { + "auxiliary_loss_clip": 0.01447672, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.29673433, + "balance_loss_mlp": 1.02134371, + "epoch": 0.23105366000300617, + "flos": 36002506381560.0, + "grad_norm": 1.6636840925896257, + "language_loss": 0.66619039, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.69107378, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.19317627, + "step": 3843, + "time_per_iteration": 2.9184486865997314 + }, + { + "auxiliary_loss_clip": 0.01434832, + "auxiliary_loss_mlp": 0.01046823, + "balance_loss_clip": 1.28713393, + "balance_loss_mlp": 1.02798748, + "epoch": 0.23111378325567414, + "flos": 31251180532680.0, + "grad_norm": 2.1362649079719858, + "language_loss": 0.77163815, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79645467, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.18847656, + "step": 3844, + "time_per_iteration": 2.8355581760406494 + }, + { + "auxiliary_loss_clip": 0.01447105, + "auxiliary_loss_mlp": 0.01039242, + "balance_loss_clip": 1.29580522, + "balance_loss_mlp": 1.01982284, + "epoch": 0.2311739065083421, + "flos": 19212913269720.0, + "grad_norm": 1.6146424087658247, + "language_loss": 0.78166354, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80652702, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.19421387, + "step": 3845, + "time_per_iteration": 4.129903554916382 + }, + { + "auxiliary_loss_clip": 0.01436353, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.29081106, + "balance_loss_mlp": 1.01921439, + "epoch": 0.23123402976101007, + "flos": 23364441082080.0, + "grad_norm": 2.0458932869684525, + "language_loss": 0.75733429, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78205788, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.16796875, + "step": 3846, + "time_per_iteration": 2.6994214057922363 + }, + { + "auxiliary_loss_clip": 0.01448522, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_clip": 1.29691648, + "balance_loss_mlp": 1.02927446, + "epoch": 0.23129415301367803, + "flos": 13739341887000.0, + "grad_norm": 1.798939489368334, + "language_loss": 0.69750953, + "learning_rate": 3.589793599381304e-06, + "loss": 0.72246194, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.17443848, + "step": 3847, + "time_per_iteration": 2.723637104034424 + }, + { + "auxiliary_loss_clip": 0.01327061, + "auxiliary_loss_mlp": 0.0102287, + "balance_loss_clip": 1.25633311, + "balance_loss_mlp": 1.01950836, + "epoch": 0.231354276266346, + "flos": 69752545267680.0, + "grad_norm": 0.789578110245081, + "language_loss": 0.61018109, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63368046, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.03369141, + "step": 3848, + "time_per_iteration": 3.2123770713806152 + }, + { + "auxiliary_loss_clip": 0.01445976, + "auxiliary_loss_mlp": 0.01044633, + "balance_loss_clip": 1.29369688, + "balance_loss_mlp": 1.02583396, + "epoch": 0.231414399519014, + "flos": 18839567386440.0, + "grad_norm": 2.046672048489429, + "language_loss": 0.78849578, + "learning_rate": 3.589320871234923e-06, + "loss": 0.81340188, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.18798828, + "step": 3849, + "time_per_iteration": 2.8086001873016357 + }, + { + "auxiliary_loss_clip": 0.01449968, + "auxiliary_loss_mlp": 0.01044953, + "balance_loss_clip": 1.29876041, + "balance_loss_mlp": 1.02658248, + "epoch": 0.23147452277168196, + "flos": 36142090606800.0, + "grad_norm": 1.9400858238003524, + "language_loss": 0.71670961, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.74165881, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.18383789, + "step": 3850, + "time_per_iteration": 2.907825469970703 + }, + { + "auxiliary_loss_clip": 0.01445724, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.29528809, + "balance_loss_mlp": 1.01838899, + "epoch": 0.23153464602434992, + "flos": 20817739434600.0, + "grad_norm": 1.6620478323940393, + "language_loss": 0.76895386, + "learning_rate": 3.588847902019718e-06, + "loss": 0.7937789, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.18395996, + "step": 3851, + "time_per_iteration": 2.7148571014404297 + }, + { + "auxiliary_loss_clip": 0.01442762, + "auxiliary_loss_mlp": 0.01043608, + "balance_loss_clip": 1.29315686, + "balance_loss_mlp": 1.02466559, + "epoch": 0.2315947692770179, + "flos": 19944214467840.0, + "grad_norm": 1.6372565275767366, + "language_loss": 0.70066702, + "learning_rate": 3.588611327033723e-06, + "loss": 0.72553074, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.18933105, + "step": 3852, + "time_per_iteration": 2.7649364471435547 + }, + { + "auxiliary_loss_clip": 0.01453321, + "auxiliary_loss_mlp": 0.01039611, + "balance_loss_clip": 1.30112565, + "balance_loss_mlp": 1.02130067, + "epoch": 0.23165489252968585, + "flos": 12858872890680.0, + "grad_norm": 2.494202030439462, + "language_loss": 0.67619622, + "learning_rate": 3.588374691807428e-06, + "loss": 0.7011255, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.18310547, + "step": 3853, + "time_per_iteration": 2.6695687770843506 + }, + { + "auxiliary_loss_clip": 0.01461035, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.30840135, + "balance_loss_mlp": 1.02198744, + "epoch": 0.23171501578235382, + "flos": 30634465791240.0, + "grad_norm": 1.7658850217960085, + "language_loss": 0.80661845, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.83163714, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.18847656, + "step": 3854, + "time_per_iteration": 4.201350212097168 + }, + { + "auxiliary_loss_clip": 0.01461725, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.30389237, + "balance_loss_mlp": 1.02433085, + "epoch": 0.23177513903502178, + "flos": 23848190760960.0, + "grad_norm": 2.1442088726074084, + "language_loss": 0.66527522, + "learning_rate": 3.587901240669831e-06, + "loss": 0.6903246, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18884277, + "step": 3855, + "time_per_iteration": 2.8027076721191406 + }, + { + "auxiliary_loss_clip": 0.01448624, + "auxiliary_loss_mlp": 0.01043361, + "balance_loss_clip": 1.29521501, + "balance_loss_mlp": 1.02429903, + "epoch": 0.23183526228768978, + "flos": 29576338909200.0, + "grad_norm": 2.2314139150420407, + "language_loss": 0.70894635, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73386621, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.19055176, + "step": 3856, + "time_per_iteration": 4.31064248085022 + }, + { + "auxiliary_loss_clip": 0.01447639, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.29694891, + "balance_loss_mlp": 1.02282608, + "epoch": 0.23189538554035774, + "flos": 34465299782040.0, + "grad_norm": 1.6678921187588962, + "language_loss": 0.78108817, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.80596811, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.17529297, + "step": 3857, + "time_per_iteration": 2.8256711959838867 + }, + { + "auxiliary_loss_clip": 0.01465769, + "auxiliary_loss_mlp": 0.01057247, + "balance_loss_clip": 1.31009173, + "balance_loss_mlp": 1.03743458, + "epoch": 0.2319555087930257, + "flos": 18008298741240.0, + "grad_norm": 2.2877841061626523, + "language_loss": 0.9170875, + "learning_rate": 3.587190612385584e-06, + "loss": 0.94231766, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.19812012, + "step": 3858, + "time_per_iteration": 4.17481541633606 + }, + { + "auxiliary_loss_clip": 0.01449294, + "auxiliary_loss_mlp": 0.01050607, + "balance_loss_clip": 1.29983807, + "balance_loss_mlp": 1.03172469, + "epoch": 0.23201563204569367, + "flos": 23148425277360.0, + "grad_norm": 1.774955785721005, + "language_loss": 0.76654875, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.79154778, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.18884277, + "step": 3859, + "time_per_iteration": 2.776277780532837 + }, + { + "auxiliary_loss_clip": 0.01452264, + "auxiliary_loss_mlp": 0.01039129, + "balance_loss_clip": 1.29995477, + "balance_loss_mlp": 1.02235579, + "epoch": 0.23207575529836164, + "flos": 20672916730920.0, + "grad_norm": 1.649104058310393, + "language_loss": 0.84294683, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86786079, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.16772461, + "step": 3860, + "time_per_iteration": 2.7238335609436035 + }, + { + "auxiliary_loss_clip": 0.01458085, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.30792212, + "balance_loss_mlp": 1.03090835, + "epoch": 0.2321358785510296, + "flos": 16476574270320.0, + "grad_norm": 1.8826175850846927, + "language_loss": 0.83088547, + "learning_rate": 3.586479442423508e-06, + "loss": 0.8559671, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19165039, + "step": 3861, + "time_per_iteration": 2.713386297225952 + }, + { + "auxiliary_loss_clip": 0.01450549, + "auxiliary_loss_mlp": 0.01051705, + "balance_loss_clip": 1.29997373, + "balance_loss_mlp": 1.03377569, + "epoch": 0.2321960018036976, + "flos": 21621330159480.0, + "grad_norm": 1.4485063267342777, + "language_loss": 0.86330831, + "learning_rate": 3.586242265438576e-06, + "loss": 0.88833082, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.17932129, + "step": 3862, + "time_per_iteration": 2.716756820678711 + }, + { + "auxiliary_loss_clip": 0.01447892, + "auxiliary_loss_mlp": 0.0104579, + "balance_loss_clip": 1.29960155, + "balance_loss_mlp": 1.02917206, + "epoch": 0.23225612505636556, + "flos": 22276362211560.0, + "grad_norm": 1.3908445600697927, + "language_loss": 0.75650114, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.78143799, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.16625977, + "step": 3863, + "time_per_iteration": 2.771038293838501 + }, + { + "auxiliary_loss_clip": 0.01447483, + "auxiliary_loss_mlp": 0.01047873, + "balance_loss_clip": 1.30077267, + "balance_loss_mlp": 1.03058767, + "epoch": 0.23231624830903352, + "flos": 17056311777000.0, + "grad_norm": 1.7294565957859103, + "language_loss": 0.74923313, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77418673, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.17285156, + "step": 3864, + "time_per_iteration": 2.665130376815796 + }, + { + "auxiliary_loss_clip": 0.01452562, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.30220509, + "balance_loss_mlp": 1.02611589, + "epoch": 0.2323763715617015, + "flos": 34646531203440.0, + "grad_norm": 2.9632777772852568, + "language_loss": 0.70773804, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73271024, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.18554688, + "step": 3865, + "time_per_iteration": 2.842142343521118 + }, + { + "auxiliary_loss_clip": 0.01469962, + "auxiliary_loss_mlp": 0.01055023, + "balance_loss_clip": 1.3120575, + "balance_loss_mlp": 1.03450727, + "epoch": 0.23243649481436945, + "flos": 25556557908600.0, + "grad_norm": 1.9925384105098343, + "language_loss": 0.95329571, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97854555, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.20507812, + "step": 3866, + "time_per_iteration": 2.7361807823181152 + }, + { + "auxiliary_loss_clip": 0.01451751, + "auxiliary_loss_mlp": 0.01046322, + "balance_loss_clip": 1.30171776, + "balance_loss_mlp": 1.02932239, + "epoch": 0.23249661806703742, + "flos": 20488111773840.0, + "grad_norm": 2.535005570056678, + "language_loss": 0.73104572, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75602639, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.16992188, + "step": 3867, + "time_per_iteration": 2.7688746452331543 + }, + { + "auxiliary_loss_clip": 0.01448964, + "auxiliary_loss_mlp": 0.01047869, + "balance_loss_clip": 1.29589701, + "balance_loss_mlp": 1.02915311, + "epoch": 0.23255674131970538, + "flos": 20381565772440.0, + "grad_norm": 2.0081462368933236, + "language_loss": 0.82808006, + "learning_rate": 3.584817940684145e-06, + "loss": 0.85304838, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18713379, + "step": 3868, + "time_per_iteration": 2.7585842609405518 + }, + { + "auxiliary_loss_clip": 0.01442483, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.29469097, + "balance_loss_mlp": 1.021451, + "epoch": 0.23261686457237338, + "flos": 17060575654800.0, + "grad_norm": 1.5604973246239293, + "language_loss": 0.73117679, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75599861, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.18249512, + "step": 3869, + "time_per_iteration": 2.6919467449188232 + }, + { + "auxiliary_loss_clip": 0.01449154, + "auxiliary_loss_mlp": 0.01050384, + "balance_loss_clip": 1.29868317, + "balance_loss_mlp": 1.03201413, + "epoch": 0.23267698782504134, + "flos": 29176005447720.0, + "grad_norm": 1.9096397685682511, + "language_loss": 0.79880571, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.82380116, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.18359375, + "step": 3870, + "time_per_iteration": 2.8013882637023926 + }, + { + "auxiliary_loss_clip": 0.01455486, + "auxiliary_loss_mlp": 0.01051971, + "balance_loss_clip": 1.3020575, + "balance_loss_mlp": 1.03283811, + "epoch": 0.2327371110777093, + "flos": 21179105851680.0, + "grad_norm": 1.9476203308661797, + "language_loss": 0.71505779, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.74013233, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.19128418, + "step": 3871, + "time_per_iteration": 2.6980228424072266 + }, + { + "auxiliary_loss_clip": 0.01453709, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_clip": 1.29990673, + "balance_loss_mlp": 1.03353393, + "epoch": 0.23279723433037727, + "flos": 24868487632680.0, + "grad_norm": 1.9642350822346664, + "language_loss": 0.69327664, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71835101, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.20178223, + "step": 3872, + "time_per_iteration": 2.8314156532287598 + }, + { + "auxiliary_loss_clip": 0.01460955, + "auxiliary_loss_mlp": 0.01046689, + "balance_loss_clip": 1.30370414, + "balance_loss_mlp": 1.02691233, + "epoch": 0.23285735758304524, + "flos": 38807805022200.0, + "grad_norm": 1.6465559672175765, + "language_loss": 0.78269231, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80776882, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19775391, + "step": 3873, + "time_per_iteration": 2.894169330596924 + }, + { + "auxiliary_loss_clip": 0.01307706, + "auxiliary_loss_mlp": 0.01003166, + "balance_loss_clip": 1.23696351, + "balance_loss_mlp": 0.9990654, + "epoch": 0.2329174808357132, + "flos": 53958291867360.0, + "grad_norm": 0.8624316773105367, + "language_loss": 0.6044147, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62752342, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.04101562, + "step": 3874, + "time_per_iteration": 3.1514995098114014 + }, + { + "auxiliary_loss_clip": 0.01456373, + "auxiliary_loss_mlp": 0.01058007, + "balance_loss_clip": 1.30494893, + "balance_loss_mlp": 1.03803921, + "epoch": 0.23297760408838117, + "flos": 21221118523080.0, + "grad_norm": 2.1453354781028042, + "language_loss": 0.81425184, + "learning_rate": 3.583153494218927e-06, + "loss": 0.8393957, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.19970703, + "step": 3875, + "time_per_iteration": 2.7069811820983887 + }, + { + "auxiliary_loss_clip": 0.01453221, + "auxiliary_loss_mlp": 0.01047695, + "balance_loss_clip": 1.3031199, + "balance_loss_mlp": 1.03021932, + "epoch": 0.23303772734104916, + "flos": 28408945265640.0, + "grad_norm": 1.7124612443581244, + "language_loss": 0.61325264, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.6382618, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.17468262, + "step": 3876, + "time_per_iteration": 2.919996976852417 + }, + { + "auxiliary_loss_clip": 0.01454664, + "auxiliary_loss_mlp": 0.01050303, + "balance_loss_clip": 1.30326104, + "balance_loss_mlp": 1.0313251, + "epoch": 0.23309785059371713, + "flos": 24319879756920.0, + "grad_norm": 1.8204696611747038, + "language_loss": 0.70746702, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.73251671, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.18994141, + "step": 3877, + "time_per_iteration": 2.759692907333374 + }, + { + "auxiliary_loss_clip": 0.01449131, + "auxiliary_loss_mlp": 0.0105261, + "balance_loss_clip": 1.29810703, + "balance_loss_mlp": 1.0333339, + "epoch": 0.2331579738463851, + "flos": 15996601168920.0, + "grad_norm": 3.4947308047258927, + "language_loss": 0.81393844, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83895588, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.19262695, + "step": 3878, + "time_per_iteration": 2.6784002780914307 + }, + { + "auxiliary_loss_clip": 0.01463377, + "auxiliary_loss_mlp": 0.01055076, + "balance_loss_clip": 1.30775619, + "balance_loss_mlp": 1.03370178, + "epoch": 0.23321809709905306, + "flos": 36433563390360.0, + "grad_norm": 1.576932683835602, + "language_loss": 0.75209367, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77727818, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.21362305, + "step": 3879, + "time_per_iteration": 2.8461720943450928 + }, + { + "auxiliary_loss_clip": 0.01446015, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.29403293, + "balance_loss_mlp": 1.02571976, + "epoch": 0.23327822035172102, + "flos": 21329776159200.0, + "grad_norm": 1.974241308643436, + "language_loss": 0.90313196, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92803097, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.1817627, + "step": 3880, + "time_per_iteration": 2.69362473487854 + }, + { + "auxiliary_loss_clip": 0.0145774, + "auxiliary_loss_mlp": 0.01053012, + "balance_loss_clip": 1.3032198, + "balance_loss_mlp": 1.0348922, + "epoch": 0.233338343604389, + "flos": 19176585768720.0, + "grad_norm": 1.804057519116518, + "language_loss": 0.72137249, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74647993, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.18103027, + "step": 3881, + "time_per_iteration": 2.734072685241699 + }, + { + "auxiliary_loss_clip": 0.01453724, + "auxiliary_loss_mlp": 0.01043562, + "balance_loss_clip": 1.30336189, + "balance_loss_mlp": 1.025406, + "epoch": 0.23339846685705698, + "flos": 26914279246200.0, + "grad_norm": 1.6894528853670887, + "language_loss": 0.68114114, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70611399, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.18139648, + "step": 3882, + "time_per_iteration": 2.792644500732422 + }, + { + "auxiliary_loss_clip": 0.01457748, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.30402541, + "balance_loss_mlp": 1.0264647, + "epoch": 0.23345859010972494, + "flos": 32349573926640.0, + "grad_norm": 2.13410265191827, + "language_loss": 0.76464248, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78967011, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.18554688, + "step": 3883, + "time_per_iteration": 2.8494961261749268 + }, + { + "auxiliary_loss_clip": 0.01307346, + "auxiliary_loss_mlp": 0.01010506, + "balance_loss_clip": 1.23654842, + "balance_loss_mlp": 1.00683451, + "epoch": 0.2335187133623929, + "flos": 58501073849760.0, + "grad_norm": 0.8615775020421177, + "language_loss": 0.59100819, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61418676, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.03662109, + "step": 3884, + "time_per_iteration": 4.8107991218566895 + }, + { + "auxiliary_loss_clip": 0.01453946, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.30189407, + "balance_loss_mlp": 1.01759815, + "epoch": 0.23357883661506088, + "flos": 24508177032960.0, + "grad_norm": 1.606394172867464, + "language_loss": 0.80363703, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82853162, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.17919922, + "step": 3885, + "time_per_iteration": 2.779754400253296 + }, + { + "auxiliary_loss_clip": 0.01448873, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.29876852, + "balance_loss_mlp": 1.0238198, + "epoch": 0.23363895986772884, + "flos": 18952610725440.0, + "grad_norm": 2.701939836302365, + "language_loss": 0.87833941, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90324873, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.18237305, + "step": 3886, + "time_per_iteration": 2.7310705184936523 + }, + { + "auxiliary_loss_clip": 0.0145706, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_clip": 1.30460036, + "balance_loss_mlp": 1.02739894, + "epoch": 0.2336990831203968, + "flos": 31693201798680.0, + "grad_norm": 1.797282967808537, + "language_loss": 0.7358889, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.76091778, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.18432617, + "step": 3887, + "time_per_iteration": 2.8457977771759033 + }, + { + "auxiliary_loss_clip": 0.01456441, + "auxiliary_loss_mlp": 0.01043083, + "balance_loss_clip": 1.30368876, + "balance_loss_mlp": 1.02402127, + "epoch": 0.23375920637306477, + "flos": 27715595902920.0, + "grad_norm": 1.8161684080080223, + "language_loss": 0.8429178, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86791301, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.19055176, + "step": 3888, + "time_per_iteration": 2.768156051635742 + }, + { + "auxiliary_loss_clip": 0.01453211, + "auxiliary_loss_mlp": 0.01058857, + "balance_loss_clip": 1.3030827, + "balance_loss_mlp": 1.03196311, + "epoch": 0.23381932962573276, + "flos": 17680092373080.0, + "grad_norm": 2.9010219454279538, + "language_loss": 0.87431669, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89943731, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.26916504, + "step": 3889, + "time_per_iteration": 2.7223312854766846 + }, + { + "auxiliary_loss_clip": 0.01457279, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.30554187, + "balance_loss_mlp": 1.02130485, + "epoch": 0.23387945287840073, + "flos": 14394739414320.0, + "grad_norm": 2.894610694733116, + "language_loss": 0.77338219, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79835033, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.18237305, + "step": 3890, + "time_per_iteration": 2.699575185775757 + }, + { + "auxiliary_loss_clip": 0.01453405, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.30196786, + "balance_loss_mlp": 1.02418137, + "epoch": 0.2339395761310687, + "flos": 46106157385440.0, + "grad_norm": 1.690942967774174, + "language_loss": 0.73054123, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75551355, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.19677734, + "step": 3891, + "time_per_iteration": 2.988023281097412 + }, + { + "auxiliary_loss_clip": 0.01446318, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_clip": 1.29731011, + "balance_loss_mlp": 1.03144002, + "epoch": 0.23399969938373666, + "flos": 22387090874040.0, + "grad_norm": 1.5189273486070716, + "language_loss": 0.82651502, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.85147661, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.18408203, + "step": 3892, + "time_per_iteration": 2.7648606300354004 + }, + { + "auxiliary_loss_clip": 0.0146262, + "auxiliary_loss_mlp": 0.01050772, + "balance_loss_clip": 1.30771041, + "balance_loss_mlp": 1.03159106, + "epoch": 0.23405982263640462, + "flos": 43516509074280.0, + "grad_norm": 1.549518709030316, + "language_loss": 0.6480791, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67321301, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.19177246, + "step": 3893, + "time_per_iteration": 4.352790117263794 + }, + { + "auxiliary_loss_clip": 0.01458744, + "auxiliary_loss_mlp": 0.01042357, + "balance_loss_clip": 1.31085157, + "balance_loss_mlp": 1.02299762, + "epoch": 0.2341199458890726, + "flos": 22569784196400.0, + "grad_norm": 2.620331315266292, + "language_loss": 0.79695076, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.82196182, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.19360352, + "step": 3894, + "time_per_iteration": 2.7545228004455566 + }, + { + "auxiliary_loss_clip": 0.01447064, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.29824817, + "balance_loss_mlp": 1.02282023, + "epoch": 0.23418006914174055, + "flos": 25640055342720.0, + "grad_norm": 1.8440284320752416, + "language_loss": 0.82328606, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.8481617, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.17675781, + "step": 3895, + "time_per_iteration": 4.3030853271484375 + }, + { + "auxiliary_loss_clip": 0.01456925, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.30657291, + "balance_loss_mlp": 1.03001523, + "epoch": 0.23424019239440855, + "flos": 13548973584600.0, + "grad_norm": 1.8471143235372334, + "language_loss": 0.80863422, + "learning_rate": 3.578142517422292e-06, + "loss": 0.83369434, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.19055176, + "step": 3896, + "time_per_iteration": 2.769068956375122 + }, + { + "auxiliary_loss_clip": 0.01461536, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.30646193, + "balance_loss_mlp": 1.02827144, + "epoch": 0.2343003156470765, + "flos": 22424677234200.0, + "grad_norm": 1.5568616840702243, + "language_loss": 0.83389151, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85899651, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.20690918, + "step": 3897, + "time_per_iteration": 2.8292236328125 + }, + { + "auxiliary_loss_clip": 0.01468076, + "auxiliary_loss_mlp": 0.0105215, + "balance_loss_clip": 1.31329036, + "balance_loss_mlp": 1.03279018, + "epoch": 0.23436043889974448, + "flos": 14794585575480.0, + "grad_norm": 1.705457449743873, + "language_loss": 0.79550076, + "learning_rate": 3.577663903820705e-06, + "loss": 0.82070303, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.19348145, + "step": 3898, + "time_per_iteration": 4.114223480224609 + }, + { + "auxiliary_loss_clip": 0.01451264, + "auxiliary_loss_mlp": 0.01047975, + "balance_loss_clip": 1.30479431, + "balance_loss_mlp": 1.02989125, + "epoch": 0.23442056215241244, + "flos": 22970808000000.0, + "grad_norm": 2.7290872300978557, + "language_loss": 0.73780161, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76279402, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.18078613, + "step": 3899, + "time_per_iteration": 2.7129547595977783 + }, + { + "auxiliary_loss_clip": 0.01465971, + "auxiliary_loss_mlp": 0.01049357, + "balance_loss_clip": 1.31283641, + "balance_loss_mlp": 1.03010464, + "epoch": 0.2344806854050804, + "flos": 23076582442560.0, + "grad_norm": 2.3438258389847357, + "language_loss": 0.75349319, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77864653, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.19250488, + "step": 3900, + "time_per_iteration": 2.7682671546936035 + }, + { + "auxiliary_loss_clip": 0.01460293, + "auxiliary_loss_mlp": 0.01047153, + "balance_loss_clip": 1.3092804, + "balance_loss_mlp": 1.02828169, + "epoch": 0.23454080865774837, + "flos": 16331548524840.0, + "grad_norm": 1.7819025914485642, + "language_loss": 0.66880023, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69387472, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.1887207, + "step": 3901, + "time_per_iteration": 2.6952009201049805 + }, + { + "auxiliary_loss_clip": 0.01305798, + "auxiliary_loss_mlp": 0.01018103, + "balance_loss_clip": 1.23764372, + "balance_loss_mlp": 1.01419282, + "epoch": 0.23460093191041637, + "flos": 67775469645240.0, + "grad_norm": 0.8739968258088626, + "language_loss": 0.5869925, + "learning_rate": 3.576705958788091e-06, + "loss": 0.61023152, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.0390625, + "step": 3902, + "time_per_iteration": 3.19254994392395 + }, + { + "auxiliary_loss_clip": 0.01468386, + "auxiliary_loss_mlp": 0.01051085, + "balance_loss_clip": 1.31808269, + "balance_loss_mlp": 1.03072453, + "epoch": 0.23466105516308433, + "flos": 20081605841640.0, + "grad_norm": 1.8311826663055832, + "language_loss": 0.80481529, + "learning_rate": 3.576466323035108e-06, + "loss": 0.83000994, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.20373535, + "step": 3903, + "time_per_iteration": 2.7250804901123047 + }, + { + "auxiliary_loss_clip": 0.01459674, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_clip": 1.30782688, + "balance_loss_mlp": 1.03431451, + "epoch": 0.2347211784157523, + "flos": 24541174648440.0, + "grad_norm": 1.8944957829524443, + "language_loss": 0.82608765, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.85122401, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.1965332, + "step": 3904, + "time_per_iteration": 2.791750431060791 + }, + { + "auxiliary_loss_clip": 0.01460881, + "auxiliary_loss_mlp": 0.01053846, + "balance_loss_clip": 1.31128955, + "balance_loss_mlp": 1.03491521, + "epoch": 0.23478130166842026, + "flos": 23810279533920.0, + "grad_norm": 1.9259479174292922, + "language_loss": 0.71775168, + "learning_rate": 3.57598687219895e-06, + "loss": 0.74289894, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.18920898, + "step": 3905, + "time_per_iteration": 2.7556040287017822 + }, + { + "auxiliary_loss_clip": 0.01457273, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.3092494, + "balance_loss_mlp": 1.02772343, + "epoch": 0.23484142492108823, + "flos": 24098463040320.0, + "grad_norm": 1.8329715824675987, + "language_loss": 0.71555483, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.74058068, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.17578125, + "step": 3906, + "time_per_iteration": 2.8276772499084473 + }, + { + "auxiliary_loss_clip": 0.01465159, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.30815935, + "balance_loss_mlp": 1.02211404, + "epoch": 0.2349015481737562, + "flos": 29101807328040.0, + "grad_norm": 2.9812126379443815, + "language_loss": 0.73835784, + "learning_rate": 3.575507182316473e-06, + "loss": 0.76343238, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.20202637, + "step": 3907, + "time_per_iteration": 2.8164021968841553 + }, + { + "auxiliary_loss_clip": 0.01465776, + "auxiliary_loss_mlp": 0.01057205, + "balance_loss_clip": 1.31169057, + "balance_loss_mlp": 1.03549743, + "epoch": 0.23496167142642416, + "flos": 18920953185840.0, + "grad_norm": 1.6358031860788302, + "language_loss": 0.72909951, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75432932, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.21716309, + "step": 3908, + "time_per_iteration": 2.7596311569213867 + }, + { + "auxiliary_loss_clip": 0.01292534, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.22103095, + "balance_loss_mlp": 1.03365159, + "epoch": 0.23502179467909215, + "flos": 55881212918760.0, + "grad_norm": 1.1475651823957422, + "language_loss": 0.73494452, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75824547, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.0390625, + "step": 3909, + "time_per_iteration": 3.0369069576263428 + }, + { + "auxiliary_loss_clip": 0.01454977, + "auxiliary_loss_mlp": 0.01048234, + "balance_loss_clip": 1.30404663, + "balance_loss_mlp": 1.02972126, + "epoch": 0.23508191793176011, + "flos": 23406697403640.0, + "grad_norm": 1.5116617040915639, + "language_loss": 0.88326424, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90829635, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.18505859, + "step": 3910, + "time_per_iteration": 2.8552491664886475 + }, + { + "auxiliary_loss_clip": 0.01460185, + "auxiliary_loss_mlp": 0.01045303, + "balance_loss_clip": 1.30965304, + "balance_loss_mlp": 1.02638435, + "epoch": 0.23514204118442808, + "flos": 20052506628720.0, + "grad_norm": 2.0169636346758653, + "language_loss": 0.76757646, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.79263133, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.18920898, + "step": 3911, + "time_per_iteration": 2.7780842781066895 + }, + { + "auxiliary_loss_clip": 0.01453845, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.30831981, + "balance_loss_mlp": 1.03431773, + "epoch": 0.23520216443709605, + "flos": 21585693000600.0, + "grad_norm": 2.5736014923202966, + "language_loss": 0.81694216, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.84199274, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.16906738, + "step": 3912, + "time_per_iteration": 2.836601734161377 + }, + { + "auxiliary_loss_clip": 0.01456858, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.30973279, + "balance_loss_mlp": 1.02506924, + "epoch": 0.235262287689764, + "flos": 23191087682520.0, + "grad_norm": 2.0532839743737967, + "language_loss": 0.71949756, + "learning_rate": 3.574066679118909e-06, + "loss": 0.74450886, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1920166, + "step": 3913, + "time_per_iteration": 2.7317349910736084 + }, + { + "auxiliary_loss_clip": 0.0147244, + "auxiliary_loss_mlp": 0.01054774, + "balance_loss_clip": 1.31814361, + "balance_loss_mlp": 1.03475833, + "epoch": 0.23532241094243198, + "flos": 23190437948760.0, + "grad_norm": 1.7932454274156904, + "language_loss": 0.76424253, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78951466, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.20007324, + "step": 3914, + "time_per_iteration": 2.812565565109253 + }, + { + "auxiliary_loss_clip": 0.0146488, + "auxiliary_loss_mlp": 0.01047491, + "balance_loss_clip": 1.31388319, + "balance_loss_mlp": 1.02889442, + "epoch": 0.23538253419509997, + "flos": 17023760853480.0, + "grad_norm": 2.1979882341658428, + "language_loss": 0.89953494, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.92465866, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.18579102, + "step": 3915, + "time_per_iteration": 2.690775156021118 + }, + { + "auxiliary_loss_clip": 0.01294638, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.22669339, + "balance_loss_mlp": 1.02576005, + "epoch": 0.23544265744776793, + "flos": 63461616926040.0, + "grad_norm": 0.8076110990568313, + "language_loss": 0.59468317, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61792481, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.03759766, + "step": 3916, + "time_per_iteration": 3.2068018913269043 + }, + { + "auxiliary_loss_clip": 0.01294563, + "auxiliary_loss_mlp": 0.01011439, + "balance_loss_clip": 1.22591913, + "balance_loss_mlp": 1.00793421, + "epoch": 0.2355027807004359, + "flos": 70532559516600.0, + "grad_norm": 0.8593099868726084, + "language_loss": 0.49453038, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.5175904, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03515625, + "step": 3917, + "time_per_iteration": 3.2513484954833984 + }, + { + "auxiliary_loss_clip": 0.01467924, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.31458521, + "balance_loss_mlp": 1.02874207, + "epoch": 0.23556290395310386, + "flos": 21439327179240.0, + "grad_norm": 2.0407823059618204, + "language_loss": 0.76608491, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.79123664, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18505859, + "step": 3918, + "time_per_iteration": 2.742124319076538 + }, + { + "auxiliary_loss_clip": 0.01469555, + "auxiliary_loss_mlp": 0.01042816, + "balance_loss_clip": 1.31451201, + "balance_loss_mlp": 1.0238378, + "epoch": 0.23562302720577183, + "flos": 18190951455240.0, + "grad_norm": 1.9868833620370296, + "language_loss": 0.69328785, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71841151, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18981934, + "step": 3919, + "time_per_iteration": 2.7084343433380127 + }, + { + "auxiliary_loss_clip": 0.01454619, + "auxiliary_loss_mlp": 0.01046297, + "balance_loss_clip": 1.3079617, + "balance_loss_mlp": 1.02735424, + "epoch": 0.2356831504584398, + "flos": 33737206644360.0, + "grad_norm": 1.6449851171237528, + "language_loss": 0.70426857, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72927773, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.1895752, + "step": 3920, + "time_per_iteration": 2.8520290851593018 + }, + { + "auxiliary_loss_clip": 0.01462042, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.31331742, + "balance_loss_mlp": 1.02291667, + "epoch": 0.23574327371110776, + "flos": 24937731532440.0, + "grad_norm": 1.642538444440816, + "language_loss": 0.77403998, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79907656, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.18713379, + "step": 3921, + "time_per_iteration": 2.785158157348633 + }, + { + "auxiliary_loss_clip": 0.01464765, + "auxiliary_loss_mlp": 0.01052519, + "balance_loss_clip": 1.31597018, + "balance_loss_mlp": 1.03393435, + "epoch": 0.23580339696377575, + "flos": 17826661236240.0, + "grad_norm": 2.89279939606385, + "language_loss": 0.74757862, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77275151, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.18591309, + "step": 3922, + "time_per_iteration": 2.7040011882781982 + }, + { + "auxiliary_loss_clip": 0.01459329, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.30946445, + "balance_loss_mlp": 1.02142596, + "epoch": 0.23586352021644372, + "flos": 26292244809600.0, + "grad_norm": 1.870885490621023, + "language_loss": 0.80576295, + "learning_rate": 3.571661066327956e-06, + "loss": 0.83076084, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19042969, + "step": 3923, + "time_per_iteration": 4.1597747802734375 + }, + { + "auxiliary_loss_clip": 0.01461629, + "auxiliary_loss_mlp": 0.01043899, + "balance_loss_clip": 1.31250775, + "balance_loss_mlp": 1.02545774, + "epoch": 0.23592364346911168, + "flos": 14250810094560.0, + "grad_norm": 1.6865005405510247, + "language_loss": 0.7496779, + "learning_rate": 3.571420177111754e-06, + "loss": 0.77473319, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.18444824, + "step": 3924, + "time_per_iteration": 2.7082765102386475 + }, + { + "auxiliary_loss_clip": 0.01465127, + "auxiliary_loss_mlp": 0.01042116, + "balance_loss_clip": 1.31742883, + "balance_loss_mlp": 1.02334058, + "epoch": 0.23598376672177965, + "flos": 18592543775880.0, + "grad_norm": 1.5257489884823585, + "language_loss": 0.82569212, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.85076451, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.18786621, + "step": 3925, + "time_per_iteration": 2.721569538116455 + }, + { + "auxiliary_loss_clip": 0.01472722, + "auxiliary_loss_mlp": 0.01046583, + "balance_loss_clip": 1.32010674, + "balance_loss_mlp": 1.02671123, + "epoch": 0.2360438899744476, + "flos": 22680837725760.0, + "grad_norm": 2.2095126734132235, + "language_loss": 0.59643006, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.62162316, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.19873047, + "step": 3926, + "time_per_iteration": 2.784794807434082 + }, + { + "auxiliary_loss_clip": 0.01445322, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.3004539, + "balance_loss_mlp": 1.02287352, + "epoch": 0.23610401322711558, + "flos": 29576176475760.0, + "grad_norm": 1.9227041608408215, + "language_loss": 0.72285497, + "learning_rate": 3.570697151969235e-06, + "loss": 0.74770886, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17163086, + "step": 3927, + "time_per_iteration": 2.905442476272583 + }, + { + "auxiliary_loss_clip": 0.01454548, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.30733073, + "balance_loss_mlp": 1.02562141, + "epoch": 0.23616413647978354, + "flos": 17863069953960.0, + "grad_norm": 1.7843831460631256, + "language_loss": 0.75583786, + "learning_rate": 3.570456024454221e-06, + "loss": 0.78080642, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.16699219, + "step": 3928, + "time_per_iteration": 2.798574924468994 + }, + { + "auxiliary_loss_clip": 0.01461152, + "auxiliary_loss_mlp": 0.01043867, + "balance_loss_clip": 1.31052232, + "balance_loss_mlp": 1.02406597, + "epoch": 0.23622425973245154, + "flos": 11038396396320.0, + "grad_norm": 2.2224654158568486, + "language_loss": 0.82032764, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.8453778, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.19787598, + "step": 3929, + "time_per_iteration": 2.6793978214263916 + }, + { + "auxiliary_loss_clip": 0.01473761, + "auxiliary_loss_mlp": 0.01047299, + "balance_loss_clip": 1.31898415, + "balance_loss_mlp": 1.0267117, + "epoch": 0.2362843829851195, + "flos": 23409296338680.0, + "grad_norm": 1.886966893584247, + "language_loss": 0.72277087, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74798143, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.20593262, + "step": 3930, + "time_per_iteration": 2.793156385421753 + }, + { + "auxiliary_loss_clip": 0.01459645, + "auxiliary_loss_mlp": 0.01041438, + "balance_loss_clip": 1.3114475, + "balance_loss_mlp": 1.02294898, + "epoch": 0.23634450623778747, + "flos": 39537075802320.0, + "grad_norm": 2.8049187874220296, + "language_loss": 0.74565029, + "learning_rate": 3.569732284634665e-06, + "loss": 0.77066112, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.18469238, + "step": 3931, + "time_per_iteration": 4.3905861377716064 + }, + { + "auxiliary_loss_clip": 0.01463563, + "auxiliary_loss_mlp": 0.0104928, + "balance_loss_clip": 1.31357908, + "balance_loss_mlp": 1.02888346, + "epoch": 0.23640462949045543, + "flos": 24212562196680.0, + "grad_norm": 1.9045957148476527, + "language_loss": 0.80943131, + "learning_rate": 3.569490918967136e-06, + "loss": 0.8345598, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.20385742, + "step": 3932, + "time_per_iteration": 2.7843077182769775 + }, + { + "auxiliary_loss_clip": 0.01458224, + "auxiliary_loss_mlp": 0.01040441, + "balance_loss_clip": 1.31293404, + "balance_loss_mlp": 1.02413332, + "epoch": 0.2364647527431234, + "flos": 26183302914960.0, + "grad_norm": 1.4161058907389834, + "language_loss": 0.85626066, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.88124728, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.16308594, + "step": 3933, + "time_per_iteration": 2.85469913482666 + }, + { + "auxiliary_loss_clip": 0.01469818, + "auxiliary_loss_mlp": 0.01047794, + "balance_loss_clip": 1.31800139, + "balance_loss_mlp": 1.02868462, + "epoch": 0.23652487599579136, + "flos": 22642033114800.0, + "grad_norm": 4.512690375073433, + "language_loss": 0.83146036, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85663646, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.19116211, + "step": 3934, + "time_per_iteration": 4.278799533843994 + }, + { + "auxiliary_loss_clip": 0.01467404, + "auxiliary_loss_mlp": 0.01048592, + "balance_loss_clip": 1.3185699, + "balance_loss_mlp": 1.03036451, + "epoch": 0.23658499924845935, + "flos": 21767289897240.0, + "grad_norm": 1.5852041566953476, + "language_loss": 0.78881752, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.81397748, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.18225098, + "step": 3935, + "time_per_iteration": 2.732269287109375 + }, + { + "auxiliary_loss_clip": 0.0144753, + "auxiliary_loss_mlp": 0.01042086, + "balance_loss_clip": 1.3031857, + "balance_loss_mlp": 1.02549195, + "epoch": 0.23664512250112732, + "flos": 21804064090200.0, + "grad_norm": 1.6322738652384996, + "language_loss": 0.7986697, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.82356584, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.16577148, + "step": 3936, + "time_per_iteration": 4.084080457687378 + }, + { + "auxiliary_loss_clip": 0.01460334, + "auxiliary_loss_mlp": 0.01047526, + "balance_loss_clip": 1.31156766, + "balance_loss_mlp": 1.02966809, + "epoch": 0.23670524575379528, + "flos": 22643007715440.0, + "grad_norm": 1.5024286103066895, + "language_loss": 0.79392982, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81900841, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.17871094, + "step": 3937, + "time_per_iteration": 2.8074424266815186 + }, + { + "auxiliary_loss_clip": 0.01457021, + "auxiliary_loss_mlp": 0.01047487, + "balance_loss_clip": 1.31355882, + "balance_loss_mlp": 1.03075039, + "epoch": 0.23676536900646325, + "flos": 16729080009480.0, + "grad_norm": 2.0042430024597646, + "language_loss": 0.86024725, + "learning_rate": 3.568041475462147e-06, + "loss": 0.88529235, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.1673584, + "step": 3938, + "time_per_iteration": 2.695302963256836 + }, + { + "auxiliary_loss_clip": 0.01456624, + "auxiliary_loss_mlp": 0.01056224, + "balance_loss_clip": 1.31138206, + "balance_loss_mlp": 1.03848577, + "epoch": 0.23682549225913122, + "flos": 11138038976520.0, + "grad_norm": 2.102035322697402, + "language_loss": 0.93937552, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.964504, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.17749023, + "step": 3939, + "time_per_iteration": 2.6841540336608887 + }, + { + "auxiliary_loss_clip": 0.01466827, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.31610608, + "balance_loss_mlp": 1.02584982, + "epoch": 0.23688561551179918, + "flos": 22563611725680.0, + "grad_norm": 1.7337253873873244, + "language_loss": 0.82369518, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84881115, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.18920898, + "step": 3940, + "time_per_iteration": 2.7278411388397217 + }, + { + "auxiliary_loss_clip": 0.01463583, + "auxiliary_loss_mlp": 0.01047912, + "balance_loss_clip": 1.30942655, + "balance_loss_mlp": 1.02925587, + "epoch": 0.23694573876446715, + "flos": 18519482690280.0, + "grad_norm": 2.0783208397712425, + "language_loss": 0.89370626, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91882122, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18640137, + "step": 3941, + "time_per_iteration": 2.7394185066223145 + }, + { + "auxiliary_loss_clip": 0.01463618, + "auxiliary_loss_mlp": 0.01051031, + "balance_loss_clip": 1.30965948, + "balance_loss_mlp": 1.03102779, + "epoch": 0.23700586201713514, + "flos": 15339254440320.0, + "grad_norm": 1.9815164885434695, + "language_loss": 0.84738302, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.87252951, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.20007324, + "step": 3942, + "time_per_iteration": 2.6763265132904053 + }, + { + "auxiliary_loss_clip": 0.01465917, + "auxiliary_loss_mlp": 0.01045283, + "balance_loss_clip": 1.31390572, + "balance_loss_mlp": 1.02574468, + "epoch": 0.2370659852698031, + "flos": 23952422085840.0, + "grad_norm": 1.9792111756111486, + "language_loss": 0.81695592, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.8420679, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.19543457, + "step": 3943, + "time_per_iteration": 2.7497212886810303 + }, + { + "auxiliary_loss_clip": 0.01469649, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.31464279, + "balance_loss_mlp": 1.02595532, + "epoch": 0.23712610852247107, + "flos": 15335640296280.0, + "grad_norm": 2.2420691113073636, + "language_loss": 0.67791331, + "learning_rate": 3.566589891386959e-06, + "loss": 0.70305711, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18786621, + "step": 3944, + "time_per_iteration": 2.725094795227051 + }, + { + "auxiliary_loss_clip": 0.01456462, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_clip": 1.30600524, + "balance_loss_mlp": 1.02589941, + "epoch": 0.23718623177513903, + "flos": 19687323025800.0, + "grad_norm": 1.9965282983025123, + "language_loss": 0.75787568, + "learning_rate": 3.566347752735866e-06, + "loss": 0.78288621, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.18688965, + "step": 3945, + "time_per_iteration": 2.7764155864715576 + }, + { + "auxiliary_loss_clip": 0.014569, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.30749965, + "balance_loss_mlp": 1.01911211, + "epoch": 0.237246355027807, + "flos": 24978810211560.0, + "grad_norm": 1.6206322042374588, + "language_loss": 0.64182675, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.66676688, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.18005371, + "step": 3946, + "time_per_iteration": 2.7911837100982666 + }, + { + "auxiliary_loss_clip": 0.0145764, + "auxiliary_loss_mlp": 0.01049534, + "balance_loss_clip": 1.30851007, + "balance_loss_mlp": 1.02970982, + "epoch": 0.23730647828047496, + "flos": 15381632586960.0, + "grad_norm": 2.1986125595936326, + "language_loss": 0.77095354, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79602528, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.19812012, + "step": 3947, + "time_per_iteration": 2.780684232711792 + }, + { + "auxiliary_loss_clip": 0.0145971, + "auxiliary_loss_mlp": 0.0104656, + "balance_loss_clip": 1.3092469, + "balance_loss_mlp": 1.02789164, + "epoch": 0.23736660153314296, + "flos": 28156967435160.0, + "grad_norm": 1.419850596526973, + "language_loss": 0.80815703, + "learning_rate": 3.565620980442944e-06, + "loss": 0.83321977, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.18664551, + "step": 3948, + "time_per_iteration": 2.8628528118133545 + }, + { + "auxiliary_loss_clip": 0.01457228, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_clip": 1.30703211, + "balance_loss_mlp": 1.02812457, + "epoch": 0.23742672478581092, + "flos": 22091110562520.0, + "grad_norm": 2.1905097476763054, + "language_loss": 0.80153918, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.8265866, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.19396973, + "step": 3949, + "time_per_iteration": 2.747187852859497 + }, + { + "auxiliary_loss_clip": 0.01458964, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.30702496, + "balance_loss_mlp": 1.02109063, + "epoch": 0.2374868480384789, + "flos": 19541891196720.0, + "grad_norm": 1.6849026593119945, + "language_loss": 0.73579174, + "learning_rate": 3.565136168723163e-06, + "loss": 0.76077676, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.18457031, + "step": 3950, + "time_per_iteration": 2.7932348251342773 + }, + { + "auxiliary_loss_clip": 0.01444708, + "auxiliary_loss_mlp": 0.01039384, + "balance_loss_clip": 1.29722285, + "balance_loss_mlp": 1.02202773, + "epoch": 0.23754697129114685, + "flos": 19426614397920.0, + "grad_norm": 2.6652402249021616, + "language_loss": 0.731718, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75655895, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.17358398, + "step": 3951, + "time_per_iteration": 2.7409725189208984 + }, + { + "auxiliary_loss_clip": 0.01454865, + "auxiliary_loss_mlp": 0.01041845, + "balance_loss_clip": 1.30525136, + "balance_loss_mlp": 1.02267587, + "epoch": 0.23760709454381482, + "flos": 19505847954240.0, + "grad_norm": 2.932689612792765, + "language_loss": 0.74313295, + "learning_rate": 3.564651119602903e-06, + "loss": 0.76810002, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.19165039, + "step": 3952, + "time_per_iteration": 2.7709176540374756 + }, + { + "auxiliary_loss_clip": 0.01458204, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.30939317, + "balance_loss_mlp": 1.02199244, + "epoch": 0.23766721779648278, + "flos": 27642412992240.0, + "grad_norm": 2.741305586323538, + "language_loss": 0.71154332, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73652828, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.18310547, + "step": 3953, + "time_per_iteration": 2.841535806655884 + }, + { + "auxiliary_loss_clip": 0.01455395, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_clip": 1.30478477, + "balance_loss_mlp": 1.02718973, + "epoch": 0.23772734104915075, + "flos": 23409621205560.0, + "grad_norm": 1.8273975305574035, + "language_loss": 0.81588984, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.8409096, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.19396973, + "step": 3954, + "time_per_iteration": 2.7439279556274414 + }, + { + "auxiliary_loss_clip": 0.01451012, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.30173743, + "balance_loss_mlp": 1.024441, + "epoch": 0.23778746430181874, + "flos": 15709717130040.0, + "grad_norm": 2.4720140408995466, + "language_loss": 0.66439676, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68934935, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.19824219, + "step": 3955, + "time_per_iteration": 2.7467989921569824 + }, + { + "auxiliary_loss_clip": 0.01445561, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_clip": 1.29716468, + "balance_loss_mlp": 1.03186667, + "epoch": 0.2378475875544867, + "flos": 19431081317520.0, + "grad_norm": 1.426485885386631, + "language_loss": 0.83906746, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86402905, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.18713379, + "step": 3956, + "time_per_iteration": 2.725924015045166 + }, + { + "auxiliary_loss_clip": 0.01441857, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.29727364, + "balance_loss_mlp": 1.01736629, + "epoch": 0.23790771080715467, + "flos": 22273072934400.0, + "grad_norm": 2.034884869082207, + "language_loss": 0.85451221, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87929332, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18884277, + "step": 3957, + "time_per_iteration": 2.79533314704895 + }, + { + "auxiliary_loss_clip": 0.01447082, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.29814291, + "balance_loss_mlp": 1.02262092, + "epoch": 0.23796783405982264, + "flos": 20052100545120.0, + "grad_norm": 2.0542756277481016, + "language_loss": 0.70962822, + "learning_rate": 3.563194548575151e-06, + "loss": 0.73451269, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.1875, + "step": 3958, + "time_per_iteration": 2.7329487800598145 + }, + { + "auxiliary_loss_clip": 0.01457077, + "auxiliary_loss_mlp": 0.01042577, + "balance_loss_clip": 1.30652666, + "balance_loss_mlp": 1.02225208, + "epoch": 0.2380279573124906, + "flos": 14249957319000.0, + "grad_norm": 2.7415489124380326, + "language_loss": 0.66422093, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68921745, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.203125, + "step": 3959, + "time_per_iteration": 2.752979278564453 + }, + { + "auxiliary_loss_clip": 0.01448924, + "auxiliary_loss_mlp": 0.0104172, + "balance_loss_clip": 1.30152035, + "balance_loss_mlp": 1.02029788, + "epoch": 0.23808808056515857, + "flos": 21184222505040.0, + "grad_norm": 1.858752196421501, + "language_loss": 0.72736561, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.75227201, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.21447754, + "step": 3960, + "time_per_iteration": 2.732199192047119 + }, + { + "auxiliary_loss_clip": 0.01451215, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.30177236, + "balance_loss_mlp": 1.02292585, + "epoch": 0.23814820381782653, + "flos": 22533416087040.0, + "grad_norm": 1.764575910335429, + "language_loss": 0.74676991, + "learning_rate": 3.562465462704307e-06, + "loss": 0.77170622, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.19482422, + "step": 3961, + "time_per_iteration": 2.757981538772583 + }, + { + "auxiliary_loss_clip": 0.01462252, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_clip": 1.30973458, + "balance_loss_mlp": 1.02660215, + "epoch": 0.23820832707049452, + "flos": 22308750701640.0, + "grad_norm": 2.118633141691254, + "language_loss": 0.66278654, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68788731, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.21203613, + "step": 3962, + "time_per_iteration": 4.216936826705933 + }, + { + "auxiliary_loss_clip": 0.01450419, + "auxiliary_loss_mlp": 0.01046167, + "balance_loss_clip": 1.30061698, + "balance_loss_mlp": 1.02447057, + "epoch": 0.2382684503231625, + "flos": 24869746491840.0, + "grad_norm": 1.6486708718893057, + "language_loss": 0.75368708, + "learning_rate": 3.561979109197483e-06, + "loss": 0.77865291, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.21691895, + "step": 3963, + "time_per_iteration": 2.7657203674316406 + }, + { + "auxiliary_loss_clip": 0.01461559, + "auxiliary_loss_mlp": 0.01044382, + "balance_loss_clip": 1.30992317, + "balance_loss_mlp": 1.02405691, + "epoch": 0.23832857357583045, + "flos": 21876637875480.0, + "grad_norm": 2.015431360980043, + "language_loss": 0.77367479, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79873425, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.20336914, + "step": 3964, + "time_per_iteration": 2.771993398666382 + }, + { + "auxiliary_loss_clip": 0.01446716, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.2982173, + "balance_loss_mlp": 1.02182317, + "epoch": 0.23838869682849842, + "flos": 21293042574600.0, + "grad_norm": 2.9408464078921726, + "language_loss": 0.72472847, + "learning_rate": 3.561492518769045e-06, + "loss": 0.74962384, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.21008301, + "step": 3965, + "time_per_iteration": 2.746572494506836 + }, + { + "auxiliary_loss_clip": 0.01446576, + "auxiliary_loss_mlp": 0.0104517, + "balance_loss_clip": 1.30119491, + "balance_loss_mlp": 1.02517867, + "epoch": 0.23844882008116638, + "flos": 16184979661680.0, + "grad_norm": 1.807080724338232, + "language_loss": 0.78666991, + "learning_rate": 3.561249134732282e-06, + "loss": 0.81158739, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.19995117, + "step": 3966, + "time_per_iteration": 2.7436747550964355 + }, + { + "auxiliary_loss_clip": 0.01453973, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_clip": 1.30575728, + "balance_loss_mlp": 1.02587128, + "epoch": 0.23850894333383435, + "flos": 21074752701720.0, + "grad_norm": 1.6053188949353854, + "language_loss": 0.68808335, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71307349, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.19152832, + "step": 3967, + "time_per_iteration": 2.771390438079834 + }, + { + "auxiliary_loss_clip": 0.01455596, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.30760777, + "balance_loss_mlp": 1.03180873, + "epoch": 0.23856906658650234, + "flos": 17206048092240.0, + "grad_norm": 1.8251313955373238, + "language_loss": 0.68017232, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70524561, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.19946289, + "step": 3968, + "time_per_iteration": 2.735034227371216 + }, + { + "auxiliary_loss_clip": 0.01451045, + "auxiliary_loss_mlp": 0.01046462, + "balance_loss_clip": 1.30224705, + "balance_loss_mlp": 1.02605319, + "epoch": 0.2386291898391703, + "flos": 29499866721360.0, + "grad_norm": 1.8825901017989877, + "language_loss": 0.7662878, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.79126287, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.20422363, + "step": 3969, + "time_per_iteration": 2.8361191749572754 + }, + { + "auxiliary_loss_clip": 0.01452169, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_clip": 1.30696368, + "balance_loss_mlp": 1.02639842, + "epoch": 0.23868931309183827, + "flos": 21147448312080.0, + "grad_norm": 2.093216308950207, + "language_loss": 0.766729, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.79173577, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.22119141, + "step": 3970, + "time_per_iteration": 4.237342596054077 + }, + { + "auxiliary_loss_clip": 0.01460469, + "auxiliary_loss_mlp": 0.01051096, + "balance_loss_clip": 1.30971622, + "balance_loss_mlp": 1.03060365, + "epoch": 0.23874943634450624, + "flos": 25664037902280.0, + "grad_norm": 1.825838784456282, + "language_loss": 0.85808051, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.88319612, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.20483398, + "step": 3971, + "time_per_iteration": 2.7763261795043945 + }, + { + "auxiliary_loss_clip": 0.01320753, + "auxiliary_loss_mlp": 0.0100886, + "balance_loss_clip": 1.25192344, + "balance_loss_mlp": 1.00523591, + "epoch": 0.2388095595971742, + "flos": 59002430575680.0, + "grad_norm": 0.7411739143387788, + "language_loss": 0.62796754, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65126371, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03613281, + "step": 3972, + "time_per_iteration": 4.907954692840576 + }, + { + "auxiliary_loss_clip": 0.01462068, + "auxiliary_loss_mlp": 0.0104259, + "balance_loss_clip": 1.31489015, + "balance_loss_mlp": 1.02442312, + "epoch": 0.23886968284984217, + "flos": 16804740030120.0, + "grad_norm": 1.9196751899460969, + "language_loss": 0.82657695, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.85162354, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.18164062, + "step": 3973, + "time_per_iteration": 2.7715048789978027 + }, + { + "auxiliary_loss_clip": 0.01463864, + "auxiliary_loss_mlp": 0.01049366, + "balance_loss_clip": 1.31705308, + "balance_loss_mlp": 1.02942204, + "epoch": 0.23892980610251013, + "flos": 22387781216160.0, + "grad_norm": 1.5779198379952124, + "language_loss": 0.79558957, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.82072186, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.19934082, + "step": 3974, + "time_per_iteration": 4.266826629638672 + }, + { + "auxiliary_loss_clip": 0.01464887, + "auxiliary_loss_mlp": 0.0105261, + "balance_loss_clip": 1.3161664, + "balance_loss_mlp": 1.03322709, + "epoch": 0.23898992935517813, + "flos": 12827052917640.0, + "grad_norm": 1.821988078931304, + "language_loss": 0.84917772, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.87435263, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.19396973, + "step": 3975, + "time_per_iteration": 2.703301191329956 + }, + { + "auxiliary_loss_clip": 0.01457903, + "auxiliary_loss_mlp": 0.01051624, + "balance_loss_clip": 1.31248212, + "balance_loss_mlp": 1.03395689, + "epoch": 0.2390500526078461, + "flos": 22350722764680.0, + "grad_norm": 2.0008620728347535, + "language_loss": 0.8393271, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.86442238, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.17675781, + "step": 3976, + "time_per_iteration": 2.8811841011047363 + }, + { + "auxiliary_loss_clip": 0.01450952, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.30707574, + "balance_loss_mlp": 1.03697968, + "epoch": 0.23911017586051406, + "flos": 22639921480080.0, + "grad_norm": 1.641077338122867, + "language_loss": 0.74265701, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76769966, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.16320801, + "step": 3977, + "time_per_iteration": 2.7695605754852295 + }, + { + "auxiliary_loss_clip": 0.01457137, + "auxiliary_loss_mlp": 0.0105946, + "balance_loss_clip": 1.30835688, + "balance_loss_mlp": 1.04037499, + "epoch": 0.23917029911318202, + "flos": 23657741241840.0, + "grad_norm": 1.6702671083756886, + "language_loss": 0.72070605, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74587202, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.19091797, + "step": 3978, + "time_per_iteration": 2.801469564437866 + }, + { + "auxiliary_loss_clip": 0.01469707, + "auxiliary_loss_mlp": 0.01065947, + "balance_loss_clip": 1.31886184, + "balance_loss_mlp": 1.04776728, + "epoch": 0.23923042236585, + "flos": 22788723803040.0, + "grad_norm": 2.0300036899535203, + "language_loss": 0.79463363, + "learning_rate": 3.558079758168997e-06, + "loss": 0.81999016, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.1817627, + "step": 3979, + "time_per_iteration": 2.762540578842163 + }, + { + "auxiliary_loss_clip": 0.01455519, + "auxiliary_loss_mlp": 0.0106444, + "balance_loss_clip": 1.30930734, + "balance_loss_mlp": 1.04686844, + "epoch": 0.23929054561851795, + "flos": 28153637549640.0, + "grad_norm": 1.7244346498948036, + "language_loss": 0.82234216, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84754175, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.17572021, + "step": 3980, + "time_per_iteration": 2.8265347480773926 + }, + { + "auxiliary_loss_clip": 0.01458789, + "auxiliary_loss_mlp": 0.01060258, + "balance_loss_clip": 1.3135891, + "balance_loss_mlp": 1.0417093, + "epoch": 0.23935066887118592, + "flos": 21691183184640.0, + "grad_norm": 1.700024075354103, + "language_loss": 0.84120524, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86639571, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.1854248, + "step": 3981, + "time_per_iteration": 2.7472383975982666 + }, + { + "auxiliary_loss_clip": 0.01470818, + "auxiliary_loss_mlp": 0.01062208, + "balance_loss_clip": 1.31990826, + "balance_loss_mlp": 1.04389799, + "epoch": 0.2394107921238539, + "flos": 32128400860200.0, + "grad_norm": 1.8325774092974498, + "language_loss": 0.77028179, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.7956121, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.18334961, + "step": 3982, + "time_per_iteration": 2.8806753158569336 + }, + { + "auxiliary_loss_clip": 0.01452659, + "auxiliary_loss_mlp": 0.01073103, + "balance_loss_clip": 1.30607879, + "balance_loss_mlp": 1.05493617, + "epoch": 0.23947091537652188, + "flos": 17023598420040.0, + "grad_norm": 1.6915904787577847, + "language_loss": 0.78252745, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80778509, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.18151855, + "step": 3983, + "time_per_iteration": 2.7415647506713867 + }, + { + "auxiliary_loss_clip": 0.01460045, + "auxiliary_loss_mlp": 0.01067614, + "balance_loss_clip": 1.3132869, + "balance_loss_mlp": 1.04874325, + "epoch": 0.23953103862918984, + "flos": 20598312527640.0, + "grad_norm": 1.74158919386979, + "language_loss": 0.73463124, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75990778, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.18884277, + "step": 3984, + "time_per_iteration": 2.762986898422241 + }, + { + "auxiliary_loss_clip": 0.01457671, + "auxiliary_loss_mlp": 0.01057242, + "balance_loss_clip": 1.3079617, + "balance_loss_mlp": 1.03856158, + "epoch": 0.2395911618818578, + "flos": 20709325448640.0, + "grad_norm": 1.959267667677097, + "language_loss": 0.78824896, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81339812, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.18688965, + "step": 3985, + "time_per_iteration": 2.7511801719665527 + }, + { + "auxiliary_loss_clip": 0.01465775, + "auxiliary_loss_mlp": 0.01059875, + "balance_loss_clip": 1.31699908, + "balance_loss_mlp": 1.04044342, + "epoch": 0.23965128513452577, + "flos": 27059345600040.0, + "grad_norm": 1.725297095498047, + "language_loss": 0.7350539, + "learning_rate": 3.556369033716254e-06, + "loss": 0.76031041, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.19445801, + "step": 3986, + "time_per_iteration": 2.8016271591186523 + }, + { + "auxiliary_loss_clip": 0.01472626, + "auxiliary_loss_mlp": 0.01059915, + "balance_loss_clip": 1.31966805, + "balance_loss_mlp": 1.04185462, + "epoch": 0.23971140838719374, + "flos": 23148953186040.0, + "grad_norm": 1.7063981950750158, + "language_loss": 0.88284707, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90817243, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18054199, + "step": 3987, + "time_per_iteration": 2.7855093479156494 + }, + { + "auxiliary_loss_clip": 0.01453454, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.31249607, + "balance_loss_mlp": 1.02760088, + "epoch": 0.23977153163986173, + "flos": 18038291338080.0, + "grad_norm": 2.497595284648895, + "language_loss": 0.83576715, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.86075157, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17382812, + "step": 3988, + "time_per_iteration": 2.703427314758301 + }, + { + "auxiliary_loss_clip": 0.01454903, + "auxiliary_loss_mlp": 0.01048258, + "balance_loss_clip": 1.30697644, + "balance_loss_mlp": 1.02936351, + "epoch": 0.2398316548925297, + "flos": 18117890369640.0, + "grad_norm": 1.568573083137435, + "language_loss": 0.85382962, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87886125, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.18908691, + "step": 3989, + "time_per_iteration": 2.769392490386963 + }, + { + "auxiliary_loss_clip": 0.01452733, + "auxiliary_loss_mlp": 0.0105174, + "balance_loss_clip": 1.30821812, + "balance_loss_mlp": 1.03372765, + "epoch": 0.23989177814519766, + "flos": 12571542159840.0, + "grad_norm": 1.9977071130188013, + "language_loss": 0.84712332, + "learning_rate": 3.555390178293477e-06, + "loss": 0.87216806, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18029785, + "step": 3990, + "time_per_iteration": 2.7891414165496826 + }, + { + "auxiliary_loss_clip": 0.01457373, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.31073344, + "balance_loss_mlp": 1.03063452, + "epoch": 0.23995190139786562, + "flos": 25270039344960.0, + "grad_norm": 1.3552298617217815, + "language_loss": 0.75998813, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78503799, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.1697998, + "step": 3991, + "time_per_iteration": 2.854356050491333 + }, + { + "auxiliary_loss_clip": 0.01327694, + "auxiliary_loss_mlp": 0.01005231, + "balance_loss_clip": 1.25997806, + "balance_loss_mlp": 1.00196505, + "epoch": 0.2400120246505336, + "flos": 61974219803040.0, + "grad_norm": 0.8922431796624563, + "language_loss": 0.63824463, + "learning_rate": 3.554900396661656e-06, + "loss": 0.66157389, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.03271484, + "step": 3992, + "time_per_iteration": 3.199181079864502 + }, + { + "auxiliary_loss_clip": 0.01324099, + "auxiliary_loss_mlp": 0.01004634, + "balance_loss_clip": 1.25641298, + "balance_loss_mlp": 1.00146329, + "epoch": 0.24007214790320155, + "flos": 66723677667360.0, + "grad_norm": 0.7510329390144024, + "language_loss": 0.62913764, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65242499, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.03173828, + "step": 3993, + "time_per_iteration": 3.3324131965637207 + }, + { + "auxiliary_loss_clip": 0.01474893, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.3260752, + "balance_loss_mlp": 1.02742219, + "epoch": 0.24013227115586952, + "flos": 25814220909480.0, + "grad_norm": 1.655403867659087, + "language_loss": 0.77328205, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79850709, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.2019043, + "step": 3994, + "time_per_iteration": 2.841585874557495 + }, + { + "auxiliary_loss_clip": 0.01465148, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.31684363, + "balance_loss_mlp": 1.03149974, + "epoch": 0.2401923944085375, + "flos": 25563095854560.0, + "grad_norm": 1.5799420257164642, + "language_loss": 0.78367966, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80884916, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.20288086, + "step": 3995, + "time_per_iteration": 2.7725319862365723 + }, + { + "auxiliary_loss_clip": 0.01316299, + "auxiliary_loss_mlp": 0.01004752, + "balance_loss_clip": 1.24877763, + "balance_loss_mlp": 1.00148547, + "epoch": 0.24025251766120548, + "flos": 54957002072760.0, + "grad_norm": 0.9467204306922218, + "language_loss": 0.63477337, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.6579839, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.03271484, + "step": 3996, + "time_per_iteration": 3.2950797080993652 + }, + { + "auxiliary_loss_clip": 0.0146624, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_clip": 1.31492281, + "balance_loss_mlp": 1.02810073, + "epoch": 0.24031264091387344, + "flos": 20635939496160.0, + "grad_norm": 3.1658048390402387, + "language_loss": 0.70378458, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72891462, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.18664551, + "step": 3997, + "time_per_iteration": 2.7574331760406494 + }, + { + "auxiliary_loss_clip": 0.01454553, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_clip": 1.30729795, + "balance_loss_mlp": 1.02679801, + "epoch": 0.2403727641665414, + "flos": 20891125387080.0, + "grad_norm": 2.098668995854664, + "language_loss": 0.86932641, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89433551, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.19555664, + "step": 3998, + "time_per_iteration": 2.743126630783081 + }, + { + "auxiliary_loss_clip": 0.01463138, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.30988312, + "balance_loss_mlp": 1.02199423, + "epoch": 0.24043288741920937, + "flos": 22825173129120.0, + "grad_norm": 1.6555308349669993, + "language_loss": 0.75933039, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78436708, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18554688, + "step": 3999, + "time_per_iteration": 2.781587839126587 + }, + { + "auxiliary_loss_clip": 0.01453369, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.30912089, + "balance_loss_mlp": 1.02096295, + "epoch": 0.24049301067187734, + "flos": 27964812364920.0, + "grad_norm": 1.9350513576984623, + "language_loss": 0.7279759, + "learning_rate": 3.552938912398679e-06, + "loss": 0.75289458, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17529297, + "step": 4000, + "time_per_iteration": 2.7920615673065186 + }, + { + "auxiliary_loss_clip": 0.01466879, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.31484699, + "balance_loss_mlp": 1.02697587, + "epoch": 0.24055313392454533, + "flos": 27456836476320.0, + "grad_norm": 1.6437419658172563, + "language_loss": 0.66997468, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.69509792, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.18457031, + "step": 4001, + "time_per_iteration": 4.151069641113281 + }, + { + "auxiliary_loss_clip": 0.01461574, + "auxiliary_loss_mlp": 0.01042483, + "balance_loss_clip": 1.31330407, + "balance_loss_mlp": 1.02319479, + "epoch": 0.2406132571772133, + "flos": 25562121253920.0, + "grad_norm": 1.7258611481002835, + "language_loss": 0.82904166, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85408223, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.19287109, + "step": 4002, + "time_per_iteration": 2.8323252201080322 + }, + { + "auxiliary_loss_clip": 0.01450551, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.30350542, + "balance_loss_mlp": 1.0251348, + "epoch": 0.24067338042988126, + "flos": 24796929056400.0, + "grad_norm": 1.8942137826859273, + "language_loss": 0.82685852, + "learning_rate": 3.552202383898897e-06, + "loss": 0.85180962, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.1940918, + "step": 4003, + "time_per_iteration": 2.7614827156066895 + }, + { + "auxiliary_loss_clip": 0.01466362, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.31674051, + "balance_loss_mlp": 1.01817119, + "epoch": 0.24073350368254923, + "flos": 21182801212440.0, + "grad_norm": 2.1932114239078238, + "language_loss": 0.87566048, + "learning_rate": 3.551956756667215e-06, + "loss": 0.90069592, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.19006348, + "step": 4004, + "time_per_iteration": 2.847776174545288 + }, + { + "auxiliary_loss_clip": 0.01457258, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_clip": 1.30455327, + "balance_loss_mlp": 1.03057718, + "epoch": 0.2407936269352172, + "flos": 22499728129440.0, + "grad_norm": 1.7944776969206733, + "language_loss": 0.77781045, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80287814, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.18920898, + "step": 4005, + "time_per_iteration": 2.839165210723877 + }, + { + "auxiliary_loss_clip": 0.01445395, + "auxiliary_loss_mlp": 0.01043249, + "balance_loss_clip": 1.2997309, + "balance_loss_mlp": 1.02419949, + "epoch": 0.24085375018788516, + "flos": 18555688366200.0, + "grad_norm": 1.9204636813300475, + "language_loss": 0.79084069, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81572711, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.19055176, + "step": 4006, + "time_per_iteration": 2.871455669403076 + }, + { + "auxiliary_loss_clip": 0.01463011, + "auxiliary_loss_mlp": 0.01051261, + "balance_loss_clip": 1.3063935, + "balance_loss_mlp": 1.03030443, + "epoch": 0.24091387344055312, + "flos": 24175909828800.0, + "grad_norm": 1.6935328313506954, + "language_loss": 0.71718085, + "learning_rate": 3.551219521907302e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.2097168, + "step": 4007, + "time_per_iteration": 2.7980592250823975 + }, + { + "auxiliary_loss_clip": 0.01449798, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_clip": 1.30389464, + "balance_loss_mlp": 1.03472638, + "epoch": 0.24097399669322112, + "flos": 11040061339080.0, + "grad_norm": 1.8042857747180008, + "language_loss": 0.76212251, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78714669, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.17895508, + "step": 4008, + "time_per_iteration": 2.7351150512695312 + }, + { + "auxiliary_loss_clip": 0.01453625, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.30538213, + "balance_loss_mlp": 1.0284276, + "epoch": 0.24103411994588908, + "flos": 17169517549440.0, + "grad_norm": 2.34195781373466, + "language_loss": 0.75700748, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.78201926, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.19128418, + "step": 4009, + "time_per_iteration": 4.205803394317627 + }, + { + "auxiliary_loss_clip": 0.01451638, + "auxiliary_loss_mlp": 0.01053489, + "balance_loss_clip": 1.30421698, + "balance_loss_mlp": 1.0351429, + "epoch": 0.24109424319855705, + "flos": 20672876122560.0, + "grad_norm": 1.8755156466049667, + "language_loss": 0.80200315, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82705438, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.18334961, + "step": 4010, + "time_per_iteration": 2.74863338470459 + }, + { + "auxiliary_loss_clip": 0.01456309, + "auxiliary_loss_mlp": 0.01050919, + "balance_loss_clip": 1.30609727, + "balance_loss_mlp": 1.03097546, + "epoch": 0.241154366451225, + "flos": 28187406723960.0, + "grad_norm": 1.839251404728184, + "language_loss": 0.70799363, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.7330659, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.19909668, + "step": 4011, + "time_per_iteration": 4.430984020233154 + }, + { + "auxiliary_loss_clip": 0.01459733, + "auxiliary_loss_mlp": 0.01047913, + "balance_loss_clip": 1.30937922, + "balance_loss_mlp": 1.02786219, + "epoch": 0.24121448970389298, + "flos": 21694797328680.0, + "grad_norm": 1.497210673597674, + "language_loss": 0.69022334, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71529973, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.20043945, + "step": 4012, + "time_per_iteration": 4.220163583755493 + }, + { + "auxiliary_loss_clip": 0.01457829, + "auxiliary_loss_mlp": 0.01050723, + "balance_loss_clip": 1.30811155, + "balance_loss_mlp": 1.02937222, + "epoch": 0.24127461295656094, + "flos": 39683482232040.0, + "grad_norm": 1.8589746301220802, + "language_loss": 0.73547453, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.76056004, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.21350098, + "step": 4013, + "time_per_iteration": 2.930607318878174 + }, + { + "auxiliary_loss_clip": 0.01463181, + "auxiliary_loss_mlp": 0.01046924, + "balance_loss_clip": 1.31197655, + "balance_loss_mlp": 1.02814877, + "epoch": 0.2413347362092289, + "flos": 19140623742960.0, + "grad_norm": 1.7835922364869525, + "language_loss": 0.88427269, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90937376, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.18762207, + "step": 4014, + "time_per_iteration": 2.721841812133789 + }, + { + "auxiliary_loss_clip": 0.01466101, + "auxiliary_loss_mlp": 0.01056893, + "balance_loss_clip": 1.3115952, + "balance_loss_mlp": 1.03742599, + "epoch": 0.2413948594618969, + "flos": 26944637318280.0, + "grad_norm": 2.018797662759572, + "language_loss": 0.94985175, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97508174, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.19470215, + "step": 4015, + "time_per_iteration": 2.7971484661102295 + }, + { + "auxiliary_loss_clip": 0.01461547, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.3095665, + "balance_loss_mlp": 1.02940762, + "epoch": 0.24145498271456486, + "flos": 25233549410520.0, + "grad_norm": 1.5591623333322997, + "language_loss": 0.82783663, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.85295218, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.20617676, + "step": 4016, + "time_per_iteration": 2.7904176712036133 + }, + { + "auxiliary_loss_clip": 0.01452681, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.30703032, + "balance_loss_mlp": 1.02719367, + "epoch": 0.24151510596723283, + "flos": 40669238370600.0, + "grad_norm": 1.866570320155512, + "language_loss": 0.69181734, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71679997, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.18371582, + "step": 4017, + "time_per_iteration": 3.0228323936462402 + }, + { + "auxiliary_loss_clip": 0.01462639, + "auxiliary_loss_mlp": 0.01053496, + "balance_loss_clip": 1.30901623, + "balance_loss_mlp": 1.03314745, + "epoch": 0.2415752292199008, + "flos": 18150766160040.0, + "grad_norm": 1.6069301233354292, + "language_loss": 0.84786379, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87302518, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.20324707, + "step": 4018, + "time_per_iteration": 2.744779586791992 + }, + { + "auxiliary_loss_clip": 0.01296304, + "auxiliary_loss_mlp": 0.01011975, + "balance_loss_clip": 1.22731566, + "balance_loss_mlp": 1.00882769, + "epoch": 0.24163535247256876, + "flos": 67303090307160.0, + "grad_norm": 0.8262832854223774, + "language_loss": 0.60680062, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62988341, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.03149414, + "step": 4019, + "time_per_iteration": 3.3090832233428955 + }, + { + "auxiliary_loss_clip": 0.0145392, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.30320549, + "balance_loss_mlp": 1.02757835, + "epoch": 0.24169547572523672, + "flos": 24934523472000.0, + "grad_norm": 1.788499433753286, + "language_loss": 0.73786688, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.76285332, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.17150879, + "step": 4020, + "time_per_iteration": 2.880194902420044 + }, + { + "auxiliary_loss_clip": 0.01459973, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.31090307, + "balance_loss_mlp": 1.02785122, + "epoch": 0.24175559897790472, + "flos": 18732209217840.0, + "grad_norm": 4.178790751560353, + "language_loss": 0.82133543, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84641325, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.19958496, + "step": 4021, + "time_per_iteration": 2.745553493499756 + }, + { + "auxiliary_loss_clip": 0.01465643, + "auxiliary_loss_mlp": 0.01058065, + "balance_loss_clip": 1.31246161, + "balance_loss_mlp": 1.03671479, + "epoch": 0.24181572223057268, + "flos": 23044478211000.0, + "grad_norm": 2.1608382070251326, + "language_loss": 0.7661674, + "learning_rate": 3.547525412122378e-06, + "loss": 0.79140449, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.21350098, + "step": 4022, + "time_per_iteration": 2.796767234802246 + }, + { + "auxiliary_loss_clip": 0.01472197, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.31544995, + "balance_loss_mlp": 1.03074837, + "epoch": 0.24187584548324065, + "flos": 20381037863760.0, + "grad_norm": 1.9345738802679895, + "language_loss": 0.75345194, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77868515, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.20385742, + "step": 4023, + "time_per_iteration": 2.7249319553375244 + }, + { + "auxiliary_loss_clip": 0.01460581, + "auxiliary_loss_mlp": 0.01058996, + "balance_loss_clip": 1.31113648, + "balance_loss_mlp": 1.03964829, + "epoch": 0.2419359687359086, + "flos": 21402512377920.0, + "grad_norm": 1.884619411428118, + "language_loss": 0.82629949, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.85149527, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.19335938, + "step": 4024, + "time_per_iteration": 2.7650845050811768 + }, + { + "auxiliary_loss_clip": 0.01457557, + "auxiliary_loss_mlp": 0.01053284, + "balance_loss_clip": 1.30820215, + "balance_loss_mlp": 1.0344727, + "epoch": 0.24199609198857658, + "flos": 18374010252840.0, + "grad_norm": 1.8242090607766492, + "language_loss": 0.85881352, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88392192, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.18823242, + "step": 4025, + "time_per_iteration": 2.7346231937408447 + }, + { + "auxiliary_loss_clip": 0.01467021, + "auxiliary_loss_mlp": 0.01060448, + "balance_loss_clip": 1.31153798, + "balance_loss_mlp": 1.04108846, + "epoch": 0.24205621524124454, + "flos": 19468627069320.0, + "grad_norm": 1.8684057757838304, + "language_loss": 0.71570683, + "learning_rate": 3.546538084949365e-06, + "loss": 0.74098152, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.19360352, + "step": 4026, + "time_per_iteration": 2.727620840072632 + }, + { + "auxiliary_loss_clip": 0.01455596, + "auxiliary_loss_mlp": 0.01054763, + "balance_loss_clip": 1.30932999, + "balance_loss_mlp": 1.03759646, + "epoch": 0.2421163384939125, + "flos": 14980527566640.0, + "grad_norm": 2.1808913180640306, + "language_loss": 0.64461952, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66972315, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.17163086, + "step": 4027, + "time_per_iteration": 2.7310450077056885 + }, + { + "auxiliary_loss_clip": 0.01466685, + "auxiliary_loss_mlp": 0.01054688, + "balance_loss_clip": 1.31260991, + "balance_loss_mlp": 1.03638971, + "epoch": 0.2421764617465805, + "flos": 18666823112280.0, + "grad_norm": 2.0304259272150094, + "language_loss": 0.70459324, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72980696, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.18286133, + "step": 4028, + "time_per_iteration": 2.8325302600860596 + }, + { + "auxiliary_loss_clip": 0.01287219, + "auxiliary_loss_mlp": 0.01005633, + "balance_loss_clip": 1.21942401, + "balance_loss_mlp": 1.00184178, + "epoch": 0.24223658499924847, + "flos": 64363486528080.0, + "grad_norm": 0.9634893188462851, + "language_loss": 0.55315197, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57608056, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.0378418, + "step": 4029, + "time_per_iteration": 3.2767887115478516 + }, + { + "auxiliary_loss_clip": 0.01463263, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_clip": 1.31364977, + "balance_loss_mlp": 1.02392125, + "epoch": 0.24229670825191643, + "flos": 25780735993680.0, + "grad_norm": 2.207955093337531, + "language_loss": 0.7416501, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76671875, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.19677734, + "step": 4030, + "time_per_iteration": 2.7898013591766357 + }, + { + "auxiliary_loss_clip": 0.01454852, + "auxiliary_loss_mlp": 0.01056094, + "balance_loss_clip": 1.30552053, + "balance_loss_mlp": 1.03715158, + "epoch": 0.2423568315045844, + "flos": 20671901521920.0, + "grad_norm": 7.447043006500463, + "language_loss": 0.77234852, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79745793, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.18933105, + "step": 4031, + "time_per_iteration": 2.875418186187744 + }, + { + "auxiliary_loss_clip": 0.01468322, + "auxiliary_loss_mlp": 0.01061248, + "balance_loss_clip": 1.31199169, + "balance_loss_mlp": 1.04273462, + "epoch": 0.24241695475725236, + "flos": 22421631607200.0, + "grad_norm": 3.7708671160400806, + "language_loss": 0.65587771, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68117344, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18518066, + "step": 4032, + "time_per_iteration": 2.843071222305298 + }, + { + "auxiliary_loss_clip": 0.01457387, + "auxiliary_loss_mlp": 0.01049512, + "balance_loss_clip": 1.30909073, + "balance_loss_mlp": 1.03121305, + "epoch": 0.24247707800992033, + "flos": 17133474306960.0, + "grad_norm": 1.9475612401700404, + "language_loss": 0.81667835, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.84174728, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.18273926, + "step": 4033, + "time_per_iteration": 2.735360622406006 + }, + { + "auxiliary_loss_clip": 0.01457696, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.30970621, + "balance_loss_mlp": 1.02560997, + "epoch": 0.2425372012625883, + "flos": 31620100104720.0, + "grad_norm": 1.9044657969789314, + "language_loss": 0.69605541, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.7210716, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.18310547, + "step": 4034, + "time_per_iteration": 2.83703351020813 + }, + { + "auxiliary_loss_clip": 0.0146171, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.31172025, + "balance_loss_mlp": 1.02262831, + "epoch": 0.24259732451525629, + "flos": 16330370882400.0, + "grad_norm": 2.4640290456518072, + "language_loss": 0.96198767, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98702407, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19299316, + "step": 4035, + "time_per_iteration": 2.6977639198303223 + }, + { + "auxiliary_loss_clip": 0.01446112, + "auxiliary_loss_mlp": 0.010445, + "balance_loss_clip": 1.30068636, + "balance_loss_mlp": 1.02756011, + "epoch": 0.24265744776792425, + "flos": 22861784888640.0, + "grad_norm": 1.8355172220970575, + "language_loss": 0.78391117, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80881727, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.16943359, + "step": 4036, + "time_per_iteration": 2.7789084911346436 + }, + { + "auxiliary_loss_clip": 0.01463677, + "auxiliary_loss_mlp": 0.01047347, + "balance_loss_clip": 1.31445312, + "balance_loss_mlp": 1.02828515, + "epoch": 0.24271757102059222, + "flos": 21876597267120.0, + "grad_norm": 1.6137688553718725, + "language_loss": 0.74678266, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.77189285, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 1.49121094, + "router_z_loss_mlp": 0.19055176, + "step": 4037, + "time_per_iteration": 2.737698554992676 + }, + { + "auxiliary_loss_clip": 0.01458728, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.30746508, + "balance_loss_mlp": 1.0203284, + "epoch": 0.24277769427326018, + "flos": 19213319353320.0, + "grad_norm": 1.9803186937384225, + "language_loss": 0.76685572, + "learning_rate": 3.543570475921171e-06, + "loss": 0.79184318, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.19689941, + "step": 4038, + "time_per_iteration": 2.732332468032837 + }, + { + "auxiliary_loss_clip": 0.01456963, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.30704045, + "balance_loss_mlp": 1.02327621, + "epoch": 0.24283781752592815, + "flos": 19504589095080.0, + "grad_norm": 3.127773979134222, + "language_loss": 0.72380239, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74879444, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.1895752, + "step": 4039, + "time_per_iteration": 4.1942219734191895 + }, + { + "auxiliary_loss_clip": 0.01461076, + "auxiliary_loss_mlp": 0.01038871, + "balance_loss_clip": 1.3114326, + "balance_loss_mlp": 1.02039313, + "epoch": 0.2428979407785961, + "flos": 19907074799640.0, + "grad_norm": 2.052597809230513, + "language_loss": 0.7799291, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80492854, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.18481445, + "step": 4040, + "time_per_iteration": 2.785151720046997 + }, + { + "auxiliary_loss_clip": 0.01454656, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.3085494, + "balance_loss_mlp": 1.01865721, + "epoch": 0.2429580640312641, + "flos": 24721065993960.0, + "grad_norm": 1.7966532891619356, + "language_loss": 0.79920936, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82412028, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17773438, + "step": 4041, + "time_per_iteration": 2.8174221515655518 + }, + { + "auxiliary_loss_clip": 0.01460577, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.31123543, + "balance_loss_mlp": 1.02884054, + "epoch": 0.24301818728393207, + "flos": 25635953898360.0, + "grad_norm": 2.2671833694136914, + "language_loss": 0.77279341, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79787308, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.18566895, + "step": 4042, + "time_per_iteration": 2.7934932708740234 + }, + { + "auxiliary_loss_clip": 0.0145672, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.31001329, + "balance_loss_mlp": 1.01825213, + "epoch": 0.24307831053660003, + "flos": 26147097239040.0, + "grad_norm": 1.8683662889995047, + "language_loss": 0.81905037, + "learning_rate": 3.542331483604246e-06, + "loss": 0.8439796, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.17956543, + "step": 4043, + "time_per_iteration": 2.8175597190856934 + }, + { + "auxiliary_loss_clip": 0.01471722, + "auxiliary_loss_mlp": 0.01040337, + "balance_loss_clip": 1.31696033, + "balance_loss_mlp": 1.02007174, + "epoch": 0.243138433789268, + "flos": 14976101255400.0, + "grad_norm": 2.3616676893203286, + "language_loss": 0.73222387, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75734442, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.20251465, + "step": 4044, + "time_per_iteration": 2.733524799346924 + }, + { + "auxiliary_loss_clip": 0.01464342, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.31553245, + "balance_loss_mlp": 1.02265596, + "epoch": 0.24319855704193596, + "flos": 25197140692800.0, + "grad_norm": 1.6787242131836206, + "language_loss": 0.83217418, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85723788, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.19384766, + "step": 4045, + "time_per_iteration": 2.748530387878418 + }, + { + "auxiliary_loss_clip": 0.01465514, + "auxiliary_loss_mlp": 0.01048318, + "balance_loss_clip": 1.31570101, + "balance_loss_mlp": 1.02948356, + "epoch": 0.24325868029460393, + "flos": 22132311066720.0, + "grad_norm": 1.9724030768545313, + "language_loss": 0.86680603, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89194441, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.18835449, + "step": 4046, + "time_per_iteration": 2.779355049133301 + }, + { + "auxiliary_loss_clip": 0.01463436, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.31737518, + "balance_loss_mlp": 1.02586746, + "epoch": 0.2433188035472719, + "flos": 23586791790960.0, + "grad_norm": 1.8405319099464854, + "language_loss": 0.72984946, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.19628906, + "step": 4047, + "time_per_iteration": 4.249172687530518 + }, + { + "auxiliary_loss_clip": 0.01463964, + "auxiliary_loss_mlp": 0.01048122, + "balance_loss_clip": 1.3145206, + "balance_loss_mlp": 1.02755868, + "epoch": 0.2433789267999399, + "flos": 24468275996280.0, + "grad_norm": 3.39751222523333, + "language_loss": 0.73692119, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.76204205, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.20556641, + "step": 4048, + "time_per_iteration": 2.783456325531006 + }, + { + "auxiliary_loss_clip": 0.01470225, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.32193983, + "balance_loss_mlp": 1.02822733, + "epoch": 0.24343905005260785, + "flos": 16732369286640.0, + "grad_norm": 2.9125392952264, + "language_loss": 0.73627508, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.76144409, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.18457031, + "step": 4049, + "time_per_iteration": 2.7537918090820312 + }, + { + "auxiliary_loss_clip": 0.01461184, + "auxiliary_loss_mlp": 0.01050578, + "balance_loss_clip": 1.31389308, + "balance_loss_mlp": 1.03167105, + "epoch": 0.24349917330527582, + "flos": 20048080317480.0, + "grad_norm": 1.8210053998131233, + "language_loss": 0.74242711, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.76754475, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.18896484, + "step": 4050, + "time_per_iteration": 4.346787929534912 + }, + { + "auxiliary_loss_clip": 0.01462484, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.31633127, + "balance_loss_mlp": 1.02601385, + "epoch": 0.24355929655794378, + "flos": 17425028307240.0, + "grad_norm": 2.543986467928481, + "language_loss": 0.75030613, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77537149, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.18029785, + "step": 4051, + "time_per_iteration": 4.259458541870117 + }, + { + "auxiliary_loss_clip": 0.01468856, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.31918144, + "balance_loss_mlp": 1.0255295, + "epoch": 0.24361941981061175, + "flos": 25416445774680.0, + "grad_norm": 2.1127922823403575, + "language_loss": 0.70749205, + "learning_rate": 3.540097613646296e-06, + "loss": 0.73262668, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.1907959, + "step": 4052, + "time_per_iteration": 2.856039524078369 + }, + { + "auxiliary_loss_clip": 0.01472062, + "auxiliary_loss_mlp": 0.01052329, + "balance_loss_clip": 1.32419872, + "balance_loss_mlp": 1.0328145, + "epoch": 0.2436795430632797, + "flos": 22826025904680.0, + "grad_norm": 1.5568336348245557, + "language_loss": 0.81262672, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83787066, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.19519043, + "step": 4053, + "time_per_iteration": 2.7497291564941406 + }, + { + "auxiliary_loss_clip": 0.01477183, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.32536125, + "balance_loss_mlp": 1.02136838, + "epoch": 0.2437396663159477, + "flos": 15161880813120.0, + "grad_norm": 1.5498215288421078, + "language_loss": 0.78209096, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80726671, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.19018555, + "step": 4054, + "time_per_iteration": 2.7294435501098633 + }, + { + "auxiliary_loss_clip": 0.01468472, + "auxiliary_loss_mlp": 0.01047779, + "balance_loss_clip": 1.31984556, + "balance_loss_mlp": 1.0300889, + "epoch": 0.24379978956861567, + "flos": 22096267824240.0, + "grad_norm": 1.566464947324253, + "language_loss": 0.84090173, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86606431, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.17675781, + "step": 4055, + "time_per_iteration": 2.736164093017578 + }, + { + "auxiliary_loss_clip": 0.01478229, + "auxiliary_loss_mlp": 0.01046291, + "balance_loss_clip": 1.32393765, + "balance_loss_mlp": 1.02606118, + "epoch": 0.24385991282128364, + "flos": 31474099758600.0, + "grad_norm": 3.047376389725406, + "language_loss": 0.55674112, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.58198637, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.20239258, + "step": 4056, + "time_per_iteration": 2.8154940605163574 + }, + { + "auxiliary_loss_clip": 0.01478415, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.3269639, + "balance_loss_mlp": 1.02575517, + "epoch": 0.2439200360739516, + "flos": 23843683233000.0, + "grad_norm": 2.3249226120618474, + "language_loss": 0.7965349, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82176775, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.19116211, + "step": 4057, + "time_per_iteration": 2.7476205825805664 + }, + { + "auxiliary_loss_clip": 0.01474687, + "auxiliary_loss_mlp": 0.0104471, + "balance_loss_clip": 1.32656324, + "balance_loss_mlp": 1.02610183, + "epoch": 0.24398015932661957, + "flos": 19174433525640.0, + "grad_norm": 1.6524309837219526, + "language_loss": 0.79552889, + "learning_rate": 3.538605738554673e-06, + "loss": 0.82072288, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.18603516, + "step": 4058, + "time_per_iteration": 2.7253942489624023 + }, + { + "auxiliary_loss_clip": 0.01476635, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.32254684, + "balance_loss_mlp": 1.02790022, + "epoch": 0.24404028257928753, + "flos": 25267643451720.0, + "grad_norm": 1.673120470374549, + "language_loss": 0.85788447, + "learning_rate": 3.538356888446756e-06, + "loss": 0.88311863, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18859863, + "step": 4059, + "time_per_iteration": 2.7769336700439453 + }, + { + "auxiliary_loss_clip": 0.01471865, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.32545638, + "balance_loss_mlp": 1.02134275, + "epoch": 0.2441004058319555, + "flos": 26473070147400.0, + "grad_norm": 1.5494494010177156, + "language_loss": 0.74243796, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76754969, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.1796875, + "step": 4060, + "time_per_iteration": 2.817918539047241 + }, + { + "auxiliary_loss_clip": 0.01485861, + "auxiliary_loss_mlp": 0.01057733, + "balance_loss_clip": 1.33149421, + "balance_loss_mlp": 1.03641868, + "epoch": 0.2441605290846235, + "flos": 26766167265360.0, + "grad_norm": 1.8038213062392345, + "language_loss": 0.73809689, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.76353282, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.21313477, + "step": 4061, + "time_per_iteration": 2.8263638019561768 + }, + { + "auxiliary_loss_clip": 0.01478195, + "auxiliary_loss_mlp": 0.01047326, + "balance_loss_clip": 1.33131003, + "balance_loss_mlp": 1.02963555, + "epoch": 0.24422065233729146, + "flos": 21110836552560.0, + "grad_norm": 1.854457046451737, + "language_loss": 0.76725888, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.79251409, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.17700195, + "step": 4062, + "time_per_iteration": 2.739762306213379 + }, + { + "auxiliary_loss_clip": 0.01474177, + "auxiliary_loss_mlp": 0.01044286, + "balance_loss_clip": 1.33010173, + "balance_loss_mlp": 1.02622533, + "epoch": 0.24428077558995942, + "flos": 25268415010560.0, + "grad_norm": 1.7374815826256977, + "language_loss": 0.85398966, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87917429, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.18054199, + "step": 4063, + "time_per_iteration": 2.833216905593872 + }, + { + "auxiliary_loss_clip": 0.01481093, + "auxiliary_loss_mlp": 0.01041072, + "balance_loss_clip": 1.32860112, + "balance_loss_mlp": 1.02217698, + "epoch": 0.24434089884262739, + "flos": 20490142191840.0, + "grad_norm": 1.9590336892261577, + "language_loss": 0.68471551, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70993721, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18884277, + "step": 4064, + "time_per_iteration": 2.7875678539276123 + }, + { + "auxiliary_loss_clip": 0.01485977, + "auxiliary_loss_mlp": 0.0104608, + "balance_loss_clip": 1.3320601, + "balance_loss_mlp": 1.02734017, + "epoch": 0.24440102209529535, + "flos": 23627017694520.0, + "grad_norm": 1.449246226159988, + "language_loss": 0.70453036, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72985101, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1875, + "step": 4065, + "time_per_iteration": 2.8907110691070557 + }, + { + "auxiliary_loss_clip": 0.01474726, + "auxiliary_loss_mlp": 0.01055938, + "balance_loss_clip": 1.32242, + "balance_loss_mlp": 1.03452849, + "epoch": 0.24446114534796332, + "flos": 20559142441440.0, + "grad_norm": 1.8399849186490387, + "language_loss": 0.84094763, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86625433, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.2142334, + "step": 4066, + "time_per_iteration": 2.8307299613952637 + }, + { + "auxiliary_loss_clip": 0.01310907, + "auxiliary_loss_mlp": 0.01023938, + "balance_loss_clip": 1.24181533, + "balance_loss_mlp": 1.02048111, + "epoch": 0.24452126860063128, + "flos": 60402959770680.0, + "grad_norm": 0.8390929806841317, + "language_loss": 0.52263546, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54598391, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.03466797, + "step": 4067, + "time_per_iteration": 3.1693036556243896 + }, + { + "auxiliary_loss_clip": 0.01471619, + "auxiliary_loss_mlp": 0.01043032, + "balance_loss_clip": 1.32087445, + "balance_loss_mlp": 1.0238986, + "epoch": 0.24458139185329927, + "flos": 15125796962280.0, + "grad_norm": 2.7151857158339254, + "language_loss": 0.72793281, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.7530793, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.19128418, + "step": 4068, + "time_per_iteration": 2.7504968643188477 + }, + { + "auxiliary_loss_clip": 0.01474702, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.32821679, + "balance_loss_mlp": 1.02705121, + "epoch": 0.24464151510596724, + "flos": 28004226101280.0, + "grad_norm": 1.4983387067681704, + "language_loss": 0.78240001, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.80761635, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.19885254, + "step": 4069, + "time_per_iteration": 2.7827060222625732 + }, + { + "auxiliary_loss_clip": 0.01474711, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_clip": 1.32801485, + "balance_loss_mlp": 1.02725577, + "epoch": 0.2447016383586352, + "flos": 19797767429760.0, + "grad_norm": 1.8355129561397363, + "language_loss": 0.80640739, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.83161694, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.18981934, + "step": 4070, + "time_per_iteration": 2.8045122623443604 + }, + { + "auxiliary_loss_clip": 0.01476796, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.32811284, + "balance_loss_mlp": 1.02571034, + "epoch": 0.24476176161130317, + "flos": 26072899119360.0, + "grad_norm": 1.3970519507434302, + "language_loss": 0.84371221, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86891323, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.17602539, + "step": 4071, + "time_per_iteration": 2.8160574436187744 + }, + { + "auxiliary_loss_clip": 0.01481791, + "auxiliary_loss_mlp": 0.01049118, + "balance_loss_clip": 1.32659984, + "balance_loss_mlp": 1.02766061, + "epoch": 0.24482188486397113, + "flos": 18848216967120.0, + "grad_norm": 1.6645100860084079, + "language_loss": 0.80314291, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82845199, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.21447754, + "step": 4072, + "time_per_iteration": 2.7782142162323 + }, + { + "auxiliary_loss_clip": 0.01468004, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.32367253, + "balance_loss_mlp": 1.02946377, + "epoch": 0.2448820081166391, + "flos": 21256796290320.0, + "grad_norm": 1.8656577210547236, + "language_loss": 0.70042777, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72557688, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.17437744, + "step": 4073, + "time_per_iteration": 2.850351572036743 + }, + { + "auxiliary_loss_clip": 0.01465914, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.32080793, + "balance_loss_mlp": 1.02743101, + "epoch": 0.2449421313693071, + "flos": 23955792579720.0, + "grad_norm": 2.125471805047199, + "language_loss": 0.67880148, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.70390517, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.17016602, + "step": 4074, + "time_per_iteration": 2.757362127304077 + }, + { + "auxiliary_loss_clip": 0.01293496, + "auxiliary_loss_mlp": 0.01009687, + "balance_loss_clip": 1.22448504, + "balance_loss_mlp": 1.00584841, + "epoch": 0.24500225462197506, + "flos": 60701904492480.0, + "grad_norm": 0.9323345794520914, + "language_loss": 0.68761098, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.71064281, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.03833008, + "step": 4075, + "time_per_iteration": 3.342278003692627 + }, + { + "auxiliary_loss_clip": 0.01466154, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.32077801, + "balance_loss_mlp": 1.02480686, + "epoch": 0.24506237787464302, + "flos": 26289524049480.0, + "grad_norm": 2.651520898576517, + "language_loss": 0.79923749, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.82432646, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.17932129, + "step": 4076, + "time_per_iteration": 2.9423060417175293 + }, + { + "auxiliary_loss_clip": 0.01479897, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.32766068, + "balance_loss_mlp": 1.02118993, + "epoch": 0.245122501127311, + "flos": 20556381072960.0, + "grad_norm": 1.983258118892128, + "language_loss": 0.82592821, + "learning_rate": 3.533867620434151e-06, + "loss": 0.8511349, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.19580078, + "step": 4077, + "time_per_iteration": 4.177792072296143 + }, + { + "auxiliary_loss_clip": 0.01480447, + "auxiliary_loss_mlp": 0.0104639, + "balance_loss_clip": 1.33116615, + "balance_loss_mlp": 1.02582645, + "epoch": 0.24518262437997895, + "flos": 29138622129360.0, + "grad_norm": 1.8272079065662556, + "language_loss": 0.6246689, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64993727, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.20568848, + "step": 4078, + "time_per_iteration": 2.8307721614837646 + }, + { + "auxiliary_loss_clip": 0.01468749, + "auxiliary_loss_mlp": 0.01040826, + "balance_loss_clip": 1.32358241, + "balance_loss_mlp": 1.02331424, + "epoch": 0.24524274763264692, + "flos": 23482154382480.0, + "grad_norm": 1.4305410211745528, + "language_loss": 0.75789136, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.78298712, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.1751709, + "step": 4079, + "time_per_iteration": 2.7715108394622803 + }, + { + "auxiliary_loss_clip": 0.01463632, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.31911874, + "balance_loss_mlp": 1.01960492, + "epoch": 0.24530287088531488, + "flos": 17205560791920.0, + "grad_norm": 1.6813402121315433, + "language_loss": 0.75133425, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77635139, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.18469238, + "step": 4080, + "time_per_iteration": 2.7487003803253174 + }, + { + "auxiliary_loss_clip": 0.01467553, + "auxiliary_loss_mlp": 0.01042689, + "balance_loss_clip": 1.32288396, + "balance_loss_mlp": 1.02416408, + "epoch": 0.24536299413798288, + "flos": 14871707497080.0, + "grad_norm": 1.7954949175535613, + "language_loss": 0.83983696, + "learning_rate": 3.532867444142186e-06, + "loss": 0.86493945, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18530273, + "step": 4081, + "time_per_iteration": 2.76971697807312 + }, + { + "auxiliary_loss_clip": 0.01464636, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.32055068, + "balance_loss_mlp": 1.02466154, + "epoch": 0.24542311739065084, + "flos": 35268849898560.0, + "grad_norm": 1.820252074602588, + "language_loss": 0.73973686, + "learning_rate": 3.532617254729267e-06, + "loss": 0.76479816, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.16845703, + "step": 4082, + "time_per_iteration": 2.8461389541625977 + }, + { + "auxiliary_loss_clip": 0.01466585, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.3229295, + "balance_loss_mlp": 1.03015339, + "epoch": 0.2454832406433188, + "flos": 21507637086720.0, + "grad_norm": 1.5060523289073902, + "language_loss": 0.72514981, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.75028193, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.16467285, + "step": 4083, + "time_per_iteration": 2.7891175746917725 + }, + { + "auxiliary_loss_clip": 0.01471857, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_clip": 1.32383192, + "balance_loss_mlp": 1.02656674, + "epoch": 0.24554336389598677, + "flos": 14760735184440.0, + "grad_norm": 2.0096099361493036, + "language_loss": 0.75317001, + "learning_rate": 3.532116701561919e-06, + "loss": 0.77834761, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.19335938, + "step": 4084, + "time_per_iteration": 2.835313558578491 + }, + { + "auxiliary_loss_clip": 0.01460194, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.3155638, + "balance_loss_mlp": 1.02364028, + "epoch": 0.24560348714865474, + "flos": 14980608783360.0, + "grad_norm": 1.8193606589822828, + "language_loss": 0.85540128, + "learning_rate": 3.531866337826471e-06, + "loss": 0.88041592, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1763916, + "step": 4085, + "time_per_iteration": 2.791508436203003 + }, + { + "auxiliary_loss_clip": 0.01468119, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.32242942, + "balance_loss_mlp": 1.02762341, + "epoch": 0.2456636104013227, + "flos": 22680594075600.0, + "grad_norm": 2.6902284797675122, + "language_loss": 0.79283142, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81796598, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.17700195, + "step": 4086, + "time_per_iteration": 4.318992614746094 + }, + { + "auxiliary_loss_clip": 0.01460756, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_clip": 1.31771553, + "balance_loss_mlp": 1.02591336, + "epoch": 0.2457237336539907, + "flos": 27423676427400.0, + "grad_norm": 1.6543460351520123, + "language_loss": 0.75601321, + "learning_rate": 3.531365436099496e-06, + "loss": 0.78105223, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.17236328, + "step": 4087, + "time_per_iteration": 2.871030807495117 + }, + { + "auxiliary_loss_clip": 0.01474184, + "auxiliary_loss_mlp": 0.01044395, + "balance_loss_clip": 1.32919157, + "balance_loss_mlp": 1.02616787, + "epoch": 0.24578385690665866, + "flos": 20417243539680.0, + "grad_norm": 2.752086383176156, + "language_loss": 0.79952836, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.82471418, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.18225098, + "step": 4088, + "time_per_iteration": 2.8029682636260986 + }, + { + "auxiliary_loss_clip": 0.01452079, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.31014776, + "balance_loss_mlp": 1.0238713, + "epoch": 0.24584398015932662, + "flos": 23920764546240.0, + "grad_norm": 1.787446714942016, + "language_loss": 0.77631974, + "learning_rate": 3.5308643020944e-06, + "loss": 0.80125487, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.17565918, + "step": 4089, + "time_per_iteration": 4.381560325622559 + }, + { + "auxiliary_loss_clip": 0.0146078, + "auxiliary_loss_mlp": 0.0104883, + "balance_loss_clip": 1.31409252, + "balance_loss_mlp": 1.03130603, + "epoch": 0.2459041034119946, + "flos": 41503349601000.0, + "grad_norm": 1.7894694229857848, + "language_loss": 0.81308943, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83818549, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.1751709, + "step": 4090, + "time_per_iteration": 4.371319055557251 + }, + { + "auxiliary_loss_clip": 0.01465972, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_clip": 1.32000422, + "balance_loss_mlp": 1.03219664, + "epoch": 0.24596422666466256, + "flos": 19941453099360.0, + "grad_norm": 1.810125889132285, + "language_loss": 0.73584509, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.7610153, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.18859863, + "step": 4091, + "time_per_iteration": 2.7811360359191895 + }, + { + "auxiliary_loss_clip": 0.01467128, + "auxiliary_loss_mlp": 0.01043228, + "balance_loss_clip": 1.32358265, + "balance_loss_mlp": 1.02537024, + "epoch": 0.24602434991733052, + "flos": 21549527933040.0, + "grad_norm": 2.052816593196117, + "language_loss": 0.77390021, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79900384, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1784668, + "step": 4092, + "time_per_iteration": 2.787189483642578 + }, + { + "auxiliary_loss_clip": 0.01472813, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.3236165, + "balance_loss_mlp": 1.02569258, + "epoch": 0.24608447316999849, + "flos": 23190275515320.0, + "grad_norm": 2.5279620847489706, + "language_loss": 0.82086539, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.84603328, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.1829834, + "step": 4093, + "time_per_iteration": 2.741206407546997 + }, + { + "auxiliary_loss_clip": 0.01476184, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_clip": 1.32656968, + "balance_loss_mlp": 1.02899265, + "epoch": 0.24614459642266648, + "flos": 19646163129960.0, + "grad_norm": 1.9697751748973829, + "language_loss": 0.87657142, + "learning_rate": 3.529610451363797e-06, + "loss": 0.90180689, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.18371582, + "step": 4094, + "time_per_iteration": 2.8419454097747803 + }, + { + "auxiliary_loss_clip": 0.01299389, + "auxiliary_loss_mlp": 0.01014556, + "balance_loss_clip": 1.23431933, + "balance_loss_mlp": 1.01112294, + "epoch": 0.24620471967533444, + "flos": 61753412211840.0, + "grad_norm": 0.7562783461622822, + "language_loss": 0.57526994, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.5984093, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.03442383, + "step": 4095, + "time_per_iteration": 3.323329448699951 + }, + { + "auxiliary_loss_clip": 0.01298817, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_clip": 1.23456144, + "balance_loss_mlp": 1.02057636, + "epoch": 0.2462648429280024, + "flos": 69170046392520.0, + "grad_norm": 0.6455754020417764, + "language_loss": 0.56224793, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58547759, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03564453, + "step": 4096, + "time_per_iteration": 3.3401975631713867 + }, + { + "auxiliary_loss_clip": 0.01468764, + "auxiliary_loss_mlp": 0.01040998, + "balance_loss_clip": 1.32348704, + "balance_loss_mlp": 1.02325928, + "epoch": 0.24632496618067037, + "flos": 29465488421640.0, + "grad_norm": 1.5301013642222987, + "language_loss": 0.77756315, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.8026607, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.17749023, + "step": 4097, + "time_per_iteration": 2.8215866088867188 + }, + { + "auxiliary_loss_clip": 0.01473329, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_clip": 1.32439995, + "balance_loss_mlp": 1.02536237, + "epoch": 0.24638508943333834, + "flos": 24321828958200.0, + "grad_norm": 1.7911688136037056, + "language_loss": 0.76508296, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.79027218, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.20227051, + "step": 4098, + "time_per_iteration": 2.81781268119812 + }, + { + "auxiliary_loss_clip": 0.01468737, + "auxiliary_loss_mlp": 0.0104954, + "balance_loss_clip": 1.32192492, + "balance_loss_mlp": 1.03260064, + "epoch": 0.2464452126860063, + "flos": 26618705018280.0, + "grad_norm": 2.037838094992291, + "language_loss": 0.68091995, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70610273, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.16955566, + "step": 4099, + "time_per_iteration": 2.8063602447509766 + }, + { + "auxiliary_loss_clip": 0.01460087, + "auxiliary_loss_mlp": 0.01043877, + "balance_loss_clip": 1.31964207, + "balance_loss_mlp": 1.02673507, + "epoch": 0.24650533593867427, + "flos": 31218710825880.0, + "grad_norm": 1.8631167959894883, + "language_loss": 0.66052002, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68555963, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.17126465, + "step": 4100, + "time_per_iteration": 2.821176528930664 + }, + { + "auxiliary_loss_clip": 0.01300421, + "auxiliary_loss_mlp": 0.01004949, + "balance_loss_clip": 1.23585713, + "balance_loss_mlp": 1.00137317, + "epoch": 0.24656545919134226, + "flos": 68509044911520.0, + "grad_norm": 0.708415940015705, + "language_loss": 0.61561453, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63866824, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.03564453, + "step": 4101, + "time_per_iteration": 3.4101274013519287 + }, + { + "auxiliary_loss_clip": 0.01464212, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.32172108, + "balance_loss_mlp": 1.0230583, + "epoch": 0.24662558244401023, + "flos": 20089037171520.0, + "grad_norm": 1.6426081248915567, + "language_loss": 0.73657429, + "learning_rate": 3.527601274535012e-06, + "loss": 0.7616275, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.18054199, + "step": 4102, + "time_per_iteration": 2.761986494064331 + }, + { + "auxiliary_loss_clip": 0.01469665, + "auxiliary_loss_mlp": 0.01042852, + "balance_loss_clip": 1.32286429, + "balance_loss_mlp": 1.02462459, + "epoch": 0.2466857056966782, + "flos": 30707567485200.0, + "grad_norm": 1.9593431100440983, + "language_loss": 0.76271737, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78784251, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.18225098, + "step": 4103, + "time_per_iteration": 2.835420846939087 + }, + { + "auxiliary_loss_clip": 0.01476288, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.32901323, + "balance_loss_mlp": 1.02453482, + "epoch": 0.24674582894934616, + "flos": 22533659737200.0, + "grad_norm": 1.9627891137902598, + "language_loss": 0.78427172, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80947006, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.18994141, + "step": 4104, + "time_per_iteration": 2.8348171710968018 + }, + { + "auxiliary_loss_clip": 0.01459704, + "auxiliary_loss_mlp": 0.01039955, + "balance_loss_clip": 1.31583548, + "balance_loss_mlp": 1.02027297, + "epoch": 0.24680595220201412, + "flos": 20709203623560.0, + "grad_norm": 1.9754602845130218, + "language_loss": 0.83588648, + "learning_rate": 3.526846877170133e-06, + "loss": 0.86088312, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.19665527, + "step": 4105, + "time_per_iteration": 2.8434386253356934 + }, + { + "auxiliary_loss_clip": 0.01470028, + "auxiliary_loss_mlp": 0.01045989, + "balance_loss_clip": 1.32623029, + "balance_loss_mlp": 1.02814341, + "epoch": 0.2468660754546821, + "flos": 21835681021440.0, + "grad_norm": 1.8641198855661574, + "language_loss": 0.76905572, + "learning_rate": 3.52659529557275e-06, + "loss": 0.79421592, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.17858887, + "step": 4106, + "time_per_iteration": 2.823352098464966 + }, + { + "auxiliary_loss_clip": 0.01468624, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_clip": 1.32363081, + "balance_loss_mlp": 1.02376509, + "epoch": 0.24692619870735008, + "flos": 15271878525120.0, + "grad_norm": 2.506477299864314, + "language_loss": 0.73029226, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.75541317, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.19702148, + "step": 4107, + "time_per_iteration": 2.7284486293792725 + }, + { + "auxiliary_loss_clip": 0.01475118, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_clip": 1.32963824, + "balance_loss_mlp": 1.02793074, + "epoch": 0.24698632196001805, + "flos": 29686214796120.0, + "grad_norm": 1.7232886505098926, + "language_loss": 0.65672731, + "learning_rate": 3.526091958721587e-06, + "loss": 0.68194926, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.19152832, + "step": 4108, + "time_per_iteration": 2.8547134399414062 + }, + { + "auxiliary_loss_clip": 0.01472019, + "auxiliary_loss_mlp": 0.01051971, + "balance_loss_clip": 1.32392538, + "balance_loss_mlp": 1.03232551, + "epoch": 0.247046445212686, + "flos": 39172704366600.0, + "grad_norm": 1.664317799751782, + "language_loss": 0.72634763, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.75158745, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.19641113, + "step": 4109, + "time_per_iteration": 2.9712424278259277 + }, + { + "auxiliary_loss_clip": 0.01471422, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.3239013, + "balance_loss_mlp": 1.02749109, + "epoch": 0.24710656846535398, + "flos": 23003521356960.0, + "grad_norm": 1.7371994437302487, + "language_loss": 0.7970202, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.822191, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.18151855, + "step": 4110, + "time_per_iteration": 2.7974960803985596 + }, + { + "auxiliary_loss_clip": 0.01480653, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.33246756, + "balance_loss_mlp": 1.02124929, + "epoch": 0.24716669171802194, + "flos": 26438488805880.0, + "grad_norm": 1.9867852578511769, + "language_loss": 0.80551255, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83071834, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.18688965, + "step": 4111, + "time_per_iteration": 2.8286590576171875 + }, + { + "auxiliary_loss_clip": 0.01470305, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.32395089, + "balance_loss_mlp": 1.03004479, + "epoch": 0.2472268149706899, + "flos": 23334935785560.0, + "grad_norm": 1.8916857549315071, + "language_loss": 0.75481659, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.78000236, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.18225098, + "step": 4112, + "time_per_iteration": 2.790278911590576 + }, + { + "auxiliary_loss_clip": 0.01466164, + "auxiliary_loss_mlp": 0.01049974, + "balance_loss_clip": 1.31963694, + "balance_loss_mlp": 1.03178263, + "epoch": 0.24728693822335787, + "flos": 23773992641280.0, + "grad_norm": 1.7435344280390732, + "language_loss": 0.82941079, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.85457218, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.18188477, + "step": 4113, + "time_per_iteration": 2.8481972217559814 + }, + { + "auxiliary_loss_clip": 0.01464388, + "auxiliary_loss_mlp": 0.01040989, + "balance_loss_clip": 1.31861794, + "balance_loss_mlp": 1.02192712, + "epoch": 0.24734706147602586, + "flos": 19322180031240.0, + "grad_norm": 2.0836689119219463, + "language_loss": 0.87140906, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.8964628, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.1907959, + "step": 4114, + "time_per_iteration": 2.7142069339752197 + }, + { + "auxiliary_loss_clip": 0.0146236, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.31781709, + "balance_loss_mlp": 1.02283204, + "epoch": 0.24740718472869383, + "flos": 28042665237000.0, + "grad_norm": 2.2814462780980675, + "language_loss": 0.75330776, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77834094, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18127441, + "step": 4115, + "time_per_iteration": 2.8242883682250977 + }, + { + "auxiliary_loss_clip": 0.0128432, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.22043824, + "balance_loss_mlp": 1.02314091, + "epoch": 0.2474673079813618, + "flos": 68124508102080.0, + "grad_norm": 0.7733176236039938, + "language_loss": 0.58300459, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60611236, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03320312, + "step": 4116, + "time_per_iteration": 4.806845664978027 + }, + { + "auxiliary_loss_clip": 0.01460145, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_clip": 1.31609774, + "balance_loss_mlp": 1.02644277, + "epoch": 0.24752743123402976, + "flos": 29468331006840.0, + "grad_norm": 1.3961278315265628, + "language_loss": 0.83719933, + "learning_rate": 3.523824079451235e-06, + "loss": 0.86224365, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.1784668, + "step": 4117, + "time_per_iteration": 2.83073353767395 + }, + { + "auxiliary_loss_clip": 0.01284429, + "auxiliary_loss_mlp": 0.01006934, + "balance_loss_clip": 1.22004128, + "balance_loss_mlp": 1.00330973, + "epoch": 0.24758755448669773, + "flos": 58364112186720.0, + "grad_norm": 0.9036503261914747, + "language_loss": 0.63539916, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.6583128, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.03613281, + "step": 4118, + "time_per_iteration": 3.1322200298309326 + }, + { + "auxiliary_loss_clip": 0.01465949, + "auxiliary_loss_mlp": 0.01046229, + "balance_loss_clip": 1.32100928, + "balance_loss_mlp": 1.02827549, + "epoch": 0.2476476777393657, + "flos": 20489167591200.0, + "grad_norm": 2.0750915591452905, + "language_loss": 0.79398274, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81910443, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1796875, + "step": 4119, + "time_per_iteration": 2.7702317237854004 + }, + { + "auxiliary_loss_clip": 0.01457329, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.31349254, + "balance_loss_mlp": 1.02098548, + "epoch": 0.24770780099203366, + "flos": 20490791925600.0, + "grad_norm": 2.1132643765701187, + "language_loss": 0.74463546, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76959425, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.17553711, + "step": 4120, + "time_per_iteration": 2.752471446990967 + }, + { + "auxiliary_loss_clip": 0.01461074, + "auxiliary_loss_mlp": 0.01058657, + "balance_loss_clip": 1.31554341, + "balance_loss_mlp": 1.0405252, + "epoch": 0.24776792424470165, + "flos": 15156926593200.0, + "grad_norm": 1.9925757391948908, + "language_loss": 0.88300788, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90820515, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.18139648, + "step": 4121, + "time_per_iteration": 2.7623114585876465 + }, + { + "auxiliary_loss_clip": 0.01468521, + "auxiliary_loss_mlp": 0.0104462, + "balance_loss_clip": 1.32231832, + "balance_loss_mlp": 1.02529645, + "epoch": 0.2478280474973696, + "flos": 21730434487560.0, + "grad_norm": 2.0270954567810695, + "language_loss": 0.69858956, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.72372103, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.19335938, + "step": 4122, + "time_per_iteration": 2.7410759925842285 + }, + { + "auxiliary_loss_clip": 0.01460488, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.31421924, + "balance_loss_mlp": 1.02389574, + "epoch": 0.24788817075003758, + "flos": 20417040497880.0, + "grad_norm": 2.1690474850485684, + "language_loss": 0.79959303, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82463467, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.19763184, + "step": 4123, + "time_per_iteration": 2.783057928085327 + }, + { + "auxiliary_loss_clip": 0.01461427, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_clip": 1.31704319, + "balance_loss_mlp": 1.03413439, + "epoch": 0.24794829400270554, + "flos": 22598680367520.0, + "grad_norm": 1.7303037431224477, + "language_loss": 0.75309134, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77821767, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.17053223, + "step": 4124, + "time_per_iteration": 4.22322678565979 + }, + { + "auxiliary_loss_clip": 0.01457868, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.31566346, + "balance_loss_mlp": 1.02440238, + "epoch": 0.2480084172553735, + "flos": 39683319798600.0, + "grad_norm": 1.3815733262410004, + "language_loss": 0.73951781, + "learning_rate": 3.521804257268357e-06, + "loss": 0.76451266, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17224121, + "step": 4125, + "time_per_iteration": 2.915817975997925 + }, + { + "auxiliary_loss_clip": 0.0147694, + "auxiliary_loss_mlp": 0.0104534, + "balance_loss_clip": 1.32524586, + "balance_loss_mlp": 1.02711308, + "epoch": 0.24806854050804147, + "flos": 22058722072440.0, + "grad_norm": 1.7554893469363015, + "language_loss": 0.69446242, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.7196852, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.18225098, + "step": 4126, + "time_per_iteration": 2.7471327781677246 + }, + { + "auxiliary_loss_clip": 0.01462872, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.31803203, + "balance_loss_mlp": 1.02711868, + "epoch": 0.24812866376070947, + "flos": 15490980565200.0, + "grad_norm": 2.763320204455014, + "language_loss": 0.81945479, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.84453839, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.18383789, + "step": 4127, + "time_per_iteration": 2.9177138805389404 + }, + { + "auxiliary_loss_clip": 0.01466222, + "auxiliary_loss_mlp": 0.01050432, + "balance_loss_clip": 1.31969607, + "balance_loss_mlp": 1.03297985, + "epoch": 0.24818878701337743, + "flos": 14761587960000.0, + "grad_norm": 2.2388348758274335, + "language_loss": 0.84437519, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86954176, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.17456055, + "step": 4128, + "time_per_iteration": 4.259047985076904 + }, + { + "auxiliary_loss_clip": 0.01464476, + "auxiliary_loss_mlp": 0.01057738, + "balance_loss_clip": 1.31734419, + "balance_loss_mlp": 1.0399524, + "epoch": 0.2482489102660454, + "flos": 27095267017440.0, + "grad_norm": 1.9079430056338498, + "language_loss": 0.65430689, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67952895, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.17785645, + "step": 4129, + "time_per_iteration": 2.793297529220581 + }, + { + "auxiliary_loss_clip": 0.01454696, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.31131446, + "balance_loss_mlp": 1.02722073, + "epoch": 0.24830903351871336, + "flos": 26472988930680.0, + "grad_norm": 2.525704816809343, + "language_loss": 0.75778109, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.78278834, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.18798828, + "step": 4130, + "time_per_iteration": 2.7875845432281494 + }, + { + "auxiliary_loss_clip": 0.01453995, + "auxiliary_loss_mlp": 0.01051329, + "balance_loss_clip": 1.30950391, + "balance_loss_mlp": 1.03230321, + "epoch": 0.24836915677138133, + "flos": 10231394569200.0, + "grad_norm": 2.1358574760783733, + "language_loss": 0.78123415, + "learning_rate": 3.520286966670535e-06, + "loss": 0.80628741, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.19018555, + "step": 4131, + "time_per_iteration": 2.7431576251983643 + }, + { + "auxiliary_loss_clip": 0.01447756, + "auxiliary_loss_mlp": 0.01044205, + "balance_loss_clip": 1.30545163, + "balance_loss_mlp": 1.02676451, + "epoch": 0.2484292800240493, + "flos": 30086467040880.0, + "grad_norm": 1.5037335921645754, + "language_loss": 0.83686537, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86178505, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.17443848, + "step": 4132, + "time_per_iteration": 2.845679759979248 + }, + { + "auxiliary_loss_clip": 0.01462339, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.31785274, + "balance_loss_mlp": 1.02884865, + "epoch": 0.24848940327671726, + "flos": 13446813286080.0, + "grad_norm": 1.5653393675887335, + "language_loss": 0.71688062, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.74198371, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.19104004, + "step": 4133, + "time_per_iteration": 2.7196943759918213 + }, + { + "auxiliary_loss_clip": 0.01472842, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.31971467, + "balance_loss_mlp": 1.02493942, + "epoch": 0.24854952652938525, + "flos": 19974491323200.0, + "grad_norm": 2.4451810522782322, + "language_loss": 0.61741602, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.64260375, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.20983887, + "step": 4134, + "time_per_iteration": 2.740499973297119 + }, + { + "auxiliary_loss_clip": 0.01458759, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_clip": 1.31265664, + "balance_loss_mlp": 1.02810907, + "epoch": 0.24860964978205322, + "flos": 18154745779320.0, + "grad_norm": 1.8197838926420051, + "language_loss": 0.78581464, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81085616, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17285156, + "step": 4135, + "time_per_iteration": 2.723597526550293 + }, + { + "auxiliary_loss_clip": 0.01455426, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.31091082, + "balance_loss_mlp": 1.02522528, + "epoch": 0.24866977303472118, + "flos": 11732354884440.0, + "grad_norm": 2.0047251557647385, + "language_loss": 0.83210075, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85708016, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.17297363, + "step": 4136, + "time_per_iteration": 2.753607988357544 + }, + { + "auxiliary_loss_clip": 0.0146108, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.31252861, + "balance_loss_mlp": 1.02737212, + "epoch": 0.24872989628738915, + "flos": 34830239734800.0, + "grad_norm": 1.7859112890843034, + "language_loss": 0.71310508, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73817223, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.18261719, + "step": 4137, + "time_per_iteration": 2.9053854942321777 + }, + { + "auxiliary_loss_clip": 0.01461291, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.31156898, + "balance_loss_mlp": 1.02083254, + "epoch": 0.2487900195400571, + "flos": 13703379861240.0, + "grad_norm": 1.7299946704375817, + "language_loss": 0.66589081, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69089353, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.18151855, + "step": 4138, + "time_per_iteration": 2.8233489990234375 + }, + { + "auxiliary_loss_clip": 0.01449043, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.30679774, + "balance_loss_mlp": 1.0231514, + "epoch": 0.24885014279272508, + "flos": 25343303472360.0, + "grad_norm": 1.915058802368699, + "language_loss": 0.84299785, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86789155, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.171875, + "step": 4139, + "time_per_iteration": 2.769148349761963 + }, + { + "auxiliary_loss_clip": 0.01461144, + "auxiliary_loss_mlp": 0.01044687, + "balance_loss_clip": 1.31162655, + "balance_loss_mlp": 1.02561331, + "epoch": 0.24891026604539307, + "flos": 20636345579760.0, + "grad_norm": 1.5968882066870072, + "language_loss": 0.79198653, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81704485, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.1907959, + "step": 4140, + "time_per_iteration": 2.8104422092437744 + }, + { + "auxiliary_loss_clip": 0.01288726, + "auxiliary_loss_mlp": 0.01017588, + "balance_loss_clip": 1.22238231, + "balance_loss_mlp": 1.01279569, + "epoch": 0.24897038929806103, + "flos": 66974965155720.0, + "grad_norm": 0.8238453025689421, + "language_loss": 0.61031425, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63337737, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.04785156, + "step": 4141, + "time_per_iteration": 3.3055150508880615 + }, + { + "auxiliary_loss_clip": 0.014613, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_clip": 1.31374729, + "balance_loss_mlp": 1.03108382, + "epoch": 0.249030512550729, + "flos": 36399428740800.0, + "grad_norm": 1.778790929384236, + "language_loss": 0.72908473, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75419992, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.19128418, + "step": 4142, + "time_per_iteration": 2.8730340003967285 + }, + { + "auxiliary_loss_clip": 0.01459678, + "auxiliary_loss_mlp": 0.01045016, + "balance_loss_clip": 1.31463265, + "balance_loss_mlp": 1.02652717, + "epoch": 0.24909063580339696, + "flos": 20159093238480.0, + "grad_norm": 1.7462342744594388, + "language_loss": 0.81355476, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83860165, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.18481445, + "step": 4143, + "time_per_iteration": 2.793997049331665 + }, + { + "auxiliary_loss_clip": 0.01453642, + "auxiliary_loss_mlp": 0.01046122, + "balance_loss_clip": 1.30965638, + "balance_loss_mlp": 1.02839565, + "epoch": 0.24915075905606493, + "flos": 26402283129960.0, + "grad_norm": 2.0308475896732396, + "language_loss": 0.59166014, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61665779, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.17712402, + "step": 4144, + "time_per_iteration": 2.7904841899871826 + }, + { + "auxiliary_loss_clip": 0.01453769, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.30933666, + "balance_loss_mlp": 1.0226469, + "epoch": 0.2492108823087329, + "flos": 27533024405640.0, + "grad_norm": 1.9793305013748628, + "language_loss": 0.79212117, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81706548, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.18029785, + "step": 4145, + "time_per_iteration": 2.8093230724334717 + }, + { + "auxiliary_loss_clip": 0.01478694, + "auxiliary_loss_mlp": 0.01049392, + "balance_loss_clip": 1.32536376, + "balance_loss_mlp": 1.02804124, + "epoch": 0.24927100556140086, + "flos": 16695838743840.0, + "grad_norm": 2.0143160829169937, + "language_loss": 0.65439808, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67967904, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.21350098, + "step": 4146, + "time_per_iteration": 2.7374651432037354 + }, + { + "auxiliary_loss_clip": 0.01280382, + "auxiliary_loss_mlp": 0.01018867, + "balance_loss_clip": 1.21405268, + "balance_loss_mlp": 1.01388359, + "epoch": 0.24933112881406885, + "flos": 62787637751040.0, + "grad_norm": 1.0593038914758433, + "language_loss": 0.6734736, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69646603, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.04980469, + "step": 4147, + "time_per_iteration": 3.3872621059417725 + }, + { + "auxiliary_loss_clip": 0.014584, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.31290352, + "balance_loss_mlp": 1.02925014, + "epoch": 0.24939125206673682, + "flos": 26657631454320.0, + "grad_norm": 1.8410412969424592, + "language_loss": 0.89660287, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.92166913, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.18981934, + "step": 4148, + "time_per_iteration": 2.9749581813812256 + }, + { + "auxiliary_loss_clip": 0.01478714, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.32823896, + "balance_loss_mlp": 1.02945828, + "epoch": 0.24945137531940478, + "flos": 20709244231920.0, + "grad_norm": 1.944111905243203, + "language_loss": 0.68164361, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70693845, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.2130127, + "step": 4149, + "time_per_iteration": 2.8015615940093994 + }, + { + "auxiliary_loss_clip": 0.01461303, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_clip": 1.31809545, + "balance_loss_mlp": 1.02719021, + "epoch": 0.24951149857207275, + "flos": 23773911424560.0, + "grad_norm": 1.4710500565050488, + "language_loss": 0.71252972, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73759937, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.18469238, + "step": 4150, + "time_per_iteration": 2.801377296447754 + }, + { + "auxiliary_loss_clip": 0.01462882, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_clip": 1.31637645, + "balance_loss_mlp": 1.02501106, + "epoch": 0.2495716218247407, + "flos": 15668557234200.0, + "grad_norm": 1.7730103345820802, + "language_loss": 0.73314905, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75822026, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.19238281, + "step": 4151, + "time_per_iteration": 2.6916887760162354 + }, + { + "auxiliary_loss_clip": 0.01473487, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.32229674, + "balance_loss_mlp": 1.02315712, + "epoch": 0.24963174507740868, + "flos": 24056734627440.0, + "grad_norm": 2.64141031393225, + "language_loss": 0.64759701, + "learning_rate": 3.514960119583781e-06, + "loss": 0.67275733, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.19372559, + "step": 4152, + "time_per_iteration": 2.7987751960754395 + }, + { + "auxiliary_loss_clip": 0.01462134, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.32135117, + "balance_loss_mlp": 1.02023149, + "epoch": 0.24969186833007664, + "flos": 21804632607240.0, + "grad_norm": 1.8154055967960583, + "language_loss": 0.77398586, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79899073, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.18115234, + "step": 4153, + "time_per_iteration": 2.807985305786133 + }, + { + "auxiliary_loss_clip": 0.01461074, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.31870961, + "balance_loss_mlp": 1.02640748, + "epoch": 0.24975199158274464, + "flos": 19942955608680.0, + "grad_norm": 2.3558947713130878, + "language_loss": 0.77206117, + "learning_rate": 3.514451478119711e-06, + "loss": 0.79711688, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.18103027, + "step": 4154, + "time_per_iteration": 2.721431255340576 + }, + { + "auxiliary_loss_clip": 0.01473096, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.32330918, + "balance_loss_mlp": 1.02426958, + "epoch": 0.2498121148354126, + "flos": 25343953206120.0, + "grad_norm": 1.6867603256727801, + "language_loss": 0.71374375, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.73891407, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19665527, + "step": 4155, + "time_per_iteration": 4.200450658798218 + }, + { + "auxiliary_loss_clip": 0.0147353, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.32634878, + "balance_loss_mlp": 1.02831388, + "epoch": 0.24987223808808057, + "flos": 20563731186120.0, + "grad_norm": 1.575870974629921, + "language_loss": 0.75032115, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77552462, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.18518066, + "step": 4156, + "time_per_iteration": 2.7196686267852783 + }, + { + "auxiliary_loss_clip": 0.01460099, + "auxiliary_loss_mlp": 0.010391, + "balance_loss_clip": 1.31897211, + "balance_loss_mlp": 1.02106357, + "epoch": 0.24993236134074853, + "flos": 19752952781520.0, + "grad_norm": 5.277698965707662, + "language_loss": 0.77542436, + "learning_rate": 3.513688085236591e-06, + "loss": 0.80041635, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.18029785, + "step": 4157, + "time_per_iteration": 2.7482686042785645 + }, + { + "auxiliary_loss_clip": 0.01466006, + "auxiliary_loss_mlp": 0.01045869, + "balance_loss_clip": 1.3216325, + "balance_loss_mlp": 1.02757001, + "epoch": 0.2499924845934165, + "flos": 18774912231360.0, + "grad_norm": 1.6510378954216467, + "language_loss": 0.81782079, + "learning_rate": 3.513433506130942e-06, + "loss": 0.84293956, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.18310547, + "step": 4158, + "time_per_iteration": 2.8328280448913574 + }, + { + "auxiliary_loss_clip": 0.01468594, + "auxiliary_loss_mlp": 0.01042398, + "balance_loss_clip": 1.32473969, + "balance_loss_mlp": 1.02369404, + "epoch": 0.25005260784608446, + "flos": 16876461039840.0, + "grad_norm": 1.835615638415791, + "language_loss": 0.76387566, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.78898561, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.18713379, + "step": 4159, + "time_per_iteration": 2.8062026500701904 + }, + { + "auxiliary_loss_clip": 0.01470795, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.32329357, + "balance_loss_mlp": 1.02010751, + "epoch": 0.2501127310987524, + "flos": 22129712131680.0, + "grad_norm": 1.8924161593835742, + "language_loss": 0.71437991, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73948455, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.19567871, + "step": 4160, + "time_per_iteration": 2.8602190017700195 + }, + { + "auxiliary_loss_clip": 0.01295714, + "auxiliary_loss_mlp": 0.01011006, + "balance_loss_clip": 1.2331816, + "balance_loss_mlp": 1.00707209, + "epoch": 0.2501728543514204, + "flos": 69475244801760.0, + "grad_norm": 0.8984968097622648, + "language_loss": 0.5686667, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.59173393, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03930664, + "step": 4161, + "time_per_iteration": 3.329977035522461 + }, + { + "auxiliary_loss_clip": 0.01472882, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.32529116, + "balance_loss_mlp": 1.02662015, + "epoch": 0.25023297760408836, + "flos": 16294368248280.0, + "grad_norm": 1.8031781606524995, + "language_loss": 0.81202936, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.83721459, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.19030762, + "step": 4162, + "time_per_iteration": 2.8422839641571045 + }, + { + "auxiliary_loss_clip": 0.01466094, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.31879699, + "balance_loss_mlp": 1.02282858, + "epoch": 0.2502931008567563, + "flos": 12241589632200.0, + "grad_norm": 2.1744196771607576, + "language_loss": 0.87986702, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.90494436, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.18823242, + "step": 4163, + "time_per_iteration": 4.231068849563599 + }, + { + "auxiliary_loss_clip": 0.01468144, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.3237741, + "balance_loss_mlp": 1.0198822, + "epoch": 0.25035322410942434, + "flos": 23186661371280.0, + "grad_norm": 1.6284639139619512, + "language_loss": 0.84228277, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.86734462, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.18164062, + "step": 4164, + "time_per_iteration": 2.7803196907043457 + }, + { + "auxiliary_loss_clip": 0.01447844, + "auxiliary_loss_mlp": 0.01049423, + "balance_loss_clip": 1.3109529, + "balance_loss_mlp": 1.03213787, + "epoch": 0.2504133473620923, + "flos": 20921727109320.0, + "grad_norm": 1.8262763479542758, + "language_loss": 0.74336922, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.7683419, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.17272949, + "step": 4165, + "time_per_iteration": 2.8182547092437744 + }, + { + "auxiliary_loss_clip": 0.01465177, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.31906605, + "balance_loss_mlp": 1.02356219, + "epoch": 0.2504734706147603, + "flos": 20781371325240.0, + "grad_norm": 2.8051528433585715, + "language_loss": 0.74081862, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76589525, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.18933105, + "step": 4166, + "time_per_iteration": 2.7641329765319824 + }, + { + "auxiliary_loss_clip": 0.01453679, + "auxiliary_loss_mlp": 0.01040984, + "balance_loss_clip": 1.31093001, + "balance_loss_mlp": 1.02366316, + "epoch": 0.25053359386742824, + "flos": 24354298665000.0, + "grad_norm": 1.7536432811746914, + "language_loss": 0.82026684, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84521341, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.17321777, + "step": 4167, + "time_per_iteration": 4.280304193496704 + }, + { + "auxiliary_loss_clip": 0.0145495, + "auxiliary_loss_mlp": 0.01040897, + "balance_loss_clip": 1.31587982, + "balance_loss_mlp": 1.02324247, + "epoch": 0.2505937171200962, + "flos": 21219209930160.0, + "grad_norm": 2.6215052727063863, + "language_loss": 0.80438876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.82934725, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17675781, + "step": 4168, + "time_per_iteration": 2.7976443767547607 + }, + { + "auxiliary_loss_clip": 0.01465, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.31710899, + "balance_loss_mlp": 1.01894832, + "epoch": 0.25065384037276417, + "flos": 41434714826640.0, + "grad_norm": 2.7978099440698974, + "language_loss": 0.69666576, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72169781, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.19238281, + "step": 4169, + "time_per_iteration": 2.9111461639404297 + }, + { + "auxiliary_loss_clip": 0.0144721, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.30817437, + "balance_loss_mlp": 1.02514136, + "epoch": 0.25071396362543213, + "flos": 26107764719400.0, + "grad_norm": 1.6915630977627816, + "language_loss": 0.77591765, + "learning_rate": 3.510374083241361e-06, + "loss": 0.80081326, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17199707, + "step": 4170, + "time_per_iteration": 2.8066177368164062 + }, + { + "auxiliary_loss_clip": 0.01451548, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.30875897, + "balance_loss_mlp": 1.02173972, + "epoch": 0.2507740868781001, + "flos": 19103687116560.0, + "grad_norm": 2.107869406232585, + "language_loss": 0.76685727, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.79176742, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17724609, + "step": 4171, + "time_per_iteration": 2.779998540878296 + }, + { + "auxiliary_loss_clip": 0.01284235, + "auxiliary_loss_mlp": 0.01019987, + "balance_loss_clip": 1.22177112, + "balance_loss_mlp": 1.01588583, + "epoch": 0.25083421013076806, + "flos": 64357314057360.0, + "grad_norm": 0.8539939163429573, + "language_loss": 0.60135537, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62439758, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.04101562, + "step": 4172, + "time_per_iteration": 3.2777254581451416 + }, + { + "auxiliary_loss_clip": 0.01448805, + "auxiliary_loss_mlp": 0.01041034, + "balance_loss_clip": 1.30605614, + "balance_loss_mlp": 1.02284288, + "epoch": 0.25089433338343603, + "flos": 24284932940160.0, + "grad_norm": 1.4256171363684293, + "language_loss": 0.79380691, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81870526, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.18188477, + "step": 4173, + "time_per_iteration": 2.7914910316467285 + }, + { + "auxiliary_loss_clip": 0.01449723, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.30850399, + "balance_loss_mlp": 1.02834558, + "epoch": 0.250954456636104, + "flos": 14725707150960.0, + "grad_norm": 1.9418772379841127, + "language_loss": 0.84132004, + "learning_rate": 3.509352442032875e-06, + "loss": 0.86627281, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.17211914, + "step": 4174, + "time_per_iteration": 2.7265169620513916 + }, + { + "auxiliary_loss_clip": 0.01455954, + "auxiliary_loss_mlp": 0.01038785, + "balance_loss_clip": 1.31364655, + "balance_loss_mlp": 1.02104628, + "epoch": 0.25101457988877196, + "flos": 22278758104800.0, + "grad_norm": 2.0746662312390485, + "language_loss": 0.71141702, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73636442, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.17736816, + "step": 4175, + "time_per_iteration": 2.772341728210449 + }, + { + "auxiliary_loss_clip": 0.01451915, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.30680656, + "balance_loss_mlp": 1.0151974, + "epoch": 0.2510747031414399, + "flos": 25194013849080.0, + "grad_norm": 1.725396392487946, + "language_loss": 0.8078115, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83266658, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.1842041, + "step": 4176, + "time_per_iteration": 2.768709659576416 + }, + { + "auxiliary_loss_clip": 0.01443594, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.301561, + "balance_loss_mlp": 1.01925516, + "epoch": 0.25113482639410795, + "flos": 20709081798480.0, + "grad_norm": 1.871426291834914, + "language_loss": 0.83428091, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85909498, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.18554688, + "step": 4177, + "time_per_iteration": 2.7506065368652344 + }, + { + "auxiliary_loss_clip": 0.01445484, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.3063941, + "balance_loss_mlp": 1.02528358, + "epoch": 0.2511949496467759, + "flos": 21511941572880.0, + "grad_norm": 2.1934663764773927, + "language_loss": 0.83293706, + "learning_rate": 3.508329885067698e-06, + "loss": 0.85781586, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.17126465, + "step": 4178, + "time_per_iteration": 2.8116455078125 + }, + { + "auxiliary_loss_clip": 0.01434364, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.29646969, + "balance_loss_mlp": 1.02504683, + "epoch": 0.2512550728994439, + "flos": 20706604688520.0, + "grad_norm": 2.3007465766487796, + "language_loss": 0.75632763, + "learning_rate": 3.508074102812112e-06, + "loss": 0.78108084, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.15917969, + "step": 4179, + "time_per_iteration": 2.762789726257324 + }, + { + "auxiliary_loss_clip": 0.01447084, + "auxiliary_loss_mlp": 0.01047943, + "balance_loss_clip": 1.30425656, + "balance_loss_mlp": 1.02931035, + "epoch": 0.25131519615211184, + "flos": 18483236406000.0, + "grad_norm": 2.000050718788955, + "language_loss": 0.71159202, + "learning_rate": 3.507818263370206e-06, + "loss": 0.73654228, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.18652344, + "step": 4180, + "time_per_iteration": 2.744610548019409 + }, + { + "auxiliary_loss_clip": 0.01442069, + "auxiliary_loss_mlp": 0.01043326, + "balance_loss_clip": 1.3038826, + "balance_loss_mlp": 1.02694702, + "epoch": 0.2513753194047798, + "flos": 20489776716600.0, + "grad_norm": 1.923946787419355, + "language_loss": 0.85932201, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88417596, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.16381836, + "step": 4181, + "time_per_iteration": 2.769188642501831 + }, + { + "auxiliary_loss_clip": 0.01444977, + "auxiliary_loss_mlp": 0.01044652, + "balance_loss_clip": 1.30499506, + "balance_loss_mlp": 1.02772474, + "epoch": 0.25143544265744777, + "flos": 37677185571600.0, + "grad_norm": 1.9734450978989297, + "language_loss": 0.68791157, + "learning_rate": 3.507306412966238e-06, + "loss": 0.71280783, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16931152, + "step": 4182, + "time_per_iteration": 2.887260913848877 + }, + { + "auxiliary_loss_clip": 0.01287831, + "auxiliary_loss_mlp": 0.01014091, + "balance_loss_clip": 1.22482872, + "balance_loss_mlp": 1.01063395, + "epoch": 0.25149556591011574, + "flos": 69382326228120.0, + "grad_norm": 0.8541274186207141, + "language_loss": 0.70179415, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72481334, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.03466797, + "step": 4183, + "time_per_iteration": 3.2671167850494385 + }, + { + "auxiliary_loss_clip": 0.01445579, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.3044765, + "balance_loss_mlp": 1.02381921, + "epoch": 0.2515556891627837, + "flos": 13994365344480.0, + "grad_norm": 1.6509566336777721, + "language_loss": 0.74528563, + "learning_rate": 3.506794333933431e-06, + "loss": 0.77015948, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17993164, + "step": 4184, + "time_per_iteration": 2.808732271194458 + }, + { + "auxiliary_loss_clip": 0.01446244, + "auxiliary_loss_mlp": 0.01043768, + "balance_loss_clip": 1.30574632, + "balance_loss_mlp": 1.02575541, + "epoch": 0.25161581241545167, + "flos": 22168597959360.0, + "grad_norm": 3.2015904625106737, + "language_loss": 0.83548087, + "learning_rate": 3.506538208705484e-06, + "loss": 0.86038101, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.18029785, + "step": 4185, + "time_per_iteration": 2.7879490852355957 + }, + { + "auxiliary_loss_clip": 0.01280373, + "auxiliary_loss_mlp": 0.01015049, + "balance_loss_clip": 1.21811962, + "balance_loss_mlp": 1.01137733, + "epoch": 0.25167593566811963, + "flos": 69372092921400.0, + "grad_norm": 0.8112299042804653, + "language_loss": 0.61470115, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63765538, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03662109, + "step": 4186, + "time_per_iteration": 3.1305298805236816 + }, + { + "auxiliary_loss_clip": 0.01445808, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.30588341, + "balance_loss_mlp": 1.02652895, + "epoch": 0.2517360589207876, + "flos": 13265419431240.0, + "grad_norm": 1.847136370545374, + "language_loss": 0.79819274, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.8230921, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17590332, + "step": 4187, + "time_per_iteration": 2.7881603240966797 + }, + { + "auxiliary_loss_clip": 0.01442863, + "auxiliary_loss_mlp": 0.01053876, + "balance_loss_clip": 1.30509996, + "balance_loss_mlp": 1.03679323, + "epoch": 0.25179618217345556, + "flos": 20381809422600.0, + "grad_norm": 1.6197613040401093, + "language_loss": 0.79840636, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82337379, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17089844, + "step": 4188, + "time_per_iteration": 2.7345285415649414 + }, + { + "auxiliary_loss_clip": 0.01450823, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.31089151, + "balance_loss_mlp": 1.02679837, + "epoch": 0.25185630542612353, + "flos": 27669441178800.0, + "grad_norm": 1.8044407364164676, + "language_loss": 0.74226332, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76720768, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16809082, + "step": 4189, + "time_per_iteration": 2.8012471199035645 + }, + { + "auxiliary_loss_clip": 0.01439508, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.30358982, + "balance_loss_mlp": 1.0249455, + "epoch": 0.25191642867879155, + "flos": 21001407357600.0, + "grad_norm": 1.8994034631688832, + "language_loss": 0.85196722, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.87677753, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16589355, + "step": 4190, + "time_per_iteration": 2.9036366939544678 + }, + { + "auxiliary_loss_clip": 0.01447244, + "auxiliary_loss_mlp": 0.01044438, + "balance_loss_clip": 1.30761337, + "balance_loss_mlp": 1.02401769, + "epoch": 0.2519765519314595, + "flos": 21110633510760.0, + "grad_norm": 1.6942930097592699, + "language_loss": 0.75726604, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.78218293, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.20422363, + "step": 4191, + "time_per_iteration": 2.747919797897339 + }, + { + "auxiliary_loss_clip": 0.01269373, + "auxiliary_loss_mlp": 0.01012992, + "balance_loss_clip": 1.20757735, + "balance_loss_mlp": 1.00946355, + "epoch": 0.2520366751841275, + "flos": 62761340514960.0, + "grad_norm": 0.7214638737513498, + "language_loss": 0.57196862, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59479225, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03540039, + "step": 4192, + "time_per_iteration": 3.3379642963409424 + }, + { + "auxiliary_loss_clip": 0.01446035, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.30858636, + "balance_loss_mlp": 1.01969957, + "epoch": 0.25209679843679544, + "flos": 22235039882280.0, + "grad_norm": 2.045590952590254, + "language_loss": 0.76561671, + "learning_rate": 3.504487151087323e-06, + "loss": 0.79044425, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.17028809, + "step": 4193, + "time_per_iteration": 4.218744277954102 + }, + { + "auxiliary_loss_clip": 0.01448706, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.3076036, + "balance_loss_mlp": 1.02574325, + "epoch": 0.2521569216894634, + "flos": 12170437139520.0, + "grad_norm": 1.9116916077290902, + "language_loss": 0.845137, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.87005579, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17431641, + "step": 4194, + "time_per_iteration": 2.7191522121429443 + }, + { + "auxiliary_loss_clip": 0.01448932, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_clip": 1.30668473, + "balance_loss_mlp": 1.02809787, + "epoch": 0.2522170449421314, + "flos": 23705560908720.0, + "grad_norm": 1.4672491942328114, + "language_loss": 0.88493794, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90986383, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.15576172, + "step": 4195, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0145732, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_clip": 1.31635439, + "balance_loss_mlp": 1.02509224, + "epoch": 0.25227716819479934, + "flos": 20959841378160.0, + "grad_norm": 1.871418012650188, + "language_loss": 0.85737139, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88238269, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.18737793, + "step": 4196, + "time_per_iteration": 2.7779273986816406 + }, + { + "auxiliary_loss_clip": 0.01453396, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.31154418, + "balance_loss_mlp": 1.02400887, + "epoch": 0.2523372914474673, + "flos": 23336600728320.0, + "grad_norm": 1.9674327154746716, + "language_loss": 0.83887661, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.86381435, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.16381836, + "step": 4197, + "time_per_iteration": 2.906425714492798 + }, + { + "auxiliary_loss_clip": 0.01455908, + "auxiliary_loss_mlp": 0.01048264, + "balance_loss_clip": 1.31386697, + "balance_loss_mlp": 1.02957249, + "epoch": 0.25239741470013527, + "flos": 36976486095720.0, + "grad_norm": 1.8534826458335882, + "language_loss": 0.73408794, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75912964, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.18701172, + "step": 4198, + "time_per_iteration": 2.8842720985412598 + }, + { + "auxiliary_loss_clip": 0.01454681, + "auxiliary_loss_mlp": 0.01039931, + "balance_loss_clip": 1.31049764, + "balance_loss_mlp": 1.02169168, + "epoch": 0.25245753795280323, + "flos": 18520457290920.0, + "grad_norm": 1.9024515784206457, + "language_loss": 0.77183998, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79678607, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.18237305, + "step": 4199, + "time_per_iteration": 2.7588393688201904 + }, + { + "auxiliary_loss_clip": 0.01455237, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.31161284, + "balance_loss_mlp": 1.02751315, + "epoch": 0.2525176612054712, + "flos": 32351523127920.0, + "grad_norm": 1.9289197926004358, + "language_loss": 0.73614287, + "learning_rate": 3.502689480360739e-06, + "loss": 0.76113653, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.1661377, + "step": 4200, + "time_per_iteration": 2.94952654838562 + }, + { + "auxiliary_loss_clip": 0.01450451, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.30949044, + "balance_loss_mlp": 1.02938962, + "epoch": 0.25257778445813917, + "flos": 45266198551200.0, + "grad_norm": 2.0236298098336403, + "language_loss": 0.82718611, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.85215497, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17041016, + "step": 4201, + "time_per_iteration": 3.0247371196746826 + }, + { + "auxiliary_loss_clip": 0.01457907, + "auxiliary_loss_mlp": 0.01042037, + "balance_loss_clip": 1.3150692, + "balance_loss_mlp": 1.02513266, + "epoch": 0.25263790771080713, + "flos": 23373050054400.0, + "grad_norm": 1.603290228856865, + "language_loss": 0.74749213, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77249157, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16894531, + "step": 4202, + "time_per_iteration": 4.228376865386963 + }, + { + "auxiliary_loss_clip": 0.01451719, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.31307983, + "balance_loss_mlp": 1.0215044, + "epoch": 0.25269803096347515, + "flos": 18519645123720.0, + "grad_norm": 8.678505247396105, + "language_loss": 0.73507822, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75998002, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.1696167, + "step": 4203, + "time_per_iteration": 2.8666720390319824 + }, + { + "auxiliary_loss_clip": 0.01458567, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.31560862, + "balance_loss_mlp": 1.02255034, + "epoch": 0.2527581542161431, + "flos": 24616266152040.0, + "grad_norm": 1.765238448676078, + "language_loss": 0.78110743, + "learning_rate": 3.501660986124297e-06, + "loss": 0.80609518, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.17651367, + "step": 4204, + "time_per_iteration": 2.829845905303955 + }, + { + "auxiliary_loss_clip": 0.01452941, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.31110406, + "balance_loss_mlp": 1.02038169, + "epoch": 0.2528182774688111, + "flos": 12645171762480.0, + "grad_norm": 1.9313290505325387, + "language_loss": 0.7225275, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74742943, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.16870117, + "step": 4205, + "time_per_iteration": 5.715506553649902 + }, + { + "auxiliary_loss_clip": 0.01440383, + "auxiliary_loss_mlp": 0.01040966, + "balance_loss_clip": 1.30480742, + "balance_loss_mlp": 1.02561152, + "epoch": 0.25287840072147905, + "flos": 46945019793960.0, + "grad_norm": 1.3651006627194509, + "language_loss": 0.75787461, + "learning_rate": 3.50114639730826e-06, + "loss": 0.78268814, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.15362549, + "step": 4206, + "time_per_iteration": 2.999711751937866 + }, + { + "auxiliary_loss_clip": 0.01452028, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.3098774, + "balance_loss_mlp": 1.02382588, + "epoch": 0.252938523974147, + "flos": 18884341426320.0, + "grad_norm": 1.6748004640572685, + "language_loss": 0.79017127, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.815099, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.16906738, + "step": 4207, + "time_per_iteration": 2.763817310333252 + }, + { + "auxiliary_loss_clip": 0.01446324, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.30830741, + "balance_loss_mlp": 1.02451658, + "epoch": 0.252998647226815, + "flos": 21439814479560.0, + "grad_norm": 1.5776484995575728, + "language_loss": 0.76607776, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.79094785, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.16149902, + "step": 4208, + "time_per_iteration": 2.750213146209717 + }, + { + "auxiliary_loss_clip": 0.01444195, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_clip": 1.30628538, + "balance_loss_mlp": 1.02663505, + "epoch": 0.25305877047948294, + "flos": 25447006888560.0, + "grad_norm": 2.3723874707468453, + "language_loss": 0.70103598, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72590542, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.16101074, + "step": 4209, + "time_per_iteration": 2.8422558307647705 + }, + { + "auxiliary_loss_clip": 0.01264648, + "auxiliary_loss_mlp": 0.0100574, + "balance_loss_clip": 1.20097184, + "balance_loss_mlp": 1.00240195, + "epoch": 0.2531188937321509, + "flos": 60200913241800.0, + "grad_norm": 0.7561441105799587, + "language_loss": 0.55138528, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57408917, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.03344727, + "step": 4210, + "time_per_iteration": 3.3120734691619873 + }, + { + "auxiliary_loss_clip": 0.01446981, + "auxiliary_loss_mlp": 0.01037834, + "balance_loss_clip": 1.30485284, + "balance_loss_mlp": 1.02071571, + "epoch": 0.25317901698481887, + "flos": 19687119984000.0, + "grad_norm": 2.314404661150003, + "language_loss": 0.80558723, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.8304354, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17138672, + "step": 4211, + "time_per_iteration": 2.8177175521850586 + }, + { + "auxiliary_loss_clip": 0.01441653, + "auxiliary_loss_mlp": 0.01040126, + "balance_loss_clip": 1.30502343, + "balance_loss_mlp": 1.02505183, + "epoch": 0.25323914023748684, + "flos": 24429552602040.0, + "grad_norm": 1.4449347618813624, + "language_loss": 0.78033686, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80515468, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.1506958, + "step": 4212, + "time_per_iteration": 2.779204845428467 + }, + { + "auxiliary_loss_clip": 0.01450957, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.3092308, + "balance_loss_mlp": 1.02037716, + "epoch": 0.2532992634901548, + "flos": 25452976317480.0, + "grad_norm": 2.1652856223569907, + "language_loss": 0.537368, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.56225836, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.17700195, + "step": 4213, + "time_per_iteration": 2.9824485778808594 + }, + { + "auxiliary_loss_clip": 0.01448686, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_clip": 1.30697775, + "balance_loss_mlp": 1.02863538, + "epoch": 0.25335938674282277, + "flos": 18885275418600.0, + "grad_norm": 3.0690834164119334, + "language_loss": 0.6536001, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67855543, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.18225098, + "step": 4214, + "time_per_iteration": 2.727525472640991 + }, + { + "auxiliary_loss_clip": 0.0126072, + "auxiliary_loss_mlp": 0.01003983, + "balance_loss_clip": 1.19798076, + "balance_loss_mlp": 1.00007331, + "epoch": 0.25341950999549073, + "flos": 53075954886480.0, + "grad_norm": 0.8612471063617134, + "language_loss": 0.58071637, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60336339, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.0390625, + "step": 4215, + "time_per_iteration": 3.085954427719116 + }, + { + "auxiliary_loss_clip": 0.01450856, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.3113395, + "balance_loss_mlp": 1.02550006, + "epoch": 0.2534796332481587, + "flos": 39027556796040.0, + "grad_norm": 1.6156981878933714, + "language_loss": 0.83930171, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86423504, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16955566, + "step": 4216, + "time_per_iteration": 2.9346415996551514 + }, + { + "auxiliary_loss_clip": 0.01450867, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.31008327, + "balance_loss_mlp": 1.0206809, + "epoch": 0.2535397565008267, + "flos": 23592314527920.0, + "grad_norm": 1.8138634539453238, + "language_loss": 0.80720466, + "learning_rate": 3.498312090875666e-06, + "loss": 0.83209491, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17492676, + "step": 4217, + "time_per_iteration": 2.7749297618865967 + }, + { + "auxiliary_loss_clip": 0.01444903, + "auxiliary_loss_mlp": 0.01039307, + "balance_loss_clip": 1.3043704, + "balance_loss_mlp": 1.02255857, + "epoch": 0.2535998797534947, + "flos": 19286096180400.0, + "grad_norm": 2.2882835084023125, + "language_loss": 0.75072211, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77556431, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1673584, + "step": 4218, + "time_per_iteration": 2.765687942504883 + }, + { + "auxiliary_loss_clip": 0.01452904, + "auxiliary_loss_mlp": 0.01042498, + "balance_loss_clip": 1.30850029, + "balance_loss_mlp": 1.02460515, + "epoch": 0.25366000300616265, + "flos": 24029706440880.0, + "grad_norm": 1.899534259799688, + "language_loss": 0.75363779, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.77859181, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.17895508, + "step": 4219, + "time_per_iteration": 2.8023247718811035 + }, + { + "auxiliary_loss_clip": 0.01459047, + "auxiliary_loss_mlp": 0.01043743, + "balance_loss_clip": 1.31503272, + "balance_loss_mlp": 1.02521765, + "epoch": 0.2537201262588306, + "flos": 16293677906160.0, + "grad_norm": 1.9238128130237897, + "language_loss": 0.81988901, + "learning_rate": 3.497537904525736e-06, + "loss": 0.84491688, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.1854248, + "step": 4220, + "time_per_iteration": 2.866450786590576 + }, + { + "auxiliary_loss_clip": 0.01459279, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_clip": 1.31583107, + "balance_loss_mlp": 1.02298427, + "epoch": 0.2537802495114986, + "flos": 23299907752080.0, + "grad_norm": 2.1484891838439353, + "language_loss": 0.71490723, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73991495, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.18505859, + "step": 4221, + "time_per_iteration": 2.8114025592803955 + }, + { + "auxiliary_loss_clip": 0.01456461, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.31172013, + "balance_loss_mlp": 1.02117157, + "epoch": 0.25384037276416654, + "flos": 17643602438640.0, + "grad_norm": 2.016801210801429, + "language_loss": 0.62528253, + "learning_rate": 3.497021496342202e-06, + "loss": 0.65024674, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.18786621, + "step": 4222, + "time_per_iteration": 2.774909734725952 + }, + { + "auxiliary_loss_clip": 0.01465427, + "auxiliary_loss_mlp": 0.01050218, + "balance_loss_clip": 1.31932187, + "balance_loss_mlp": 1.03187203, + "epoch": 0.2539004960168345, + "flos": 21512063397960.0, + "grad_norm": 1.7169820167183383, + "language_loss": 0.7503922, + "learning_rate": 3.496763207094731e-06, + "loss": 0.77554864, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.18359375, + "step": 4223, + "time_per_iteration": 2.8251688480377197 + }, + { + "auxiliary_loss_clip": 0.01449099, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.31019378, + "balance_loss_mlp": 1.01804793, + "epoch": 0.2539606192695025, + "flos": 23956158054960.0, + "grad_norm": 1.581059594877548, + "language_loss": 0.80417049, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82900906, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16699219, + "step": 4224, + "time_per_iteration": 2.7924365997314453 + }, + { + "auxiliary_loss_clip": 0.01453312, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.31264389, + "balance_loss_mlp": 1.02760863, + "epoch": 0.25402074252217044, + "flos": 24175057053240.0, + "grad_norm": 1.3818575773030388, + "language_loss": 0.77985561, + "learning_rate": 3.496246458337354e-06, + "loss": 0.80484456, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17993164, + "step": 4225, + "time_per_iteration": 2.90047025680542 + }, + { + "auxiliary_loss_clip": 0.01458311, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_clip": 1.31608176, + "balance_loss_mlp": 1.02775955, + "epoch": 0.2540808657748384, + "flos": 22308100967880.0, + "grad_norm": 1.5838158690429804, + "language_loss": 0.84609985, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.87114555, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.18505859, + "step": 4226, + "time_per_iteration": 2.753413200378418 + }, + { + "auxiliary_loss_clip": 0.01455593, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.3136698, + "balance_loss_mlp": 1.02551353, + "epoch": 0.25414098902750637, + "flos": 27605273324040.0, + "grad_norm": 1.503317639569364, + "language_loss": 0.71324408, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73824453, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.18933105, + "step": 4227, + "time_per_iteration": 2.8464810848236084 + }, + { + "auxiliary_loss_clip": 0.01264345, + "auxiliary_loss_mlp": 0.01013668, + "balance_loss_clip": 1.20193946, + "balance_loss_mlp": 1.0098294, + "epoch": 0.25420111228017434, + "flos": 58184829966600.0, + "grad_norm": 0.9773439854177551, + "language_loss": 0.61834973, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64112985, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03833008, + "step": 4228, + "time_per_iteration": 3.091219425201416 + }, + { + "auxiliary_loss_clip": 0.01461323, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_clip": 1.31688285, + "balance_loss_mlp": 1.0255878, + "epoch": 0.2542612355328423, + "flos": 11466935686800.0, + "grad_norm": 2.1507882921176713, + "language_loss": 0.86248589, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88754654, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.19165039, + "step": 4229, + "time_per_iteration": 2.813645601272583 + }, + { + "auxiliary_loss_clip": 0.01464865, + "auxiliary_loss_mlp": 0.01044506, + "balance_loss_clip": 1.32196617, + "balance_loss_mlp": 1.02548051, + "epoch": 0.2543213587855103, + "flos": 22971011041800.0, + "grad_norm": 2.2296665621126293, + "language_loss": 0.77566296, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.80075669, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.19006348, + "step": 4230, + "time_per_iteration": 2.8010270595550537 + }, + { + "auxiliary_loss_clip": 0.01457233, + "auxiliary_loss_mlp": 0.01047377, + "balance_loss_clip": 1.3139298, + "balance_loss_mlp": 1.0281961, + "epoch": 0.2543814820381783, + "flos": 18256906077840.0, + "grad_norm": 2.2891464805357895, + "language_loss": 0.7549907, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.78003687, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.19189453, + "step": 4231, + "time_per_iteration": 2.787749767303467 + }, + { + "auxiliary_loss_clip": 0.01457298, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_clip": 1.31442928, + "balance_loss_mlp": 1.02614713, + "epoch": 0.25444160529084625, + "flos": 15636940302960.0, + "grad_norm": 1.7801635902054587, + "language_loss": 0.74003398, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.7650435, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.17504883, + "step": 4232, + "time_per_iteration": 4.129711627960205 + }, + { + "auxiliary_loss_clip": 0.01457838, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_clip": 1.31444407, + "balance_loss_mlp": 1.02597666, + "epoch": 0.2545017285435142, + "flos": 24606154670400.0, + "grad_norm": 1.8486936052051461, + "language_loss": 0.87038827, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.8954159, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.18945312, + "step": 4233, + "time_per_iteration": 2.8021368980407715 + }, + { + "auxiliary_loss_clip": 0.01443637, + "auxiliary_loss_mlp": 0.01046967, + "balance_loss_clip": 1.30671501, + "balance_loss_mlp": 1.02968156, + "epoch": 0.2545618517961822, + "flos": 24684210584280.0, + "grad_norm": 1.5469818435551412, + "language_loss": 0.74969608, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77460212, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.17272949, + "step": 4234, + "time_per_iteration": 2.8192670345306396 + }, + { + "auxiliary_loss_clip": 0.01463922, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.31929803, + "balance_loss_mlp": 1.02915871, + "epoch": 0.25462197504885015, + "flos": 23920439679360.0, + "grad_norm": 1.5339006248214608, + "language_loss": 0.75390327, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77900761, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17346191, + "step": 4235, + "time_per_iteration": 2.8241541385650635 + }, + { + "auxiliary_loss_clip": 0.01479466, + "auxiliary_loss_mlp": 0.01048553, + "balance_loss_clip": 1.3265779, + "balance_loss_mlp": 1.02847815, + "epoch": 0.2546820983015181, + "flos": 24794654988240.0, + "grad_norm": 1.9438895869515629, + "language_loss": 0.65190399, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6771841, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.20056152, + "step": 4236, + "time_per_iteration": 2.879593849182129 + }, + { + "auxiliary_loss_clip": 0.01454737, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.31380177, + "balance_loss_mlp": 1.02171612, + "epoch": 0.2547422215541861, + "flos": 18738787772160.0, + "grad_norm": 1.5335808177005492, + "language_loss": 0.67533505, + "learning_rate": 3.493141202562354e-06, + "loss": 0.70026314, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16357422, + "step": 4237, + "time_per_iteration": 2.8595871925354004 + }, + { + "auxiliary_loss_clip": 0.01456479, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.31346643, + "balance_loss_mlp": 1.03313136, + "epoch": 0.25480234480685404, + "flos": 21037369383360.0, + "grad_norm": 1.8333338066795857, + "language_loss": 0.75652534, + "learning_rate": 3.492882062983333e-06, + "loss": 0.7816056, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1842041, + "step": 4238, + "time_per_iteration": 2.7355198860168457 + }, + { + "auxiliary_loss_clip": 0.01459467, + "auxiliary_loss_mlp": 0.0104972, + "balance_loss_clip": 1.3153491, + "balance_loss_mlp": 1.03090882, + "epoch": 0.254862468059522, + "flos": 25087589672760.0, + "grad_norm": 1.8569577491826006, + "language_loss": 0.80725312, + "learning_rate": 3.492622866794074e-06, + "loss": 0.83234501, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.18823242, + "step": 4239, + "time_per_iteration": 2.8337814807891846 + }, + { + "auxiliary_loss_clip": 0.01455039, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.31628728, + "balance_loss_mlp": 1.02472043, + "epoch": 0.25492259131219, + "flos": 20563284494160.0, + "grad_norm": 1.5641525139392625, + "language_loss": 0.77931648, + "learning_rate": 3.492363614004407e-06, + "loss": 0.80428976, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.17553711, + "step": 4240, + "time_per_iteration": 4.2319111824035645 + }, + { + "auxiliary_loss_clip": 0.01464447, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.31928635, + "balance_loss_mlp": 1.02404475, + "epoch": 0.25498271456485794, + "flos": 25047485594280.0, + "grad_norm": 1.6681328473116872, + "language_loss": 0.83584237, + "learning_rate": 3.492104304624162e-06, + "loss": 0.86090994, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.18261719, + "step": 4241, + "time_per_iteration": 2.923496723175049 + }, + { + "auxiliary_loss_clip": 0.01456761, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.31423569, + "balance_loss_mlp": 1.02284527, + "epoch": 0.2550428378175259, + "flos": 26184196298880.0, + "grad_norm": 1.5149949081793093, + "language_loss": 0.73246622, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75743872, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17651367, + "step": 4242, + "time_per_iteration": 2.853299140930176 + }, + { + "auxiliary_loss_clip": 0.01457895, + "auxiliary_loss_mlp": 0.01040663, + "balance_loss_clip": 1.3139714, + "balance_loss_mlp": 1.02282906, + "epoch": 0.2551029610701939, + "flos": 15271269399720.0, + "grad_norm": 5.2290888967279825, + "language_loss": 0.73604393, + "learning_rate": 3.491585516131273e-06, + "loss": 0.76102954, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1784668, + "step": 4243, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.01451496, + "auxiliary_loss_mlp": 0.01044318, + "balance_loss_clip": 1.31070888, + "balance_loss_mlp": 1.02637744, + "epoch": 0.2551630843228619, + "flos": 18116550293760.0, + "grad_norm": 1.626103564340621, + "language_loss": 0.81877369, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84373182, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.17944336, + "step": 4244, + "time_per_iteration": 5.849533557891846 + }, + { + "auxiliary_loss_clip": 0.0128555, + "auxiliary_loss_mlp": 0.0100611, + "balance_loss_clip": 1.22388268, + "balance_loss_mlp": 1.00265253, + "epoch": 0.25522320757552985, + "flos": 70538041645200.0, + "grad_norm": 0.6974520169818771, + "language_loss": 0.57780159, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.60071814, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03466797, + "step": 4245, + "time_per_iteration": 3.34749174118042 + }, + { + "auxiliary_loss_clip": 0.01460574, + "auxiliary_loss_mlp": 0.01043936, + "balance_loss_clip": 1.31513965, + "balance_loss_mlp": 1.0257448, + "epoch": 0.2552833308281978, + "flos": 22898234214720.0, + "grad_norm": 2.5337334499130466, + "language_loss": 0.65908301, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.68412817, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.18188477, + "step": 4246, + "time_per_iteration": 2.7669498920440674 + }, + { + "auxiliary_loss_clip": 0.01445407, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.30791867, + "balance_loss_mlp": 1.02352214, + "epoch": 0.2553434540808658, + "flos": 22058600247360.0, + "grad_norm": 1.829967572141715, + "language_loss": 0.81494886, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83980531, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16711426, + "step": 4247, + "time_per_iteration": 2.7476930618286133 + }, + { + "auxiliary_loss_clip": 0.0147016, + "auxiliary_loss_mlp": 0.01044363, + "balance_loss_clip": 1.3211087, + "balance_loss_mlp": 1.02505124, + "epoch": 0.25540357733353375, + "flos": 16548457713480.0, + "grad_norm": 2.5082999879967307, + "language_loss": 0.83812249, + "learning_rate": 3.490287555252514e-06, + "loss": 0.86326766, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.1932373, + "step": 4248, + "time_per_iteration": 2.7804415225982666 + }, + { + "auxiliary_loss_clip": 0.01459108, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_clip": 1.31417811, + "balance_loss_mlp": 1.02813315, + "epoch": 0.2554637005862017, + "flos": 17569323102240.0, + "grad_norm": 1.9603531712225324, + "language_loss": 0.84487975, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86993444, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.18237305, + "step": 4249, + "time_per_iteration": 2.7520363330841064 + }, + { + "auxiliary_loss_clip": 0.01288612, + "auxiliary_loss_mlp": 0.01004773, + "balance_loss_clip": 1.22656286, + "balance_loss_mlp": 1.00095868, + "epoch": 0.2555238238388697, + "flos": 72259345878480.0, + "grad_norm": 0.772720972324536, + "language_loss": 0.56303221, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58596611, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.03808594, + "step": 4250, + "time_per_iteration": 3.270336627960205 + }, + { + "auxiliary_loss_clip": 0.01453912, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.30974567, + "balance_loss_mlp": 1.02019429, + "epoch": 0.25558394709153764, + "flos": 24394808827080.0, + "grad_norm": 1.9997643271501016, + "language_loss": 0.82082963, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.84576356, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.19299316, + "step": 4251, + "time_per_iteration": 2.8162314891815186 + }, + { + "auxiliary_loss_clip": 0.01280644, + "auxiliary_loss_mlp": 0.01014666, + "balance_loss_clip": 1.21867442, + "balance_loss_mlp": 1.01113749, + "epoch": 0.2556440703442056, + "flos": 69247330747560.0, + "grad_norm": 0.8002706786115882, + "language_loss": 0.66087091, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.683824, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.03540039, + "step": 4252, + "time_per_iteration": 3.230161190032959 + }, + { + "auxiliary_loss_clip": 0.01449393, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.31006181, + "balance_loss_mlp": 1.02329099, + "epoch": 0.2557041935968736, + "flos": 24869381016600.0, + "grad_norm": 1.84296205996976, + "language_loss": 0.73666704, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.76156384, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16992188, + "step": 4253, + "time_per_iteration": 2.8341429233551025 + }, + { + "auxiliary_loss_clip": 0.01452596, + "auxiliary_loss_mlp": 0.01043579, + "balance_loss_clip": 1.31054723, + "balance_loss_mlp": 1.02590013, + "epoch": 0.25576431684954154, + "flos": 22497251019480.0, + "grad_norm": 2.10250111187508, + "language_loss": 0.73313218, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75809395, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17675781, + "step": 4254, + "time_per_iteration": 2.889212131500244 + }, + { + "auxiliary_loss_clip": 0.0144922, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.30703855, + "balance_loss_mlp": 1.0249902, + "epoch": 0.2558244401022095, + "flos": 19831292953920.0, + "grad_norm": 2.590072231053863, + "language_loss": 0.81136715, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.83628958, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.18029785, + "step": 4255, + "time_per_iteration": 2.793555974960327 + }, + { + "auxiliary_loss_clip": 0.01452465, + "auxiliary_loss_mlp": 0.01044409, + "balance_loss_clip": 1.31237555, + "balance_loss_mlp": 1.02670634, + "epoch": 0.2558845633548775, + "flos": 23225547198960.0, + "grad_norm": 1.4479370469297415, + "language_loss": 0.85640371, + "learning_rate": 3.488207879742721e-06, + "loss": 0.88137245, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.17700195, + "step": 4256, + "time_per_iteration": 2.872166633605957 + }, + { + "auxiliary_loss_clip": 0.01461375, + "auxiliary_loss_mlp": 0.01049695, + "balance_loss_clip": 1.31435513, + "balance_loss_mlp": 1.02945304, + "epoch": 0.2559446866075455, + "flos": 16842407607000.0, + "grad_norm": 1.931894892204984, + "language_loss": 0.75709832, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.78220904, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.20239258, + "step": 4257, + "time_per_iteration": 2.743300437927246 + }, + { + "auxiliary_loss_clip": 0.01278717, + "auxiliary_loss_mlp": 0.01006085, + "balance_loss_clip": 1.21674657, + "balance_loss_mlp": 1.00248516, + "epoch": 0.25600480986021346, + "flos": 57608584778880.0, + "grad_norm": 0.7960145665891584, + "language_loss": 0.65287209, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.6757201, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03588867, + "step": 4258, + "time_per_iteration": 3.2198967933654785 + }, + { + "auxiliary_loss_clip": 0.01436439, + "auxiliary_loss_mlp": 0.01040495, + "balance_loss_clip": 1.2996074, + "balance_loss_mlp": 1.02174306, + "epoch": 0.2560649331128814, + "flos": 27825431181480.0, + "grad_norm": 1.5868984096403924, + "language_loss": 0.76129872, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78606802, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.1875, + "step": 4259, + "time_per_iteration": 2.7874486446380615 + }, + { + "auxiliary_loss_clip": 0.0127921, + "auxiliary_loss_mlp": 0.01012672, + "balance_loss_clip": 1.21712971, + "balance_loss_mlp": 1.00954866, + "epoch": 0.2561250563655494, + "flos": 70967433711240.0, + "grad_norm": 0.8003214050986316, + "language_loss": 0.58436143, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60728025, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.03125, + "step": 4260, + "time_per_iteration": 3.3285255432128906 + }, + { + "auxiliary_loss_clip": 0.01444997, + "auxiliary_loss_mlp": 0.01049889, + "balance_loss_clip": 1.30308318, + "balance_loss_mlp": 1.0313406, + "epoch": 0.25618517961821735, + "flos": 27017535970440.0, + "grad_norm": 1.6430707103764608, + "language_loss": 0.76536989, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79031873, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1854248, + "step": 4261, + "time_per_iteration": 2.8488316535949707 + }, + { + "auxiliary_loss_clip": 0.01446446, + "auxiliary_loss_mlp": 0.01053324, + "balance_loss_clip": 1.30555987, + "balance_loss_mlp": 1.02834928, + "epoch": 0.2562453028708853, + "flos": 23072521606560.0, + "grad_norm": 2.2661708569626744, + "language_loss": 0.83142519, + "learning_rate": 3.486645752648842e-06, + "loss": 0.8564229, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.24951172, + "step": 4262, + "time_per_iteration": 2.777883768081665 + }, + { + "auxiliary_loss_clip": 0.01454182, + "auxiliary_loss_mlp": 0.01048782, + "balance_loss_clip": 1.30691886, + "balance_loss_mlp": 1.03028083, + "epoch": 0.2563054261235533, + "flos": 15124984795080.0, + "grad_norm": 3.073753346787644, + "language_loss": 0.73754406, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76257372, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.18493652, + "step": 4263, + "time_per_iteration": 2.7247934341430664 + }, + { + "auxiliary_loss_clip": 0.01445436, + "auxiliary_loss_mlp": 0.01050347, + "balance_loss_clip": 1.30705011, + "balance_loss_mlp": 1.03244197, + "epoch": 0.25636554937622125, + "flos": 27860621648400.0, + "grad_norm": 1.5408764107422181, + "language_loss": 0.82799178, + "learning_rate": 3.486124592522163e-06, + "loss": 0.85294962, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.17919922, + "step": 4264, + "time_per_iteration": 2.7772116661071777 + }, + { + "auxiliary_loss_clip": 0.01449711, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.30797291, + "balance_loss_mlp": 1.02304697, + "epoch": 0.2564256726288892, + "flos": 28911560850720.0, + "grad_norm": 1.7467706303320067, + "language_loss": 0.75240123, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.77731615, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.18725586, + "step": 4265, + "time_per_iteration": 2.8070645332336426 + }, + { + "auxiliary_loss_clip": 0.01445894, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.30466342, + "balance_loss_mlp": 1.01488459, + "epoch": 0.2564857958815572, + "flos": 18519523298640.0, + "grad_norm": 1.7112272774055548, + "language_loss": 0.8206979, + "learning_rate": 3.485603206979513e-06, + "loss": 0.84547889, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.17321777, + "step": 4266, + "time_per_iteration": 2.710175037384033 + }, + { + "auxiliary_loss_clip": 0.01434376, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.29589677, + "balance_loss_mlp": 1.02384233, + "epoch": 0.25654591913422514, + "flos": 25813327525560.0, + "grad_norm": 1.458042954967938, + "language_loss": 0.79512262, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81988716, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.18237305, + "step": 4267, + "time_per_iteration": 2.7805206775665283 + }, + { + "auxiliary_loss_clip": 0.01435169, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.30091953, + "balance_loss_mlp": 1.02073121, + "epoch": 0.2566060423868931, + "flos": 19104458675400.0, + "grad_norm": 1.5536204837580847, + "language_loss": 0.79205698, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81679201, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.17590332, + "step": 4268, + "time_per_iteration": 2.7685108184814453 + }, + { + "auxiliary_loss_clip": 0.01439579, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_clip": 1.29931521, + "balance_loss_mlp": 1.02969861, + "epoch": 0.25666616563956113, + "flos": 23848068935880.0, + "grad_norm": 1.5324342853099513, + "language_loss": 0.6843695, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70923811, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.17590332, + "step": 4269, + "time_per_iteration": 2.774477481842041 + }, + { + "auxiliary_loss_clip": 0.01444554, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.30293846, + "balance_loss_mlp": 1.02006078, + "epoch": 0.2567262888922291, + "flos": 14607465941880.0, + "grad_norm": 2.6613471786514253, + "language_loss": 0.79979098, + "learning_rate": 3.484559759962666e-06, + "loss": 0.82462144, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1842041, + "step": 4270, + "time_per_iteration": 4.087042331695557 + }, + { + "auxiliary_loss_clip": 0.01456171, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.30958438, + "balance_loss_mlp": 1.02257538, + "epoch": 0.25678641214489706, + "flos": 32929108391520.0, + "grad_norm": 1.8809722027096443, + "language_loss": 0.68802238, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.71301389, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.20385742, + "step": 4271, + "time_per_iteration": 2.855652093887329 + }, + { + "auxiliary_loss_clip": 0.01449527, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.30486572, + "balance_loss_mlp": 1.02434254, + "epoch": 0.256846535397565, + "flos": 24104473077600.0, + "grad_norm": 1.3194825432872086, + "language_loss": 0.87321484, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8981424, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1887207, + "step": 4272, + "time_per_iteration": 2.8236474990844727 + }, + { + "auxiliary_loss_clip": 0.01453847, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.31072283, + "balance_loss_mlp": 1.02198148, + "epoch": 0.256906658650233, + "flos": 19723122618120.0, + "grad_norm": 1.583215721621724, + "language_loss": 0.8164953, + "learning_rate": 3.483776583571541e-06, + "loss": 0.84144253, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.18896484, + "step": 4273, + "time_per_iteration": 2.7799489498138428 + }, + { + "auxiliary_loss_clip": 0.01432395, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.29708731, + "balance_loss_mlp": 1.01898527, + "epoch": 0.25696678190290095, + "flos": 22930703921520.0, + "grad_norm": 1.4664602679912835, + "language_loss": 0.77506256, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79975283, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1763916, + "step": 4274, + "time_per_iteration": 2.764949083328247 + }, + { + "auxiliary_loss_clip": 0.01429404, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.29188812, + "balance_loss_mlp": 1.01861906, + "epoch": 0.2570269051555689, + "flos": 27313556890320.0, + "grad_norm": 1.6972795511344154, + "language_loss": 0.83812988, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86278534, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.1751709, + "step": 4275, + "time_per_iteration": 2.810710906982422 + }, + { + "auxiliary_loss_clip": 0.01445782, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.3038013, + "balance_loss_mlp": 1.01886225, + "epoch": 0.2570870284082369, + "flos": 27569189473200.0, + "grad_norm": 1.9565653593135433, + "language_loss": 0.78567576, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.81049919, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17712402, + "step": 4276, + "time_per_iteration": 2.7893974781036377 + }, + { + "auxiliary_loss_clip": 0.0144528, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.30440784, + "balance_loss_mlp": 1.02432775, + "epoch": 0.25714715166090485, + "flos": 28736745550200.0, + "grad_norm": 1.665993946476206, + "language_loss": 0.79495412, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81982303, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17297363, + "step": 4277, + "time_per_iteration": 2.928270101547241 + }, + { + "auxiliary_loss_clip": 0.01442748, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.30274391, + "balance_loss_mlp": 1.01835728, + "epoch": 0.2572072749135728, + "flos": 20120410452600.0, + "grad_norm": 1.8091076033938809, + "language_loss": 0.79091763, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81570423, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17565918, + "step": 4278, + "time_per_iteration": 2.7492446899414062 + }, + { + "auxiliary_loss_clip": 0.01443896, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.30211329, + "balance_loss_mlp": 1.01631176, + "epoch": 0.2572673981662408, + "flos": 26036409184920.0, + "grad_norm": 1.7448394432595329, + "language_loss": 0.74767226, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77245069, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.17626953, + "step": 4279, + "time_per_iteration": 4.241356611251831 + }, + { + "auxiliary_loss_clip": 0.01444684, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_clip": 1.30098629, + "balance_loss_mlp": 1.02702069, + "epoch": 0.25732752141890874, + "flos": 16110862758720.0, + "grad_norm": 1.9658704180726554, + "language_loss": 0.85954869, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.88444662, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.18078613, + "step": 4280, + "time_per_iteration": 2.885514974594116 + }, + { + "auxiliary_loss_clip": 0.01442705, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.2990942, + "balance_loss_mlp": 1.02100646, + "epoch": 0.2573876446715767, + "flos": 22529030384160.0, + "grad_norm": 2.9764065859775224, + "language_loss": 0.79971272, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.82452977, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.17993164, + "step": 4281, + "time_per_iteration": 2.8004260063171387 + }, + { + "auxiliary_loss_clip": 0.0144413, + "auxiliary_loss_mlp": 0.01040293, + "balance_loss_clip": 1.30383062, + "balance_loss_mlp": 1.02315104, + "epoch": 0.2574477679242447, + "flos": 23956076838240.0, + "grad_norm": 1.6761053750760748, + "language_loss": 0.87297463, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89781886, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17150879, + "step": 4282, + "time_per_iteration": 2.773320198059082 + }, + { + "auxiliary_loss_clip": 0.01450941, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.30752897, + "balance_loss_mlp": 1.02865624, + "epoch": 0.2575078911769127, + "flos": 21986838629280.0, + "grad_norm": 1.4549178761492958, + "language_loss": 0.70682275, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.73178697, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.16833496, + "step": 4283, + "time_per_iteration": 4.31053614616394 + }, + { + "auxiliary_loss_clip": 0.01434858, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.29880404, + "balance_loss_mlp": 1.02257848, + "epoch": 0.25756801442958066, + "flos": 21950551736640.0, + "grad_norm": 1.6398305698895717, + "language_loss": 0.80965739, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.8343941, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.16223145, + "step": 4284, + "time_per_iteration": 2.7609353065490723 + }, + { + "auxiliary_loss_clip": 0.01446223, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.30474114, + "balance_loss_mlp": 1.01830482, + "epoch": 0.2576281376822486, + "flos": 35268768681840.0, + "grad_norm": 1.8541281955223297, + "language_loss": 0.70432663, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72913289, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.16113281, + "step": 4285, + "time_per_iteration": 2.864339590072632 + }, + { + "auxiliary_loss_clip": 0.0144377, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.30308115, + "balance_loss_mlp": 1.02480328, + "epoch": 0.2576882609349166, + "flos": 14135736337560.0, + "grad_norm": 2.2872374686732373, + "language_loss": 0.59047759, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.61533439, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17089844, + "step": 4286, + "time_per_iteration": 2.706902503967285 + }, + { + "auxiliary_loss_clip": 0.01458533, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.31428361, + "balance_loss_mlp": 1.02940893, + "epoch": 0.25774838418758456, + "flos": 23263580251080.0, + "grad_norm": 1.7589361856337802, + "language_loss": 0.64353108, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66858304, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17260742, + "step": 4287, + "time_per_iteration": 2.770494222640991 + }, + { + "auxiliary_loss_clip": 0.01453761, + "auxiliary_loss_mlp": 0.01042746, + "balance_loss_clip": 1.30866385, + "balance_loss_mlp": 1.02376831, + "epoch": 0.2578085074402525, + "flos": 22606923864600.0, + "grad_norm": 1.7091948484516004, + "language_loss": 0.71536446, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74032956, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.18981934, + "step": 4288, + "time_per_iteration": 2.782048225402832 + }, + { + "auxiliary_loss_clip": 0.01433638, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.29383755, + "balance_loss_mlp": 1.02117372, + "epoch": 0.2578686306929205, + "flos": 24577258499280.0, + "grad_norm": 1.525746888947042, + "language_loss": 0.77176762, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79647559, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.16003418, + "step": 4289, + "time_per_iteration": 2.774038076400757 + }, + { + "auxiliary_loss_clip": 0.01438326, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.29808974, + "balance_loss_mlp": 1.02041912, + "epoch": 0.25792875394558845, + "flos": 18118540103400.0, + "grad_norm": 6.947519802650053, + "language_loss": 0.85438704, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87915641, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.18200684, + "step": 4290, + "time_per_iteration": 2.743501663208008 + }, + { + "auxiliary_loss_clip": 0.0143975, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_clip": 1.29708171, + "balance_loss_mlp": 1.02823162, + "epoch": 0.2579888771982564, + "flos": 17717475691440.0, + "grad_norm": 2.020480487142494, + "language_loss": 0.73263663, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.75749469, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17822266, + "step": 4291, + "time_per_iteration": 2.7174153327941895 + }, + { + "auxiliary_loss_clip": 0.01442299, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.29863596, + "balance_loss_mlp": 1.02052569, + "epoch": 0.2580490004509244, + "flos": 16439150343600.0, + "grad_norm": 2.435508643369777, + "language_loss": 0.81264174, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83744848, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.1784668, + "step": 4292, + "time_per_iteration": 2.8126838207244873 + }, + { + "auxiliary_loss_clip": 0.01442352, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.30254126, + "balance_loss_mlp": 1.0220654, + "epoch": 0.25810912370359235, + "flos": 33841234927440.0, + "grad_norm": 2.1449051705961413, + "language_loss": 0.68464148, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.70945811, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.17236328, + "step": 4293, + "time_per_iteration": 2.8475139141082764 + }, + { + "auxiliary_loss_clip": 0.0143362, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.29685438, + "balance_loss_mlp": 1.02287781, + "epoch": 0.2581692469562603, + "flos": 25197709209840.0, + "grad_norm": 1.7772338596826194, + "language_loss": 0.75529766, + "learning_rate": 3.478280185054542e-06, + "loss": 0.78002143, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.15869141, + "step": 4294, + "time_per_iteration": 2.8401951789855957 + }, + { + "auxiliary_loss_clip": 0.01429495, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.29029751, + "balance_loss_mlp": 1.02740407, + "epoch": 0.2582293702089283, + "flos": 34938775545840.0, + "grad_norm": 1.864349297134081, + "language_loss": 0.8121649, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83691204, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.17810059, + "step": 4295, + "time_per_iteration": 2.887596607208252 + }, + { + "auxiliary_loss_clip": 0.01445467, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.30021596, + "balance_loss_mlp": 1.02015531, + "epoch": 0.2582894934615963, + "flos": 26839431392760.0, + "grad_norm": 6.033249809047699, + "language_loss": 0.73589927, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.76074016, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.18457031, + "step": 4296, + "time_per_iteration": 2.784024238586426 + }, + { + "auxiliary_loss_clip": 0.01445063, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.3022536, + "balance_loss_mlp": 1.02251947, + "epoch": 0.25834961671426426, + "flos": 23520715343280.0, + "grad_norm": 1.5242886622595408, + "language_loss": 0.87025011, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89509606, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17016602, + "step": 4297, + "time_per_iteration": 2.834263324737549 + }, + { + "auxiliary_loss_clip": 0.01445372, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.30267775, + "balance_loss_mlp": 1.02945304, + "epoch": 0.25840973996693223, + "flos": 22455563214960.0, + "grad_norm": 1.6559845710502223, + "language_loss": 0.85046196, + "learning_rate": 3.477230446361943e-06, + "loss": 0.87538081, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1706543, + "step": 4298, + "time_per_iteration": 2.7525830268859863 + }, + { + "auxiliary_loss_clip": 0.01440017, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.29851174, + "balance_loss_mlp": 1.02134681, + "epoch": 0.2584698632196002, + "flos": 11294516279520.0, + "grad_norm": 2.0251985030653, + "language_loss": 0.84377456, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.86856699, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.17883301, + "step": 4299, + "time_per_iteration": 2.7440075874328613 + }, + { + "auxiliary_loss_clip": 0.0143668, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.29938233, + "balance_loss_mlp": 1.01761794, + "epoch": 0.25852998647226816, + "flos": 17934344271720.0, + "grad_norm": 2.3150716717240285, + "language_loss": 0.83643758, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.86114359, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16296387, + "step": 4300, + "time_per_iteration": 2.724207878112793 + }, + { + "auxiliary_loss_clip": 0.01447495, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.30569577, + "balance_loss_mlp": 1.01869571, + "epoch": 0.2585901097249361, + "flos": 33262756279920.0, + "grad_norm": 1.9026257425952304, + "language_loss": 0.67711526, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.70194441, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1673584, + "step": 4301, + "time_per_iteration": 2.915585994720459 + }, + { + "auxiliary_loss_clip": 0.01446981, + "auxiliary_loss_mlp": 0.01047565, + "balance_loss_clip": 1.30173206, + "balance_loss_mlp": 1.03054225, + "epoch": 0.2586502329776041, + "flos": 18445771870920.0, + "grad_norm": 2.5744249898452476, + "language_loss": 0.82298481, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.84793031, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.17004395, + "step": 4302, + "time_per_iteration": 2.796034812927246 + }, + { + "auxiliary_loss_clip": 0.0144314, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.30306554, + "balance_loss_mlp": 1.02835, + "epoch": 0.25871035623027205, + "flos": 17972702190720.0, + "grad_norm": 1.7632025431077312, + "language_loss": 0.9210999, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94598556, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.17077637, + "step": 4303, + "time_per_iteration": 2.805082321166992 + }, + { + "auxiliary_loss_clip": 0.01440801, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.30071867, + "balance_loss_mlp": 1.01984453, + "epoch": 0.25877047948294, + "flos": 27782931209760.0, + "grad_norm": 1.8292752431460313, + "language_loss": 0.67977464, + "learning_rate": 3.475654158020507e-06, + "loss": 0.70455205, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17102051, + "step": 4304, + "time_per_iteration": 2.833646535873413 + }, + { + "auxiliary_loss_clip": 0.01445152, + "auxiliary_loss_mlp": 0.01043507, + "balance_loss_clip": 1.30117857, + "balance_loss_mlp": 1.02642417, + "epoch": 0.258830602735608, + "flos": 27131594518440.0, + "grad_norm": 2.3838733505132454, + "language_loss": 0.72913206, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75401866, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.17089844, + "step": 4305, + "time_per_iteration": 2.777522087097168 + }, + { + "auxiliary_loss_clip": 0.01443523, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_clip": 1.29995763, + "balance_loss_mlp": 1.03001761, + "epoch": 0.25889072598827595, + "flos": 17895580269120.0, + "grad_norm": 1.8590215042530107, + "language_loss": 0.762564, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.78747272, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.17333984, + "step": 4306, + "time_per_iteration": 2.7624568939208984 + }, + { + "auxiliary_loss_clip": 0.01301043, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.24052238, + "balance_loss_mlp": 1.02376986, + "epoch": 0.2589508492409439, + "flos": 53947936735560.0, + "grad_norm": 0.8637540062888108, + "language_loss": 0.57230544, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59558547, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.03198242, + "step": 4307, + "time_per_iteration": 3.1636264324188232 + }, + { + "auxiliary_loss_clip": 0.01433101, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.29698265, + "balance_loss_mlp": 1.02168727, + "epoch": 0.2590109724936119, + "flos": 22130686732320.0, + "grad_norm": 1.4732268386203244, + "language_loss": 0.71626949, + "learning_rate": 3.474602179854327e-06, + "loss": 0.74098414, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.16687012, + "step": 4308, + "time_per_iteration": 2.762845277786255 + }, + { + "auxiliary_loss_clip": 0.01445075, + "auxiliary_loss_mlp": 0.01045017, + "balance_loss_clip": 1.30160964, + "balance_loss_mlp": 1.02828026, + "epoch": 0.2590710957462799, + "flos": 13477496225040.0, + "grad_norm": 1.7426950188110275, + "language_loss": 0.846178, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.87107897, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.1673584, + "step": 4309, + "time_per_iteration": 4.130009412765503 + }, + { + "auxiliary_loss_clip": 0.01441068, + "auxiliary_loss_mlp": 0.01041897, + "balance_loss_clip": 1.30212927, + "balance_loss_mlp": 1.02542257, + "epoch": 0.25913121899894787, + "flos": 22311877545360.0, + "grad_norm": 1.4790081533998483, + "language_loss": 0.84658319, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87141281, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16467285, + "step": 4310, + "time_per_iteration": 2.74177885055542 + }, + { + "auxiliary_loss_clip": 0.01448468, + "auxiliary_loss_mlp": 0.01048195, + "balance_loss_clip": 1.30572653, + "balance_loss_mlp": 1.03086162, + "epoch": 0.25919134225161583, + "flos": 25817225928120.0, + "grad_norm": 1.8048975874900013, + "language_loss": 0.77616608, + "learning_rate": 3.473812609065639e-06, + "loss": 0.80113268, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17333984, + "step": 4311, + "time_per_iteration": 2.800712823867798 + }, + { + "auxiliary_loss_clip": 0.01447641, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.30511749, + "balance_loss_mlp": 1.02269137, + "epoch": 0.2592514655042838, + "flos": 31218548392440.0, + "grad_norm": 1.843577297454338, + "language_loss": 0.72267199, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74754465, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.16918945, + "step": 4312, + "time_per_iteration": 2.833956003189087 + }, + { + "auxiliary_loss_clip": 0.01441761, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.30091, + "balance_loss_mlp": 1.02696466, + "epoch": 0.25931158875695176, + "flos": 18479134961640.0, + "grad_norm": 2.03479493659454, + "language_loss": 0.70718354, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.73203707, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16607666, + "step": 4313, + "time_per_iteration": 2.8382046222686768 + }, + { + "auxiliary_loss_clip": 0.01435321, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_clip": 1.29658341, + "balance_loss_mlp": 1.02226901, + "epoch": 0.2593717120096197, + "flos": 19212588402840.0, + "grad_norm": 1.5610345602540774, + "language_loss": 0.80775559, + "learning_rate": 3.473022535292867e-06, + "loss": 0.83253968, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.20812988, + "step": 4314, + "time_per_iteration": 2.7635750770568848 + }, + { + "auxiliary_loss_clip": 0.01449945, + "auxiliary_loss_mlp": 0.01055889, + "balance_loss_clip": 1.30452156, + "balance_loss_mlp": 1.03716087, + "epoch": 0.2594318352622877, + "flos": 31254307376400.0, + "grad_norm": 2.0762279243061936, + "language_loss": 0.67438281, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69944119, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.18713379, + "step": 4315, + "time_per_iteration": 2.894411563873291 + }, + { + "auxiliary_loss_clip": 0.01438891, + "auxiliary_loss_mlp": 0.01043559, + "balance_loss_clip": 1.29881942, + "balance_loss_mlp": 1.02812183, + "epoch": 0.25949195851495566, + "flos": 22242308778720.0, + "grad_norm": 1.4322950591178096, + "language_loss": 0.7957626, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.8205871, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.15441895, + "step": 4316, + "time_per_iteration": 2.7576406002044678 + }, + { + "auxiliary_loss_clip": 0.0144345, + "auxiliary_loss_mlp": 0.01041376, + "balance_loss_clip": 1.29932141, + "balance_loss_mlp": 1.02385187, + "epoch": 0.2595520817676236, + "flos": 28081754106480.0, + "grad_norm": 1.4637207210052485, + "language_loss": 0.7747699, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79961818, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1751709, + "step": 4317, + "time_per_iteration": 2.788843870162964 + }, + { + "auxiliary_loss_clip": 0.01441305, + "auxiliary_loss_mlp": 0.01047532, + "balance_loss_clip": 1.30083203, + "balance_loss_mlp": 1.0300436, + "epoch": 0.2596122050202916, + "flos": 20195461347840.0, + "grad_norm": 2.1958914710918687, + "language_loss": 0.78129607, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80618447, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17504883, + "step": 4318, + "time_per_iteration": 4.1739182472229 + }, + { + "auxiliary_loss_clip": 0.0144319, + "auxiliary_loss_mlp": 0.01044285, + "balance_loss_clip": 1.3022052, + "balance_loss_mlp": 1.02539003, + "epoch": 0.25967232827295955, + "flos": 22533010003440.0, + "grad_norm": 1.635455891341472, + "language_loss": 0.76285386, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78772867, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.18896484, + "step": 4319, + "time_per_iteration": 2.782594680786133 + }, + { + "auxiliary_loss_clip": 0.01438632, + "auxiliary_loss_mlp": 0.01036397, + "balance_loss_clip": 1.29951656, + "balance_loss_mlp": 1.01988649, + "epoch": 0.2597324515256275, + "flos": 21073087758960.0, + "grad_norm": 1.574877601878838, + "language_loss": 0.77035224, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.7951026, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16503906, + "step": 4320, + "time_per_iteration": 2.75984525680542 + }, + { + "auxiliary_loss_clip": 0.01445564, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.3033365, + "balance_loss_mlp": 1.02330816, + "epoch": 0.2597925747782955, + "flos": 22054539411360.0, + "grad_norm": 1.6278034967779536, + "language_loss": 0.71248472, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73735404, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.18066406, + "step": 4321, + "time_per_iteration": 2.7633862495422363 + }, + { + "auxiliary_loss_clip": 0.01450782, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.3041656, + "balance_loss_mlp": 1.01758814, + "epoch": 0.2598526980309635, + "flos": 19541728763280.0, + "grad_norm": 1.779330338469268, + "language_loss": 0.74787301, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.77273458, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.17785645, + "step": 4322, + "time_per_iteration": 4.244766712188721 + }, + { + "auxiliary_loss_clip": 0.01446094, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.30228639, + "balance_loss_mlp": 1.02179551, + "epoch": 0.25991282128363147, + "flos": 24500136577680.0, + "grad_norm": 2.219753092800767, + "language_loss": 0.73840988, + "learning_rate": 3.470649298767278e-06, + "loss": 0.76327354, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.18481445, + "step": 4323, + "time_per_iteration": 4.292095899581909 + }, + { + "auxiliary_loss_clip": 0.0145355, + "auxiliary_loss_mlp": 0.01049555, + "balance_loss_clip": 1.30392396, + "balance_loss_mlp": 1.02949238, + "epoch": 0.25997294453629943, + "flos": 24206105467440.0, + "grad_norm": 1.7074270010945611, + "language_loss": 0.67180955, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69684058, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.20068359, + "step": 4324, + "time_per_iteration": 2.8251230716705322 + }, + { + "auxiliary_loss_clip": 0.01447724, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.30475259, + "balance_loss_mlp": 1.02040625, + "epoch": 0.2600330677889674, + "flos": 31437812865960.0, + "grad_norm": 2.1817204473319705, + "language_loss": 0.71195853, + "learning_rate": 3.470121299177082e-06, + "loss": 0.73680264, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1628418, + "step": 4325, + "time_per_iteration": 2.8211793899536133 + }, + { + "auxiliary_loss_clip": 0.01442861, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.30008364, + "balance_loss_mlp": 1.0185492, + "epoch": 0.26009319104163536, + "flos": 32272452005040.0, + "grad_norm": 1.8500924456544339, + "language_loss": 0.73685986, + "learning_rate": 3.469857215756257e-06, + "loss": 0.76165408, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.18017578, + "step": 4326, + "time_per_iteration": 2.8529393672943115 + }, + { + "auxiliary_loss_clip": 0.01436375, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.29675865, + "balance_loss_mlp": 1.01866734, + "epoch": 0.26015331429430333, + "flos": 26292407243040.0, + "grad_norm": 2.0487876375956535, + "language_loss": 0.873119, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.89783227, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.16259766, + "step": 4327, + "time_per_iteration": 2.8140528202056885 + }, + { + "auxiliary_loss_clip": 0.01454, + "auxiliary_loss_mlp": 0.01052945, + "balance_loss_clip": 1.30812955, + "balance_loss_mlp": 1.03263175, + "epoch": 0.2602134375469713, + "flos": 21147326487000.0, + "grad_norm": 1.6073458681603772, + "language_loss": 0.81050891, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.83557832, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.20336914, + "step": 4328, + "time_per_iteration": 2.7492828369140625 + }, + { + "auxiliary_loss_clip": 0.01439933, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.29718661, + "balance_loss_mlp": 1.02346158, + "epoch": 0.26027356079963926, + "flos": 25926858164880.0, + "grad_norm": 1.3701971220738995, + "language_loss": 0.88199633, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.90680397, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17382812, + "step": 4329, + "time_per_iteration": 2.7814078330993652 + }, + { + "auxiliary_loss_clip": 0.0143479, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.29457664, + "balance_loss_mlp": 1.02449131, + "epoch": 0.2603336840523072, + "flos": 26364818594880.0, + "grad_norm": 2.4326766232595434, + "language_loss": 0.78180629, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80656886, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1697998, + "step": 4330, + "time_per_iteration": 2.7838737964630127 + }, + { + "auxiliary_loss_clip": 0.01446509, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_clip": 1.30190861, + "balance_loss_mlp": 1.03102589, + "epoch": 0.2603938073049752, + "flos": 23518928575440.0, + "grad_norm": 1.4876242026862423, + "language_loss": 0.76036716, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.78532648, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18408203, + "step": 4331, + "time_per_iteration": 2.8111045360565186 + }, + { + "auxiliary_loss_clip": 0.01442892, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_clip": 1.30121374, + "balance_loss_mlp": 1.02817833, + "epoch": 0.26045393055764315, + "flos": 25379793406800.0, + "grad_norm": 1.4231607827045074, + "language_loss": 0.69454229, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71942568, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.17272949, + "step": 4332, + "time_per_iteration": 2.768773317337036 + }, + { + "auxiliary_loss_clip": 0.01445431, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_clip": 1.30160093, + "balance_loss_mlp": 1.0320884, + "epoch": 0.2605140538103111, + "flos": 27640910482920.0, + "grad_norm": 3.595599011116576, + "language_loss": 0.79858398, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.82354146, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.18237305, + "step": 4333, + "time_per_iteration": 2.820403814315796 + }, + { + "auxiliary_loss_clip": 0.0143481, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.2948513, + "balance_loss_mlp": 1.02660418, + "epoch": 0.2605741770629791, + "flos": 13773557753280.0, + "grad_norm": 1.733314152339423, + "language_loss": 0.80938005, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83416337, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16906738, + "step": 4334, + "time_per_iteration": 2.7776362895965576 + }, + { + "auxiliary_loss_clip": 0.01443595, + "auxiliary_loss_mlp": 0.01044096, + "balance_loss_clip": 1.30058885, + "balance_loss_mlp": 1.02576196, + "epoch": 0.26063430031564705, + "flos": 26037343177200.0, + "grad_norm": 1.8822266192889316, + "language_loss": 0.79850662, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82338345, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.18334961, + "step": 4335, + "time_per_iteration": 2.7984378337860107 + }, + { + "auxiliary_loss_clip": 0.01290543, + "auxiliary_loss_mlp": 0.01005772, + "balance_loss_clip": 1.22910166, + "balance_loss_mlp": 1.00207615, + "epoch": 0.26069442356831507, + "flos": 62458903474200.0, + "grad_norm": 0.8302047394098171, + "language_loss": 0.60816735, + "learning_rate": 3.467213317659068e-06, + "loss": 0.63113046, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.03686523, + "step": 4336, + "time_per_iteration": 3.2616777420043945 + }, + { + "auxiliary_loss_clip": 0.01440933, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.29698491, + "balance_loss_mlp": 1.02965569, + "epoch": 0.26075454682098304, + "flos": 13630399992360.0, + "grad_norm": 1.8696603843986173, + "language_loss": 0.77255476, + "learning_rate": 3.46694862168102e-06, + "loss": 0.797436, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.17553711, + "step": 4337, + "time_per_iteration": 2.738086223602295 + }, + { + "auxiliary_loss_clip": 0.01436789, + "auxiliary_loss_mlp": 0.01051887, + "balance_loss_clip": 1.29292083, + "balance_loss_mlp": 1.03268278, + "epoch": 0.260814670073651, + "flos": 12129845760720.0, + "grad_norm": 2.0255657588208074, + "language_loss": 0.74641633, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.77130306, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.19177246, + "step": 4338, + "time_per_iteration": 2.795100450515747 + }, + { + "auxiliary_loss_clip": 0.01446435, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.29809475, + "balance_loss_mlp": 1.02250409, + "epoch": 0.26087479332631897, + "flos": 15126771562920.0, + "grad_norm": 1.906948264015511, + "language_loss": 0.80864096, + "learning_rate": 3.466419062854447e-06, + "loss": 0.83352119, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.19091797, + "step": 4339, + "time_per_iteration": 2.779741048812866 + }, + { + "auxiliary_loss_clip": 0.01432105, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_clip": 1.29134357, + "balance_loss_mlp": 1.0296309, + "epoch": 0.26093491657898693, + "flos": 24686322219000.0, + "grad_norm": 1.76613998492437, + "language_loss": 0.76994407, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.79472709, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16552734, + "step": 4340, + "time_per_iteration": 2.8818583488464355 + }, + { + "auxiliary_loss_clip": 0.01445533, + "auxiliary_loss_mlp": 0.01047007, + "balance_loss_clip": 1.3005867, + "balance_loss_mlp": 1.02954268, + "epoch": 0.2609950398316549, + "flos": 25121237022000.0, + "grad_norm": 1.4131013018075216, + "language_loss": 0.82694328, + "learning_rate": 3.465889281600845e-06, + "loss": 0.85186869, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.17468262, + "step": 4341, + "time_per_iteration": 2.8115313053131104 + }, + { + "auxiliary_loss_clip": 0.01435122, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.29196441, + "balance_loss_mlp": 1.02671671, + "epoch": 0.26105516308432286, + "flos": 28554498919800.0, + "grad_norm": 1.780725193053805, + "language_loss": 0.7678256, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79264593, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.20202637, + "step": 4342, + "time_per_iteration": 2.8321869373321533 + }, + { + "auxiliary_loss_clip": 0.01437848, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.29264736, + "balance_loss_mlp": 1.02018833, + "epoch": 0.2611152863369908, + "flos": 39537928577880.0, + "grad_norm": 1.7427341661057743, + "language_loss": 0.66365623, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68842661, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.18994141, + "step": 4343, + "time_per_iteration": 2.9269609451293945 + }, + { + "auxiliary_loss_clip": 0.01438448, + "auxiliary_loss_mlp": 0.01047515, + "balance_loss_clip": 1.29246509, + "balance_loss_mlp": 1.02957416, + "epoch": 0.2611754095896588, + "flos": 13739301278640.0, + "grad_norm": 1.8818404003766696, + "language_loss": 0.73936981, + "learning_rate": 3.465094192845553e-06, + "loss": 0.76422942, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.17944336, + "step": 4344, + "time_per_iteration": 2.763610601425171 + }, + { + "auxiliary_loss_clip": 0.01439389, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.2942636, + "balance_loss_mlp": 1.02735257, + "epoch": 0.26123553284232676, + "flos": 21511616706000.0, + "grad_norm": 3.322947420694177, + "language_loss": 0.86403036, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88886833, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.17041016, + "step": 4345, + "time_per_iteration": 2.780454635620117 + }, + { + "auxiliary_loss_clip": 0.01432641, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.29195011, + "balance_loss_mlp": 1.0307126, + "epoch": 0.2612956560949947, + "flos": 21144443293440.0, + "grad_norm": 2.102837976474257, + "language_loss": 0.76544368, + "learning_rate": 3.464563855876015e-06, + "loss": 0.79024816, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.17102051, + "step": 4346, + "time_per_iteration": 2.767054796218872 + }, + { + "auxiliary_loss_clip": 0.01430607, + "auxiliary_loss_mlp": 0.0105187, + "balance_loss_clip": 1.28693819, + "balance_loss_mlp": 1.03468013, + "epoch": 0.2613557793476627, + "flos": 25124485690800.0, + "grad_norm": 1.5903833539263534, + "language_loss": 0.76451886, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78934366, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.17199707, + "step": 4347, + "time_per_iteration": 2.7735979557037354 + }, + { + "auxiliary_loss_clip": 0.01430279, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.28790343, + "balance_loss_mlp": 1.02257419, + "epoch": 0.26141590260033065, + "flos": 26073305202960.0, + "grad_norm": 1.2732745256051772, + "language_loss": 0.73880851, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.76352054, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.18347168, + "step": 4348, + "time_per_iteration": 4.201573610305786 + }, + { + "auxiliary_loss_clip": 0.01446296, + "auxiliary_loss_mlp": 0.01044588, + "balance_loss_clip": 1.30105901, + "balance_loss_mlp": 1.02531195, + "epoch": 0.2614760258529987, + "flos": 25706903349240.0, + "grad_norm": 1.7299367939140957, + "language_loss": 0.91250116, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93741006, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.19262695, + "step": 4349, + "time_per_iteration": 2.7594292163848877 + }, + { + "auxiliary_loss_clip": 0.01427107, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_clip": 1.28820848, + "balance_loss_mlp": 1.03174853, + "epoch": 0.26153614910566664, + "flos": 17461640066760.0, + "grad_norm": 1.5823563747455995, + "language_loss": 0.8015492, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82631481, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17712402, + "step": 4350, + "time_per_iteration": 2.7611451148986816 + }, + { + "auxiliary_loss_clip": 0.01424262, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_clip": 1.28470504, + "balance_loss_mlp": 1.02332807, + "epoch": 0.2615962723583346, + "flos": 17717394474720.0, + "grad_norm": 1.7482680003360513, + "language_loss": 0.62225688, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64693463, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.2019043, + "step": 4351, + "time_per_iteration": 2.735123634338379 + }, + { + "auxiliary_loss_clip": 0.01431267, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.28653204, + "balance_loss_mlp": 1.02530301, + "epoch": 0.26165639561100257, + "flos": 23262889908960.0, + "grad_norm": 1.8416152624056654, + "language_loss": 0.8423475, + "learning_rate": 3.462971512415555e-06, + "loss": 0.86709905, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.18591309, + "step": 4352, + "time_per_iteration": 2.7949440479278564 + }, + { + "auxiliary_loss_clip": 0.01281551, + "auxiliary_loss_mlp": 0.01003106, + "balance_loss_clip": 1.21751595, + "balance_loss_mlp": 0.99924338, + "epoch": 0.26171651886367053, + "flos": 66752167754880.0, + "grad_norm": 0.810446512980357, + "language_loss": 0.70637643, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72922301, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03857422, + "step": 4353, + "time_per_iteration": 3.141181230545044 + }, + { + "auxiliary_loss_clip": 0.01431541, + "auxiliary_loss_mlp": 0.01043943, + "balance_loss_clip": 1.29000735, + "balance_loss_mlp": 1.0243566, + "epoch": 0.2617766421163385, + "flos": 22355149075920.0, + "grad_norm": 2.2508636047015123, + "language_loss": 0.78149354, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.80624831, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.19580078, + "step": 4354, + "time_per_iteration": 2.872738838195801 + }, + { + "auxiliary_loss_clip": 0.01436991, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.28935194, + "balance_loss_mlp": 1.02997804, + "epoch": 0.26183676536900646, + "flos": 26072655469200.0, + "grad_norm": 1.7903623422172243, + "language_loss": 0.6868217, + "learning_rate": 3.462174591623085e-06, + "loss": 0.71167457, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.18334961, + "step": 4355, + "time_per_iteration": 2.8868868350982666 + }, + { + "auxiliary_loss_clip": 0.01430385, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.28802001, + "balance_loss_mlp": 1.01970673, + "epoch": 0.26189688862167443, + "flos": 21001447965960.0, + "grad_norm": 1.7936464659052163, + "language_loss": 0.67506289, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69977653, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.21252441, + "step": 4356, + "time_per_iteration": 4.240374326705933 + }, + { + "auxiliary_loss_clip": 0.01278733, + "auxiliary_loss_mlp": 0.01005575, + "balance_loss_clip": 1.21566749, + "balance_loss_mlp": 1.00183225, + "epoch": 0.2619570118743424, + "flos": 65813500332720.0, + "grad_norm": 0.6778374549216507, + "language_loss": 0.5314979, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55434102, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.03735352, + "step": 4357, + "time_per_iteration": 3.1288280487060547 + }, + { + "auxiliary_loss_clip": 0.01441277, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.29408824, + "balance_loss_mlp": 1.02638149, + "epoch": 0.26201713512701036, + "flos": 28772666967600.0, + "grad_norm": 1.7542518598903327, + "language_loss": 0.84612465, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.87096733, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.1661377, + "step": 4358, + "time_per_iteration": 2.819380521774292 + }, + { + "auxiliary_loss_clip": 0.01449953, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.29833341, + "balance_loss_mlp": 1.02058959, + "epoch": 0.2620772583796783, + "flos": 26438163939000.0, + "grad_norm": 2.1516175558754154, + "language_loss": 0.67560548, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.70051086, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.20007324, + "step": 4359, + "time_per_iteration": 2.7944462299346924 + }, + { + "auxiliary_loss_clip": 0.01440943, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_clip": 1.29533517, + "balance_loss_mlp": 1.02292311, + "epoch": 0.2621373816323463, + "flos": 20161326698280.0, + "grad_norm": 2.502361246107745, + "language_loss": 0.78742743, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.81224048, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.17431641, + "step": 4360, + "time_per_iteration": 4.255789756774902 + }, + { + "auxiliary_loss_clip": 0.01421697, + "auxiliary_loss_mlp": 0.01040337, + "balance_loss_clip": 1.28266287, + "balance_loss_mlp": 1.02370739, + "epoch": 0.26219750488501425, + "flos": 28627113313440.0, + "grad_norm": 1.6595822537598894, + "language_loss": 0.68388146, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70850188, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16638184, + "step": 4361, + "time_per_iteration": 4.260794401168823 + }, + { + "auxiliary_loss_clip": 0.01442004, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_clip": 1.29453933, + "balance_loss_mlp": 1.0344218, + "epoch": 0.2622576281376823, + "flos": 15045873063840.0, + "grad_norm": 1.7138303005627353, + "language_loss": 0.84423935, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86918443, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.1809082, + "step": 4362, + "time_per_iteration": 2.7527172565460205 + }, + { + "auxiliary_loss_clip": 0.0142919, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.28772974, + "balance_loss_mlp": 1.022017, + "epoch": 0.26231775139035024, + "flos": 26401836438000.0, + "grad_norm": 1.942748322936224, + "language_loss": 0.65542603, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.68012285, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.18469238, + "step": 4363, + "time_per_iteration": 2.7818708419799805 + }, + { + "auxiliary_loss_clip": 0.0128256, + "auxiliary_loss_mlp": 0.01012927, + "balance_loss_clip": 1.22088361, + "balance_loss_mlp": 1.00944567, + "epoch": 0.2623778746430182, + "flos": 65425674246120.0, + "grad_norm": 0.8900483914798566, + "language_loss": 0.61215901, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63511395, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03491211, + "step": 4364, + "time_per_iteration": 3.349613666534424 + }, + { + "auxiliary_loss_clip": 0.01437989, + "auxiliary_loss_mlp": 0.01050143, + "balance_loss_clip": 1.29324889, + "balance_loss_mlp": 1.03133202, + "epoch": 0.26243799789568617, + "flos": 12608032094280.0, + "grad_norm": 2.426677186351026, + "language_loss": 0.72116435, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74604571, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.18823242, + "step": 4365, + "time_per_iteration": 2.6987500190734863 + }, + { + "auxiliary_loss_clip": 0.01428807, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.28848624, + "balance_loss_mlp": 1.02696216, + "epoch": 0.26249812114835414, + "flos": 28630037115360.0, + "grad_norm": 2.563528187929582, + "language_loss": 0.7746141, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79933596, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.16418457, + "step": 4366, + "time_per_iteration": 2.797950267791748 + }, + { + "auxiliary_loss_clip": 0.01436664, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_clip": 1.29376566, + "balance_loss_mlp": 1.02727985, + "epoch": 0.2625582444010221, + "flos": 14469506051040.0, + "grad_norm": 1.7200414018614016, + "language_loss": 0.76540989, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.79022467, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17529297, + "step": 4367, + "time_per_iteration": 2.712357759475708 + }, + { + "auxiliary_loss_clip": 0.01431543, + "auxiliary_loss_mlp": 0.01045798, + "balance_loss_clip": 1.29141521, + "balance_loss_mlp": 1.02919245, + "epoch": 0.26261836765369007, + "flos": 16616930054400.0, + "grad_norm": 1.7695008283288733, + "language_loss": 0.70049429, + "learning_rate": 3.458715505320736e-06, + "loss": 0.72526765, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16601562, + "step": 4368, + "time_per_iteration": 2.772002935409546 + }, + { + "auxiliary_loss_clip": 0.01427104, + "auxiliary_loss_mlp": 0.01044091, + "balance_loss_clip": 1.2854141, + "balance_loss_mlp": 1.0252676, + "epoch": 0.26267849090635803, + "flos": 20524561099920.0, + "grad_norm": 1.777500527634138, + "language_loss": 0.79429221, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81900418, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.18823242, + "step": 4369, + "time_per_iteration": 2.789818048477173 + }, + { + "auxiliary_loss_clip": 0.0142982, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.28809381, + "balance_loss_mlp": 1.02562225, + "epoch": 0.262738614159026, + "flos": 21328842166920.0, + "grad_norm": 3.1856412564591414, + "language_loss": 0.83623588, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.86096787, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1776123, + "step": 4370, + "time_per_iteration": 2.735396146774292 + }, + { + "auxiliary_loss_clip": 0.01438931, + "auxiliary_loss_mlp": 0.0104956, + "balance_loss_clip": 1.2926383, + "balance_loss_mlp": 1.02928281, + "epoch": 0.26279873741169396, + "flos": 17608168321560.0, + "grad_norm": 1.9607167124357805, + "language_loss": 0.71390057, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73878551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.20288086, + "step": 4371, + "time_per_iteration": 2.7387778759002686 + }, + { + "auxiliary_loss_clip": 0.01280032, + "auxiliary_loss_mlp": 0.01018478, + "balance_loss_clip": 1.21945632, + "balance_loss_mlp": 1.01525974, + "epoch": 0.2628588606643619, + "flos": 60964887188520.0, + "grad_norm": 0.744097091672165, + "language_loss": 0.56404895, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58703405, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.03222656, + "step": 4372, + "time_per_iteration": 3.366389036178589 + }, + { + "auxiliary_loss_clip": 0.01424279, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.28408885, + "balance_loss_mlp": 1.01801682, + "epoch": 0.2629189839170299, + "flos": 27022043498400.0, + "grad_norm": 1.686354070851473, + "language_loss": 0.77740479, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.80200136, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17370605, + "step": 4373, + "time_per_iteration": 2.865630626678467 + }, + { + "auxiliary_loss_clip": 0.0142855, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.28738451, + "balance_loss_mlp": 1.02274203, + "epoch": 0.26297910716969786, + "flos": 17024369978880.0, + "grad_norm": 2.157216861836272, + "language_loss": 0.71541399, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.74009705, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17022705, + "step": 4374, + "time_per_iteration": 2.726696729660034 + }, + { + "auxiliary_loss_clip": 0.01433498, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.29148734, + "balance_loss_mlp": 1.02222133, + "epoch": 0.2630392304223659, + "flos": 24902459848800.0, + "grad_norm": 1.5819594705374977, + "language_loss": 0.80986857, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.8346079, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.18225098, + "step": 4375, + "time_per_iteration": 2.8911454677581787 + }, + { + "auxiliary_loss_clip": 0.01420107, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.28057623, + "balance_loss_mlp": 1.02284396, + "epoch": 0.26309935367503384, + "flos": 32860514225520.0, + "grad_norm": 1.8627608430307308, + "language_loss": 0.66601706, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.69061172, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16516113, + "step": 4376, + "time_per_iteration": 2.8690545558929443 + }, + { + "auxiliary_loss_clip": 0.01437911, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_clip": 1.29262769, + "balance_loss_mlp": 1.0229485, + "epoch": 0.2631594769277018, + "flos": 15892207410600.0, + "grad_norm": 1.7162143424436758, + "language_loss": 0.69438243, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71917915, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.18798828, + "step": 4377, + "time_per_iteration": 2.7172884941101074 + }, + { + "auxiliary_loss_clip": 0.01428148, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.28548694, + "balance_loss_mlp": 1.02310944, + "epoch": 0.2632196001803698, + "flos": 50813765011800.0, + "grad_norm": 1.6894599031829147, + "language_loss": 0.79497612, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.8196705, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1817627, + "step": 4378, + "time_per_iteration": 3.0325944423675537 + }, + { + "auxiliary_loss_clip": 0.01436048, + "auxiliary_loss_mlp": 0.01042085, + "balance_loss_clip": 1.29450417, + "balance_loss_mlp": 1.02538347, + "epoch": 0.26327972343303774, + "flos": 13736702343600.0, + "grad_norm": 2.0167003370137966, + "language_loss": 0.76904249, + "learning_rate": 3.455781283723846e-06, + "loss": 0.79382384, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.16699219, + "step": 4379, + "time_per_iteration": 2.726731061935425 + }, + { + "auxiliary_loss_clip": 0.0144016, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.29391372, + "balance_loss_mlp": 1.02111554, + "epoch": 0.2633398466857057, + "flos": 23774358116520.0, + "grad_norm": 1.8940481172319052, + "language_loss": 0.77816629, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80297893, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.19995117, + "step": 4380, + "time_per_iteration": 2.788496971130371 + }, + { + "auxiliary_loss_clip": 0.0142951, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.28564692, + "balance_loss_mlp": 1.01746392, + "epoch": 0.26339996993837367, + "flos": 27606044882880.0, + "grad_norm": 1.7614026264408507, + "language_loss": 0.64105666, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66569936, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.17297363, + "step": 4381, + "time_per_iteration": 2.794161081314087 + }, + { + "auxiliary_loss_clip": 0.01430967, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.28697288, + "balance_loss_mlp": 1.02370238, + "epoch": 0.26346009319104163, + "flos": 16950740376240.0, + "grad_norm": 2.030328965009999, + "language_loss": 0.8291117, + "learning_rate": 3.454979881632595e-06, + "loss": 0.85386074, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.20227051, + "step": 4382, + "time_per_iteration": 2.7323570251464844 + }, + { + "auxiliary_loss_clip": 0.01438111, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.29145551, + "balance_loss_mlp": 1.02619004, + "epoch": 0.2635202164437096, + "flos": 37239265749960.0, + "grad_norm": 3.632194475757137, + "language_loss": 0.70118976, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72602117, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.18835449, + "step": 4383, + "time_per_iteration": 2.889296531677246 + }, + { + "auxiliary_loss_clip": 0.01429824, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.28877246, + "balance_loss_mlp": 1.02496195, + "epoch": 0.26358033969637756, + "flos": 21001326140880.0, + "grad_norm": 3.3865919137081626, + "language_loss": 0.69573563, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.7204538, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17041016, + "step": 4384, + "time_per_iteration": 2.7236740589141846 + }, + { + "auxiliary_loss_clip": 0.01423112, + "auxiliary_loss_mlp": 0.01040402, + "balance_loss_clip": 1.28297985, + "balance_loss_mlp": 1.02144742, + "epoch": 0.26364046294904553, + "flos": 27751923403920.0, + "grad_norm": 1.9558426069886046, + "language_loss": 0.70009917, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72473431, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.18969727, + "step": 4385, + "time_per_iteration": 2.793412923812866 + }, + { + "auxiliary_loss_clip": 0.01425562, + "auxiliary_loss_mlp": 0.01046409, + "balance_loss_clip": 1.28435397, + "balance_loss_mlp": 1.02826524, + "epoch": 0.2637005862017135, + "flos": 22898234214720.0, + "grad_norm": 1.7334771222570935, + "language_loss": 0.85772216, + "learning_rate": 3.453910573136482e-06, + "loss": 0.88244182, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.18151855, + "step": 4386, + "time_per_iteration": 2.7734274864196777 + }, + { + "auxiliary_loss_clip": 0.01427822, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.28739452, + "balance_loss_mlp": 1.02221775, + "epoch": 0.26376070945438146, + "flos": 15052979526840.0, + "grad_norm": 1.9682161335027226, + "language_loss": 0.77889895, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.80357325, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.17407227, + "step": 4387, + "time_per_iteration": 4.106396436691284 + }, + { + "auxiliary_loss_clip": 0.01431832, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.29151845, + "balance_loss_mlp": 1.02186799, + "epoch": 0.2638208327070494, + "flos": 21146757969960.0, + "grad_norm": 1.974439945980248, + "language_loss": 0.76771623, + "learning_rate": 3.453375588053264e-06, + "loss": 0.79242969, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1763916, + "step": 4388, + "time_per_iteration": 2.77065110206604 + }, + { + "auxiliary_loss_clip": 0.0142341, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.28312588, + "balance_loss_mlp": 1.01931953, + "epoch": 0.26388095595971744, + "flos": 21730596921000.0, + "grad_norm": 1.9088276723785509, + "language_loss": 0.87023485, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.89483649, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.17437744, + "step": 4389, + "time_per_iteration": 2.7393808364868164 + }, + { + "auxiliary_loss_clip": 0.01287895, + "auxiliary_loss_mlp": 0.01006608, + "balance_loss_clip": 1.22482669, + "balance_loss_mlp": 1.00369954, + "epoch": 0.2639410792123854, + "flos": 65531935989000.0, + "grad_norm": 0.8110378959138462, + "language_loss": 0.60375214, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62669718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.02905273, + "step": 4390, + "time_per_iteration": 3.269153594970703 + }, + { + "auxiliary_loss_clip": 0.01434665, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.28876376, + "balance_loss_mlp": 1.01648784, + "epoch": 0.2640012024650534, + "flos": 23953518511560.0, + "grad_norm": 1.5962334583275657, + "language_loss": 0.77911305, + "learning_rate": 3.4525726971127e-06, + "loss": 0.80380744, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.18286133, + "step": 4391, + "time_per_iteration": 2.757406234741211 + }, + { + "auxiliary_loss_clip": 0.01284735, + "auxiliary_loss_mlp": 0.01003804, + "balance_loss_clip": 1.22179687, + "balance_loss_mlp": 1.00076413, + "epoch": 0.26406132571772134, + "flos": 56457759346200.0, + "grad_norm": 0.8700004826671485, + "language_loss": 0.58752316, + "learning_rate": 3.45230495662224e-06, + "loss": 0.61040854, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.03039551, + "step": 4392, + "time_per_iteration": 3.211318254470825 + }, + { + "auxiliary_loss_clip": 0.01428466, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.28642082, + "balance_loss_mlp": 1.0254848, + "epoch": 0.2641214489703893, + "flos": 22095618090480.0, + "grad_norm": 1.8754609593921863, + "language_loss": 0.69249082, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.71720147, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17126465, + "step": 4393, + "time_per_iteration": 2.77604603767395 + }, + { + "auxiliary_loss_clip": 0.01439881, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.29192162, + "balance_loss_mlp": 1.0254153, + "epoch": 0.26418157222305727, + "flos": 16549107447240.0, + "grad_norm": 1.8307070435440456, + "language_loss": 0.84463561, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86947864, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.18994141, + "step": 4394, + "time_per_iteration": 4.1560914516448975 + }, + { + "auxiliary_loss_clip": 0.01431049, + "auxiliary_loss_mlp": 0.01042017, + "balance_loss_clip": 1.28453851, + "balance_loss_mlp": 1.02232385, + "epoch": 0.26424169547572524, + "flos": 18006796231920.0, + "grad_norm": 2.091504576758344, + "language_loss": 0.70418233, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72891301, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.19702148, + "step": 4395, + "time_per_iteration": 2.759719133377075 + }, + { + "auxiliary_loss_clip": 0.01422249, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.28177536, + "balance_loss_mlp": 1.01625419, + "epoch": 0.2643018187283932, + "flos": 16987636394280.0, + "grad_norm": 2.3121680250373084, + "language_loss": 0.86974549, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.8943069, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.17626953, + "step": 4396, + "time_per_iteration": 2.8139872550964355 + }, + { + "auxiliary_loss_clip": 0.01278827, + "auxiliary_loss_mlp": 0.01008809, + "balance_loss_clip": 1.21637535, + "balance_loss_mlp": 1.0060668, + "epoch": 0.26436194198106117, + "flos": 59678237126880.0, + "grad_norm": 0.7897144064459278, + "language_loss": 0.55099201, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57386839, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.02746582, + "step": 4397, + "time_per_iteration": 3.0807735919952393 + }, + { + "auxiliary_loss_clip": 0.01422848, + "auxiliary_loss_mlp": 0.01041296, + "balance_loss_clip": 1.28346956, + "balance_loss_mlp": 1.02416587, + "epoch": 0.26442206523372913, + "flos": 32927077973520.0, + "grad_norm": 2.165013451097769, + "language_loss": 0.77915132, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80379272, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17126465, + "step": 4398, + "time_per_iteration": 4.361672639846802 + }, + { + "auxiliary_loss_clip": 0.01430178, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.28665257, + "balance_loss_mlp": 1.02151287, + "epoch": 0.2644821884863971, + "flos": 21036069915840.0, + "grad_norm": 1.6988004460515715, + "language_loss": 0.67781651, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.70251155, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.17822266, + "step": 4399, + "time_per_iteration": 2.7752833366394043 + }, + { + "auxiliary_loss_clip": 0.01416371, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.2802248, + "balance_loss_mlp": 1.02108264, + "epoch": 0.26454231173906506, + "flos": 20781655583760.0, + "grad_norm": 1.7692103045590988, + "language_loss": 0.86477244, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88930738, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16052246, + "step": 4400, + "time_per_iteration": 2.7637462615966797 + }, + { + "auxiliary_loss_clip": 0.01428415, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.28633976, + "balance_loss_mlp": 1.01777697, + "epoch": 0.264602434991733, + "flos": 16623143133480.0, + "grad_norm": 2.1969771944603096, + "language_loss": 0.7624042, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.78704476, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17858887, + "step": 4401, + "time_per_iteration": 2.733044385910034 + }, + { + "auxiliary_loss_clip": 0.01435075, + "auxiliary_loss_mlp": 0.01044753, + "balance_loss_clip": 1.28935289, + "balance_loss_mlp": 1.0249877, + "epoch": 0.26466255824440105, + "flos": 19067400223920.0, + "grad_norm": 1.7716984208755537, + "language_loss": 0.88238448, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90718281, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.19775391, + "step": 4402, + "time_per_iteration": 2.7408242225646973 + }, + { + "auxiliary_loss_clip": 0.01422154, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.28007817, + "balance_loss_mlp": 1.02504206, + "epoch": 0.264722681497069, + "flos": 22643779274280.0, + "grad_norm": 2.0766459338297345, + "language_loss": 0.78577948, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.81042111, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.16967773, + "step": 4403, + "time_per_iteration": 2.770191192626953 + }, + { + "auxiliary_loss_clip": 0.01427108, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.28371096, + "balance_loss_mlp": 1.01798403, + "epoch": 0.264782804749737, + "flos": 22497413452920.0, + "grad_norm": 1.9983181765315297, + "language_loss": 0.88906264, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.91368878, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.1751709, + "step": 4404, + "time_per_iteration": 2.744879961013794 + }, + { + "auxiliary_loss_clip": 0.01419187, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.27494574, + "balance_loss_mlp": 1.02053607, + "epoch": 0.26484292800240494, + "flos": 16804658813400.0, + "grad_norm": 1.662049627883088, + "language_loss": 0.76890105, + "learning_rate": 3.448819322433709e-06, + "loss": 0.79346567, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.16748047, + "step": 4405, + "time_per_iteration": 2.757894992828369 + }, + { + "auxiliary_loss_clip": 0.0143053, + "auxiliary_loss_mlp": 0.01042641, + "balance_loss_clip": 1.28743303, + "balance_loss_mlp": 1.02433002, + "epoch": 0.2649030512550729, + "flos": 20454545641320.0, + "grad_norm": 1.9120884485588208, + "language_loss": 0.70627809, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.73100978, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.1829834, + "step": 4406, + "time_per_iteration": 2.778791904449463 + }, + { + "auxiliary_loss_clip": 0.01417831, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.27880979, + "balance_loss_mlp": 1.02604771, + "epoch": 0.2649631745077409, + "flos": 22420860048360.0, + "grad_norm": 1.5807365863216325, + "language_loss": 0.8373307, + "learning_rate": 3.448282246369912e-06, + "loss": 0.86193198, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16259766, + "step": 4407, + "time_per_iteration": 2.76824951171875 + }, + { + "auxiliary_loss_clip": 0.01426463, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.28403354, + "balance_loss_mlp": 1.01758218, + "epoch": 0.26502329776040884, + "flos": 35122890160800.0, + "grad_norm": 1.5352056280038928, + "language_loss": 0.76803118, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.79264307, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17150879, + "step": 4408, + "time_per_iteration": 2.924461603164673 + }, + { + "auxiliary_loss_clip": 0.01418115, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.27900529, + "balance_loss_mlp": 1.01868653, + "epoch": 0.2650834210130768, + "flos": 38694193166160.0, + "grad_norm": 1.635160944051144, + "language_loss": 0.71486473, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73940969, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17700195, + "step": 4409, + "time_per_iteration": 2.8826940059661865 + }, + { + "auxiliary_loss_clip": 0.01428164, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.28467584, + "balance_loss_mlp": 1.0230639, + "epoch": 0.26514354426574477, + "flos": 24722162419680.0, + "grad_norm": 1.7341267496967818, + "language_loss": 0.74114138, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.76583362, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.17993164, + "step": 4410, + "time_per_iteration": 2.8440678119659424 + }, + { + "auxiliary_loss_clip": 0.01425442, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.27965212, + "balance_loss_mlp": 1.0275265, + "epoch": 0.26520366751841273, + "flos": 20344994621280.0, + "grad_norm": 1.81355460588601, + "language_loss": 0.73888475, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.76358932, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.17492676, + "step": 4411, + "time_per_iteration": 2.75321626663208 + }, + { + "auxiliary_loss_clip": 0.01416221, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.27566206, + "balance_loss_mlp": 1.01873493, + "epoch": 0.2652637907710807, + "flos": 22348692346680.0, + "grad_norm": 1.891157842475472, + "language_loss": 0.82581639, + "learning_rate": 3.446938595306071e-06, + "loss": 0.85034227, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.17626953, + "step": 4412, + "time_per_iteration": 2.791515588760376 + }, + { + "auxiliary_loss_clip": 0.01425555, + "auxiliary_loss_mlp": 0.01044336, + "balance_loss_clip": 1.28294277, + "balance_loss_mlp": 1.02659798, + "epoch": 0.26532391402374866, + "flos": 19358954224200.0, + "grad_norm": 1.5811631444439842, + "language_loss": 0.74288821, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76758713, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.17736816, + "step": 4413, + "time_per_iteration": 2.7882678508758545 + }, + { + "auxiliary_loss_clip": 0.01276446, + "auxiliary_loss_mlp": 0.01012415, + "balance_loss_clip": 1.21555173, + "balance_loss_mlp": 1.00902987, + "epoch": 0.26538403727641663, + "flos": 44800780223880.0, + "grad_norm": 0.8840287569553474, + "language_loss": 0.56979108, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59267968, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03393555, + "step": 4414, + "time_per_iteration": 3.202998161315918 + }, + { + "auxiliary_loss_clip": 0.01412963, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.27536547, + "balance_loss_mlp": 1.0176183, + "epoch": 0.26544416052908465, + "flos": 28187772199200.0, + "grad_norm": 1.6456490364215235, + "language_loss": 0.74650764, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.77097356, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.16003418, + "step": 4415, + "time_per_iteration": 2.8693997859954834 + }, + { + "auxiliary_loss_clip": 0.01428635, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.2839613, + "balance_loss_mlp": 1.02334428, + "epoch": 0.2655042837817526, + "flos": 17569607360760.0, + "grad_norm": 1.9922962213668487, + "language_loss": 0.87119484, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.89591014, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.19543457, + "step": 4416, + "time_per_iteration": 2.7366786003112793 + }, + { + "auxiliary_loss_clip": 0.01429165, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.28557169, + "balance_loss_mlp": 1.02465653, + "epoch": 0.2655644070344206, + "flos": 23409946072440.0, + "grad_norm": 1.517756404889253, + "language_loss": 0.76464367, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78937429, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.19262695, + "step": 4417, + "time_per_iteration": 2.8379571437835693 + }, + { + "auxiliary_loss_clip": 0.0141885, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.27905536, + "balance_loss_mlp": 1.02149892, + "epoch": 0.26562453028708854, + "flos": 26474044748040.0, + "grad_norm": 1.3940653514867465, + "language_loss": 0.80463368, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82922149, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.18432617, + "step": 4418, + "time_per_iteration": 2.797441244125366 + }, + { + "auxiliary_loss_clip": 0.01425013, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.28248608, + "balance_loss_mlp": 1.02636063, + "epoch": 0.2656846535397565, + "flos": 19212182319240.0, + "grad_norm": 2.3375248692660624, + "language_loss": 0.67539138, + "learning_rate": 3.445055179644071e-06, + "loss": 0.70008862, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.18347168, + "step": 4419, + "time_per_iteration": 2.7878201007843018 + }, + { + "auxiliary_loss_clip": 0.01423909, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.28106284, + "balance_loss_mlp": 1.01663613, + "epoch": 0.2657447767924245, + "flos": 30556978394400.0, + "grad_norm": 1.893787794602084, + "language_loss": 0.79708189, + "learning_rate": 3.444785900995585e-06, + "loss": 0.82167554, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.18811035, + "step": 4420, + "time_per_iteration": 2.8175628185272217 + }, + { + "auxiliary_loss_clip": 0.0143094, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.28570223, + "balance_loss_mlp": 1.02290106, + "epoch": 0.26580490004509244, + "flos": 20927615321520.0, + "grad_norm": 1.9726679773838467, + "language_loss": 0.81714267, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84188247, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.20141602, + "step": 4421, + "time_per_iteration": 2.746706485748291 + }, + { + "auxiliary_loss_clip": 0.0141825, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.2779156, + "balance_loss_mlp": 1.01534176, + "epoch": 0.2658650232977604, + "flos": 43953616728720.0, + "grad_norm": 1.4632455264288702, + "language_loss": 0.6658175, + "learning_rate": 3.444247179349548e-06, + "loss": 0.69032967, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.17602539, + "step": 4422, + "time_per_iteration": 2.9636611938476562 + }, + { + "auxiliary_loss_clip": 0.01425608, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.28351784, + "balance_loss_mlp": 1.02195191, + "epoch": 0.26592514655042837, + "flos": 29722542297120.0, + "grad_norm": 2.058983203916909, + "language_loss": 0.74850619, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.77314824, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.16650391, + "step": 4423, + "time_per_iteration": 2.8080151081085205 + }, + { + "auxiliary_loss_clip": 0.01426501, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.28213024, + "balance_loss_mlp": 1.02583289, + "epoch": 0.26598526980309634, + "flos": 46685448200160.0, + "grad_norm": 2.0008655541653955, + "language_loss": 0.77983582, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80453175, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.17272949, + "step": 4424, + "time_per_iteration": 2.9791250228881836 + }, + { + "auxiliary_loss_clip": 0.01426443, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.28305614, + "balance_loss_mlp": 1.02250957, + "epoch": 0.2660453930557643, + "flos": 11513090410920.0, + "grad_norm": 1.869371818882694, + "language_loss": 0.79662848, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.82128942, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.17132568, + "step": 4425, + "time_per_iteration": 2.7138831615448 + }, + { + "auxiliary_loss_clip": 0.01417843, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.27687824, + "balance_loss_mlp": 1.02367854, + "epoch": 0.26610551630843227, + "flos": 24797132098200.0, + "grad_norm": 1.5775772969145758, + "language_loss": 0.80929774, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.83388531, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17224121, + "step": 4426, + "time_per_iteration": 4.201253652572632 + }, + { + "auxiliary_loss_clip": 0.0142905, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.28726506, + "balance_loss_mlp": 1.02652693, + "epoch": 0.26616563956110023, + "flos": 27642169342080.0, + "grad_norm": 1.6404527774857514, + "language_loss": 0.77247381, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79720545, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17578125, + "step": 4427, + "time_per_iteration": 2.8357110023498535 + }, + { + "auxiliary_loss_clip": 0.014125, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.27552414, + "balance_loss_mlp": 1.02063394, + "epoch": 0.26622576281376825, + "flos": 28368353886840.0, + "grad_norm": 1.6328985547969932, + "language_loss": 0.77404618, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79853356, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.15612793, + "step": 4428, + "time_per_iteration": 2.7877650260925293 + }, + { + "auxiliary_loss_clip": 0.01428567, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.28516948, + "balance_loss_mlp": 1.01989603, + "epoch": 0.2662858860664362, + "flos": 18045966318120.0, + "grad_norm": 1.75128121844075, + "language_loss": 0.82841903, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.85306716, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.16357422, + "step": 4429, + "time_per_iteration": 2.755307197570801 + }, + { + "auxiliary_loss_clip": 0.01420887, + "auxiliary_loss_mlp": 0.01043356, + "balance_loss_clip": 1.28048408, + "balance_loss_mlp": 1.02348351, + "epoch": 0.2663460093191042, + "flos": 22750771967640.0, + "grad_norm": 1.6066907049804497, + "language_loss": 0.73201317, + "learning_rate": 3.442090102943143e-06, + "loss": 0.75665557, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.1986084, + "step": 4430, + "time_per_iteration": 2.7471799850463867 + }, + { + "auxiliary_loss_clip": 0.01424199, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_clip": 1.28326964, + "balance_loss_mlp": 1.03064024, + "epoch": 0.26640613257177215, + "flos": 16513186029840.0, + "grad_norm": 1.9037577047665346, + "language_loss": 0.82666123, + "learning_rate": 3.441820222206035e-06, + "loss": 0.8513906, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.1809082, + "step": 4431, + "time_per_iteration": 2.7527496814727783 + }, + { + "auxiliary_loss_clip": 0.01435053, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_clip": 1.28793335, + "balance_loss_mlp": 1.02894604, + "epoch": 0.2664662558244401, + "flos": 23081171187240.0, + "grad_norm": 2.1211904859400152, + "language_loss": 0.76883245, + "learning_rate": 3.44155028679496e-06, + "loss": 0.79365933, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.18688965, + "step": 4432, + "time_per_iteration": 2.762880802154541 + }, + { + "auxiliary_loss_clip": 0.0142743, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.28340161, + "balance_loss_mlp": 1.0264169, + "epoch": 0.2665263790771081, + "flos": 23774601766680.0, + "grad_norm": 2.2112242886363456, + "language_loss": 0.82813501, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85284841, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17480469, + "step": 4433, + "time_per_iteration": 4.310573101043701 + }, + { + "auxiliary_loss_clip": 0.0142364, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.28320193, + "balance_loss_mlp": 1.02457249, + "epoch": 0.26658650232977604, + "flos": 28007109294840.0, + "grad_norm": 1.7871887726824824, + "language_loss": 0.77235627, + "learning_rate": 3.441010251991854e-06, + "loss": 0.79701149, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17333984, + "step": 4434, + "time_per_iteration": 2.8638415336608887 + }, + { + "auxiliary_loss_clip": 0.01416242, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.27715659, + "balance_loss_mlp": 1.02047777, + "epoch": 0.266646625582444, + "flos": 22168719784440.0, + "grad_norm": 1.7149657837141101, + "language_loss": 0.82698321, + "learning_rate": 3.440740152620301e-06, + "loss": 0.85152411, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17358398, + "step": 4435, + "time_per_iteration": 2.733372688293457 + }, + { + "auxiliary_loss_clip": 0.01429218, + "auxiliary_loss_mlp": 0.01055394, + "balance_loss_clip": 1.28339529, + "balance_loss_mlp": 1.0364759, + "epoch": 0.266706748835112, + "flos": 27859119139080.0, + "grad_norm": 2.080831211757193, + "language_loss": 0.87610328, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.90094936, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.18908691, + "step": 4436, + "time_per_iteration": 2.835693597793579 + }, + { + "auxiliary_loss_clip": 0.01431794, + "auxiliary_loss_mlp": 0.01039384, + "balance_loss_clip": 1.28697491, + "balance_loss_mlp": 1.02217054, + "epoch": 0.26676687208777994, + "flos": 25817672620080.0, + "grad_norm": 1.5038600092258032, + "language_loss": 0.78613257, + "learning_rate": 3.440199789988407e-06, + "loss": 0.8108443, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17199707, + "step": 4437, + "time_per_iteration": 4.289813756942749 + }, + { + "auxiliary_loss_clip": 0.01421401, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_clip": 1.27977145, + "balance_loss_mlp": 1.02395403, + "epoch": 0.2668269953404479, + "flos": 36071466022800.0, + "grad_norm": 2.0596767242117053, + "language_loss": 0.64912391, + "learning_rate": 3.439929526748556e-06, + "loss": 0.67374986, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17224121, + "step": 4438, + "time_per_iteration": 4.428094387054443 + }, + { + "auxiliary_loss_clip": 0.01427686, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.28497589, + "balance_loss_mlp": 1.02267575, + "epoch": 0.26688711859311587, + "flos": 26575677137880.0, + "grad_norm": 1.7427476989420259, + "language_loss": 0.7627992, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.78747094, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16796875, + "step": 4439, + "time_per_iteration": 2.7682456970214844 + }, + { + "auxiliary_loss_clip": 0.0142722, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.28279018, + "balance_loss_mlp": 1.01849377, + "epoch": 0.26694724184578383, + "flos": 26767344907800.0, + "grad_norm": 2.0236432628252037, + "language_loss": 0.71901602, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.74366212, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.18908691, + "step": 4440, + "time_per_iteration": 2.8224406242370605 + }, + { + "auxiliary_loss_clip": 0.01427763, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_clip": 1.28332186, + "balance_loss_mlp": 1.02347767, + "epoch": 0.2670073650984518, + "flos": 20964511339560.0, + "grad_norm": 2.350400126530815, + "language_loss": 0.67149007, + "learning_rate": 3.439118409456376e-06, + "loss": 0.6961937, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.19128418, + "step": 4441, + "time_per_iteration": 2.731344223022461 + }, + { + "auxiliary_loss_clip": 0.0143015, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.28648877, + "balance_loss_mlp": 1.02723169, + "epoch": 0.2670674883511198, + "flos": 28371927422520.0, + "grad_norm": 1.4986941939821674, + "language_loss": 0.76940656, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.794164, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.18334961, + "step": 4442, + "time_per_iteration": 2.8308098316192627 + }, + { + "auxiliary_loss_clip": 0.01302137, + "auxiliary_loss_mlp": 0.01021829, + "balance_loss_clip": 1.2387892, + "balance_loss_mlp": 1.01870537, + "epoch": 0.2671276116037878, + "flos": 58985131414320.0, + "grad_norm": 0.9343645745927828, + "language_loss": 0.61303627, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63627589, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.03112793, + "step": 4443, + "time_per_iteration": 3.161325216293335 + }, + { + "auxiliary_loss_clip": 0.01432803, + "auxiliary_loss_mlp": 0.01040645, + "balance_loss_clip": 1.28915477, + "balance_loss_mlp": 1.02327585, + "epoch": 0.26718773485645575, + "flos": 43952926386600.0, + "grad_norm": 2.2194803359231075, + "language_loss": 0.76414073, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78887516, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.17358398, + "step": 4444, + "time_per_iteration": 2.955645799636841 + }, + { + "auxiliary_loss_clip": 0.01426777, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.28359175, + "balance_loss_mlp": 1.01963091, + "epoch": 0.2672478581091237, + "flos": 25233793060680.0, + "grad_norm": 1.6858990053021623, + "language_loss": 0.80611539, + "learning_rate": 3.438036155780158e-06, + "loss": 0.83076274, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.18347168, + "step": 4445, + "time_per_iteration": 2.807490587234497 + }, + { + "auxiliary_loss_clip": 0.01429106, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.28621697, + "balance_loss_mlp": 1.0190475, + "epoch": 0.2673079813617917, + "flos": 15272406433800.0, + "grad_norm": 1.9173653751968938, + "language_loss": 0.89501548, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91968334, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.1862793, + "step": 4446, + "time_per_iteration": 2.7178118228912354 + }, + { + "auxiliary_loss_clip": 0.01429123, + "auxiliary_loss_mlp": 0.01039565, + "balance_loss_clip": 1.28465474, + "balance_loss_mlp": 1.0220654, + "epoch": 0.26736810461445965, + "flos": 25818362962200.0, + "grad_norm": 1.6359541718536963, + "language_loss": 0.68656796, + "learning_rate": 3.437494701718153e-06, + "loss": 0.71125484, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.17504883, + "step": 4447, + "time_per_iteration": 2.8439717292785645 + }, + { + "auxiliary_loss_clip": 0.01428889, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.28503799, + "balance_loss_mlp": 1.02196789, + "epoch": 0.2674282278671276, + "flos": 24317605688760.0, + "grad_norm": 2.1111853986249107, + "language_loss": 0.83147037, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85615933, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.18041992, + "step": 4448, + "time_per_iteration": 2.757266044616699 + }, + { + "auxiliary_loss_clip": 0.01427719, + "auxiliary_loss_mlp": 0.01042803, + "balance_loss_clip": 1.28716445, + "balance_loss_mlp": 1.02473104, + "epoch": 0.2674883511197956, + "flos": 22820056475760.0, + "grad_norm": 1.7462748621233521, + "language_loss": 0.84612584, + "learning_rate": 3.436953029616378e-06, + "loss": 0.87083101, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.18066406, + "step": 4449, + "time_per_iteration": 2.775634765625 + }, + { + "auxiliary_loss_clip": 0.01445642, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.29353118, + "balance_loss_mlp": 1.0224452, + "epoch": 0.26754847437246354, + "flos": 25375123445400.0, + "grad_norm": 1.5702745390989625, + "language_loss": 0.83878815, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86366045, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.19140625, + "step": 4450, + "time_per_iteration": 2.801575183868408 + }, + { + "auxiliary_loss_clip": 0.01422444, + "auxiliary_loss_mlp": 0.01039668, + "balance_loss_clip": 1.28237414, + "balance_loss_mlp": 1.02259719, + "epoch": 0.2676085976251315, + "flos": 20235321776160.0, + "grad_norm": 1.714882812409534, + "language_loss": 0.81221867, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.8368398, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1706543, + "step": 4451, + "time_per_iteration": 2.7493348121643066 + }, + { + "auxiliary_loss_clip": 0.01431389, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.29083121, + "balance_loss_mlp": 1.02199316, + "epoch": 0.26766872087779947, + "flos": 28044127137960.0, + "grad_norm": 1.5862385172876152, + "language_loss": 0.86493623, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88963497, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.16479492, + "step": 4452, + "time_per_iteration": 2.813756227493286 + }, + { + "auxiliary_loss_clip": 0.01437942, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.29551339, + "balance_loss_mlp": 1.02106023, + "epoch": 0.26772884413046744, + "flos": 18328789521000.0, + "grad_norm": 2.207668231273939, + "language_loss": 0.83674991, + "learning_rate": 3.435869031622194e-06, + "loss": 0.86151803, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17810059, + "step": 4453, + "time_per_iteration": 2.722576141357422 + }, + { + "auxiliary_loss_clip": 0.01433023, + "auxiliary_loss_mlp": 0.01046539, + "balance_loss_clip": 1.29112422, + "balance_loss_mlp": 1.02801383, + "epoch": 0.2677889673831354, + "flos": 22132635933600.0, + "grad_norm": 1.7934474686888975, + "language_loss": 0.79577547, + "learning_rate": 3.435597895977208e-06, + "loss": 0.82057112, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.18530273, + "step": 4454, + "time_per_iteration": 2.787609815597534 + }, + { + "auxiliary_loss_clip": 0.01441331, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.29695344, + "balance_loss_mlp": 1.02226543, + "epoch": 0.2678490906358034, + "flos": 23734619513280.0, + "grad_norm": 1.5855518285483552, + "language_loss": 0.73240101, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75721812, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.18103027, + "step": 4455, + "time_per_iteration": 2.779148578643799 + }, + { + "auxiliary_loss_clip": 0.01432196, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.29371095, + "balance_loss_mlp": 1.01781082, + "epoch": 0.2679092138884714, + "flos": 21767980239360.0, + "grad_norm": 1.4796958161867273, + "language_loss": 0.73887807, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76354909, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.17089844, + "step": 4456, + "time_per_iteration": 2.850728988647461 + }, + { + "auxiliary_loss_clip": 0.01450519, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.30341291, + "balance_loss_mlp": 1.02087784, + "epoch": 0.26796933714113935, + "flos": 19865468211840.0, + "grad_norm": 1.923561774061317, + "language_loss": 0.70853674, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73343623, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.18554688, + "step": 4457, + "time_per_iteration": 2.7883965969085693 + }, + { + "auxiliary_loss_clip": 0.01444976, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.30208397, + "balance_loss_mlp": 1.02342856, + "epoch": 0.2680294603938073, + "flos": 20052750278880.0, + "grad_norm": 1.457823362746268, + "language_loss": 0.79022515, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81508714, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.17797852, + "step": 4458, + "time_per_iteration": 2.911803960800171 + }, + { + "auxiliary_loss_clip": 0.01324961, + "auxiliary_loss_mlp": 0.01006078, + "balance_loss_clip": 1.26284635, + "balance_loss_mlp": 1.00233459, + "epoch": 0.2680895836464753, + "flos": 72130279218480.0, + "grad_norm": 0.8664912487387225, + "language_loss": 0.58814847, + "learning_rate": 3.434241401387739e-06, + "loss": 0.6114589, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.03735352, + "step": 4459, + "time_per_iteration": 3.2374587059020996 + }, + { + "auxiliary_loss_clip": 0.01436243, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.29579389, + "balance_loss_mlp": 1.02040911, + "epoch": 0.26814970689914325, + "flos": 20453936515920.0, + "grad_norm": 1.9862389020607603, + "language_loss": 0.85350239, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87823969, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.17077637, + "step": 4460, + "time_per_iteration": 2.7460787296295166 + }, + { + "auxiliary_loss_clip": 0.01435156, + "auxiliary_loss_mlp": 0.01046104, + "balance_loss_clip": 1.29544592, + "balance_loss_mlp": 1.02868807, + "epoch": 0.2682098301518112, + "flos": 17571191086800.0, + "grad_norm": 1.7807035617624347, + "language_loss": 0.68391335, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70872593, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.17407227, + "step": 4461, + "time_per_iteration": 2.750253915786743 + }, + { + "auxiliary_loss_clip": 0.01433219, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.29191041, + "balance_loss_mlp": 1.02940524, + "epoch": 0.2682699534044792, + "flos": 18337926402000.0, + "grad_norm": 1.4904149892864749, + "language_loss": 0.67700082, + "learning_rate": 3.43342685191282e-06, + "loss": 0.70180696, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17993164, + "step": 4462, + "time_per_iteration": 2.781240940093994 + }, + { + "auxiliary_loss_clip": 0.01433339, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.29324222, + "balance_loss_mlp": 1.01639032, + "epoch": 0.26833007665714714, + "flos": 25306772929560.0, + "grad_norm": 1.5695251613936456, + "language_loss": 0.69549906, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.7201755, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17932129, + "step": 4463, + "time_per_iteration": 2.8264410495758057 + }, + { + "auxiliary_loss_clip": 0.01438062, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.29484868, + "balance_loss_mlp": 1.01857686, + "epoch": 0.2683901999098151, + "flos": 16103106561960.0, + "grad_norm": 3.090320908834191, + "language_loss": 0.78172272, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80646336, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.17419434, + "step": 4464, + "time_per_iteration": 2.7219111919403076 + }, + { + "auxiliary_loss_clip": 0.01429761, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.29002512, + "balance_loss_mlp": 1.03029859, + "epoch": 0.2684503231624831, + "flos": 27313800540480.0, + "grad_norm": 2.2834726979873814, + "language_loss": 0.71607673, + "learning_rate": 3.432611813236704e-06, + "loss": 0.74085468, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.17736816, + "step": 4465, + "time_per_iteration": 4.225724935531616 + }, + { + "auxiliary_loss_clip": 0.01309208, + "auxiliary_loss_mlp": 0.01010971, + "balance_loss_clip": 1.24800479, + "balance_loss_mlp": 1.00744236, + "epoch": 0.26851044641515104, + "flos": 71874443593800.0, + "grad_norm": 0.688060758203873, + "language_loss": 0.53106153, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55426329, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03540039, + "step": 4466, + "time_per_iteration": 3.430128812789917 + }, + { + "auxiliary_loss_clip": 0.01433118, + "auxiliary_loss_mlp": 0.01049111, + "balance_loss_clip": 1.29198146, + "balance_loss_mlp": 1.03140819, + "epoch": 0.268570569667819, + "flos": 18738381688560.0, + "grad_norm": 1.776189056440991, + "language_loss": 0.74089617, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76571846, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17700195, + "step": 4467, + "time_per_iteration": 2.72672963142395 + }, + { + "auxiliary_loss_clip": 0.01434574, + "auxiliary_loss_mlp": 0.01046019, + "balance_loss_clip": 1.29132593, + "balance_loss_mlp": 1.02837586, + "epoch": 0.268630692920487, + "flos": 18181855182600.0, + "grad_norm": 2.2676690900922294, + "language_loss": 0.8133626, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.8381685, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.17651367, + "step": 4468, + "time_per_iteration": 2.808789014816284 + }, + { + "auxiliary_loss_clip": 0.01305592, + "auxiliary_loss_mlp": 0.01008319, + "balance_loss_clip": 1.24509084, + "balance_loss_mlp": 1.00535107, + "epoch": 0.268690816173155, + "flos": 68749303907160.0, + "grad_norm": 0.8444639329356811, + "language_loss": 0.5964185, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61955762, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.02966309, + "step": 4469, + "time_per_iteration": 3.3328914642333984 + }, + { + "auxiliary_loss_clip": 0.01435203, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.29187417, + "balance_loss_mlp": 1.02355194, + "epoch": 0.26875093942582295, + "flos": 23298730109640.0, + "grad_norm": 2.081210510610432, + "language_loss": 0.81446958, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83924991, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.19274902, + "step": 4470, + "time_per_iteration": 2.7778446674346924 + }, + { + "auxiliary_loss_clip": 0.01420989, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_clip": 1.2843318, + "balance_loss_mlp": 1.02520525, + "epoch": 0.2688110626784909, + "flos": 21548472115680.0, + "grad_norm": 1.5590455325266712, + "language_loss": 0.82898295, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.85362118, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.17626953, + "step": 4471, + "time_per_iteration": 4.328341007232666 + }, + { + "auxiliary_loss_clip": 0.0142367, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.28774464, + "balance_loss_mlp": 1.02038085, + "epoch": 0.2688711859311589, + "flos": 28406224505520.0, + "grad_norm": 2.2255343550558524, + "language_loss": 0.69176787, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71637589, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.16760254, + "step": 4472, + "time_per_iteration": 2.831976890563965 + }, + { + "auxiliary_loss_clip": 0.01429153, + "auxiliary_loss_mlp": 0.01040782, + "balance_loss_clip": 1.29030609, + "balance_loss_mlp": 1.02340114, + "epoch": 0.26893130918382685, + "flos": 26000568984240.0, + "grad_norm": 1.702161141096559, + "language_loss": 0.67934418, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70404357, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17352295, + "step": 4473, + "time_per_iteration": 2.793999671936035 + }, + { + "auxiliary_loss_clip": 0.01427039, + "auxiliary_loss_mlp": 0.01041141, + "balance_loss_clip": 1.28981984, + "balance_loss_mlp": 1.02489281, + "epoch": 0.2689914324364948, + "flos": 20344507320960.0, + "grad_norm": 1.650473689110527, + "language_loss": 0.83331108, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85799289, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16247559, + "step": 4474, + "time_per_iteration": 2.798969268798828 + }, + { + "auxiliary_loss_clip": 0.01430005, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.29308486, + "balance_loss_mlp": 1.02873945, + "epoch": 0.2690515556891628, + "flos": 19469398628160.0, + "grad_norm": 1.8207165062605508, + "language_loss": 0.70779932, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.73255664, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.1697998, + "step": 4475, + "time_per_iteration": 4.283073902130127 + }, + { + "auxiliary_loss_clip": 0.01438639, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.29592657, + "balance_loss_mlp": 1.01956856, + "epoch": 0.26911167894183075, + "flos": 18150725551680.0, + "grad_norm": 1.6833183424199945, + "language_loss": 0.73388118, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75863713, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17382812, + "step": 4476, + "time_per_iteration": 4.273772239685059 + }, + { + "auxiliary_loss_clip": 0.01435144, + "auxiliary_loss_mlp": 0.01040781, + "balance_loss_clip": 1.29680514, + "balance_loss_mlp": 1.02424645, + "epoch": 0.2691718021944987, + "flos": 19980014060160.0, + "grad_norm": 1.5326084064036798, + "language_loss": 0.81047273, + "learning_rate": 3.429346772085922e-06, + "loss": 0.83523202, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.16540527, + "step": 4477, + "time_per_iteration": 2.749411106109619 + }, + { + "auxiliary_loss_clip": 0.01439881, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.29607093, + "balance_loss_mlp": 1.02531207, + "epoch": 0.2692319254471667, + "flos": 37453779045360.0, + "grad_norm": 1.5873992473333047, + "language_loss": 0.6550467, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67987692, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.17822266, + "step": 4478, + "time_per_iteration": 2.9006738662719727 + }, + { + "auxiliary_loss_clip": 0.0143398, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.29297948, + "balance_loss_mlp": 1.02457428, + "epoch": 0.26929204869983464, + "flos": 22132960800480.0, + "grad_norm": 1.931937574471386, + "language_loss": 0.80990148, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83465415, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.16723633, + "step": 4479, + "time_per_iteration": 2.766954183578491 + }, + { + "auxiliary_loss_clip": 0.01443234, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.30070782, + "balance_loss_mlp": 1.02173257, + "epoch": 0.2693521719525026, + "flos": 19797686213040.0, + "grad_norm": 2.1217457115229346, + "language_loss": 0.81246734, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.8373006, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.18347168, + "step": 4480, + "time_per_iteration": 2.781440019607544 + }, + { + "auxiliary_loss_clip": 0.01429941, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.29326582, + "balance_loss_mlp": 1.02177548, + "epoch": 0.2694122952051706, + "flos": 20999092681080.0, + "grad_norm": 1.4307466318807542, + "language_loss": 0.77813971, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80281603, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.15905762, + "step": 4481, + "time_per_iteration": 2.7322001457214355 + }, + { + "auxiliary_loss_clip": 0.01434824, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.294173, + "balance_loss_mlp": 1.02528167, + "epoch": 0.2694724184578386, + "flos": 25854934113360.0, + "grad_norm": 1.6251389941606207, + "language_loss": 0.7432822, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76807034, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.18701172, + "step": 4482, + "time_per_iteration": 2.8183341026306152 + }, + { + "auxiliary_loss_clip": 0.01441188, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.29905105, + "balance_loss_mlp": 1.02067423, + "epoch": 0.26953254171050656, + "flos": 21732140038680.0, + "grad_norm": 2.0182361469269345, + "language_loss": 0.72745287, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75225353, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.18200684, + "step": 4483, + "time_per_iteration": 2.7542564868927 + }, + { + "auxiliary_loss_clip": 0.01439456, + "auxiliary_loss_mlp": 0.01040352, + "balance_loss_clip": 1.29596221, + "balance_loss_mlp": 1.02197003, + "epoch": 0.2695926649631745, + "flos": 19687729109400.0, + "grad_norm": 2.418980043925866, + "language_loss": 0.87060583, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89540392, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.18383789, + "step": 4484, + "time_per_iteration": 2.757899284362793 + }, + { + "auxiliary_loss_clip": 0.01442137, + "auxiliary_loss_mlp": 0.01046388, + "balance_loss_clip": 1.29969335, + "balance_loss_mlp": 1.02887583, + "epoch": 0.2696527882158425, + "flos": 32892577848720.0, + "grad_norm": 1.5857985627343099, + "language_loss": 0.66797471, + "learning_rate": 3.427165740807239e-06, + "loss": 0.69285995, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1751709, + "step": 4485, + "time_per_iteration": 2.8883097171783447 + }, + { + "auxiliary_loss_clip": 0.0144, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.29852724, + "balance_loss_mlp": 1.02348471, + "epoch": 0.26971291146851045, + "flos": 12127937167800.0, + "grad_norm": 3.8500625836052977, + "language_loss": 0.72995579, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75476694, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.1762085, + "step": 4486, + "time_per_iteration": 2.717233180999756 + }, + { + "auxiliary_loss_clip": 0.01443651, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.30026329, + "balance_loss_mlp": 1.02208614, + "epoch": 0.2697730347211784, + "flos": 22639149921240.0, + "grad_norm": 1.7381007011519998, + "language_loss": 0.84073222, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86555827, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.16882324, + "step": 4487, + "time_per_iteration": 2.7917590141296387 + }, + { + "auxiliary_loss_clip": 0.01448192, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.30166423, + "balance_loss_mlp": 1.02047873, + "epoch": 0.2698331579738464, + "flos": 23518238233320.0, + "grad_norm": 1.988939293499908, + "language_loss": 0.72359049, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.74847984, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.20263672, + "step": 4488, + "time_per_iteration": 2.775792360305786 + }, + { + "auxiliary_loss_clip": 0.0144252, + "auxiliary_loss_mlp": 0.01046036, + "balance_loss_clip": 1.30038309, + "balance_loss_mlp": 1.02619958, + "epoch": 0.26989328122651435, + "flos": 24645974490360.0, + "grad_norm": 3.1648886217479992, + "language_loss": 0.84052444, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86540997, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.19824219, + "step": 4489, + "time_per_iteration": 2.8551125526428223 + }, + { + "auxiliary_loss_clip": 0.01442698, + "auxiliary_loss_mlp": 0.01056283, + "balance_loss_clip": 1.29652488, + "balance_loss_mlp": 1.03746009, + "epoch": 0.2699534044791823, + "flos": 10775170050120.0, + "grad_norm": 2.3922477628181666, + "language_loss": 0.89947075, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92446053, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.18835449, + "step": 4490, + "time_per_iteration": 2.7141454219818115 + }, + { + "auxiliary_loss_clip": 0.01431423, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.2929436, + "balance_loss_mlp": 1.01988971, + "epoch": 0.2700135277318503, + "flos": 36178174457640.0, + "grad_norm": 2.086707478483441, + "language_loss": 0.73541617, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.76009756, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16833496, + "step": 4491, + "time_per_iteration": 2.912015914916992 + }, + { + "auxiliary_loss_clip": 0.01440934, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.29915011, + "balance_loss_mlp": 1.02780461, + "epoch": 0.27007365098451824, + "flos": 17422835455800.0, + "grad_norm": 2.335665278656338, + "language_loss": 0.74501169, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7698772, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.17822266, + "step": 4492, + "time_per_iteration": 2.7346832752227783 + }, + { + "auxiliary_loss_clip": 0.01438339, + "auxiliary_loss_mlp": 0.01047901, + "balance_loss_clip": 1.29731691, + "balance_loss_mlp": 1.02991176, + "epoch": 0.2701337742371862, + "flos": 23190965857440.0, + "grad_norm": 1.7270140044933922, + "language_loss": 0.89456475, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91942716, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17980957, + "step": 4493, + "time_per_iteration": 2.833934783935547 + }, + { + "auxiliary_loss_clip": 0.01444875, + "auxiliary_loss_mlp": 0.01044949, + "balance_loss_clip": 1.30254459, + "balance_loss_mlp": 1.02762771, + "epoch": 0.2701938974898542, + "flos": 24394849435440.0, + "grad_norm": 1.4377848505475646, + "language_loss": 0.71342891, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73832715, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17333984, + "step": 4494, + "time_per_iteration": 2.8220627307891846 + }, + { + "auxiliary_loss_clip": 0.01441877, + "auxiliary_loss_mlp": 0.01040666, + "balance_loss_clip": 1.30301332, + "balance_loss_mlp": 1.02377367, + "epoch": 0.2702540207425222, + "flos": 26220036499560.0, + "grad_norm": 1.8233208847498898, + "language_loss": 0.86605585, + "learning_rate": 3.42443458168683e-06, + "loss": 0.8908813, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16882324, + "step": 4495, + "time_per_iteration": 2.786681652069092 + }, + { + "auxiliary_loss_clip": 0.01444698, + "auxiliary_loss_mlp": 0.01054428, + "balance_loss_clip": 1.3016181, + "balance_loss_mlp": 1.03678489, + "epoch": 0.27031414399519016, + "flos": 22931110005120.0, + "grad_norm": 1.7128776203829237, + "language_loss": 0.76325053, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78824174, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17626953, + "step": 4496, + "time_per_iteration": 2.8478386402130127 + }, + { + "auxiliary_loss_clip": 0.01316415, + "auxiliary_loss_mlp": 0.01010206, + "balance_loss_clip": 1.25680101, + "balance_loss_mlp": 1.00632024, + "epoch": 0.2703742672478581, + "flos": 63033930411120.0, + "grad_norm": 0.7341229033881864, + "language_loss": 0.50235653, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52562284, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.03881836, + "step": 4497, + "time_per_iteration": 3.305100202560425 + }, + { + "auxiliary_loss_clip": 0.01452711, + "auxiliary_loss_mlp": 0.01050446, + "balance_loss_clip": 1.31052113, + "balance_loss_mlp": 1.03417349, + "epoch": 0.2704343905005261, + "flos": 18845130731760.0, + "grad_norm": 1.6959330234345502, + "language_loss": 0.72471809, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74974966, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.16271973, + "step": 4498, + "time_per_iteration": 2.7851107120513916 + }, + { + "auxiliary_loss_clip": 0.01305495, + "auxiliary_loss_mlp": 0.01015598, + "balance_loss_clip": 1.24515676, + "balance_loss_mlp": 1.01187825, + "epoch": 0.27049451375319405, + "flos": 71249201096760.0, + "grad_norm": 0.7618869973252423, + "language_loss": 0.59138954, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61460048, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.03710938, + "step": 4499, + "time_per_iteration": 3.257486581802368 + }, + { + "auxiliary_loss_clip": 0.01449454, + "auxiliary_loss_mlp": 0.01054891, + "balance_loss_clip": 1.30746746, + "balance_loss_mlp": 1.03689039, + "epoch": 0.270554637005862, + "flos": 24283795906080.0, + "grad_norm": 1.880600289339255, + "language_loss": 0.74285626, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.76789963, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17993164, + "step": 4500, + "time_per_iteration": 2.764744281768799 + }, + { + "auxiliary_loss_clip": 0.01445839, + "auxiliary_loss_mlp": 0.01060953, + "balance_loss_clip": 1.3042891, + "balance_loss_mlp": 1.0432508, + "epoch": 0.27061476025853, + "flos": 17635886850240.0, + "grad_norm": 3.3997744052768035, + "language_loss": 0.8133809, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83844882, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.17712402, + "step": 4501, + "time_per_iteration": 2.8888285160064697 + }, + { + "auxiliary_loss_clip": 0.01451576, + "auxiliary_loss_mlp": 0.01057992, + "balance_loss_clip": 1.30549061, + "balance_loss_mlp": 1.03997934, + "epoch": 0.27067488351119795, + "flos": 22715175417120.0, + "grad_norm": 1.6901815525506867, + "language_loss": 0.72519052, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75028622, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.18005371, + "step": 4502, + "time_per_iteration": 2.989898920059204 + }, + { + "auxiliary_loss_clip": 0.01450904, + "auxiliary_loss_mlp": 0.01058429, + "balance_loss_clip": 1.30421269, + "balance_loss_mlp": 1.04015458, + "epoch": 0.2707350067638659, + "flos": 41727202819200.0, + "grad_norm": 1.8911261009922316, + "language_loss": 0.68821287, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.71330625, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.18273926, + "step": 4503, + "time_per_iteration": 2.9182887077331543 + }, + { + "auxiliary_loss_clip": 0.01446577, + "auxiliary_loss_mlp": 0.01058435, + "balance_loss_clip": 1.30277073, + "balance_loss_mlp": 1.04087496, + "epoch": 0.2707951300165339, + "flos": 20197816632720.0, + "grad_norm": 1.8521239006492167, + "language_loss": 0.68288541, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70793557, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.17565918, + "step": 4504, + "time_per_iteration": 4.170639276504517 + }, + { + "auxiliary_loss_clip": 0.01444669, + "auxiliary_loss_mlp": 0.01064757, + "balance_loss_clip": 1.30342722, + "balance_loss_mlp": 1.04813886, + "epoch": 0.27085525326920185, + "flos": 21438961704000.0, + "grad_norm": 1.4942243758709093, + "language_loss": 0.75638485, + "learning_rate": 3.421698021097902e-06, + "loss": 0.78147912, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.16601562, + "step": 4505, + "time_per_iteration": 2.7408993244171143 + }, + { + "auxiliary_loss_clip": 0.01447662, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.30147219, + "balance_loss_mlp": 1.03517711, + "epoch": 0.2709153765218698, + "flos": 17679645681120.0, + "grad_norm": 2.6119131580042616, + "language_loss": 0.73277771, + "learning_rate": 3.42142406835758e-06, + "loss": 0.7577951, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.18896484, + "step": 4506, + "time_per_iteration": 2.7964718341827393 + }, + { + "auxiliary_loss_clip": 0.01444865, + "auxiliary_loss_mlp": 0.01055696, + "balance_loss_clip": 1.30001235, + "balance_loss_mlp": 1.03638411, + "epoch": 0.2709754997745378, + "flos": 24460844666400.0, + "grad_norm": 1.8171122350986109, + "language_loss": 0.8109709, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83597648, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.1932373, + "step": 4507, + "time_per_iteration": 2.777769088745117 + }, + { + "auxiliary_loss_clip": 0.01279784, + "auxiliary_loss_mlp": 0.01004186, + "balance_loss_clip": 1.21936882, + "balance_loss_mlp": 1.00061023, + "epoch": 0.2710356230272058, + "flos": 65225478720600.0, + "grad_norm": 1.0247420592384413, + "language_loss": 0.50927973, + "learning_rate": 3.420876001185698e-06, + "loss": 0.53211951, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.03564453, + "step": 4508, + "time_per_iteration": 3.193495512008667 + }, + { + "auxiliary_loss_clip": 0.0143597, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.29546845, + "balance_loss_mlp": 1.02433312, + "epoch": 0.27109574627987376, + "flos": 25490034768960.0, + "grad_norm": 1.9863521651904723, + "language_loss": 0.75137126, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.77614045, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.16638184, + "step": 4509, + "time_per_iteration": 2.8558290004730225 + }, + { + "auxiliary_loss_clip": 0.01430192, + "auxiliary_loss_mlp": 0.01037379, + "balance_loss_clip": 1.29118228, + "balance_loss_mlp": 1.02123833, + "epoch": 0.2711558695325417, + "flos": 19687566675960.0, + "grad_norm": 1.5910517277739922, + "language_loss": 0.71900916, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.74368489, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16125488, + "step": 4510, + "time_per_iteration": 4.363580226898193 + }, + { + "auxiliary_loss_clip": 0.01441658, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.30050611, + "balance_loss_mlp": 1.01936889, + "epoch": 0.2712159927852097, + "flos": 18592340734080.0, + "grad_norm": 2.9369733269595883, + "language_loss": 0.70473182, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72951472, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17260742, + "step": 4511, + "time_per_iteration": 2.7515480518341064 + }, + { + "auxiliary_loss_clip": 0.01437388, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.29271603, + "balance_loss_mlp": 1.01906264, + "epoch": 0.27127611603787766, + "flos": 25635832073280.0, + "grad_norm": 2.0912413496530875, + "language_loss": 0.81358945, + "learning_rate": 3.419779220367979e-06, + "loss": 0.8383342, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.18005371, + "step": 4512, + "time_per_iteration": 2.804973840713501 + }, + { + "auxiliary_loss_clip": 0.01435956, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.29538417, + "balance_loss_mlp": 1.01999426, + "epoch": 0.2713362392905456, + "flos": 23154191664480.0, + "grad_norm": 1.4131071119731877, + "language_loss": 0.80534887, + "learning_rate": 3.419504890542124e-06, + "loss": 0.83006704, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.15869141, + "step": 4513, + "time_per_iteration": 2.755234479904175 + }, + { + "auxiliary_loss_clip": 0.01437178, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.2945112, + "balance_loss_mlp": 1.02301502, + "epoch": 0.2713963625432136, + "flos": 18370233675360.0, + "grad_norm": 1.832757327972247, + "language_loss": 0.88426363, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90903449, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.16882324, + "step": 4514, + "time_per_iteration": 4.267492771148682 + }, + { + "auxiliary_loss_clip": 0.01436961, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.29481006, + "balance_loss_mlp": 1.02288079, + "epoch": 0.27145648579588155, + "flos": 22496844935880.0, + "grad_norm": 1.6785705241972393, + "language_loss": 0.92293358, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94771075, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17883301, + "step": 4515, + "time_per_iteration": 4.359050273895264 + }, + { + "auxiliary_loss_clip": 0.0144747, + "auxiliary_loss_mlp": 0.01054201, + "balance_loss_clip": 1.3004849, + "balance_loss_mlp": 1.03412604, + "epoch": 0.2715166090485495, + "flos": 19243149516720.0, + "grad_norm": 3.5358134603972475, + "language_loss": 0.74020082, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76521754, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.20056152, + "step": 4516, + "time_per_iteration": 2.7277157306671143 + }, + { + "auxiliary_loss_clip": 0.01438745, + "auxiliary_loss_mlp": 0.01046555, + "balance_loss_clip": 1.2977488, + "balance_loss_mlp": 1.02822065, + "epoch": 0.2715767323012175, + "flos": 17713739722320.0, + "grad_norm": 1.9765355299395293, + "language_loss": 0.76584911, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.7907021, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.18310547, + "step": 4517, + "time_per_iteration": 2.7497527599334717 + }, + { + "auxiliary_loss_clip": 0.01431308, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_clip": 1.29036021, + "balance_loss_mlp": 1.0317862, + "epoch": 0.27163685555388545, + "flos": 22387984257960.0, + "grad_norm": 2.325654171917609, + "language_loss": 0.77370483, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79851449, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17858887, + "step": 4518, + "time_per_iteration": 2.774475336074829 + }, + { + "auxiliary_loss_clip": 0.01434891, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_clip": 1.29279733, + "balance_loss_mlp": 1.02703929, + "epoch": 0.2716969788065534, + "flos": 22351900407120.0, + "grad_norm": 1.5764156142091363, + "language_loss": 0.68429857, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70908791, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.16986084, + "step": 4519, + "time_per_iteration": 2.7696070671081543 + }, + { + "auxiliary_loss_clip": 0.01433786, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.29286337, + "balance_loss_mlp": 1.01969767, + "epoch": 0.2717571020592214, + "flos": 25233955494120.0, + "grad_norm": 3.584897437470708, + "language_loss": 0.75769472, + "learning_rate": 3.417583075166451e-06, + "loss": 0.78239626, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16662598, + "step": 4520, + "time_per_iteration": 2.7904040813446045 + }, + { + "auxiliary_loss_clip": 0.01437956, + "auxiliary_loss_mlp": 0.01045825, + "balance_loss_clip": 1.29459906, + "balance_loss_mlp": 1.02724004, + "epoch": 0.2718172253118894, + "flos": 20194121271960.0, + "grad_norm": 2.20408701430007, + "language_loss": 0.76302123, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78785902, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.18566895, + "step": 4521, + "time_per_iteration": 2.731673240661621 + }, + { + "auxiliary_loss_clip": 0.01447067, + "auxiliary_loss_mlp": 0.0104977, + "balance_loss_clip": 1.30162001, + "balance_loss_mlp": 1.03164983, + "epoch": 0.27187734856455736, + "flos": 14323180838040.0, + "grad_norm": 2.094086339329889, + "language_loss": 0.75143516, + "learning_rate": 3.417033501108875e-06, + "loss": 0.77640355, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.18115234, + "step": 4522, + "time_per_iteration": 2.7466959953308105 + }, + { + "auxiliary_loss_clip": 0.01442726, + "auxiliary_loss_mlp": 0.01044163, + "balance_loss_clip": 1.29744244, + "balance_loss_mlp": 1.02593613, + "epoch": 0.27193747181722533, + "flos": 21112907578920.0, + "grad_norm": 1.6835171016131933, + "language_loss": 0.725734, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75060284, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.18225098, + "step": 4523, + "time_per_iteration": 2.7661993503570557 + }, + { + "auxiliary_loss_clip": 0.01437436, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.29741216, + "balance_loss_mlp": 1.02392507, + "epoch": 0.2719975950698933, + "flos": 19687566675960.0, + "grad_norm": 1.4998503910223986, + "language_loss": 0.74354804, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.7683506, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.18908691, + "step": 4524, + "time_per_iteration": 2.7863223552703857 + }, + { + "auxiliary_loss_clip": 0.01443368, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.29996562, + "balance_loss_mlp": 1.02366066, + "epoch": 0.27205771832256126, + "flos": 24759951821640.0, + "grad_norm": 1.5415711452289618, + "language_loss": 0.76087403, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78572559, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.18139648, + "step": 4525, + "time_per_iteration": 2.776236057281494 + }, + { + "auxiliary_loss_clip": 0.01434234, + "auxiliary_loss_mlp": 0.01047441, + "balance_loss_clip": 1.29412937, + "balance_loss_mlp": 1.02974987, + "epoch": 0.2721178415752292, + "flos": 21759736742280.0, + "grad_norm": 1.9868252854050228, + "language_loss": 0.8187201, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84353685, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.17700195, + "step": 4526, + "time_per_iteration": 2.7494022846221924 + }, + { + "auxiliary_loss_clip": 0.01448373, + "auxiliary_loss_mlp": 0.01044884, + "balance_loss_clip": 1.29895449, + "balance_loss_mlp": 1.02606082, + "epoch": 0.2721779648278972, + "flos": 12680605879560.0, + "grad_norm": 1.9648537367597827, + "language_loss": 0.77634013, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.80127275, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.18823242, + "step": 4527, + "time_per_iteration": 2.7823548316955566 + }, + { + "auxiliary_loss_clip": 0.01443132, + "auxiliary_loss_mlp": 0.01042445, + "balance_loss_clip": 1.30022991, + "balance_loss_mlp": 1.0244925, + "epoch": 0.27223808808056515, + "flos": 16257553446960.0, + "grad_norm": 1.963838414308417, + "language_loss": 0.82372212, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84857792, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.17944336, + "step": 4528, + "time_per_iteration": 2.7831642627716064 + }, + { + "auxiliary_loss_clip": 0.01428912, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.28926933, + "balance_loss_mlp": 1.02127469, + "epoch": 0.2722982113332331, + "flos": 27751598537040.0, + "grad_norm": 2.791847627382441, + "language_loss": 0.78246093, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.80713248, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.16967773, + "step": 4529, + "time_per_iteration": 2.826259136199951 + }, + { + "auxiliary_loss_clip": 0.01438493, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.29340029, + "balance_loss_mlp": 1.02693582, + "epoch": 0.2723583345859011, + "flos": 21731327871480.0, + "grad_norm": 1.8506649228100525, + "language_loss": 0.82707292, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.85189462, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.1673584, + "step": 4530, + "time_per_iteration": 2.861351490020752 + }, + { + "auxiliary_loss_clip": 0.01434035, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.29233408, + "balance_loss_mlp": 1.02433944, + "epoch": 0.27241845783856905, + "flos": 17351682963120.0, + "grad_norm": 1.9781369519275975, + "language_loss": 0.91873491, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94349682, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.17822266, + "step": 4531, + "time_per_iteration": 2.7408974170684814 + }, + { + "auxiliary_loss_clip": 0.01441023, + "auxiliary_loss_mlp": 0.01044522, + "balance_loss_clip": 1.29468036, + "balance_loss_mlp": 1.02724862, + "epoch": 0.272478581091237, + "flos": 24759748779840.0, + "grad_norm": 2.0676894912476724, + "language_loss": 0.76607692, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.7909323, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.17272949, + "step": 4532, + "time_per_iteration": 2.764880418777466 + }, + { + "auxiliary_loss_clip": 0.01431963, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_clip": 1.29326987, + "balance_loss_mlp": 1.02558613, + "epoch": 0.272538704343905, + "flos": 17894605668480.0, + "grad_norm": 2.4532212073297988, + "language_loss": 0.89190888, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91665417, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.16992188, + "step": 4533, + "time_per_iteration": 2.703697681427002 + }, + { + "auxiliary_loss_clip": 0.01426161, + "auxiliary_loss_mlp": 0.01035434, + "balance_loss_clip": 1.28782701, + "balance_loss_mlp": 1.01882863, + "epoch": 0.272598827596573, + "flos": 22937485517640.0, + "grad_norm": 1.7987775935023593, + "language_loss": 0.71497691, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73959285, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1661377, + "step": 4534, + "time_per_iteration": 2.7513225078582764 + }, + { + "auxiliary_loss_clip": 0.01437083, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_clip": 1.29317617, + "balance_loss_mlp": 1.02465057, + "epoch": 0.27265895084924097, + "flos": 24243123310560.0, + "grad_norm": 1.7403013888698502, + "language_loss": 0.9139728, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93876672, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.17651367, + "step": 4535, + "time_per_iteration": 2.786848306655884 + }, + { + "auxiliary_loss_clip": 0.0143755, + "auxiliary_loss_mlp": 0.0104742, + "balance_loss_clip": 1.29515803, + "balance_loss_mlp": 1.03033781, + "epoch": 0.27271907410190893, + "flos": 27018469962720.0, + "grad_norm": 1.5504408995293362, + "language_loss": 0.73026341, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75511312, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.17077637, + "step": 4536, + "time_per_iteration": 2.8154854774475098 + }, + { + "auxiliary_loss_clip": 0.01433961, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.29213607, + "balance_loss_mlp": 1.02513313, + "epoch": 0.2727791973545769, + "flos": 34458396360840.0, + "grad_norm": 1.6911514914676276, + "language_loss": 0.71743578, + "learning_rate": 3.41290485034781e-06, + "loss": 0.74220717, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.18041992, + "step": 4537, + "time_per_iteration": 2.907339096069336 + }, + { + "auxiliary_loss_clip": 0.01432035, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.28915739, + "balance_loss_mlp": 1.02597022, + "epoch": 0.27283932060724486, + "flos": 15044939071560.0, + "grad_norm": 6.022200857025042, + "language_loss": 0.7851398, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80990958, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.1895752, + "step": 4538, + "time_per_iteration": 2.7256462574005127 + }, + { + "auxiliary_loss_clip": 0.01432853, + "auxiliary_loss_mlp": 0.01049634, + "balance_loss_clip": 1.28866005, + "balance_loss_mlp": 1.03233695, + "epoch": 0.2728994438599128, + "flos": 21657373401960.0, + "grad_norm": 1.4355909572902308, + "language_loss": 0.90443701, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92926186, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.1730957, + "step": 4539, + "time_per_iteration": 2.7641897201538086 + }, + { + "auxiliary_loss_clip": 0.01431503, + "auxiliary_loss_mlp": 0.01044782, + "balance_loss_clip": 1.29182959, + "balance_loss_mlp": 1.02661395, + "epoch": 0.2729595671125808, + "flos": 17492241789000.0, + "grad_norm": 2.8902476120279883, + "language_loss": 0.88256228, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90732509, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.18164062, + "step": 4540, + "time_per_iteration": 2.7417445182800293 + }, + { + "auxiliary_loss_clip": 0.01432029, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.28945935, + "balance_loss_mlp": 1.01919436, + "epoch": 0.27301969036524876, + "flos": 19322910981720.0, + "grad_norm": 2.0214594477498586, + "language_loss": 0.82494402, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84962934, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17297363, + "step": 4541, + "time_per_iteration": 4.197892904281616 + }, + { + "auxiliary_loss_clip": 0.01430333, + "auxiliary_loss_mlp": 0.01045048, + "balance_loss_clip": 1.2885015, + "balance_loss_mlp": 1.02772641, + "epoch": 0.2730798136179167, + "flos": 21070163957040.0, + "grad_norm": 1.750563947894255, + "language_loss": 0.80223477, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82698858, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17333984, + "step": 4542, + "time_per_iteration": 2.74332857131958 + }, + { + "auxiliary_loss_clip": 0.01434951, + "auxiliary_loss_mlp": 0.01043203, + "balance_loss_clip": 1.29347086, + "balance_loss_mlp": 1.02535772, + "epoch": 0.2731399368705847, + "flos": 19176423335280.0, + "grad_norm": 1.870694175378794, + "language_loss": 0.90001446, + "learning_rate": 3.411250012687582e-06, + "loss": 0.92479604, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.1784668, + "step": 4543, + "time_per_iteration": 2.7828071117401123 + }, + { + "auxiliary_loss_clip": 0.0143936, + "auxiliary_loss_mlp": 0.01044902, + "balance_loss_clip": 1.29481113, + "balance_loss_mlp": 1.0263052, + "epoch": 0.27320006012325265, + "flos": 18293923920960.0, + "grad_norm": 4.223832442834931, + "language_loss": 0.64461529, + "learning_rate": 3.410974019048255e-06, + "loss": 0.66945797, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.18603516, + "step": 4544, + "time_per_iteration": 2.7176618576049805 + }, + { + "auxiliary_loss_clip": 0.01432664, + "auxiliary_loss_mlp": 0.01046408, + "balance_loss_clip": 1.29194736, + "balance_loss_mlp": 1.02856207, + "epoch": 0.2732601833759206, + "flos": 34867704269880.0, + "grad_norm": 1.5803982008752702, + "language_loss": 0.69758785, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72237855, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.17858887, + "step": 4545, + "time_per_iteration": 2.9061155319213867 + }, + { + "auxiliary_loss_clip": 0.01279243, + "auxiliary_loss_mlp": 0.01020675, + "balance_loss_clip": 1.22007358, + "balance_loss_mlp": 1.01728964, + "epoch": 0.2733203066285886, + "flos": 53925416076960.0, + "grad_norm": 0.7206438818092902, + "language_loss": 0.61701137, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.64001054, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.03393555, + "step": 4546, + "time_per_iteration": 3.3352317810058594 + }, + { + "auxiliary_loss_clip": 0.01437748, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_clip": 1.29650235, + "balance_loss_mlp": 1.03781724, + "epoch": 0.2733804298812566, + "flos": 20664307758600.0, + "grad_norm": 1.8522930287728625, + "language_loss": 0.64679933, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67173016, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.17529297, + "step": 4547, + "time_per_iteration": 2.7483108043670654 + }, + { + "auxiliary_loss_clip": 0.01420304, + "auxiliary_loss_mlp": 0.01047783, + "balance_loss_clip": 1.28357363, + "balance_loss_mlp": 1.03220844, + "epoch": 0.27344055313392457, + "flos": 25889799713400.0, + "grad_norm": 4.711126101658949, + "language_loss": 0.77775776, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.80243862, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.15582275, + "step": 4548, + "time_per_iteration": 4.257373809814453 + }, + { + "auxiliary_loss_clip": 0.01426947, + "auxiliary_loss_mlp": 0.01047137, + "balance_loss_clip": 1.28759933, + "balance_loss_mlp": 1.0314548, + "epoch": 0.27350067638659253, + "flos": 22934927190960.0, + "grad_norm": 2.0235182979407824, + "language_loss": 0.83293664, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.85767752, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.15686035, + "step": 4549, + "time_per_iteration": 2.8030571937561035 + }, + { + "auxiliary_loss_clip": 0.01434205, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.29224491, + "balance_loss_mlp": 1.02910566, + "epoch": 0.2735607996392605, + "flos": 16578328485240.0, + "grad_norm": 1.9723288960313945, + "language_loss": 0.71117043, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73598158, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.17797852, + "step": 4550, + "time_per_iteration": 2.7332229614257812 + }, + { + "auxiliary_loss_clip": 0.01422312, + "auxiliary_loss_mlp": 0.01048099, + "balance_loss_clip": 1.28408813, + "balance_loss_mlp": 1.03251874, + "epoch": 0.27362092289192846, + "flos": 19649939707440.0, + "grad_norm": 2.0643730120966715, + "language_loss": 0.78693455, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81163865, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.15600586, + "step": 4551, + "time_per_iteration": 2.82666277885437 + }, + { + "auxiliary_loss_clip": 0.01432006, + "auxiliary_loss_mlp": 0.0105147, + "balance_loss_clip": 1.29096889, + "balance_loss_mlp": 1.03468537, + "epoch": 0.27368104614459643, + "flos": 17643602438640.0, + "grad_norm": 2.295279714482716, + "language_loss": 0.7132616, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.73809636, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16784668, + "step": 4552, + "time_per_iteration": 2.738668203353882 + }, + { + "auxiliary_loss_clip": 0.01435118, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.29471564, + "balance_loss_mlp": 1.02903879, + "epoch": 0.2737411693972644, + "flos": 21585043266840.0, + "grad_norm": 2.1116449008357647, + "language_loss": 0.715657, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74046624, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.16760254, + "step": 4553, + "time_per_iteration": 4.269854545593262 + }, + { + "auxiliary_loss_clip": 0.01429956, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.28990102, + "balance_loss_mlp": 1.02715123, + "epoch": 0.27380129264993236, + "flos": 25489588077000.0, + "grad_norm": 1.5307080620260607, + "language_loss": 0.59149224, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.6162408, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17749023, + "step": 4554, + "time_per_iteration": 4.379674434661865 + }, + { + "auxiliary_loss_clip": 0.01443122, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.29689777, + "balance_loss_mlp": 1.01940179, + "epoch": 0.2738614159026003, + "flos": 18665848511640.0, + "grad_norm": 1.6708144065033455, + "language_loss": 0.74220312, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76699936, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.17114258, + "step": 4555, + "time_per_iteration": 2.690749168395996 + }, + { + "auxiliary_loss_clip": 0.01440958, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.29820311, + "balance_loss_mlp": 1.02196145, + "epoch": 0.2739215391552683, + "flos": 23482398032640.0, + "grad_norm": 1.7575201228333863, + "language_loss": 0.77920556, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80400932, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17456055, + "step": 4556, + "time_per_iteration": 2.7774693965911865 + }, + { + "auxiliary_loss_clip": 0.01451286, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.29913688, + "balance_loss_mlp": 1.02457786, + "epoch": 0.27398166240793626, + "flos": 17133068223360.0, + "grad_norm": 1.824974278780477, + "language_loss": 0.8264159, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.85135841, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.18383789, + "step": 4557, + "time_per_iteration": 2.7269468307495117 + }, + { + "auxiliary_loss_clip": 0.01432134, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.29029477, + "balance_loss_mlp": 1.02046752, + "epoch": 0.2740417856606042, + "flos": 23410067897520.0, + "grad_norm": 1.7990472066970131, + "language_loss": 0.73531753, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.76001072, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.16711426, + "step": 4558, + "time_per_iteration": 2.748800277709961 + }, + { + "auxiliary_loss_clip": 0.01434662, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.29173946, + "balance_loss_mlp": 1.02628446, + "epoch": 0.2741019089132722, + "flos": 12783456520200.0, + "grad_norm": 2.258836003455774, + "language_loss": 0.68442225, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70921195, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.18005371, + "step": 4559, + "time_per_iteration": 2.7788593769073486 + }, + { + "auxiliary_loss_clip": 0.01428241, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.28856468, + "balance_loss_mlp": 1.02924919, + "epoch": 0.27416203216594015, + "flos": 20636467404840.0, + "grad_norm": 1.8636069173699237, + "language_loss": 0.72884429, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.75359601, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17687988, + "step": 4560, + "time_per_iteration": 2.732606887817383 + }, + { + "auxiliary_loss_clip": 0.0144035, + "auxiliary_loss_mlp": 0.01043317, + "balance_loss_clip": 1.29703927, + "balance_loss_mlp": 1.02532792, + "epoch": 0.27422215541860817, + "flos": 26547187050360.0, + "grad_norm": 1.684099127163911, + "language_loss": 0.81765902, + "learning_rate": 3.406273949573303e-06, + "loss": 0.84249568, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.17980957, + "step": 4561, + "time_per_iteration": 2.809948682785034 + }, + { + "auxiliary_loss_clip": 0.01440737, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.2957449, + "balance_loss_mlp": 1.02460742, + "epoch": 0.27428227867127614, + "flos": 23336397686520.0, + "grad_norm": 1.685875977406773, + "language_loss": 0.75315785, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77799189, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.18066406, + "step": 4562, + "time_per_iteration": 2.7525174617767334 + }, + { + "auxiliary_loss_clip": 0.01439597, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.29694462, + "balance_loss_mlp": 1.01818371, + "epoch": 0.2743424019239441, + "flos": 23039767641240.0, + "grad_norm": 1.7232217236714034, + "language_loss": 0.74751532, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.77226484, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.17175293, + "step": 4563, + "time_per_iteration": 2.792729616165161 + }, + { + "auxiliary_loss_clip": 0.01449395, + "auxiliary_loss_mlp": 0.010547, + "balance_loss_clip": 1.30176663, + "balance_loss_mlp": 1.03556705, + "epoch": 0.27440252517661207, + "flos": 21986148287160.0, + "grad_norm": 1.9258712400330413, + "language_loss": 0.63297123, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.65801215, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.19116211, + "step": 4564, + "time_per_iteration": 2.782925605773926 + }, + { + "auxiliary_loss_clip": 0.01442239, + "auxiliary_loss_mlp": 0.01040984, + "balance_loss_clip": 1.29764366, + "balance_loss_mlp": 1.02366304, + "epoch": 0.27446264842928003, + "flos": 40194869222880.0, + "grad_norm": 2.035920392262794, + "language_loss": 0.79175013, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.81658238, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.1730957, + "step": 4565, + "time_per_iteration": 2.9741156101226807 + }, + { + "auxiliary_loss_clip": 0.01434286, + "auxiliary_loss_mlp": 0.01043611, + "balance_loss_clip": 1.29196692, + "balance_loss_mlp": 1.02638495, + "epoch": 0.274522771681948, + "flos": 13483587479040.0, + "grad_norm": 1.8703903655820673, + "language_loss": 0.68818557, + "learning_rate": 3.404888640957477e-06, + "loss": 0.71296453, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.17224121, + "step": 4566, + "time_per_iteration": 2.77117919921875 + }, + { + "auxiliary_loss_clip": 0.01428879, + "auxiliary_loss_mlp": 0.0104629, + "balance_loss_clip": 1.28760195, + "balance_loss_mlp": 1.02887392, + "epoch": 0.27458289493461596, + "flos": 28628128522440.0, + "grad_norm": 1.845262040086627, + "language_loss": 0.61097872, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63573039, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17419434, + "step": 4567, + "time_per_iteration": 2.848599433898926 + }, + { + "auxiliary_loss_clip": 0.01431405, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.28890347, + "balance_loss_mlp": 1.02298474, + "epoch": 0.2746430181872839, + "flos": 20124552505320.0, + "grad_norm": 1.6175186264105326, + "language_loss": 0.82735515, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85207891, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17980957, + "step": 4568, + "time_per_iteration": 2.7967026233673096 + }, + { + "auxiliary_loss_clip": 0.01442996, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.29699707, + "balance_loss_mlp": 1.02036989, + "epoch": 0.2747031414399519, + "flos": 20198303933040.0, + "grad_norm": 2.3986529109254726, + "language_loss": 0.68536794, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.71018136, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.17993164, + "step": 4569, + "time_per_iteration": 2.721250534057617 + }, + { + "auxiliary_loss_clip": 0.01435756, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.29064393, + "balance_loss_mlp": 1.02128899, + "epoch": 0.27476326469261986, + "flos": 13520280455280.0, + "grad_norm": 2.0163254130964856, + "language_loss": 0.71428627, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73904371, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.18701172, + "step": 4570, + "time_per_iteration": 2.737313747406006 + }, + { + "auxiliary_loss_clip": 0.0126377, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.20452762, + "balance_loss_mlp": 1.02993762, + "epoch": 0.2748233879452878, + "flos": 65951460223560.0, + "grad_norm": 0.7494405595626791, + "language_loss": 0.55869687, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.58167422, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.0402832, + "step": 4571, + "time_per_iteration": 3.37286114692688 + }, + { + "auxiliary_loss_clip": 0.0143778, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.2928071, + "balance_loss_mlp": 1.02643061, + "epoch": 0.2748835111979558, + "flos": 17388700806240.0, + "grad_norm": 1.850467202022682, + "language_loss": 0.77996218, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.80477887, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.17456055, + "step": 4572, + "time_per_iteration": 2.7863500118255615 + }, + { + "auxiliary_loss_clip": 0.01421009, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.28218293, + "balance_loss_mlp": 1.0210979, + "epoch": 0.27494363445062375, + "flos": 23592964261680.0, + "grad_norm": 1.494073692423168, + "language_loss": 0.81588721, + "learning_rate": 3.402946971702147e-06, + "loss": 0.84046865, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16040039, + "step": 4573, + "time_per_iteration": 2.7879014015197754 + }, + { + "auxiliary_loss_clip": 0.01427458, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.28632069, + "balance_loss_mlp": 1.02176392, + "epoch": 0.2750037577032918, + "flos": 17168949032400.0, + "grad_norm": 1.6487407391429352, + "language_loss": 0.79477531, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81944501, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.17736816, + "step": 4574, + "time_per_iteration": 2.747579574584961 + }, + { + "auxiliary_loss_clip": 0.01431429, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.28718531, + "balance_loss_mlp": 1.0322485, + "epoch": 0.27506388095595974, + "flos": 24496400608560.0, + "grad_norm": 1.8926714060231693, + "language_loss": 0.74808335, + "learning_rate": 3.402391730100936e-06, + "loss": 0.77288771, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.16748047, + "step": 4575, + "time_per_iteration": 2.814154863357544 + }, + { + "auxiliary_loss_clip": 0.01425329, + "auxiliary_loss_mlp": 0.01044297, + "balance_loss_clip": 1.28433371, + "balance_loss_mlp": 1.0276674, + "epoch": 0.2751240042086277, + "flos": 38771964821520.0, + "grad_norm": 2.1691317969146255, + "language_loss": 0.71917212, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74386835, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.16625977, + "step": 4576, + "time_per_iteration": 2.89379620552063 + }, + { + "auxiliary_loss_clip": 0.01426512, + "auxiliary_loss_mlp": 0.01044442, + "balance_loss_clip": 1.28345323, + "balance_loss_mlp": 1.02660787, + "epoch": 0.27518412746129567, + "flos": 26912776736880.0, + "grad_norm": 1.862087893505806, + "language_loss": 0.73725188, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.7619614, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.17834473, + "step": 4577, + "time_per_iteration": 2.8586549758911133 + }, + { + "auxiliary_loss_clip": 0.01438476, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.29448783, + "balance_loss_mlp": 1.0322082, + "epoch": 0.27524425071396363, + "flos": 24906033384480.0, + "grad_norm": 1.7267617244120697, + "language_loss": 0.7584157, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78329778, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17541504, + "step": 4578, + "time_per_iteration": 2.785344362258911 + }, + { + "auxiliary_loss_clip": 0.01435934, + "auxiliary_loss_mlp": 0.01055739, + "balance_loss_clip": 1.29101896, + "balance_loss_mlp": 1.03627229, + "epoch": 0.2753043739666316, + "flos": 26293869144000.0, + "grad_norm": 1.4222042779171347, + "language_loss": 0.67009306, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.69500971, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.19482422, + "step": 4579, + "time_per_iteration": 2.814000129699707 + }, + { + "auxiliary_loss_clip": 0.01432888, + "auxiliary_loss_mlp": 0.01062831, + "balance_loss_clip": 1.28753066, + "balance_loss_mlp": 1.04481876, + "epoch": 0.27536449721929956, + "flos": 24212115504720.0, + "grad_norm": 1.7712846816348846, + "language_loss": 0.80251038, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82746756, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.18029785, + "step": 4580, + "time_per_iteration": 4.175474166870117 + }, + { + "auxiliary_loss_clip": 0.01430181, + "auxiliary_loss_mlp": 0.0105776, + "balance_loss_clip": 1.28824759, + "balance_loss_mlp": 1.03887761, + "epoch": 0.27542462047196753, + "flos": 19542094238520.0, + "grad_norm": 1.538715743998047, + "language_loss": 0.67566943, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.70054889, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.18896484, + "step": 4581, + "time_per_iteration": 2.746534585952759 + }, + { + "auxiliary_loss_clip": 0.01441745, + "auxiliary_loss_mlp": 0.01050787, + "balance_loss_clip": 1.29400158, + "balance_loss_mlp": 1.03410995, + "epoch": 0.2754847437246355, + "flos": 14323221446400.0, + "grad_norm": 2.035264317107414, + "language_loss": 0.78427935, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80920464, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.16662598, + "step": 4582, + "time_per_iteration": 2.787214756011963 + }, + { + "auxiliary_loss_clip": 0.01423098, + "auxiliary_loss_mlp": 0.01048161, + "balance_loss_clip": 1.28290451, + "balance_loss_mlp": 1.03169823, + "epoch": 0.27554486697730346, + "flos": 18842816055240.0, + "grad_norm": 1.728158386031469, + "language_loss": 0.846506, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.87121856, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.16455078, + "step": 4583, + "time_per_iteration": 2.741537570953369 + }, + { + "auxiliary_loss_clip": 0.01434171, + "auxiliary_loss_mlp": 0.01045439, + "balance_loss_clip": 1.28904188, + "balance_loss_mlp": 1.02795064, + "epoch": 0.2756049902299714, + "flos": 22387375132560.0, + "grad_norm": 1.881949530762262, + "language_loss": 0.67429364, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69908977, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.17504883, + "step": 4584, + "time_per_iteration": 2.7887556552886963 + }, + { + "auxiliary_loss_clip": 0.01424567, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.28464985, + "balance_loss_mlp": 1.03017604, + "epoch": 0.2756651134826394, + "flos": 19578746606400.0, + "grad_norm": 1.73927502042509, + "language_loss": 0.77465582, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79937643, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.1730957, + "step": 4585, + "time_per_iteration": 2.7787580490112305 + }, + { + "auxiliary_loss_clip": 0.0143023, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.28410721, + "balance_loss_mlp": 1.02063894, + "epoch": 0.27572523673530736, + "flos": 23591827227600.0, + "grad_norm": 1.589187343256445, + "language_loss": 0.71732259, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74200606, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17480469, + "step": 4586, + "time_per_iteration": 2.787442445755005 + }, + { + "auxiliary_loss_clip": 0.01428544, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.28573275, + "balance_loss_mlp": 1.02338719, + "epoch": 0.2757853599879754, + "flos": 22825254345840.0, + "grad_norm": 1.5465149130758875, + "language_loss": 0.80692661, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.83161962, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17358398, + "step": 4587, + "time_per_iteration": 4.2611753940582275 + }, + { + "auxiliary_loss_clip": 0.01428388, + "auxiliary_loss_mlp": 0.01038502, + "balance_loss_clip": 1.28637338, + "balance_loss_mlp": 1.02125263, + "epoch": 0.27584548324064334, + "flos": 18556338099960.0, + "grad_norm": 2.338650188284172, + "language_loss": 0.83195227, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85662115, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.17242432, + "step": 4588, + "time_per_iteration": 2.771881580352783 + }, + { + "auxiliary_loss_clip": 0.01421476, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_clip": 1.28240085, + "balance_loss_mlp": 1.0247165, + "epoch": 0.2759056064933113, + "flos": 23775292108800.0, + "grad_norm": 1.295380015183072, + "language_loss": 0.76339871, + "learning_rate": 3.398499087583342e-06, + "loss": 0.78803468, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.1739502, + "step": 4589, + "time_per_iteration": 2.8256750106811523 + }, + { + "auxiliary_loss_clip": 0.01419613, + "auxiliary_loss_mlp": 0.01037265, + "balance_loss_clip": 1.28016305, + "balance_loss_mlp": 1.0202657, + "epoch": 0.27596572974597927, + "flos": 24287531875200.0, + "grad_norm": 1.7460521503410085, + "language_loss": 0.88812548, + "learning_rate": 3.398220643612143e-06, + "loss": 0.91269428, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.16992188, + "step": 4590, + "time_per_iteration": 2.820782423019409 + }, + { + "auxiliary_loss_clip": 0.01422606, + "auxiliary_loss_mlp": 0.01042439, + "balance_loss_clip": 1.28181159, + "balance_loss_mlp": 1.02445078, + "epoch": 0.27602585299864724, + "flos": 35047473790320.0, + "grad_norm": 1.4942594657314854, + "language_loss": 0.71991283, + "learning_rate": 3.397942146620277e-06, + "loss": 0.74456328, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17980957, + "step": 4591, + "time_per_iteration": 2.862926959991455 + }, + { + "auxiliary_loss_clip": 0.01425811, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.28265524, + "balance_loss_mlp": 1.02222395, + "epoch": 0.2760859762513152, + "flos": 24313869719640.0, + "grad_norm": 1.7664468003788434, + "language_loss": 0.79781497, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82247317, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.17785645, + "step": 4592, + "time_per_iteration": 4.291538715362549 + }, + { + "auxiliary_loss_clip": 0.01267418, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.20588481, + "balance_loss_mlp": 1.03681123, + "epoch": 0.27614609950398317, + "flos": 71275335899400.0, + "grad_norm": 0.742392123865541, + "language_loss": 0.61625397, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63932943, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03320312, + "step": 4593, + "time_per_iteration": 4.662216424942017 + }, + { + "auxiliary_loss_clip": 0.01423827, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.2822299, + "balance_loss_mlp": 1.02429211, + "epoch": 0.27620622275665113, + "flos": 29680285975560.0, + "grad_norm": 2.14717696310293, + "language_loss": 0.77806544, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.80271679, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.17028809, + "step": 4594, + "time_per_iteration": 2.842438220977783 + }, + { + "auxiliary_loss_clip": 0.01420638, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.28004766, + "balance_loss_mlp": 1.02103877, + "epoch": 0.2762663460093191, + "flos": 15382363537440.0, + "grad_norm": 1.41297796834854, + "language_loss": 0.92050028, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.94509453, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17749023, + "step": 4595, + "time_per_iteration": 2.778714418411255 + }, + { + "auxiliary_loss_clip": 0.0143149, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.28865981, + "balance_loss_mlp": 1.02330232, + "epoch": 0.27632646926198706, + "flos": 20708959973400.0, + "grad_norm": 1.863494361242009, + "language_loss": 0.6977672, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.72249728, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.18225098, + "step": 4596, + "time_per_iteration": 2.8491175174713135 + }, + { + "auxiliary_loss_clip": 0.01436378, + "auxiliary_loss_mlp": 0.01043051, + "balance_loss_clip": 1.29020846, + "balance_loss_mlp": 1.02569437, + "epoch": 0.276386592514655, + "flos": 32819313721320.0, + "grad_norm": 2.1012774542504618, + "language_loss": 0.6391772, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66397154, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.17346191, + "step": 4597, + "time_per_iteration": 2.825016975402832 + }, + { + "auxiliary_loss_clip": 0.01416828, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.27888227, + "balance_loss_mlp": 1.01837695, + "epoch": 0.276446715767323, + "flos": 18556338099960.0, + "grad_norm": 1.891105735252417, + "language_loss": 0.86846507, + "learning_rate": 3.395991183985887e-06, + "loss": 0.89298236, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1652832, + "step": 4598, + "time_per_iteration": 2.81298565864563 + }, + { + "auxiliary_loss_clip": 0.01429228, + "auxiliary_loss_mlp": 0.01043085, + "balance_loss_clip": 1.28670764, + "balance_loss_mlp": 1.0246675, + "epoch": 0.27650683901999096, + "flos": 22824685828800.0, + "grad_norm": 2.3612076663958756, + "language_loss": 0.80329764, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82802069, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.1842041, + "step": 4599, + "time_per_iteration": 2.736175060272217 + }, + { + "auxiliary_loss_clip": 0.01432005, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.28539383, + "balance_loss_mlp": 1.0232532, + "epoch": 0.276566962272659, + "flos": 21366590960520.0, + "grad_norm": 1.9918367762364193, + "language_loss": 0.78846228, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81318116, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.16638184, + "step": 4600, + "time_per_iteration": 2.7726144790649414 + }, + { + "auxiliary_loss_clip": 0.01432426, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_clip": 1.2864027, + "balance_loss_mlp": 1.02785814, + "epoch": 0.27662708552532694, + "flos": 17714876756400.0, + "grad_norm": 1.8577689777178643, + "language_loss": 0.73948359, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.7642529, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.16662598, + "step": 4601, + "time_per_iteration": 2.7548763751983643 + }, + { + "auxiliary_loss_clip": 0.01422291, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.2805208, + "balance_loss_mlp": 1.02388096, + "epoch": 0.2766872087779949, + "flos": 21257852107680.0, + "grad_norm": 1.5486257595490516, + "language_loss": 0.80575001, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.83037871, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.16711426, + "step": 4602, + "time_per_iteration": 2.828861951828003 + }, + { + "auxiliary_loss_clip": 0.01433963, + "auxiliary_loss_mlp": 0.01058242, + "balance_loss_clip": 1.28705311, + "balance_loss_mlp": 1.03963375, + "epoch": 0.2767473320306629, + "flos": 12935669945400.0, + "grad_norm": 2.4094752055588327, + "language_loss": 0.77037859, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.7953006, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.18615723, + "step": 4603, + "time_per_iteration": 2.781252384185791 + }, + { + "auxiliary_loss_clip": 0.01414004, + "auxiliary_loss_mlp": 0.01050161, + "balance_loss_clip": 1.27632403, + "balance_loss_mlp": 1.03487802, + "epoch": 0.27680745528333084, + "flos": 15017058109440.0, + "grad_norm": 1.4294566220639897, + "language_loss": 0.81604624, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.84068787, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15283203, + "step": 4604, + "time_per_iteration": 2.703695058822632 + }, + { + "auxiliary_loss_clip": 0.01425386, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.28171706, + "balance_loss_mlp": 1.02838492, + "epoch": 0.2768675785359988, + "flos": 22642967107080.0, + "grad_norm": 1.8460127198401484, + "language_loss": 0.70384049, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72854424, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.16601562, + "step": 4605, + "time_per_iteration": 2.7909867763519287 + }, + { + "auxiliary_loss_clip": 0.0126459, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_clip": 1.20092213, + "balance_loss_mlp": 1.07809663, + "epoch": 0.27692770178866677, + "flos": 66146336053920.0, + "grad_norm": 0.7281007332885425, + "language_loss": 0.57198101, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.5954451, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.03710938, + "step": 4606, + "time_per_iteration": 3.3363168239593506 + }, + { + "auxiliary_loss_clip": 0.01434792, + "auxiliary_loss_mlp": 0.01039485, + "balance_loss_clip": 1.2887274, + "balance_loss_mlp": 1.02172256, + "epoch": 0.27698782504133473, + "flos": 26470105737120.0, + "grad_norm": 1.9134018032666706, + "language_loss": 0.69296861, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71771145, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.1776123, + "step": 4607, + "time_per_iteration": 2.820634603500366 + }, + { + "auxiliary_loss_clip": 0.01422602, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.28237605, + "balance_loss_mlp": 1.02266634, + "epoch": 0.2770479482940027, + "flos": 25890327622080.0, + "grad_norm": 1.5106465135645748, + "language_loss": 0.70225382, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72686803, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.16162109, + "step": 4608, + "time_per_iteration": 2.8091156482696533 + }, + { + "auxiliary_loss_clip": 0.01430839, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.28551352, + "balance_loss_mlp": 1.0243541, + "epoch": 0.27710807154667066, + "flos": 22862475230760.0, + "grad_norm": 1.6852768030245586, + "language_loss": 0.73164403, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75636613, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.17004395, + "step": 4609, + "time_per_iteration": 2.8183977603912354 + }, + { + "auxiliary_loss_clip": 0.01424458, + "auxiliary_loss_mlp": 0.01049514, + "balance_loss_clip": 1.27978003, + "balance_loss_mlp": 1.03204966, + "epoch": 0.27716819479933863, + "flos": 17715404665080.0, + "grad_norm": 2.25003333493895, + "language_loss": 0.83871293, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86345255, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17480469, + "step": 4610, + "time_per_iteration": 2.7349929809570312 + }, + { + "auxiliary_loss_clip": 0.0142865, + "auxiliary_loss_mlp": 0.01047318, + "balance_loss_clip": 1.28177428, + "balance_loss_mlp": 1.02948427, + "epoch": 0.2772283180520066, + "flos": 19650792483000.0, + "grad_norm": 2.0969885525539746, + "language_loss": 0.69355106, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71831077, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.1784668, + "step": 4611, + "time_per_iteration": 2.7507424354553223 + }, + { + "auxiliary_loss_clip": 0.0141334, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.27457714, + "balance_loss_mlp": 1.01890087, + "epoch": 0.27728844130467456, + "flos": 21037572425160.0, + "grad_norm": 2.0622765590898746, + "language_loss": 0.75173306, + "learning_rate": 3.392081480737698e-06, + "loss": 0.776218, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16235352, + "step": 4612, + "time_per_iteration": 2.830141544342041 + }, + { + "auxiliary_loss_clip": 0.01427069, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.28115809, + "balance_loss_mlp": 1.02308297, + "epoch": 0.2773485645573425, + "flos": 18993851838000.0, + "grad_norm": 2.9129236571498045, + "language_loss": 0.67463589, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.69931561, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17834473, + "step": 4613, + "time_per_iteration": 2.868718385696411 + }, + { + "auxiliary_loss_clip": 0.01419033, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_clip": 1.2785573, + "balance_loss_mlp": 1.03006649, + "epoch": 0.27740868781001055, + "flos": 21473177570280.0, + "grad_norm": 1.5099618420697747, + "language_loss": 0.79930139, + "learning_rate": 3.39152210641815e-06, + "loss": 0.82397133, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.17883301, + "step": 4614, + "time_per_iteration": 2.7845518589019775 + }, + { + "auxiliary_loss_clip": 0.01421631, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_clip": 1.27712488, + "balance_loss_mlp": 1.02633393, + "epoch": 0.2774688110626785, + "flos": 19832511204720.0, + "grad_norm": 2.7646893425688868, + "language_loss": 0.80638897, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.83104002, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.17150879, + "step": 4615, + "time_per_iteration": 2.7382094860076904 + }, + { + "auxiliary_loss_clip": 0.0143301, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.28489745, + "balance_loss_mlp": 1.02955055, + "epoch": 0.2775289343153465, + "flos": 18219888234720.0, + "grad_norm": 2.813858343043598, + "language_loss": 0.63735944, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66216707, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.18188477, + "step": 4616, + "time_per_iteration": 2.7441234588623047 + }, + { + "auxiliary_loss_clip": 0.01419297, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.27586126, + "balance_loss_mlp": 1.02228343, + "epoch": 0.27758905756801444, + "flos": 16476899137200.0, + "grad_norm": 3.3165393217429746, + "language_loss": 0.82857388, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.85316575, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.17614746, + "step": 4617, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.01419911, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.27581954, + "balance_loss_mlp": 1.02922916, + "epoch": 0.2776491808206824, + "flos": 18732128001120.0, + "grad_norm": 2.3540997334186797, + "language_loss": 0.77477574, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79944003, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.17272949, + "step": 4618, + "time_per_iteration": 2.749683380126953 + }, + { + "auxiliary_loss_clip": 0.01424605, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.28100789, + "balance_loss_mlp": 1.01795292, + "epoch": 0.27770930407335037, + "flos": 28044086529600.0, + "grad_norm": 1.7360878873945373, + "language_loss": 0.85344809, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87804085, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.16699219, + "step": 4619, + "time_per_iteration": 4.202209711074829 + }, + { + "auxiliary_loss_clip": 0.01406149, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.26834786, + "balance_loss_mlp": 1.02746677, + "epoch": 0.27776942732601834, + "flos": 23555053034640.0, + "grad_norm": 1.4873207721023471, + "language_loss": 0.77169168, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79618573, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.15795898, + "step": 4620, + "time_per_iteration": 2.7924163341522217 + }, + { + "auxiliary_loss_clip": 0.01415342, + "auxiliary_loss_mlp": 0.01050734, + "balance_loss_clip": 1.27431226, + "balance_loss_mlp": 1.03312707, + "epoch": 0.2778295505786863, + "flos": 23913211391280.0, + "grad_norm": 1.7202013321268403, + "language_loss": 0.79089433, + "learning_rate": 3.389562634707122e-06, + "loss": 0.8155551, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17626953, + "step": 4621, + "time_per_iteration": 2.861675262451172 + }, + { + "auxiliary_loss_clip": 0.01425918, + "auxiliary_loss_mlp": 0.01049463, + "balance_loss_clip": 1.28176725, + "balance_loss_mlp": 1.03222561, + "epoch": 0.27788967383135427, + "flos": 25559806577400.0, + "grad_norm": 2.252455556700867, + "language_loss": 0.88136256, + "learning_rate": 3.389282499322611e-06, + "loss": 0.90611637, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.17211914, + "step": 4622, + "time_per_iteration": 2.800855875015259 + }, + { + "auxiliary_loss_clip": 0.01420165, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.27634358, + "balance_loss_mlp": 1.02444768, + "epoch": 0.27794979708402223, + "flos": 16256903713200.0, + "grad_norm": 1.7737010770790747, + "language_loss": 0.81345266, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83807307, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.17431641, + "step": 4623, + "time_per_iteration": 2.715291738510132 + }, + { + "auxiliary_loss_clip": 0.01419551, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.27720213, + "balance_loss_mlp": 1.02401435, + "epoch": 0.2780099203366902, + "flos": 20672510647320.0, + "grad_norm": 2.067026016925283, + "language_loss": 0.822586, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.84719402, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.17224121, + "step": 4624, + "time_per_iteration": 2.851898670196533 + }, + { + "auxiliary_loss_clip": 0.0141883, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.27891994, + "balance_loss_mlp": 1.01656866, + "epoch": 0.27807004358935816, + "flos": 17742879543600.0, + "grad_norm": 2.0101309757097456, + "language_loss": 0.77002037, + "learning_rate": 3.388441777121191e-06, + "loss": 0.79454386, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.16943359, + "step": 4625, + "time_per_iteration": 4.160382270812988 + }, + { + "auxiliary_loss_clip": 0.01412223, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.27131796, + "balance_loss_mlp": 1.02117813, + "epoch": 0.2781301668420261, + "flos": 16731475902720.0, + "grad_norm": 1.7670915291974831, + "language_loss": 0.6996336, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72413445, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16674805, + "step": 4626, + "time_per_iteration": 2.7243382930755615 + }, + { + "auxiliary_loss_clip": 0.01423087, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.28022313, + "balance_loss_mlp": 1.01567793, + "epoch": 0.27819029009469415, + "flos": 13848730473600.0, + "grad_norm": 2.2446775944263284, + "language_loss": 0.93161368, + "learning_rate": 3.38788103238661e-06, + "loss": 0.95617598, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.17456055, + "step": 4627, + "time_per_iteration": 2.745065927505493 + }, + { + "auxiliary_loss_clip": 0.01418748, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.27374911, + "balance_loss_mlp": 1.02156258, + "epoch": 0.2782504133473621, + "flos": 27095023367280.0, + "grad_norm": 1.8974939810943978, + "language_loss": 0.85771334, + "learning_rate": 3.387600581071121e-06, + "loss": 0.88227344, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.15704346, + "step": 4628, + "time_per_iteration": 2.786640167236328 + }, + { + "auxiliary_loss_clip": 0.01412304, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.27146673, + "balance_loss_mlp": 1.02339268, + "epoch": 0.2783105366000301, + "flos": 21073737492720.0, + "grad_norm": 1.5999383824086622, + "language_loss": 0.79506785, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81958508, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16009521, + "step": 4629, + "time_per_iteration": 2.7799601554870605 + }, + { + "auxiliary_loss_clip": 0.01399947, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.26499319, + "balance_loss_mlp": 1.02287984, + "epoch": 0.27837065985269804, + "flos": 26507001755160.0, + "grad_norm": 1.4832159145571095, + "language_loss": 0.84835136, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.87273026, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1505127, + "step": 4630, + "time_per_iteration": 2.7787787914276123 + }, + { + "auxiliary_loss_clip": 0.01417345, + "auxiliary_loss_mlp": 0.01041667, + "balance_loss_clip": 1.27537131, + "balance_loss_mlp": 1.02467966, + "epoch": 0.278430783105366, + "flos": 20227159495800.0, + "grad_norm": 2.017314352220667, + "language_loss": 0.81402397, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83861411, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.1697998, + "step": 4631, + "time_per_iteration": 4.2422544956207275 + }, + { + "auxiliary_loss_clip": 0.01422262, + "auxiliary_loss_mlp": 0.01054574, + "balance_loss_clip": 1.27908373, + "balance_loss_mlp": 1.03796816, + "epoch": 0.278490906358034, + "flos": 25598367538200.0, + "grad_norm": 1.850710940982316, + "language_loss": 0.7135092, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73827761, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.16601562, + "step": 4632, + "time_per_iteration": 4.193769931793213 + }, + { + "auxiliary_loss_clip": 0.01395421, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.26018786, + "balance_loss_mlp": 1.0281918, + "epoch": 0.27855102961070194, + "flos": 16174746354960.0, + "grad_norm": 1.9024833620765484, + "language_loss": 0.82826614, + "learning_rate": 3.386197535437145e-06, + "loss": 0.85264814, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.14587402, + "step": 4633, + "time_per_iteration": 2.767500638961792 + }, + { + "auxiliary_loss_clip": 0.01407264, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.26684237, + "balance_loss_mlp": 1.02620816, + "epoch": 0.2786111528633699, + "flos": 22932490689360.0, + "grad_norm": 1.592094920761425, + "language_loss": 0.88143885, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90594989, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.1763916, + "step": 4634, + "time_per_iteration": 2.7576844692230225 + }, + { + "auxiliary_loss_clip": 0.01417323, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.27541292, + "balance_loss_mlp": 1.02897418, + "epoch": 0.27867127611603787, + "flos": 23409458772120.0, + "grad_norm": 1.605678858765293, + "language_loss": 0.77029181, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79492277, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.16784668, + "step": 4635, + "time_per_iteration": 2.807389974594116 + }, + { + "auxiliary_loss_clip": 0.01406747, + "auxiliary_loss_mlp": 0.01045474, + "balance_loss_clip": 1.26538062, + "balance_loss_mlp": 1.02895176, + "epoch": 0.27873139936870583, + "flos": 19833851280600.0, + "grad_norm": 2.178144409377028, + "language_loss": 0.6562956, + "learning_rate": 3.385355077194637e-06, + "loss": 0.68081784, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.16503906, + "step": 4636, + "time_per_iteration": 2.774106979370117 + }, + { + "auxiliary_loss_clip": 0.01422467, + "auxiliary_loss_mlp": 0.01048085, + "balance_loss_clip": 1.27826381, + "balance_loss_mlp": 1.03056157, + "epoch": 0.2787915226213738, + "flos": 17711546870880.0, + "grad_norm": 2.251082676060449, + "language_loss": 0.84350514, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86821067, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17529297, + "step": 4637, + "time_per_iteration": 2.7793703079223633 + }, + { + "auxiliary_loss_clip": 0.01408787, + "auxiliary_loss_mlp": 0.01057502, + "balance_loss_clip": 1.26927662, + "balance_loss_mlp": 1.04115868, + "epoch": 0.27885164587404176, + "flos": 22095618090480.0, + "grad_norm": 1.4271176261487142, + "language_loss": 0.76023602, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78489888, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16333008, + "step": 4638, + "time_per_iteration": 2.8142788410186768 + }, + { + "auxiliary_loss_clip": 0.01411728, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_clip": 1.27035642, + "balance_loss_mlp": 1.03897238, + "epoch": 0.27891176912670973, + "flos": 19212304144320.0, + "grad_norm": 1.498011409565535, + "language_loss": 0.7205323, + "learning_rate": 3.38451214615691e-06, + "loss": 0.74521005, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.17089844, + "step": 4639, + "time_per_iteration": 2.7806410789489746 + }, + { + "auxiliary_loss_clip": 0.01416851, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.2729063, + "balance_loss_mlp": 1.02731407, + "epoch": 0.27897189237937775, + "flos": 27605801232720.0, + "grad_norm": 1.823533663632698, + "language_loss": 0.66045636, + "learning_rate": 3.384231064128447e-06, + "loss": 0.68507087, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.17297363, + "step": 4640, + "time_per_iteration": 2.796649932861328 + }, + { + "auxiliary_loss_clip": 0.0141302, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.27082396, + "balance_loss_mlp": 1.03190839, + "epoch": 0.2790320156320457, + "flos": 21183126079320.0, + "grad_norm": 1.780509304885882, + "language_loss": 0.72498059, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74958909, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.15924072, + "step": 4641, + "time_per_iteration": 2.8021113872528076 + }, + { + "auxiliary_loss_clip": 0.01420606, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.27517009, + "balance_loss_mlp": 1.025455, + "epoch": 0.2790921388847137, + "flos": 22789089278280.0, + "grad_norm": 1.7581796343748974, + "language_loss": 0.74923408, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77386838, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.17370605, + "step": 4642, + "time_per_iteration": 2.7883358001708984 + }, + { + "auxiliary_loss_clip": 0.01419609, + "auxiliary_loss_mlp": 0.01042536, + "balance_loss_clip": 1.27664781, + "balance_loss_mlp": 1.0250721, + "epoch": 0.27915226213738165, + "flos": 23405438544480.0, + "grad_norm": 1.7880409708699028, + "language_loss": 0.86362326, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88824475, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17468262, + "step": 4643, + "time_per_iteration": 2.7907211780548096 + }, + { + "auxiliary_loss_clip": 0.01416478, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.27560949, + "balance_loss_mlp": 1.02469456, + "epoch": 0.2792123853900496, + "flos": 22753005427440.0, + "grad_norm": 1.7406810202655163, + "language_loss": 0.8361063, + "learning_rate": 3.383106211219407e-06, + "loss": 0.86067969, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.16162109, + "step": 4644, + "time_per_iteration": 2.8154032230377197 + }, + { + "auxiliary_loss_clip": 0.01411654, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.26994133, + "balance_loss_mlp": 1.02296102, + "epoch": 0.2792725086427176, + "flos": 15053588652240.0, + "grad_norm": 1.7731740065750738, + "language_loss": 0.78933942, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.8138541, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.16845703, + "step": 4645, + "time_per_iteration": 2.74570631980896 + }, + { + "auxiliary_loss_clip": 0.01276692, + "auxiliary_loss_mlp": 0.01084702, + "balance_loss_clip": 1.21083307, + "balance_loss_mlp": 1.08043468, + "epoch": 0.27933263189538554, + "flos": 62558870921280.0, + "grad_norm": 0.8642235389766524, + "language_loss": 0.62283623, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64645016, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.04272461, + "step": 4646, + "time_per_iteration": 3.2810826301574707 + }, + { + "auxiliary_loss_clip": 0.01400981, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.26370656, + "balance_loss_mlp": 1.02410412, + "epoch": 0.2793927551480535, + "flos": 25123754740320.0, + "grad_norm": 1.6854984230576995, + "language_loss": 0.89537126, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91977835, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15625, + "step": 4647, + "time_per_iteration": 2.8667891025543213 + }, + { + "auxiliary_loss_clip": 0.01417876, + "auxiliary_loss_mlp": 0.01044291, + "balance_loss_clip": 1.2735579, + "balance_loss_mlp": 1.02729177, + "epoch": 0.27945287840072147, + "flos": 21329573117400.0, + "grad_norm": 2.160199496948085, + "language_loss": 0.87159026, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89621186, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.17016602, + "step": 4648, + "time_per_iteration": 2.8240315914154053 + }, + { + "auxiliary_loss_clip": 0.01420081, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.2749995, + "balance_loss_mlp": 1.01935494, + "epoch": 0.27951300165338944, + "flos": 27455861875680.0, + "grad_norm": 2.116949279653453, + "language_loss": 0.73157024, + "learning_rate": 3.38169896509385e-06, + "loss": 0.7561301, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.16540527, + "step": 4649, + "time_per_iteration": 2.8488857746124268 + }, + { + "auxiliary_loss_clip": 0.01411336, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_clip": 1.26998043, + "balance_loss_mlp": 1.02665412, + "epoch": 0.2795731249060574, + "flos": 15163261497360.0, + "grad_norm": 1.9845637385210695, + "language_loss": 0.80474299, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82930487, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.18200684, + "step": 4650, + "time_per_iteration": 2.7577788829803467 + }, + { + "auxiliary_loss_clip": 0.01255315, + "auxiliary_loss_mlp": 0.01045645, + "balance_loss_clip": 1.1915592, + "balance_loss_mlp": 1.04197311, + "epoch": 0.27963324815872537, + "flos": 60136379911800.0, + "grad_norm": 0.830770383040581, + "language_loss": 0.58870339, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.61171299, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.03662109, + "step": 4651, + "time_per_iteration": 3.322676420211792 + }, + { + "auxiliary_loss_clip": 0.01419269, + "auxiliary_loss_mlp": 0.01048437, + "balance_loss_clip": 1.27373791, + "balance_loss_mlp": 1.03042459, + "epoch": 0.27969337141139333, + "flos": 21771472558320.0, + "grad_norm": 1.50503270007246, + "language_loss": 0.74374545, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76842248, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.18005371, + "step": 4652, + "time_per_iteration": 2.791532516479492 + }, + { + "auxiliary_loss_clip": 0.01418755, + "auxiliary_loss_mlp": 0.0106149, + "balance_loss_clip": 1.27461946, + "balance_loss_mlp": 1.04397798, + "epoch": 0.27975349466406135, + "flos": 39857769623880.0, + "grad_norm": 2.0935373859594386, + "language_loss": 0.80081004, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82561243, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.17529297, + "step": 4653, + "time_per_iteration": 2.9060122966766357 + }, + { + "auxiliary_loss_clip": 0.0141412, + "auxiliary_loss_mlp": 0.01048862, + "balance_loss_clip": 1.27177787, + "balance_loss_mlp": 1.03222084, + "epoch": 0.2798136179167293, + "flos": 21584799616680.0, + "grad_norm": 2.0129410951891877, + "language_loss": 0.78799707, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81262696, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.16638184, + "step": 4654, + "time_per_iteration": 2.767441987991333 + }, + { + "auxiliary_loss_clip": 0.01432543, + "auxiliary_loss_mlp": 0.01050655, + "balance_loss_clip": 1.28621364, + "balance_loss_mlp": 1.03266668, + "epoch": 0.2798737411693973, + "flos": 21541934169720.0, + "grad_norm": 1.9436058386998687, + "language_loss": 0.80944252, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83427447, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.17993164, + "step": 4655, + "time_per_iteration": 2.778135061264038 + }, + { + "auxiliary_loss_clip": 0.01413333, + "auxiliary_loss_mlp": 0.0105321, + "balance_loss_clip": 1.27136409, + "balance_loss_mlp": 1.03581762, + "epoch": 0.27993386442206525, + "flos": 26986934248200.0, + "grad_norm": 1.520766849166065, + "language_loss": 0.81831276, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.84297824, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17382812, + "step": 4656, + "time_per_iteration": 2.814222574234009 + }, + { + "auxiliary_loss_clip": 0.01411487, + "auxiliary_loss_mlp": 0.01045518, + "balance_loss_clip": 1.26909077, + "balance_loss_mlp": 1.02904367, + "epoch": 0.2799939876747332, + "flos": 24354867182040.0, + "grad_norm": 2.2695042297050363, + "language_loss": 0.83526456, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85983461, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.16491699, + "step": 4657, + "time_per_iteration": 4.260867118835449 + }, + { + "auxiliary_loss_clip": 0.01414918, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.27275252, + "balance_loss_mlp": 1.03609884, + "epoch": 0.2800541109274012, + "flos": 33663617650080.0, + "grad_norm": 1.9430723754182018, + "language_loss": 0.64128041, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66595483, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.16430664, + "step": 4658, + "time_per_iteration": 2.9105796813964844 + }, + { + "auxiliary_loss_clip": 0.01414666, + "auxiliary_loss_mlp": 0.010501, + "balance_loss_clip": 1.27176309, + "balance_loss_mlp": 1.03281474, + "epoch": 0.28011423418006914, + "flos": 21619177916400.0, + "grad_norm": 1.8462434546385662, + "language_loss": 0.78462809, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80927581, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.17285156, + "step": 4659, + "time_per_iteration": 2.773845672607422 + }, + { + "auxiliary_loss_clip": 0.01424072, + "auxiliary_loss_mlp": 0.01050846, + "balance_loss_clip": 1.28092957, + "balance_loss_mlp": 1.0333941, + "epoch": 0.2801743574327371, + "flos": 23117661121680.0, + "grad_norm": 1.6956920958073909, + "language_loss": 0.79300976, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81775892, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.17456055, + "step": 4660, + "time_per_iteration": 2.7603368759155273 + }, + { + "auxiliary_loss_clip": 0.01404441, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.26587534, + "balance_loss_mlp": 1.02814603, + "epoch": 0.2802344806854051, + "flos": 12645334195920.0, + "grad_norm": 1.9881491057954568, + "language_loss": 0.80601001, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.83049655, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16064453, + "step": 4661, + "time_per_iteration": 2.7436468601226807 + }, + { + "auxiliary_loss_clip": 0.01420151, + "auxiliary_loss_mlp": 0.01055327, + "balance_loss_clip": 1.28004336, + "balance_loss_mlp": 1.03876853, + "epoch": 0.28029460393807304, + "flos": 37274821692120.0, + "grad_norm": 1.7421702134403605, + "language_loss": 0.79100287, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81575763, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.16564941, + "step": 4662, + "time_per_iteration": 2.8741822242736816 + }, + { + "auxiliary_loss_clip": 0.01427109, + "auxiliary_loss_mlp": 0.01052451, + "balance_loss_clip": 1.27931213, + "balance_loss_mlp": 1.03424811, + "epoch": 0.280354727190741, + "flos": 20746059033240.0, + "grad_norm": 1.6976818455057154, + "language_loss": 0.70082033, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72561586, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.18200684, + "step": 4663, + "time_per_iteration": 2.7661666870117188 + }, + { + "auxiliary_loss_clip": 0.01422725, + "auxiliary_loss_mlp": 0.01058775, + "balance_loss_clip": 1.2764895, + "balance_loss_mlp": 1.03980911, + "epoch": 0.28041485044340897, + "flos": 21476101372200.0, + "grad_norm": 1.6860385273252911, + "language_loss": 0.77990198, + "learning_rate": 3.377469372935791e-06, + "loss": 0.804717, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.18981934, + "step": 4664, + "time_per_iteration": 4.2937328815460205 + }, + { + "auxiliary_loss_clip": 0.01414538, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.27471733, + "balance_loss_mlp": 1.03235888, + "epoch": 0.28047497369607693, + "flos": 14798483978040.0, + "grad_norm": 3.0685684449942694, + "language_loss": 0.79865265, + "learning_rate": 3.377186981855578e-06, + "loss": 0.82329416, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17260742, + "step": 4665, + "time_per_iteration": 2.748465061187744 + }, + { + "auxiliary_loss_clip": 0.01418184, + "auxiliary_loss_mlp": 0.01053796, + "balance_loss_clip": 1.27648866, + "balance_loss_mlp": 1.03782165, + "epoch": 0.2805350969487449, + "flos": 23075282975040.0, + "grad_norm": 1.7390357034316244, + "language_loss": 0.81331581, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83803558, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.1595459, + "step": 4666, + "time_per_iteration": 2.786755084991455 + }, + { + "auxiliary_loss_clip": 0.01424799, + "auxiliary_loss_mlp": 0.01068555, + "balance_loss_clip": 1.28182983, + "balance_loss_mlp": 1.05037582, + "epoch": 0.2805952202014129, + "flos": 20484010329480.0, + "grad_norm": 1.966616833957774, + "language_loss": 0.84889507, + "learning_rate": 3.376622043036658e-06, + "loss": 0.87382865, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.18164062, + "step": 4667, + "time_per_iteration": 2.7406723499298096 + }, + { + "auxiliary_loss_clip": 0.01430123, + "auxiliary_loss_mlp": 0.01064929, + "balance_loss_clip": 1.2844274, + "balance_loss_mlp": 1.04781055, + "epoch": 0.2806553434540809, + "flos": 27423067302000.0, + "grad_norm": 1.6018309787493525, + "language_loss": 0.80016106, + "learning_rate": 3.376339495319373e-06, + "loss": 0.82511163, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.17126465, + "step": 4668, + "time_per_iteration": 2.8359034061431885 + }, + { + "auxiliary_loss_clip": 0.01426985, + "auxiliary_loss_mlp": 0.01063057, + "balance_loss_clip": 1.28134525, + "balance_loss_mlp": 1.0449487, + "epoch": 0.28071546670674885, + "flos": 26510615899200.0, + "grad_norm": 1.366533809172931, + "language_loss": 0.76716316, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.79206359, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.18103027, + "step": 4669, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.01429587, + "auxiliary_loss_mlp": 0.01066592, + "balance_loss_clip": 1.28512669, + "balance_loss_mlp": 1.04991472, + "epoch": 0.2807755899594168, + "flos": 20563609361040.0, + "grad_norm": 1.8719655090385137, + "language_loss": 0.79692948, + "learning_rate": 3.375774243322725e-06, + "loss": 0.82189131, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.16674805, + "step": 4670, + "time_per_iteration": 4.355552911758423 + }, + { + "auxiliary_loss_clip": 0.01436378, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_clip": 1.28933966, + "balance_loss_mlp": 1.04829979, + "epoch": 0.2808357132120848, + "flos": 24318296030880.0, + "grad_norm": 2.6940009729342673, + "language_loss": 0.79944956, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.82447338, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.17687988, + "step": 4671, + "time_per_iteration": 4.1021623611450195 + }, + { + "auxiliary_loss_clip": 0.0142322, + "auxiliary_loss_mlp": 0.01061608, + "balance_loss_clip": 1.28345203, + "balance_loss_mlp": 1.04547346, + "epoch": 0.28089583646475275, + "flos": 26438042113920.0, + "grad_norm": 2.064147450916578, + "language_loss": 0.75113398, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77598226, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.16131592, + "step": 4672, + "time_per_iteration": 2.7971017360687256 + }, + { + "auxiliary_loss_clip": 0.01432548, + "auxiliary_loss_mlp": 0.01062892, + "balance_loss_clip": 1.28629899, + "balance_loss_mlp": 1.0438062, + "epoch": 0.2809559597174207, + "flos": 23116970779560.0, + "grad_norm": 3.3057033807674423, + "language_loss": 0.76243234, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.78738672, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.1907959, + "step": 4673, + "time_per_iteration": 2.7415931224823 + }, + { + "auxiliary_loss_clip": 0.0142849, + "auxiliary_loss_mlp": 0.01051606, + "balance_loss_clip": 1.28464925, + "balance_loss_mlp": 1.03492904, + "epoch": 0.2810160829700887, + "flos": 20928468097080.0, + "grad_norm": 2.005485958823957, + "language_loss": 0.72220743, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74700838, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.16674805, + "step": 4674, + "time_per_iteration": 2.801326274871826 + }, + { + "auxiliary_loss_clip": 0.01438409, + "auxiliary_loss_mlp": 0.01059522, + "balance_loss_clip": 1.29250979, + "balance_loss_mlp": 1.04115224, + "epoch": 0.28107620622275664, + "flos": 14359995639360.0, + "grad_norm": 1.7244786565636672, + "language_loss": 0.77169174, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79667103, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.18371582, + "step": 4675, + "time_per_iteration": 2.7349438667297363 + }, + { + "auxiliary_loss_clip": 0.01429949, + "auxiliary_loss_mlp": 0.01050106, + "balance_loss_clip": 1.28326321, + "balance_loss_mlp": 1.03254676, + "epoch": 0.2811363294754246, + "flos": 20923473268800.0, + "grad_norm": 2.924808055927797, + "language_loss": 0.70447427, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72927481, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.17578125, + "step": 4676, + "time_per_iteration": 2.843474864959717 + }, + { + "auxiliary_loss_clip": 0.01414308, + "auxiliary_loss_mlp": 0.01051032, + "balance_loss_clip": 1.27794349, + "balance_loss_mlp": 1.03526068, + "epoch": 0.28119645272809257, + "flos": 20599733820240.0, + "grad_norm": 1.4845112363394926, + "language_loss": 0.70783782, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7324912, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.15771484, + "step": 4677, + "time_per_iteration": 2.8136682510375977 + }, + { + "auxiliary_loss_clip": 0.01425051, + "auxiliary_loss_mlp": 0.01048403, + "balance_loss_clip": 1.28205323, + "balance_loss_mlp": 1.02963948, + "epoch": 0.28125657598076054, + "flos": 25342856780400.0, + "grad_norm": 1.6890662934210048, + "language_loss": 0.63804549, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.66278005, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.18768311, + "step": 4678, + "time_per_iteration": 2.8057861328125 + }, + { + "auxiliary_loss_clip": 0.0142153, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.28005457, + "balance_loss_mlp": 1.03337622, + "epoch": 0.2813166992334285, + "flos": 24832525606920.0, + "grad_norm": 1.4045670112679558, + "language_loss": 0.70856452, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73327613, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.16247559, + "step": 4679, + "time_per_iteration": 2.8098511695861816 + }, + { + "auxiliary_loss_clip": 0.01424176, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.27955151, + "balance_loss_mlp": 1.02669406, + "epoch": 0.2813768224860965, + "flos": 21765584346120.0, + "grad_norm": 1.9709087155651186, + "language_loss": 0.74588382, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77056664, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.1739502, + "step": 4680, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.01423112, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.27969158, + "balance_loss_mlp": 1.02762187, + "epoch": 0.2814369457387645, + "flos": 24322235041800.0, + "grad_norm": 1.5896048788542132, + "language_loss": 0.77519715, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79986912, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.16467285, + "step": 4681, + "time_per_iteration": 2.792473316192627 + }, + { + "auxiliary_loss_clip": 0.01427168, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.28139305, + "balance_loss_mlp": 1.01963806, + "epoch": 0.28149706899143245, + "flos": 18519969990600.0, + "grad_norm": 2.512082636084329, + "language_loss": 0.74783957, + "learning_rate": 3.372378352108146e-06, + "loss": 0.77248597, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.1784668, + "step": 4682, + "time_per_iteration": 2.716437816619873 + }, + { + "auxiliary_loss_clip": 0.01415622, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.2747792, + "balance_loss_mlp": 1.01779163, + "epoch": 0.2815571922441004, + "flos": 24868650066120.0, + "grad_norm": 1.3987464592263519, + "language_loss": 0.81009865, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.83459628, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16333008, + "step": 4683, + "time_per_iteration": 2.791849374771118 + }, + { + "auxiliary_loss_clip": 0.01427214, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.28228807, + "balance_loss_mlp": 1.02061248, + "epoch": 0.2816173154967684, + "flos": 19906425065880.0, + "grad_norm": 1.4999016048590679, + "language_loss": 0.76143062, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78609145, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.18261719, + "step": 4684, + "time_per_iteration": 2.73108172416687 + }, + { + "auxiliary_loss_clip": 0.01417257, + "auxiliary_loss_mlp": 0.0103484, + "balance_loss_clip": 1.27649522, + "balance_loss_mlp": 1.01800787, + "epoch": 0.28167743874943635, + "flos": 17495531066160.0, + "grad_norm": 1.572604972323455, + "language_loss": 0.76833266, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.79285359, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16833496, + "step": 4685, + "time_per_iteration": 2.7667694091796875 + }, + { + "auxiliary_loss_clip": 0.01416119, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.27553368, + "balance_loss_mlp": 1.01735425, + "epoch": 0.2817375620021043, + "flos": 25307706921840.0, + "grad_norm": 1.443020964640453, + "language_loss": 0.75951254, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78401107, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.16394043, + "step": 4686, + "time_per_iteration": 2.8159778118133545 + }, + { + "auxiliary_loss_clip": 0.01421068, + "auxiliary_loss_mlp": 0.01046197, + "balance_loss_clip": 1.27621293, + "balance_loss_mlp": 1.02817285, + "epoch": 0.2817976852547723, + "flos": 18697303009440.0, + "grad_norm": 2.5014960711143983, + "language_loss": 0.63935554, + "learning_rate": 3.370961184640025e-06, + "loss": 0.66402817, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.18017578, + "step": 4687, + "time_per_iteration": 2.718118667602539 + }, + { + "auxiliary_loss_clip": 0.01422393, + "auxiliary_loss_mlp": 0.01039042, + "balance_loss_clip": 1.27960193, + "balance_loss_mlp": 1.02217364, + "epoch": 0.28185780850744024, + "flos": 22746751740000.0, + "grad_norm": 2.0344353306356338, + "language_loss": 0.76934451, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7939589, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16870117, + "step": 4688, + "time_per_iteration": 2.7659168243408203 + }, + { + "auxiliary_loss_clip": 0.01412326, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.2706039, + "balance_loss_mlp": 1.01810503, + "epoch": 0.2819179317601082, + "flos": 14937337252800.0, + "grad_norm": 1.8869219394431396, + "language_loss": 0.79450941, + "learning_rate": 3.37039395366863e-06, + "loss": 0.81897449, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.16088867, + "step": 4689, + "time_per_iteration": 2.684070348739624 + }, + { + "auxiliary_loss_clip": 0.01415005, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.27265716, + "balance_loss_mlp": 1.01613641, + "epoch": 0.2819780550127762, + "flos": 23150090220120.0, + "grad_norm": 1.741477721231208, + "language_loss": 0.78058624, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80507779, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.18017578, + "step": 4690, + "time_per_iteration": 2.7687084674835205 + }, + { + "auxiliary_loss_clip": 0.01419193, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.27598977, + "balance_loss_mlp": 1.02427638, + "epoch": 0.28203817826544414, + "flos": 21621370767840.0, + "grad_norm": 1.7063004011735725, + "language_loss": 0.87756252, + "learning_rate": 3.369826514835332e-06, + "loss": 0.90217137, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.17419434, + "step": 4691, + "time_per_iteration": 2.746685743331909 + }, + { + "auxiliary_loss_clip": 0.01428508, + "auxiliary_loss_mlp": 0.01039833, + "balance_loss_clip": 1.27981925, + "balance_loss_mlp": 1.02156997, + "epoch": 0.2820983015181121, + "flos": 24032427201000.0, + "grad_norm": 1.6160419364734504, + "language_loss": 0.82162559, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84630907, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.18261719, + "step": 4692, + "time_per_iteration": 2.8076963424682617 + }, + { + "auxiliary_loss_clip": 0.01415865, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.27316296, + "balance_loss_mlp": 1.01962161, + "epoch": 0.2821584247707801, + "flos": 30014015080680.0, + "grad_norm": 1.5304973277362284, + "language_loss": 0.74860841, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.77314317, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.17980957, + "step": 4693, + "time_per_iteration": 2.814286708831787 + }, + { + "auxiliary_loss_clip": 0.01422095, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.27730131, + "balance_loss_mlp": 1.01836133, + "epoch": 0.2822185480234481, + "flos": 21401659602360.0, + "grad_norm": 1.7671988106304697, + "language_loss": 0.77814877, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.80272818, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17480469, + "step": 4694, + "time_per_iteration": 2.7525839805603027 + }, + { + "auxiliary_loss_clip": 0.01412226, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.27174044, + "balance_loss_mlp": 1.01853466, + "epoch": 0.28227867127611606, + "flos": 27458420202360.0, + "grad_norm": 1.6708343079402996, + "language_loss": 0.66871405, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69318962, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.16809082, + "step": 4695, + "time_per_iteration": 2.8228628635406494 + }, + { + "auxiliary_loss_clip": 0.01423879, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.27774644, + "balance_loss_mlp": 1.02885056, + "epoch": 0.282338794528784, + "flos": 22597868200320.0, + "grad_norm": 1.9483774223959442, + "language_loss": 0.75964767, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78436744, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.19250488, + "step": 4696, + "time_per_iteration": 4.236368656158447 + }, + { + "auxiliary_loss_clip": 0.01422447, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.2777431, + "balance_loss_mlp": 1.02272654, + "epoch": 0.282398917781452, + "flos": 42018472560960.0, + "grad_norm": 1.4306256910730963, + "language_loss": 0.62019712, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64482957, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.18078613, + "step": 4697, + "time_per_iteration": 2.9574437141418457 + }, + { + "auxiliary_loss_clip": 0.01410574, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.27019608, + "balance_loss_mlp": 1.01581383, + "epoch": 0.28245904103411995, + "flos": 23230338985440.0, + "grad_norm": 1.4742640278313517, + "language_loss": 0.73441809, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75885212, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.17004395, + "step": 4698, + "time_per_iteration": 2.850398540496826 + }, + { + "auxiliary_loss_clip": 0.01407683, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.26741397, + "balance_loss_mlp": 1.02083087, + "epoch": 0.2825191642867879, + "flos": 25380158882040.0, + "grad_norm": 2.5062685917208127, + "language_loss": 0.75433964, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.7787894, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.16467285, + "step": 4699, + "time_per_iteration": 2.8653411865234375 + }, + { + "auxiliary_loss_clip": 0.01419284, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.27316344, + "balance_loss_mlp": 1.01854515, + "epoch": 0.2825792875394559, + "flos": 17241076125720.0, + "grad_norm": 2.975987684748359, + "language_loss": 0.81313092, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.83768845, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17944336, + "step": 4700, + "time_per_iteration": 2.754359483718872 + }, + { + "auxiliary_loss_clip": 0.01412302, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.27379024, + "balance_loss_mlp": 1.02745926, + "epoch": 0.28263941079212385, + "flos": 26729596114200.0, + "grad_norm": 1.7354442850587464, + "language_loss": 0.81573355, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.84028679, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.15551758, + "step": 4701, + "time_per_iteration": 2.793980836868286 + }, + { + "auxiliary_loss_clip": 0.01409494, + "auxiliary_loss_mlp": 0.0103812, + "balance_loss_clip": 1.26753092, + "balance_loss_mlp": 1.02060795, + "epoch": 0.2826995340447918, + "flos": 25926858164880.0, + "grad_norm": 2.419396028425674, + "language_loss": 0.74109042, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.76556659, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17504883, + "step": 4702, + "time_per_iteration": 2.815561056137085 + }, + { + "auxiliary_loss_clip": 0.01412343, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.27124035, + "balance_loss_mlp": 1.01783228, + "epoch": 0.2827596572974598, + "flos": 22384451330640.0, + "grad_norm": 1.7003749817625327, + "language_loss": 0.78517956, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80965829, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.17700195, + "step": 4703, + "time_per_iteration": 4.2857911586761475 + }, + { + "auxiliary_loss_clip": 0.0142109, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_clip": 1.27646995, + "balance_loss_mlp": 1.02983165, + "epoch": 0.28281978055012774, + "flos": 33553701154800.0, + "grad_norm": 2.197395015618111, + "language_loss": 0.69760156, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.72228444, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.17382812, + "step": 4704, + "time_per_iteration": 2.8322010040283203 + }, + { + "auxiliary_loss_clip": 0.01413881, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.27148557, + "balance_loss_mlp": 1.01957905, + "epoch": 0.2828799038027957, + "flos": 23446151748360.0, + "grad_norm": 1.8984402064005026, + "language_loss": 0.70716274, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.73167449, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.17712402, + "step": 4705, + "time_per_iteration": 2.8005309104919434 + }, + { + "auxiliary_loss_clip": 0.01324593, + "auxiliary_loss_mlp": 0.01067092, + "balance_loss_clip": 1.24177086, + "balance_loss_mlp": 1.06170356, + "epoch": 0.2829400270554637, + "flos": 69888028048560.0, + "grad_norm": 1.172992845604558, + "language_loss": 0.59348178, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61739862, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.05395508, + "step": 4706, + "time_per_iteration": 3.315213441848755 + }, + { + "auxiliary_loss_clip": 0.01405439, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.26591706, + "balance_loss_mlp": 1.02129805, + "epoch": 0.2830001503081317, + "flos": 24794248904640.0, + "grad_norm": 1.4052368556000916, + "language_loss": 0.82441568, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84884632, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16333008, + "step": 4707, + "time_per_iteration": 2.852001667022705 + }, + { + "auxiliary_loss_clip": 0.01428155, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.27841949, + "balance_loss_mlp": 1.0211215, + "epoch": 0.28306027356079966, + "flos": 27674720265600.0, + "grad_norm": 1.5737968908701236, + "language_loss": 0.80450916, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82918715, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 1.49804688, + "router_z_loss_mlp": 0.18530273, + "step": 4708, + "time_per_iteration": 4.318579196929932 + }, + { + "auxiliary_loss_clip": 0.01308927, + "auxiliary_loss_mlp": 0.01059498, + "balance_loss_clip": 1.22799325, + "balance_loss_mlp": 1.05456257, + "epoch": 0.2831203968134676, + "flos": 60540351144480.0, + "grad_norm": 0.895240977855652, + "language_loss": 0.62975621, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.65344042, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.04931641, + "step": 4709, + "time_per_iteration": 3.1100168228149414 + }, + { + "auxiliary_loss_clip": 0.01418756, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.2759316, + "balance_loss_mlp": 1.01982737, + "epoch": 0.2831805200661356, + "flos": 22059940323240.0, + "grad_norm": 1.4237618316846106, + "language_loss": 0.73890072, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76345658, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.17004395, + "step": 4710, + "time_per_iteration": 4.230166912078857 + }, + { + "auxiliary_loss_clip": 0.01417689, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.27326179, + "balance_loss_mlp": 1.02744031, + "epoch": 0.28324064331880355, + "flos": 22605705613800.0, + "grad_norm": 1.9602631162016397, + "language_loss": 0.79251063, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81713694, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.17504883, + "step": 4711, + "time_per_iteration": 2.7737386226654053 + }, + { + "auxiliary_loss_clip": 0.01426127, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.27922082, + "balance_loss_mlp": 1.0280416, + "epoch": 0.2833007665714715, + "flos": 30409597364040.0, + "grad_norm": 1.7347289752658015, + "language_loss": 0.7184186, + "learning_rate": 3.363855879093996e-06, + "loss": 0.74315369, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.19335938, + "step": 4712, + "time_per_iteration": 2.820115804672241 + }, + { + "auxiliary_loss_clip": 0.01422483, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_clip": 1.27731848, + "balance_loss_mlp": 1.03523827, + "epoch": 0.2833608898241395, + "flos": 23554687559400.0, + "grad_norm": 1.8685541587477545, + "language_loss": 0.82035279, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84511435, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.18444824, + "step": 4713, + "time_per_iteration": 2.7702271938323975 + }, + { + "auxiliary_loss_clip": 0.01422903, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.27898049, + "balance_loss_mlp": 1.02029383, + "epoch": 0.28342101307680745, + "flos": 20271608668800.0, + "grad_norm": 1.6574411670704186, + "language_loss": 0.75357372, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77818722, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.1817627, + "step": 4714, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.01421635, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.27752352, + "balance_loss_mlp": 1.0292964, + "epoch": 0.2834811363294754, + "flos": 30853242964440.0, + "grad_norm": 1.684796958060293, + "language_loss": 0.78537053, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.81006789, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.18798828, + "step": 4715, + "time_per_iteration": 2.8208839893341064 + }, + { + "auxiliary_loss_clip": 0.0141676, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.27341974, + "balance_loss_mlp": 1.02681816, + "epoch": 0.2835412595821434, + "flos": 22716190626120.0, + "grad_norm": 3.392159843644722, + "language_loss": 0.73785871, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76247334, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.17883301, + "step": 4716, + "time_per_iteration": 2.7912251949310303 + }, + { + "auxiliary_loss_clip": 0.01436645, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.2840116, + "balance_loss_mlp": 1.02974069, + "epoch": 0.28360138283481134, + "flos": 18082334427480.0, + "grad_norm": 2.632437248101453, + "language_loss": 0.74345207, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.7683028, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18676758, + "step": 4717, + "time_per_iteration": 2.7351367473602295 + }, + { + "auxiliary_loss_clip": 0.01426349, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_clip": 1.27911639, + "balance_loss_mlp": 1.03137553, + "epoch": 0.2836615060874793, + "flos": 17858846684520.0, + "grad_norm": 1.4496661469944938, + "language_loss": 0.67464387, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69939494, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.17382812, + "step": 4718, + "time_per_iteration": 2.791104316711426 + }, + { + "auxiliary_loss_clip": 0.01431303, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.28423762, + "balance_loss_mlp": 1.02576649, + "epoch": 0.2837216293401473, + "flos": 25745992218720.0, + "grad_norm": 1.5658554720111622, + "language_loss": 0.73171091, + "learning_rate": 3.361860593925566e-06, + "loss": 0.75646871, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.18701172, + "step": 4719, + "time_per_iteration": 2.800532341003418 + }, + { + "auxiliary_loss_clip": 0.01414357, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.27251148, + "balance_loss_mlp": 1.0290556, + "epoch": 0.2837817525928153, + "flos": 20928508705440.0, + "grad_norm": 1.6464135401688478, + "language_loss": 0.80326104, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82787049, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17529297, + "step": 4720, + "time_per_iteration": 2.795393466949463 + }, + { + "auxiliary_loss_clip": 0.01430182, + "auxiliary_loss_mlp": 0.01045087, + "balance_loss_clip": 1.28376639, + "balance_loss_mlp": 1.02552509, + "epoch": 0.28384187584548326, + "flos": 18921399877800.0, + "grad_norm": 2.079530565477406, + "language_loss": 0.79714507, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.82189775, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.19555664, + "step": 4721, + "time_per_iteration": 2.7485575675964355 + }, + { + "auxiliary_loss_clip": 0.01419606, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.27422953, + "balance_loss_mlp": 1.02167988, + "epoch": 0.2839019990981512, + "flos": 27349640741160.0, + "grad_norm": 1.9787482637469471, + "language_loss": 0.82856572, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.85315812, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.17944336, + "step": 4722, + "time_per_iteration": 2.815119504928589 + }, + { + "auxiliary_loss_clip": 0.01430998, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.28547859, + "balance_loss_mlp": 1.02119684, + "epoch": 0.2839621223508192, + "flos": 18118946187000.0, + "grad_norm": 1.7191396809807777, + "language_loss": 0.70240307, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72709441, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.16931152, + "step": 4723, + "time_per_iteration": 2.765620231628418 + }, + { + "auxiliary_loss_clip": 0.01424985, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.28029227, + "balance_loss_mlp": 1.02568424, + "epoch": 0.28402224560348716, + "flos": 26363843994240.0, + "grad_norm": 1.5746414680922243, + "language_loss": 0.78750962, + "learning_rate": 3.360433840760998e-06, + "loss": 0.81220192, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.18554688, + "step": 4724, + "time_per_iteration": 2.807661294937134 + }, + { + "auxiliary_loss_clip": 0.01416331, + "auxiliary_loss_mlp": 0.01046173, + "balance_loss_clip": 1.27150476, + "balance_loss_mlp": 1.0283637, + "epoch": 0.2840823688561551, + "flos": 24066115158600.0, + "grad_norm": 1.5444131638674874, + "language_loss": 0.92676115, + "learning_rate": 3.36014833532143e-06, + "loss": 0.95138615, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.17822266, + "step": 4725, + "time_per_iteration": 2.783757448196411 + }, + { + "auxiliary_loss_clip": 0.01429343, + "auxiliary_loss_mlp": 0.01048718, + "balance_loss_clip": 1.28228211, + "balance_loss_mlp": 1.03014541, + "epoch": 0.2841424921088231, + "flos": 29465732071800.0, + "grad_norm": 1.4891787165491066, + "language_loss": 0.8887617, + "learning_rate": 3.3598627783049e-06, + "loss": 0.91354227, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.18566895, + "step": 4726, + "time_per_iteration": 2.819915294647217 + }, + { + "auxiliary_loss_clip": 0.0143647, + "auxiliary_loss_mlp": 0.01047603, + "balance_loss_clip": 1.28785586, + "balance_loss_mlp": 1.02984095, + "epoch": 0.28420261536149105, + "flos": 48108230776440.0, + "grad_norm": 1.8854532450723929, + "language_loss": 0.78638047, + "learning_rate": 3.359577169722238e-06, + "loss": 0.81122124, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.1776123, + "step": 4727, + "time_per_iteration": 3.035740613937378 + }, + { + "auxiliary_loss_clip": 0.01420153, + "auxiliary_loss_mlp": 0.01044043, + "balance_loss_clip": 1.2779423, + "balance_loss_mlp": 1.02737749, + "epoch": 0.284262738614159, + "flos": 25671672273960.0, + "grad_norm": 2.6370989034780985, + "language_loss": 0.67472035, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69936228, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.16674805, + "step": 4728, + "time_per_iteration": 2.776071786880493 + }, + { + "auxiliary_loss_clip": 0.01425752, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.28086698, + "balance_loss_mlp": 1.02574039, + "epoch": 0.284322861866827, + "flos": 19723731743520.0, + "grad_norm": 1.6799041305968807, + "language_loss": 0.76562774, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.79032171, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.17919922, + "step": 4729, + "time_per_iteration": 2.773634433746338 + }, + { + "auxiliary_loss_clip": 0.01431852, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.28555381, + "balance_loss_mlp": 1.02151585, + "epoch": 0.28438298511949495, + "flos": 23920602112800.0, + "grad_norm": 1.7289644635181458, + "language_loss": 0.67121911, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.69593495, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.18212891, + "step": 4730, + "time_per_iteration": 2.821249485015869 + }, + { + "auxiliary_loss_clip": 0.0143355, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.28724742, + "balance_loss_mlp": 1.01976919, + "epoch": 0.2844431083721629, + "flos": 26072899119360.0, + "grad_norm": 1.8433394906829743, + "language_loss": 0.74367046, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76838595, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.18225098, + "step": 4731, + "time_per_iteration": 2.79709792137146 + }, + { + "auxiliary_loss_clip": 0.01431234, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.28614736, + "balance_loss_mlp": 1.02118754, + "epoch": 0.2845032316248309, + "flos": 25815764027160.0, + "grad_norm": 2.4081610668951807, + "language_loss": 0.84084076, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.86553639, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.17138672, + "step": 4732, + "time_per_iteration": 2.8130722045898438 + }, + { + "auxiliary_loss_clip": 0.01432702, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.28707969, + "balance_loss_mlp": 1.02354407, + "epoch": 0.2845633548774989, + "flos": 19827272726280.0, + "grad_norm": 1.6779054711909407, + "language_loss": 0.7937634, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81851053, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.18457031, + "step": 4733, + "time_per_iteration": 2.7377147674560547 + }, + { + "auxiliary_loss_clip": 0.01438292, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.29086685, + "balance_loss_mlp": 1.02914119, + "epoch": 0.28462347813016686, + "flos": 23187717188640.0, + "grad_norm": 2.094015900458848, + "language_loss": 0.71697259, + "learning_rate": 3.357576466701875e-06, + "loss": 0.74183249, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.18566895, + "step": 4734, + "time_per_iteration": 2.793287992477417 + }, + { + "auxiliary_loss_clip": 0.01427764, + "auxiliary_loss_mlp": 0.01038593, + "balance_loss_clip": 1.28277254, + "balance_loss_mlp": 1.02081919, + "epoch": 0.2846836013828348, + "flos": 18665036344440.0, + "grad_norm": 1.893702700640223, + "language_loss": 0.73765314, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.7623167, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.17773438, + "step": 4735, + "time_per_iteration": 2.728178024291992 + }, + { + "auxiliary_loss_clip": 0.01431398, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_clip": 1.28699684, + "balance_loss_mlp": 1.03094363, + "epoch": 0.2847437246355028, + "flos": 14177586575520.0, + "grad_norm": 1.7794499637423675, + "language_loss": 0.80181533, + "learning_rate": 3.357004373789946e-06, + "loss": 0.82661796, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.17895508, + "step": 4736, + "time_per_iteration": 4.137011766433716 + }, + { + "auxiliary_loss_clip": 0.01431127, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.28559899, + "balance_loss_mlp": 1.03018928, + "epoch": 0.28480384788817076, + "flos": 29284378825320.0, + "grad_norm": 2.14033863791606, + "language_loss": 0.60445744, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62926066, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.19018555, + "step": 4737, + "time_per_iteration": 2.875927686691284 + }, + { + "auxiliary_loss_clip": 0.01420099, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.27931476, + "balance_loss_mlp": 1.02047992, + "epoch": 0.2848639711408387, + "flos": 22606436564280.0, + "grad_norm": 1.6507566668810476, + "language_loss": 0.86937433, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8939445, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16430664, + "step": 4738, + "time_per_iteration": 2.7606287002563477 + }, + { + "auxiliary_loss_clip": 0.01435608, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_clip": 1.28881478, + "balance_loss_mlp": 1.02848947, + "epoch": 0.2849240943935067, + "flos": 17603498360160.0, + "grad_norm": 1.9131470680549045, + "language_loss": 0.90173829, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92657447, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.19519043, + "step": 4739, + "time_per_iteration": 2.7711291313171387 + }, + { + "auxiliary_loss_clip": 0.01423935, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.28115082, + "balance_loss_mlp": 1.02442813, + "epoch": 0.28498421764617465, + "flos": 24867878507280.0, + "grad_norm": 1.3330517080984134, + "language_loss": 0.72566992, + "learning_rate": 3.355859570559998e-06, + "loss": 0.75033033, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17700195, + "step": 4740, + "time_per_iteration": 2.7890355587005615 + }, + { + "auxiliary_loss_clip": 0.01421161, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.28138173, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2850443408988426, + "flos": 22787546160600.0, + "grad_norm": 1.49514631315544, + "language_loss": 0.78700924, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.81158102, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.171875, + "step": 4741, + "time_per_iteration": 2.77950382232666 + }, + { + "auxiliary_loss_clip": 0.01443223, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.29301083, + "balance_loss_mlp": 1.02046382, + "epoch": 0.2851044641515106, + "flos": 18848785484160.0, + "grad_norm": 2.034657209083476, + "language_loss": 0.76647651, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.79130077, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.18737793, + "step": 4742, + "time_per_iteration": 4.289759874343872 + }, + { + "auxiliary_loss_clip": 0.01442989, + "auxiliary_loss_mlp": 0.01045987, + "balance_loss_clip": 1.29337788, + "balance_loss_mlp": 1.02593601, + "epoch": 0.28516458740417855, + "flos": 18885234810240.0, + "grad_norm": 1.8068680891102193, + "language_loss": 0.57365084, + "learning_rate": 3.355000428249086e-06, + "loss": 0.5985406, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.20043945, + "step": 4743, + "time_per_iteration": 2.8632919788360596 + }, + { + "auxiliary_loss_clip": 0.01442518, + "auxiliary_loss_mlp": 0.01051512, + "balance_loss_clip": 1.29680538, + "balance_loss_mlp": 1.03279638, + "epoch": 0.2852247106568465, + "flos": 25305107986800.0, + "grad_norm": 1.5558693113468887, + "language_loss": 0.7434904, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76843071, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.18713379, + "step": 4744, + "time_per_iteration": 2.8151955604553223 + }, + { + "auxiliary_loss_clip": 0.01433411, + "auxiliary_loss_mlp": 0.01041274, + "balance_loss_clip": 1.2906003, + "balance_loss_mlp": 1.02327323, + "epoch": 0.2852848339095145, + "flos": 11659050148680.0, + "grad_norm": 2.1398372510103645, + "language_loss": 0.78075498, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.80550182, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.18005371, + "step": 4745, + "time_per_iteration": 2.7440896034240723 + }, + { + "auxiliary_loss_clip": 0.01416758, + "auxiliary_loss_mlp": 0.01040967, + "balance_loss_clip": 1.27987993, + "balance_loss_mlp": 1.02405119, + "epoch": 0.2853449571621825, + "flos": 12937172454720.0, + "grad_norm": 1.6984211132084486, + "language_loss": 0.83326256, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85783982, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16906738, + "step": 4746, + "time_per_iteration": 2.7357177734375 + }, + { + "auxiliary_loss_clip": 0.01441558, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.29283738, + "balance_loss_mlp": 1.01805544, + "epoch": 0.28540508041485046, + "flos": 20015326352160.0, + "grad_norm": 1.6554648478732863, + "language_loss": 0.79921031, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.82399136, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.18493652, + "step": 4747, + "time_per_iteration": 4.2637999057769775 + }, + { + "auxiliary_loss_clip": 0.01267595, + "auxiliary_loss_mlp": 0.01049255, + "balance_loss_clip": 1.19781351, + "balance_loss_mlp": 1.04479682, + "epoch": 0.28546520366751843, + "flos": 68155678341360.0, + "grad_norm": 0.7869900204970922, + "language_loss": 0.60510486, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62827337, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.04467773, + "step": 4748, + "time_per_iteration": 3.256513833999634 + }, + { + "auxiliary_loss_clip": 0.01426964, + "auxiliary_loss_mlp": 0.01042845, + "balance_loss_clip": 1.28507519, + "balance_loss_mlp": 1.02535737, + "epoch": 0.2855253269201864, + "flos": 13254211523880.0, + "grad_norm": 2.243673563519649, + "language_loss": 0.80180788, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82650596, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.17480469, + "step": 4749, + "time_per_iteration": 4.172081708908081 + }, + { + "auxiliary_loss_clip": 0.0143065, + "auxiliary_loss_mlp": 0.01039456, + "balance_loss_clip": 1.28802705, + "balance_loss_mlp": 1.02133584, + "epoch": 0.28558545017285436, + "flos": 28626707229840.0, + "grad_norm": 1.7202430360258087, + "language_loss": 0.70863891, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.73333997, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.18115234, + "step": 4750, + "time_per_iteration": 2.827204465866089 + }, + { + "auxiliary_loss_clip": 0.0142215, + "auxiliary_loss_mlp": 0.01039881, + "balance_loss_clip": 1.28337538, + "balance_loss_mlp": 1.02295303, + "epoch": 0.2856455734255223, + "flos": 34137052805520.0, + "grad_norm": 1.474897721399621, + "language_loss": 0.82711828, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.85173857, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.16931152, + "step": 4751, + "time_per_iteration": 2.89447283744812 + }, + { + "auxiliary_loss_clip": 0.01429118, + "auxiliary_loss_mlp": 0.01043076, + "balance_loss_clip": 1.28732276, + "balance_loss_mlp": 1.0256238, + "epoch": 0.2857056966781903, + "flos": 39793520552400.0, + "grad_norm": 1.7034320325911911, + "language_loss": 0.8031311, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82785302, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17443848, + "step": 4752, + "time_per_iteration": 2.905582904815674 + }, + { + "auxiliary_loss_clip": 0.0142963, + "auxiliary_loss_mlp": 0.01038938, + "balance_loss_clip": 1.28809774, + "balance_loss_mlp": 1.02190328, + "epoch": 0.28576581993085826, + "flos": 21877409434320.0, + "grad_norm": 1.569929004789779, + "language_loss": 0.78687763, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81156331, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17028809, + "step": 4753, + "time_per_iteration": 2.781928300857544 + }, + { + "auxiliary_loss_clip": 0.01433877, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.28726149, + "balance_loss_mlp": 1.01975536, + "epoch": 0.2858259431835262, + "flos": 19095403011120.0, + "grad_norm": 2.0529650072481123, + "language_loss": 0.90045917, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.92518592, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.19042969, + "step": 4754, + "time_per_iteration": 2.7308876514434814 + }, + { + "auxiliary_loss_clip": 0.01419222, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.28071678, + "balance_loss_mlp": 1.02288961, + "epoch": 0.2858860664361942, + "flos": 20338497283680.0, + "grad_norm": 1.6985721076525384, + "language_loss": 0.82338142, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84796959, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16699219, + "step": 4755, + "time_per_iteration": 2.763223648071289 + }, + { + "auxiliary_loss_clip": 0.01426929, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_clip": 1.2837317, + "balance_loss_mlp": 1.02627039, + "epoch": 0.28594618968886215, + "flos": 24467057745480.0, + "grad_norm": 1.5826465905697709, + "language_loss": 0.83919168, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86390138, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.17785645, + "step": 4756, + "time_per_iteration": 2.802150011062622 + }, + { + "auxiliary_loss_clip": 0.01249766, + "auxiliary_loss_mlp": 0.01006851, + "balance_loss_clip": 1.18444228, + "balance_loss_mlp": 1.00220156, + "epoch": 0.2860063129415301, + "flos": 71671056989040.0, + "grad_norm": 0.8742225800169565, + "language_loss": 0.61048758, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63305378, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.04638672, + "step": 4757, + "time_per_iteration": 3.4168553352355957 + }, + { + "auxiliary_loss_clip": 0.0142645, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.28697073, + "balance_loss_mlp": 1.02164984, + "epoch": 0.2860664361941981, + "flos": 20563609361040.0, + "grad_norm": 2.166734769148766, + "language_loss": 0.6651749, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68982446, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16851807, + "step": 4758, + "time_per_iteration": 2.837128162384033 + }, + { + "auxiliary_loss_clip": 0.01433574, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_clip": 1.28876734, + "balance_loss_mlp": 1.02607214, + "epoch": 0.2861265594468661, + "flos": 36003887065800.0, + "grad_norm": 1.3035421910365943, + "language_loss": 0.62722778, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65200233, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.17810059, + "step": 4759, + "time_per_iteration": 2.92026686668396 + }, + { + "auxiliary_loss_clip": 0.01423324, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.28243625, + "balance_loss_mlp": 1.02357471, + "epoch": 0.28618668269953407, + "flos": 20052222370200.0, + "grad_norm": 1.803698287189687, + "language_loss": 0.74355865, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76820272, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.1751709, + "step": 4760, + "time_per_iteration": 2.858630657196045 + }, + { + "auxiliary_loss_clip": 0.01419731, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.2829175, + "balance_loss_mlp": 1.02191186, + "epoch": 0.28624680595220203, + "flos": 24977145268800.0, + "grad_norm": 1.758146269323819, + "language_loss": 0.72479653, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74937415, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16113281, + "step": 4761, + "time_per_iteration": 2.8238203525543213 + }, + { + "auxiliary_loss_clip": 0.01427205, + "auxiliary_loss_mlp": 0.01039638, + "balance_loss_clip": 1.28406739, + "balance_loss_mlp": 1.02309155, + "epoch": 0.28630692920487, + "flos": 22497413452920.0, + "grad_norm": 1.951384949939343, + "language_loss": 0.75188422, + "learning_rate": 3.349548466945793e-06, + "loss": 0.77655262, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.16552734, + "step": 4762, + "time_per_iteration": 2.8033294677734375 + }, + { + "auxiliary_loss_clip": 0.0142344, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_clip": 1.28381133, + "balance_loss_mlp": 1.03080857, + "epoch": 0.28636705245753796, + "flos": 21254644047240.0, + "grad_norm": 1.3867273368143402, + "language_loss": 0.76233459, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78704941, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.17236328, + "step": 4763, + "time_per_iteration": 2.7570695877075195 + }, + { + "auxiliary_loss_clip": 0.01426823, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.28533745, + "balance_loss_mlp": 1.02870417, + "epoch": 0.28642717571020593, + "flos": 24100899541920.0, + "grad_norm": 11.269876303513913, + "language_loss": 0.77445489, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79917955, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.16943359, + "step": 4764, + "time_per_iteration": 2.8146214485168457 + }, + { + "auxiliary_loss_clip": 0.01430773, + "auxiliary_loss_mlp": 0.01048255, + "balance_loss_clip": 1.28942084, + "balance_loss_mlp": 1.03067183, + "epoch": 0.2864872989628739, + "flos": 22606517781000.0, + "grad_norm": 2.1111705657349042, + "language_loss": 0.71367443, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73846471, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.17590332, + "step": 4765, + "time_per_iteration": 2.794147491455078 + }, + { + "auxiliary_loss_clip": 0.0141818, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_clip": 1.2795763, + "balance_loss_mlp": 1.02862811, + "epoch": 0.28654742221554186, + "flos": 32751612939240.0, + "grad_norm": 1.4861794282257077, + "language_loss": 0.7607615, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.7853862, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.15661621, + "step": 4766, + "time_per_iteration": 2.85784912109375 + }, + { + "auxiliary_loss_clip": 0.01417928, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_clip": 1.27918458, + "balance_loss_mlp": 1.02780104, + "epoch": 0.2866075454682098, + "flos": 26987583981960.0, + "grad_norm": 1.5337114927261222, + "language_loss": 0.77743089, + "learning_rate": 3.348110666737214e-06, + "loss": 0.8020556, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1673584, + "step": 4767, + "time_per_iteration": 2.847306966781616 + }, + { + "auxiliary_loss_clip": 0.01422133, + "auxiliary_loss_mlp": 0.01042844, + "balance_loss_clip": 1.28305054, + "balance_loss_mlp": 1.02579689, + "epoch": 0.2866676687208778, + "flos": 23258504206080.0, + "grad_norm": 1.8129411094722474, + "language_loss": 0.65320879, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67785859, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17041016, + "step": 4768, + "time_per_iteration": 2.799142837524414 + }, + { + "auxiliary_loss_clip": 0.01435717, + "auxiliary_loss_mlp": 0.01047047, + "balance_loss_clip": 1.29132354, + "balance_loss_mlp": 1.02843881, + "epoch": 0.28672779197354575, + "flos": 21584718399960.0, + "grad_norm": 1.7172408712537535, + "language_loss": 0.71050119, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73532885, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.18603516, + "step": 4769, + "time_per_iteration": 2.7779440879821777 + }, + { + "auxiliary_loss_clip": 0.01422876, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.28175235, + "balance_loss_mlp": 1.01918566, + "epoch": 0.2867879152262137, + "flos": 19870422431760.0, + "grad_norm": 1.706417522339516, + "language_loss": 0.75197846, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77656496, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.16589355, + "step": 4770, + "time_per_iteration": 2.758058547973633 + }, + { + "auxiliary_loss_clip": 0.01432049, + "auxiliary_loss_mlp": 0.01042328, + "balance_loss_clip": 1.28948307, + "balance_loss_mlp": 1.02432787, + "epoch": 0.2868480384788817, + "flos": 28218292704720.0, + "grad_norm": 2.084436804835739, + "language_loss": 0.68286592, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.70760965, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.17993164, + "step": 4771, + "time_per_iteration": 2.8127033710479736 + }, + { + "auxiliary_loss_clip": 0.0125006, + "auxiliary_loss_mlp": 0.0101019, + "balance_loss_clip": 1.18668795, + "balance_loss_mlp": 1.00537395, + "epoch": 0.2869081617315497, + "flos": 65438286464880.0, + "grad_norm": 0.7730066840056273, + "language_loss": 0.56967491, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.59227741, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0480957, + "step": 4772, + "time_per_iteration": 3.2081120014190674 + }, + { + "auxiliary_loss_clip": 0.01430002, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.28786433, + "balance_loss_mlp": 1.02918506, + "epoch": 0.28696828498421767, + "flos": 18665320602960.0, + "grad_norm": 2.2192024647498334, + "language_loss": 0.83555704, + "learning_rate": 3.346383619630856e-06, + "loss": 0.86032176, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.17285156, + "step": 4773, + "time_per_iteration": 2.7426114082336426 + }, + { + "auxiliary_loss_clip": 0.01429987, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.28638983, + "balance_loss_mlp": 1.02152276, + "epoch": 0.28702840823688563, + "flos": 23665050746640.0, + "grad_norm": 2.061267055184481, + "language_loss": 0.7861886, + "learning_rate": 3.34609559969027e-06, + "loss": 0.81087273, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.16894531, + "step": 4774, + "time_per_iteration": 4.223647356033325 + }, + { + "auxiliary_loss_clip": 0.01429305, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.28807664, + "balance_loss_mlp": 1.02289653, + "epoch": 0.2870885314895536, + "flos": 13808260919880.0, + "grad_norm": 1.8174841683077536, + "language_loss": 0.73323685, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75793076, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.17163086, + "step": 4775, + "time_per_iteration": 2.7493340969085693 + }, + { + "auxiliary_loss_clip": 0.01429901, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.28894424, + "balance_loss_mlp": 1.02627349, + "epoch": 0.28714865474222157, + "flos": 17791470769320.0, + "grad_norm": 1.7134706471768602, + "language_loss": 0.88810396, + "learning_rate": 3.34551940668778e-06, + "loss": 0.91283679, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17114258, + "step": 4776, + "time_per_iteration": 2.7744712829589844 + }, + { + "auxiliary_loss_clip": 0.01420313, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.28208613, + "balance_loss_mlp": 1.02137184, + "epoch": 0.28720877799488953, + "flos": 16001758430640.0, + "grad_norm": 1.6705252930743757, + "language_loss": 0.74312335, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76770282, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16259766, + "step": 4777, + "time_per_iteration": 2.800191640853882 + }, + { + "auxiliary_loss_clip": 0.01431996, + "auxiliary_loss_mlp": 0.01044595, + "balance_loss_clip": 1.28803372, + "balance_loss_mlp": 1.02777469, + "epoch": 0.2872689012475575, + "flos": 20928062013480.0, + "grad_norm": 2.8402199405367985, + "language_loss": 0.80376053, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82852638, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.16809082, + "step": 4778, + "time_per_iteration": 2.8035247325897217 + }, + { + "auxiliary_loss_clip": 0.01420392, + "auxiliary_loss_mlp": 0.01045624, + "balance_loss_clip": 1.2846241, + "balance_loss_mlp": 1.02801728, + "epoch": 0.28732902450022546, + "flos": 21330182242800.0, + "grad_norm": 20.911571567557825, + "language_loss": 0.74269581, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76735598, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.17590332, + "step": 4779, + "time_per_iteration": 4.265474081039429 + }, + { + "auxiliary_loss_clip": 0.01428832, + "auxiliary_loss_mlp": 0.01045782, + "balance_loss_clip": 1.28810227, + "balance_loss_mlp": 1.02837718, + "epoch": 0.2873891477528934, + "flos": 20855325794760.0, + "grad_norm": 1.5583268252492122, + "language_loss": 0.7647723, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78951842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17419434, + "step": 4780, + "time_per_iteration": 2.770864248275757 + }, + { + "auxiliary_loss_clip": 0.01420308, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.28329921, + "balance_loss_mlp": 1.02156758, + "epoch": 0.2874492710055614, + "flos": 17424459790200.0, + "grad_norm": 1.7349263760228464, + "language_loss": 0.81405163, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83862603, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.15570068, + "step": 4781, + "time_per_iteration": 2.7342336177825928 + }, + { + "auxiliary_loss_clip": 0.01428416, + "auxiliary_loss_mlp": 0.01041449, + "balance_loss_clip": 1.28649426, + "balance_loss_mlp": 1.02294731, + "epoch": 0.28750939425822936, + "flos": 13410729435240.0, + "grad_norm": 2.146088274181656, + "language_loss": 0.868756, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.89345467, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.18505859, + "step": 4782, + "time_per_iteration": 2.749207019805908 + }, + { + "auxiliary_loss_clip": 0.01428299, + "auxiliary_loss_mlp": 0.01042868, + "balance_loss_clip": 1.28843367, + "balance_loss_mlp": 1.02519536, + "epoch": 0.2875695175108973, + "flos": 21874851107640.0, + "grad_norm": 1.3892399171791614, + "language_loss": 0.71572089, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.74043256, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.17687988, + "step": 4783, + "time_per_iteration": 2.7907280921936035 + }, + { + "auxiliary_loss_clip": 0.01417722, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_clip": 1.28000879, + "balance_loss_mlp": 1.02704, + "epoch": 0.2876296407635653, + "flos": 26250475788360.0, + "grad_norm": 1.8510472904363513, + "language_loss": 0.77371258, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79833555, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.17541504, + "step": 4784, + "time_per_iteration": 2.808763265609741 + }, + { + "auxiliary_loss_clip": 0.01410666, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.27703357, + "balance_loss_mlp": 1.02085054, + "epoch": 0.28768976401623325, + "flos": 25379062456320.0, + "grad_norm": 1.4303616161551242, + "language_loss": 0.76123619, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.785712, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16064453, + "step": 4785, + "time_per_iteration": 2.797179937362671 + }, + { + "auxiliary_loss_clip": 0.01414892, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.2762512, + "balance_loss_mlp": 1.01974249, + "epoch": 0.28774988726890127, + "flos": 30670509033720.0, + "grad_norm": 1.7886802541506686, + "language_loss": 0.8332231, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.8577348, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16540527, + "step": 4786, + "time_per_iteration": 4.287976980209351 + }, + { + "auxiliary_loss_clip": 0.01418311, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.28097725, + "balance_loss_mlp": 1.01981819, + "epoch": 0.28781001052156924, + "flos": 20600424162360.0, + "grad_norm": 1.6332630322497894, + "language_loss": 0.80062926, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82516974, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15905762, + "step": 4787, + "time_per_iteration": 2.82444429397583 + }, + { + "auxiliary_loss_clip": 0.01417573, + "auxiliary_loss_mlp": 0.01038953, + "balance_loss_clip": 1.27794302, + "balance_loss_mlp": 1.02160835, + "epoch": 0.2878701337742372, + "flos": 26548039825920.0, + "grad_norm": 1.761638524107897, + "language_loss": 0.83787298, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.8624382, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17333984, + "step": 4788, + "time_per_iteration": 2.8092825412750244 + }, + { + "auxiliary_loss_clip": 0.01428214, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.28632951, + "balance_loss_mlp": 1.02445078, + "epoch": 0.28793025702690517, + "flos": 28152662949000.0, + "grad_norm": 16.27595973472846, + "language_loss": 0.7394129, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.76411331, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17358398, + "step": 4789, + "time_per_iteration": 2.81923246383667 + }, + { + "auxiliary_loss_clip": 0.01408284, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.27406085, + "balance_loss_mlp": 1.02432942, + "epoch": 0.28799038027957313, + "flos": 23810766834240.0, + "grad_norm": 1.7646581311848177, + "language_loss": 0.83910739, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86359137, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.15771484, + "step": 4790, + "time_per_iteration": 2.7791686058044434 + }, + { + "auxiliary_loss_clip": 0.01426059, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.28649855, + "balance_loss_mlp": 1.02531862, + "epoch": 0.2880505035322411, + "flos": 22348976605200.0, + "grad_norm": 1.7268900419820161, + "language_loss": 0.77875471, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80343592, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16748047, + "step": 4791, + "time_per_iteration": 2.783416509628296 + }, + { + "auxiliary_loss_clip": 0.01421732, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.27906156, + "balance_loss_mlp": 1.0217123, + "epoch": 0.28811062678490906, + "flos": 18008786041560.0, + "grad_norm": 1.7437205775422273, + "language_loss": 0.7120434, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.73664653, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.16870117, + "step": 4792, + "time_per_iteration": 2.7659623622894287 + }, + { + "auxiliary_loss_clip": 0.01421965, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.28251112, + "balance_loss_mlp": 1.0248338, + "epoch": 0.28817075003757703, + "flos": 22095983565720.0, + "grad_norm": 1.8808955424521128, + "language_loss": 0.7980293, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.82265687, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.1595459, + "step": 4793, + "time_per_iteration": 2.7396745681762695 + }, + { + "auxiliary_loss_clip": 0.01412254, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_clip": 1.27840734, + "balance_loss_mlp": 1.0287261, + "epoch": 0.288230873290245, + "flos": 41690712884760.0, + "grad_norm": 1.692597591253199, + "language_loss": 0.78221029, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80677402, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.15380859, + "step": 4794, + "time_per_iteration": 2.955059051513672 + }, + { + "auxiliary_loss_clip": 0.01423217, + "auxiliary_loss_mlp": 0.01047854, + "balance_loss_clip": 1.2836591, + "balance_loss_mlp": 1.03114128, + "epoch": 0.28829099654291296, + "flos": 18629033710320.0, + "grad_norm": 2.150136196739085, + "language_loss": 0.83405614, + "learning_rate": 3.340035406592074e-06, + "loss": 0.85876691, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16699219, + "step": 4795, + "time_per_iteration": 2.724043607711792 + }, + { + "auxiliary_loss_clip": 0.01409181, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.27555704, + "balance_loss_mlp": 1.02944398, + "epoch": 0.2883511197955809, + "flos": 24679418797800.0, + "grad_norm": 1.7738296996950933, + "language_loss": 0.75283831, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77737635, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15167236, + "step": 4796, + "time_per_iteration": 2.821958541870117 + }, + { + "auxiliary_loss_clip": 0.01426326, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.28381801, + "balance_loss_mlp": 1.01948309, + "epoch": 0.2884112430482489, + "flos": 23117336254800.0, + "grad_norm": 1.8171259905542485, + "language_loss": 0.73180246, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.75643766, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17724609, + "step": 4797, + "time_per_iteration": 2.7674007415771484 + }, + { + "auxiliary_loss_clip": 0.01413557, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.27648783, + "balance_loss_mlp": 1.02906621, + "epoch": 0.28847136630091685, + "flos": 16877760507360.0, + "grad_norm": 1.9781283768972568, + "language_loss": 0.74843132, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.77301949, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16186523, + "step": 4798, + "time_per_iteration": 2.75097393989563 + }, + { + "auxiliary_loss_clip": 0.01416957, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.27746034, + "balance_loss_mlp": 1.03308702, + "epoch": 0.2885314895535849, + "flos": 25661926267560.0, + "grad_norm": 2.9203153818371628, + "language_loss": 0.65793312, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.68262005, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.18652344, + "step": 4799, + "time_per_iteration": 2.749094009399414 + }, + { + "auxiliary_loss_clip": 0.01418615, + "auxiliary_loss_mlp": 0.01049979, + "balance_loss_clip": 1.27711391, + "balance_loss_mlp": 1.03395724, + "epoch": 0.28859161280625284, + "flos": 21112420278600.0, + "grad_norm": 1.6126729866750158, + "language_loss": 0.82564241, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.85032833, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.16027832, + "step": 4800, + "time_per_iteration": 2.8473734855651855 + }, + { + "auxiliary_loss_clip": 0.01413757, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.27902865, + "balance_loss_mlp": 1.0254612, + "epoch": 0.2886517360589208, + "flos": 26475262998840.0, + "grad_norm": 1.522009275888523, + "language_loss": 0.91039938, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93495429, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.16271973, + "step": 4801, + "time_per_iteration": 2.8153464794158936 + }, + { + "auxiliary_loss_clip": 0.01416046, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.27723408, + "balance_loss_mlp": 1.01879275, + "epoch": 0.28871185931158877, + "flos": 25270810903800.0, + "grad_norm": 1.9604704766008285, + "language_loss": 0.73792541, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76243269, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.15881348, + "step": 4802, + "time_per_iteration": 2.8381617069244385 + }, + { + "auxiliary_loss_clip": 0.01264857, + "auxiliary_loss_mlp": 0.01018954, + "balance_loss_clip": 1.20082855, + "balance_loss_mlp": 1.01499581, + "epoch": 0.28877198256425674, + "flos": 66679675186320.0, + "grad_norm": 0.7862946812683285, + "language_loss": 0.6303845, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65322262, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03955078, + "step": 4803, + "time_per_iteration": 3.3024203777313232 + }, + { + "auxiliary_loss_clip": 0.01406071, + "auxiliary_loss_mlp": 0.01043807, + "balance_loss_clip": 1.26908195, + "balance_loss_mlp": 1.02771366, + "epoch": 0.2888321058169247, + "flos": 20307895561440.0, + "grad_norm": 2.58158912228731, + "language_loss": 0.71103007, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.73552883, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.16101074, + "step": 4804, + "time_per_iteration": 2.788010835647583 + }, + { + "auxiliary_loss_clip": 0.01417207, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.276106, + "balance_loss_mlp": 1.0241977, + "epoch": 0.28889222906959267, + "flos": 25521773525280.0, + "grad_norm": 2.2900137476181346, + "language_loss": 0.68560737, + "learning_rate": 3.337141717919346e-06, + "loss": 0.71019864, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17724609, + "step": 4805, + "time_per_iteration": 2.860680103302002 + }, + { + "auxiliary_loss_clip": 0.01415009, + "auxiliary_loss_mlp": 0.01039265, + "balance_loss_clip": 1.27550614, + "balance_loss_mlp": 1.0221349, + "epoch": 0.28895235232226063, + "flos": 32677739686440.0, + "grad_norm": 1.4258019779836337, + "language_loss": 0.69318843, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71773118, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.17138672, + "step": 4806, + "time_per_iteration": 2.8828141689300537 + }, + { + "auxiliary_loss_clip": 0.01401656, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.26795053, + "balance_loss_mlp": 1.02199912, + "epoch": 0.2890124755749286, + "flos": 29720552487480.0, + "grad_norm": 1.3082719690865292, + "language_loss": 0.71562004, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.74001819, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.16162109, + "step": 4807, + "time_per_iteration": 2.859551191329956 + }, + { + "auxiliary_loss_clip": 0.01414094, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.27646208, + "balance_loss_mlp": 1.02125275, + "epoch": 0.28907259882759656, + "flos": 22679619474960.0, + "grad_norm": 1.7081081030363627, + "language_loss": 0.81446564, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83898729, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.16821289, + "step": 4808, + "time_per_iteration": 2.8018999099731445 + }, + { + "auxiliary_loss_clip": 0.01403781, + "auxiliary_loss_mlp": 0.01041812, + "balance_loss_clip": 1.27041793, + "balance_loss_mlp": 1.02569485, + "epoch": 0.2891327220802645, + "flos": 22571327314080.0, + "grad_norm": 1.392169037709963, + "language_loss": 0.78406668, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80852258, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.16113281, + "step": 4809, + "time_per_iteration": 2.7996826171875 + }, + { + "auxiliary_loss_clip": 0.01418044, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.2750001, + "balance_loss_mlp": 1.01578403, + "epoch": 0.2891928453329325, + "flos": 21657820093920.0, + "grad_norm": 1.8534923900895732, + "language_loss": 0.79288304, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.81740272, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.18139648, + "step": 4810, + "time_per_iteration": 2.8342461585998535 + }, + { + "auxiliary_loss_clip": 0.01412011, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.27643895, + "balance_loss_mlp": 1.01894355, + "epoch": 0.28925296858560046, + "flos": 23227374575160.0, + "grad_norm": 1.6664591786788996, + "language_loss": 0.77344453, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79792315, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16906738, + "step": 4811, + "time_per_iteration": 2.8058841228485107 + }, + { + "auxiliary_loss_clip": 0.01408813, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.27173686, + "balance_loss_mlp": 1.01772571, + "epoch": 0.2893130918382685, + "flos": 28627600613760.0, + "grad_norm": 1.3591097525773659, + "language_loss": 0.77362829, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79806554, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.171875, + "step": 4812, + "time_per_iteration": 2.838099718093872 + }, + { + "auxiliary_loss_clip": 0.01245471, + "auxiliary_loss_mlp": 0.01015524, + "balance_loss_clip": 1.18338668, + "balance_loss_mlp": 1.01166213, + "epoch": 0.28937321509093644, + "flos": 72318495277800.0, + "grad_norm": 0.8440219799882025, + "language_loss": 0.60358298, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62619293, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.03857422, + "step": 4813, + "time_per_iteration": 4.90164041519165 + }, + { + "auxiliary_loss_clip": 0.01409794, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.27413774, + "balance_loss_mlp": 1.0191741, + "epoch": 0.2894333383436044, + "flos": 16220698037280.0, + "grad_norm": 3.0767538121808164, + "language_loss": 0.82549298, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84994888, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1661377, + "step": 4814, + "time_per_iteration": 2.900681734085083 + }, + { + "auxiliary_loss_clip": 0.01422215, + "auxiliary_loss_mlp": 0.01040831, + "balance_loss_clip": 1.27958751, + "balance_loss_mlp": 1.02302122, + "epoch": 0.2894934615962724, + "flos": 24833865682800.0, + "grad_norm": 1.6609261677049203, + "language_loss": 0.72498262, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74961311, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.17822266, + "step": 4815, + "time_per_iteration": 2.90537691116333 + }, + { + "auxiliary_loss_clip": 0.01402973, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.27136278, + "balance_loss_mlp": 1.02112889, + "epoch": 0.28955358484894034, + "flos": 20454992333280.0, + "grad_norm": 1.4703686455056542, + "language_loss": 0.71012628, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.73452771, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.16040039, + "step": 4816, + "time_per_iteration": 2.8433711528778076 + }, + { + "auxiliary_loss_clip": 0.0142334, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.28048968, + "balance_loss_mlp": 1.02437973, + "epoch": 0.2896137081016083, + "flos": 22570352713440.0, + "grad_norm": 2.179978813606387, + "language_loss": 0.74895799, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.77360761, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.17236328, + "step": 4817, + "time_per_iteration": 2.842928171157837 + }, + { + "auxiliary_loss_clip": 0.01416466, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.27661467, + "balance_loss_mlp": 1.02781653, + "epoch": 0.28967383135427627, + "flos": 26693634088440.0, + "grad_norm": 2.0693165373270257, + "language_loss": 0.77010649, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.79472524, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17602539, + "step": 4818, + "time_per_iteration": 2.861773729324341 + }, + { + "auxiliary_loss_clip": 0.0142168, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.28249633, + "balance_loss_mlp": 1.02319384, + "epoch": 0.28973395460694423, + "flos": 15562295491320.0, + "grad_norm": 1.7207164141058262, + "language_loss": 0.79852867, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82314944, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17199707, + "step": 4819, + "time_per_iteration": 4.245227813720703 + }, + { + "auxiliary_loss_clip": 0.01429577, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.28554583, + "balance_loss_mlp": 1.02020311, + "epoch": 0.2897940778596122, + "flos": 18702094795920.0, + "grad_norm": 1.9058933110944187, + "language_loss": 0.79089075, + "learning_rate": 3.332791681244776e-06, + "loss": 0.81556833, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17993164, + "step": 4820, + "time_per_iteration": 2.8652303218841553 + }, + { + "auxiliary_loss_clip": 0.01429194, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.28795862, + "balance_loss_mlp": 1.01965916, + "epoch": 0.28985420111228016, + "flos": 18774912231360.0, + "grad_norm": 2.0316135537882962, + "language_loss": 0.73102891, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75569469, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17724609, + "step": 4821, + "time_per_iteration": 2.806096315383911 + }, + { + "auxiliary_loss_clip": 0.01415484, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.27649462, + "balance_loss_mlp": 1.02119493, + "epoch": 0.28991432436494813, + "flos": 23074267766040.0, + "grad_norm": 1.8243949652375986, + "language_loss": 0.72648013, + "learning_rate": 3.332210816371104e-06, + "loss": 0.75102091, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17407227, + "step": 4822, + "time_per_iteration": 2.8206160068511963 + }, + { + "auxiliary_loss_clip": 0.01416751, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.27952695, + "balance_loss_mlp": 1.02555037, + "epoch": 0.2899744476176161, + "flos": 17607721629600.0, + "grad_norm": 1.6269024154490042, + "language_loss": 0.66182393, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68641275, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16589355, + "step": 4823, + "time_per_iteration": 2.7610726356506348 + }, + { + "auxiliary_loss_clip": 0.01418717, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.27987087, + "balance_loss_mlp": 1.02467012, + "epoch": 0.29003457087028406, + "flos": 22314435872040.0, + "grad_norm": 2.710845379840455, + "language_loss": 0.81364763, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83825159, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17028809, + "step": 4824, + "time_per_iteration": 2.744072914123535 + }, + { + "auxiliary_loss_clip": 0.01426347, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.28532696, + "balance_loss_mlp": 1.02691138, + "epoch": 0.2900946941229521, + "flos": 21950064436320.0, + "grad_norm": 1.9323411910056691, + "language_loss": 0.7245959, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74930221, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.17358398, + "step": 4825, + "time_per_iteration": 5.943169832229614 + }, + { + "auxiliary_loss_clip": 0.01429336, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.28824103, + "balance_loss_mlp": 1.01809394, + "epoch": 0.29015481737562004, + "flos": 17936821381680.0, + "grad_norm": 2.504558163904467, + "language_loss": 0.73694026, + "learning_rate": 3.331048480501092e-06, + "loss": 0.7615965, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.18200684, + "step": 4826, + "time_per_iteration": 2.7572762966156006 + }, + { + "auxiliary_loss_clip": 0.01421724, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.28078496, + "balance_loss_mlp": 1.02212799, + "epoch": 0.290214940628288, + "flos": 22788642586320.0, + "grad_norm": 1.809107316770125, + "language_loss": 0.68867886, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.71327758, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.16015625, + "step": 4827, + "time_per_iteration": 2.7820920944213867 + }, + { + "auxiliary_loss_clip": 0.01424154, + "auxiliary_loss_mlp": 0.01041739, + "balance_loss_clip": 1.28402257, + "balance_loss_mlp": 1.02408361, + "epoch": 0.290275063880956, + "flos": 20010696999120.0, + "grad_norm": 3.738322227269094, + "language_loss": 0.80367243, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82833135, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.17651367, + "step": 4828, + "time_per_iteration": 2.74057936668396 + }, + { + "auxiliary_loss_clip": 0.01416385, + "auxiliary_loss_mlp": 0.01045481, + "balance_loss_clip": 1.27895188, + "balance_loss_mlp": 1.0279814, + "epoch": 0.29033518713362394, + "flos": 22058600247360.0, + "grad_norm": 1.9981971430611138, + "language_loss": 0.80367446, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82829309, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.17504883, + "step": 4829, + "time_per_iteration": 2.7827155590057373 + }, + { + "auxiliary_loss_clip": 0.01406621, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.27247, + "balance_loss_mlp": 1.02581882, + "epoch": 0.2903953103862919, + "flos": 25635629031480.0, + "grad_norm": 1.4498602240067169, + "language_loss": 0.82594514, + "learning_rate": 3.329885337055249e-06, + "loss": 0.85043108, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16174316, + "step": 4830, + "time_per_iteration": 2.8017420768737793 + }, + { + "auxiliary_loss_clip": 0.01421305, + "auxiliary_loss_mlp": 0.01043366, + "balance_loss_clip": 1.28072143, + "balance_loss_mlp": 1.02609289, + "epoch": 0.29045543363895987, + "flos": 16950212467560.0, + "grad_norm": 2.1934920635258295, + "language_loss": 0.79141504, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81606174, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.17285156, + "step": 4831, + "time_per_iteration": 2.730246067047119 + }, + { + "auxiliary_loss_clip": 0.01409116, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.2744056, + "balance_loss_mlp": 1.02350354, + "epoch": 0.29051555689162784, + "flos": 26401064879160.0, + "grad_norm": 1.7069093764841823, + "language_loss": 0.74554181, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.770024, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.15588379, + "step": 4832, + "time_per_iteration": 2.8059372901916504 + }, + { + "auxiliary_loss_clip": 0.01401229, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.26490819, + "balance_loss_mlp": 1.01771605, + "epoch": 0.2905756801442958, + "flos": 21108603092760.0, + "grad_norm": 1.7041335716988943, + "language_loss": 0.76342255, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78776532, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.15332031, + "step": 4833, + "time_per_iteration": 2.773890733718872 + }, + { + "auxiliary_loss_clip": 0.01405102, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.26841593, + "balance_loss_mlp": 1.0192095, + "epoch": 0.29063580339696377, + "flos": 15710366863800.0, + "grad_norm": 1.7717425616122306, + "language_loss": 0.64862955, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.67302579, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.15319824, + "step": 4834, + "time_per_iteration": 2.7640674114227295 + }, + { + "auxiliary_loss_clip": 0.01397269, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.26316273, + "balance_loss_mlp": 1.02004433, + "epoch": 0.29069592664963173, + "flos": 24650441409960.0, + "grad_norm": 1.437287141654585, + "language_loss": 0.72329187, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.74761474, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.14971924, + "step": 4835, + "time_per_iteration": 2.8375470638275146 + }, + { + "auxiliary_loss_clip": 0.01401145, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.2653687, + "balance_loss_mlp": 1.01902676, + "epoch": 0.2907560499022997, + "flos": 24979297511880.0, + "grad_norm": 1.6023058179481318, + "language_loss": 0.79739249, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.82174599, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.15161133, + "step": 4836, + "time_per_iteration": 2.8144803047180176 + }, + { + "auxiliary_loss_clip": 0.01404909, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.26940119, + "balance_loss_mlp": 1.02302527, + "epoch": 0.29081617315496766, + "flos": 18661665850560.0, + "grad_norm": 1.6275598955118087, + "language_loss": 0.80893022, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83336937, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15979004, + "step": 4837, + "time_per_iteration": 2.7518632411956787 + }, + { + "auxiliary_loss_clip": 0.01405774, + "auxiliary_loss_mlp": 0.01039097, + "balance_loss_clip": 1.26717353, + "balance_loss_mlp": 1.02361131, + "epoch": 0.2908762964076356, + "flos": 35337322239480.0, + "grad_norm": 1.947419316620985, + "language_loss": 0.67081499, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69526374, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.15484619, + "step": 4838, + "time_per_iteration": 2.9496874809265137 + }, + { + "auxiliary_loss_clip": 0.01414877, + "auxiliary_loss_mlp": 0.01042447, + "balance_loss_clip": 1.27501607, + "balance_loss_mlp": 1.02561474, + "epoch": 0.29093641966030365, + "flos": 23081577270840.0, + "grad_norm": 1.7647717196885004, + "language_loss": 0.71448117, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73905438, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.16845703, + "step": 4839, + "time_per_iteration": 2.775536298751831 + }, + { + "auxiliary_loss_clip": 0.01407725, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.26930881, + "balance_loss_mlp": 1.02708662, + "epoch": 0.2909965429129716, + "flos": 35962321086360.0, + "grad_norm": 2.562383358729651, + "language_loss": 0.7628721, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78737438, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.15393066, + "step": 4840, + "time_per_iteration": 2.9249041080474854 + }, + { + "auxiliary_loss_clip": 0.01402762, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.26638985, + "balance_loss_mlp": 1.01824486, + "epoch": 0.2910566661656396, + "flos": 30885996929760.0, + "grad_norm": 2.614460566545144, + "language_loss": 0.60621917, + "learning_rate": 3.326682534279471e-06, + "loss": 0.63059211, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16296387, + "step": 4841, + "time_per_iteration": 2.8613741397857666 + }, + { + "auxiliary_loss_clip": 0.01408036, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.26958251, + "balance_loss_mlp": 1.02109647, + "epoch": 0.29111678941830754, + "flos": 30016573407360.0, + "grad_norm": 1.3524865747639911, + "language_loss": 0.71644163, + "learning_rate": 3.326391068322232e-06, + "loss": 0.74089909, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.1661377, + "step": 4842, + "time_per_iteration": 2.806036949157715 + }, + { + "auxiliary_loss_clip": 0.0140565, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.27000332, + "balance_loss_mlp": 1.01630259, + "epoch": 0.2911769126709755, + "flos": 22862637664200.0, + "grad_norm": 1.8228100667474292, + "language_loss": 0.73971939, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.76408535, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.14624023, + "step": 4843, + "time_per_iteration": 2.7596585750579834 + }, + { + "auxiliary_loss_clip": 0.01405619, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.26712286, + "balance_loss_mlp": 1.0210762, + "epoch": 0.2912370359236435, + "flos": 21655221158880.0, + "grad_norm": 2.1227890221599326, + "language_loss": 0.58763123, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.61205983, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16162109, + "step": 4844, + "time_per_iteration": 2.723393440246582 + }, + { + "auxiliary_loss_clip": 0.01417123, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.27599049, + "balance_loss_mlp": 1.0203433, + "epoch": 0.29129715917631144, + "flos": 22898518473240.0, + "grad_norm": 1.775676886688476, + "language_loss": 0.86880451, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.89337146, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.19250488, + "step": 4845, + "time_per_iteration": 2.7589879035949707 + }, + { + "auxiliary_loss_clip": 0.01416688, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.27718592, + "balance_loss_mlp": 1.02444291, + "epoch": 0.2913572824289794, + "flos": 22679619474960.0, + "grad_norm": 1.6147801320218167, + "language_loss": 0.67661583, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.7012006, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.17333984, + "step": 4846, + "time_per_iteration": 2.756704807281494 + }, + { + "auxiliary_loss_clip": 0.01400992, + "auxiliary_loss_mlp": 0.01037664, + "balance_loss_clip": 1.26588428, + "balance_loss_mlp": 1.02214885, + "epoch": 0.29141740568164737, + "flos": 23111975951280.0, + "grad_norm": 1.7748988029978956, + "language_loss": 0.70355117, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.7279377, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1552124, + "step": 4847, + "time_per_iteration": 2.7507078647613525 + }, + { + "auxiliary_loss_clip": 0.0140827, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.2705009, + "balance_loss_mlp": 1.0164361, + "epoch": 0.29147752893431533, + "flos": 23592355136280.0, + "grad_norm": 1.459634580356863, + "language_loss": 0.73980427, + "learning_rate": 3.324641216731237e-06, + "loss": 0.7642135, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16223145, + "step": 4848, + "time_per_iteration": 2.774420738220215 + }, + { + "auxiliary_loss_clip": 0.01405551, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.26543903, + "balance_loss_mlp": 1.01981783, + "epoch": 0.2915376521869833, + "flos": 20596485151440.0, + "grad_norm": 2.0272770005406335, + "language_loss": 0.76862234, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79304171, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.16577148, + "step": 4849, + "time_per_iteration": 2.765441417694092 + }, + { + "auxiliary_loss_clip": 0.01412461, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.27120972, + "balance_loss_mlp": 1.0163554, + "epoch": 0.29159777543965126, + "flos": 20816115100200.0, + "grad_norm": 1.7033951024402425, + "language_loss": 0.79740614, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.82185876, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.16455078, + "step": 4850, + "time_per_iteration": 2.772773027420044 + }, + { + "auxiliary_loss_clip": 0.01395232, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.26189101, + "balance_loss_mlp": 1.02163577, + "epoch": 0.29165789869231923, + "flos": 24249620648160.0, + "grad_norm": 1.6969508723187212, + "language_loss": 0.75819582, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78253329, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.1685791, + "step": 4851, + "time_per_iteration": 2.7913169860839844 + }, + { + "auxiliary_loss_clip": 0.01407295, + "auxiliary_loss_mlp": 0.0104229, + "balance_loss_clip": 1.27223182, + "balance_loss_mlp": 1.02693558, + "epoch": 0.29171802194498725, + "flos": 28955888198640.0, + "grad_norm": 1.6381980889233252, + "language_loss": 0.77594966, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.8004455, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15356445, + "step": 4852, + "time_per_iteration": 4.210231781005859 + }, + { + "auxiliary_loss_clip": 0.01401355, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.26386809, + "balance_loss_mlp": 1.0149169, + "epoch": 0.2917781451976552, + "flos": 22602944245320.0, + "grad_norm": 1.6039579639212505, + "language_loss": 0.78419232, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80851668, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.16162109, + "step": 4853, + "time_per_iteration": 2.8116672039031982 + }, + { + "auxiliary_loss_clip": 0.01406984, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.26981091, + "balance_loss_mlp": 1.01981342, + "epoch": 0.2918382684503232, + "flos": 21578748971040.0, + "grad_norm": 2.699643495421648, + "language_loss": 0.87742734, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90185809, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16271973, + "step": 4854, + "time_per_iteration": 2.7334580421447754 + }, + { + "auxiliary_loss_clip": 0.0140594, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_clip": 1.27015162, + "balance_loss_mlp": 1.03228498, + "epoch": 0.29189839170299114, + "flos": 24358968626400.0, + "grad_norm": 1.6933630020338701, + "language_loss": 0.8616488, + "learning_rate": 3.322597437887519e-06, + "loss": 0.8862133, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.18237305, + "step": 4855, + "time_per_iteration": 2.7927732467651367 + }, + { + "auxiliary_loss_clip": 0.01270012, + "auxiliary_loss_mlp": 0.01016419, + "balance_loss_clip": 1.20696855, + "balance_loss_mlp": 1.01320076, + "epoch": 0.2919585149556591, + "flos": 71333876173320.0, + "grad_norm": 0.9654285890965912, + "language_loss": 0.60277784, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62564218, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.03222656, + "step": 4856, + "time_per_iteration": 3.3743398189544678 + }, + { + "auxiliary_loss_clip": 0.01403482, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.26754546, + "balance_loss_mlp": 1.01761961, + "epoch": 0.2920186382083271, + "flos": 15637508820000.0, + "grad_norm": 1.8281262166657766, + "language_loss": 0.68267214, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70703721, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15393066, + "step": 4857, + "time_per_iteration": 4.206814527511597 + }, + { + "auxiliary_loss_clip": 0.01406439, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.2715621, + "balance_loss_mlp": 1.02047539, + "epoch": 0.29207876146099504, + "flos": 28371886814160.0, + "grad_norm": 1.836921222029205, + "language_loss": 0.83952677, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86394697, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.15100098, + "step": 4858, + "time_per_iteration": 2.8052101135253906 + }, + { + "auxiliary_loss_clip": 0.01401819, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.26812327, + "balance_loss_mlp": 1.0220983, + "epoch": 0.292138884713663, + "flos": 21875906925000.0, + "grad_norm": 1.7013929785204658, + "language_loss": 0.77385247, + "learning_rate": 3.321428460652342e-06, + "loss": 0.7982415, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14990234, + "step": 4859, + "time_per_iteration": 2.783846378326416 + }, + { + "auxiliary_loss_clip": 0.01412288, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.27244353, + "balance_loss_mlp": 1.02259946, + "epoch": 0.29219900796633097, + "flos": 20997021654720.0, + "grad_norm": 3.4733823071345697, + "language_loss": 0.69354451, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71805382, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.16040039, + "step": 4860, + "time_per_iteration": 2.8145339488983154 + }, + { + "auxiliary_loss_clip": 0.0139991, + "auxiliary_loss_mlp": 0.01043197, + "balance_loss_clip": 1.26778471, + "balance_loss_mlp": 1.02917182, + "epoch": 0.29225913121899894, + "flos": 35011105680960.0, + "grad_norm": 2.0298682696013035, + "language_loss": 0.75997388, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78440499, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.14007568, + "step": 4861, + "time_per_iteration": 2.884721517562866 + }, + { + "auxiliary_loss_clip": 0.01404246, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.27038026, + "balance_loss_mlp": 1.02496421, + "epoch": 0.2923192544716669, + "flos": 13519508896440.0, + "grad_norm": 1.7391481462491432, + "language_loss": 0.91757607, + "learning_rate": 3.320551201545832e-06, + "loss": 0.94200701, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.13879395, + "step": 4862, + "time_per_iteration": 2.8603568077087402 + }, + { + "auxiliary_loss_clip": 0.01405411, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.27043319, + "balance_loss_mlp": 1.01891017, + "epoch": 0.29237937772433487, + "flos": 19468220985720.0, + "grad_norm": 1.8828947694145224, + "language_loss": 0.72954679, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75393963, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.14953613, + "step": 4863, + "time_per_iteration": 2.7615442276000977 + }, + { + "auxiliary_loss_clip": 0.01398957, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.26779389, + "balance_loss_mlp": 1.02218795, + "epoch": 0.29243950097700283, + "flos": 20855569444920.0, + "grad_norm": 1.682285063983723, + "language_loss": 0.78249049, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80684912, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.14727783, + "step": 4864, + "time_per_iteration": 4.1867358684539795 + }, + { + "auxiliary_loss_clip": 0.01410878, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.27495265, + "balance_loss_mlp": 1.02317858, + "epoch": 0.29249962422967085, + "flos": 23589187684200.0, + "grad_norm": 1.533706504988296, + "language_loss": 0.81980723, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84430647, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15869141, + "step": 4865, + "time_per_iteration": 4.284784317016602 + }, + { + "auxiliary_loss_clip": 0.01410422, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.27503669, + "balance_loss_mlp": 1.01978636, + "epoch": 0.2925597474823388, + "flos": 22278473846280.0, + "grad_norm": 2.037087818502391, + "language_loss": 0.85623878, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.88069928, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.15844727, + "step": 4866, + "time_per_iteration": 2.752650022506714 + }, + { + "auxiliary_loss_clip": 0.01408447, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.2749238, + "balance_loss_mlp": 1.01898074, + "epoch": 0.2926198707350068, + "flos": 34462375980120.0, + "grad_norm": 1.6103829725421457, + "language_loss": 0.75748223, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.78190911, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15283203, + "step": 4867, + "time_per_iteration": 2.8422818183898926 + }, + { + "auxiliary_loss_clip": 0.01409979, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.27270508, + "balance_loss_mlp": 1.02782607, + "epoch": 0.29267999398767475, + "flos": 20709041190120.0, + "grad_norm": 1.8199454640951092, + "language_loss": 0.73292613, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75746888, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16479492, + "step": 4868, + "time_per_iteration": 2.7519664764404297 + }, + { + "auxiliary_loss_clip": 0.01410553, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.27758145, + "balance_loss_mlp": 1.01556265, + "epoch": 0.2927401172403427, + "flos": 18373198085640.0, + "grad_norm": 1.4737109034446474, + "language_loss": 0.74940294, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.77381551, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.15148926, + "step": 4869, + "time_per_iteration": 2.722893714904785 + }, + { + "auxiliary_loss_clip": 0.01409485, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.27463722, + "balance_loss_mlp": 1.01942527, + "epoch": 0.2928002404930107, + "flos": 26109632703960.0, + "grad_norm": 1.5387770942212546, + "language_loss": 0.76653284, + "learning_rate": 3.318209641423088e-06, + "loss": 0.79097831, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.15637207, + "step": 4870, + "time_per_iteration": 2.7808263301849365 + }, + { + "auxiliary_loss_clip": 0.01423006, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.28255832, + "balance_loss_mlp": 1.02516508, + "epoch": 0.29286036374567864, + "flos": 21329735550840.0, + "grad_norm": 2.3706743684439417, + "language_loss": 0.6907053, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.71535885, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.171875, + "step": 4871, + "time_per_iteration": 2.7689735889434814 + }, + { + "auxiliary_loss_clip": 0.01410956, + "auxiliary_loss_mlp": 0.0103927, + "balance_loss_clip": 1.27582073, + "balance_loss_mlp": 1.02401161, + "epoch": 0.2929204869983466, + "flos": 29575445525280.0, + "grad_norm": 2.148089542022055, + "language_loss": 0.77955782, + "learning_rate": 3.317623751303933e-06, + "loss": 0.8040601, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.15258789, + "step": 4872, + "time_per_iteration": 2.816164970397949 + }, + { + "auxiliary_loss_clip": 0.01416663, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.27713442, + "balance_loss_mlp": 1.02268612, + "epoch": 0.2929806102510146, + "flos": 19062405395640.0, + "grad_norm": 1.8251434122650394, + "language_loss": 0.73001587, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75458825, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.17883301, + "step": 4873, + "time_per_iteration": 2.7802629470825195 + }, + { + "auxiliary_loss_clip": 0.01415475, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.27682912, + "balance_loss_mlp": 1.02281809, + "epoch": 0.29304073350368254, + "flos": 21949211660760.0, + "grad_norm": 1.9084127274843823, + "language_loss": 0.78392655, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80847603, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16650391, + "step": 4874, + "time_per_iteration": 2.7675061225891113 + }, + { + "auxiliary_loss_clip": 0.01423491, + "auxiliary_loss_mlp": 0.01038791, + "balance_loss_clip": 1.28024077, + "balance_loss_mlp": 1.02200675, + "epoch": 0.2931008567563505, + "flos": 15455099756160.0, + "grad_norm": 1.8477712278280671, + "language_loss": 0.77530122, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79992402, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.16784668, + "step": 4875, + "time_per_iteration": 2.757948398590088 + }, + { + "auxiliary_loss_clip": 0.0142271, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.28369069, + "balance_loss_mlp": 1.01863432, + "epoch": 0.29316098000901847, + "flos": 16987636394280.0, + "grad_norm": 1.5697445637570164, + "language_loss": 0.69325733, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71783078, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.16009521, + "step": 4876, + "time_per_iteration": 2.720233678817749 + }, + { + "auxiliary_loss_clip": 0.01409214, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.27316046, + "balance_loss_mlp": 1.01997101, + "epoch": 0.29322110326168643, + "flos": 16361419296600.0, + "grad_norm": 1.857380743083373, + "language_loss": 0.81851494, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84297001, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.16320801, + "step": 4877, + "time_per_iteration": 2.7401459217071533 + }, + { + "auxiliary_loss_clip": 0.01421556, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.28109252, + "balance_loss_mlp": 1.01720619, + "epoch": 0.29328122651435445, + "flos": 13994893253160.0, + "grad_norm": 2.1261535323451874, + "language_loss": 0.67933238, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70388538, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.1652832, + "step": 4878, + "time_per_iteration": 2.7058966159820557 + }, + { + "auxiliary_loss_clip": 0.01414642, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.27797389, + "balance_loss_mlp": 1.01754832, + "epoch": 0.2933413497670224, + "flos": 25270039344960.0, + "grad_norm": 2.1269772961030333, + "language_loss": 0.73904693, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7635383, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.16937256, + "step": 4879, + "time_per_iteration": 2.7980916500091553 + }, + { + "auxiliary_loss_clip": 0.01418568, + "auxiliary_loss_mlp": 0.01045241, + "balance_loss_clip": 1.27909255, + "balance_loss_mlp": 1.02712095, + "epoch": 0.2934014730196904, + "flos": 32130065802960.0, + "grad_norm": 2.407260925053735, + "language_loss": 0.66745865, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.69209677, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.18115234, + "step": 4880, + "time_per_iteration": 2.82783842086792 + }, + { + "auxiliary_loss_clip": 0.01417251, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.27742815, + "balance_loss_mlp": 1.02099288, + "epoch": 0.29346159627235835, + "flos": 24357831592320.0, + "grad_norm": 3.300744354965877, + "language_loss": 0.71231383, + "learning_rate": 3.314984773812481e-06, + "loss": 0.73686504, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.16894531, + "step": 4881, + "time_per_iteration": 2.7638518810272217 + }, + { + "auxiliary_loss_clip": 0.01418404, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.27894032, + "balance_loss_mlp": 1.01661313, + "epoch": 0.2935217195250263, + "flos": 22751827785000.0, + "grad_norm": 1.54267627705778, + "language_loss": 0.83646595, + "learning_rate": 3.314691304621127e-06, + "loss": 0.8609879, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.171875, + "step": 4882, + "time_per_iteration": 2.8208425045013428 + }, + { + "auxiliary_loss_clip": 0.01423967, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.28201091, + "balance_loss_mlp": 1.0210309, + "epoch": 0.2935818427776943, + "flos": 21730556312640.0, + "grad_norm": 3.2888184482127425, + "language_loss": 0.72356719, + "learning_rate": 3.314397785576548e-06, + "loss": 0.74818873, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.17138672, + "step": 4883, + "time_per_iteration": 2.830220937728882 + }, + { + "auxiliary_loss_clip": 0.01420026, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.28206491, + "balance_loss_mlp": 1.02074444, + "epoch": 0.29364196603036224, + "flos": 23810198317200.0, + "grad_norm": 1.936373066819549, + "language_loss": 0.92417938, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94875687, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16992188, + "step": 4884, + "time_per_iteration": 2.7712650299072266 + }, + { + "auxiliary_loss_clip": 0.01425867, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.28555608, + "balance_loss_mlp": 1.02168119, + "epoch": 0.2937020892830302, + "flos": 23474398185720.0, + "grad_norm": 2.047214484961602, + "language_loss": 0.73419958, + "learning_rate": 3.313810597972234e-06, + "loss": 0.7588383, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.16326904, + "step": 4885, + "time_per_iteration": 2.722585439682007 + }, + { + "auxiliary_loss_clip": 0.01424296, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_clip": 1.28425956, + "balance_loss_mlp": 1.03349042, + "epoch": 0.2937622125356982, + "flos": 24277176743400.0, + "grad_norm": 2.054545384315688, + "language_loss": 0.84857947, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87332088, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16357422, + "step": 4886, + "time_per_iteration": 2.7547719478607178 + }, + { + "auxiliary_loss_clip": 0.01423805, + "auxiliary_loss_mlp": 0.01041438, + "balance_loss_clip": 1.28145313, + "balance_loss_mlp": 1.02520192, + "epoch": 0.29382233578836614, + "flos": 20666663043480.0, + "grad_norm": 2.0617512884570934, + "language_loss": 0.77225792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79691035, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.16247559, + "step": 4887, + "time_per_iteration": 2.715632200241089 + }, + { + "auxiliary_loss_clip": 0.01425789, + "auxiliary_loss_mlp": 0.01047575, + "balance_loss_clip": 1.28373706, + "balance_loss_mlp": 1.03176761, + "epoch": 0.2938824590410341, + "flos": 16549432314120.0, + "grad_norm": 2.1257264217801235, + "language_loss": 0.80568242, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.83041608, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.15820312, + "step": 4888, + "time_per_iteration": 2.758131504058838 + }, + { + "auxiliary_loss_clip": 0.01425823, + "auxiliary_loss_mlp": 0.01044929, + "balance_loss_clip": 1.28666949, + "balance_loss_mlp": 1.0284183, + "epoch": 0.29394258229370207, + "flos": 37932615112680.0, + "grad_norm": 1.3479933051282844, + "language_loss": 0.55487752, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57958502, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16503906, + "step": 4889, + "time_per_iteration": 2.905629873275757 + }, + { + "auxiliary_loss_clip": 0.01430744, + "auxiliary_loss_mlp": 0.01046177, + "balance_loss_clip": 1.28865528, + "balance_loss_mlp": 1.02902317, + "epoch": 0.29400270554637004, + "flos": 20049136134840.0, + "grad_norm": 1.7039405682706215, + "language_loss": 0.84415591, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86892515, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.17150879, + "step": 4890, + "time_per_iteration": 2.714733362197876 + }, + { + "auxiliary_loss_clip": 0.01433372, + "auxiliary_loss_mlp": 0.010617, + "balance_loss_clip": 1.28922129, + "balance_loss_mlp": 1.04433179, + "epoch": 0.294062828799038, + "flos": 15270376015800.0, + "grad_norm": 1.6665883628553206, + "language_loss": 0.73181802, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.7567687, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.17346191, + "step": 4891, + "time_per_iteration": 4.105615615844727 + }, + { + "auxiliary_loss_clip": 0.01430924, + "auxiliary_loss_mlp": 0.0104518, + "balance_loss_clip": 1.28827429, + "balance_loss_mlp": 1.02787101, + "epoch": 0.294122952051706, + "flos": 22752274476960.0, + "grad_norm": 1.499099892097033, + "language_loss": 0.77496135, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79972243, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.1730957, + "step": 4892, + "time_per_iteration": 2.758260726928711 + }, + { + "auxiliary_loss_clip": 0.01427503, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_clip": 1.28716469, + "balance_loss_mlp": 1.03313255, + "epoch": 0.294183075304374, + "flos": 24978322911240.0, + "grad_norm": 1.7107382583207562, + "language_loss": 0.78550792, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.81027937, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.16503906, + "step": 4893, + "time_per_iteration": 2.831173896789551 + }, + { + "auxiliary_loss_clip": 0.0143191, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_clip": 1.29205513, + "balance_loss_mlp": 1.03292835, + "epoch": 0.29424319855704195, + "flos": 30958733148480.0, + "grad_norm": 1.7560865863761101, + "language_loss": 0.85244769, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87726128, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.1652832, + "step": 4894, + "time_per_iteration": 2.8369622230529785 + }, + { + "auxiliary_loss_clip": 0.01434148, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_clip": 1.29171741, + "balance_loss_mlp": 1.03104365, + "epoch": 0.2943033218097099, + "flos": 15235794674280.0, + "grad_norm": 3.418547387453117, + "language_loss": 0.91126513, + "learning_rate": 3.310871672543274e-06, + "loss": 0.93607521, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.15808105, + "step": 4895, + "time_per_iteration": 2.7687337398529053 + }, + { + "auxiliary_loss_clip": 0.01437698, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.29296076, + "balance_loss_mlp": 1.02463889, + "epoch": 0.2943634450623779, + "flos": 21730921787880.0, + "grad_norm": 1.8058124979761152, + "language_loss": 0.87097454, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.89578021, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.18249512, + "step": 4896, + "time_per_iteration": 4.2278594970703125 + }, + { + "auxiliary_loss_clip": 0.01445295, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.3012991, + "balance_loss_mlp": 1.03226817, + "epoch": 0.29442356831504585, + "flos": 22607329948200.0, + "grad_norm": 2.08089909087957, + "language_loss": 0.73886657, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.76383615, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.19396973, + "step": 4897, + "time_per_iteration": 2.904527425765991 + }, + { + "auxiliary_loss_clip": 0.01440879, + "auxiliary_loss_mlp": 0.01048599, + "balance_loss_clip": 1.29248726, + "balance_loss_mlp": 1.02969229, + "epoch": 0.2944836915677138, + "flos": 20016260344440.0, + "grad_norm": 1.8469682113252115, + "language_loss": 0.74789476, + "learning_rate": 3.309989025093813e-06, + "loss": 0.77278948, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.18884277, + "step": 4898, + "time_per_iteration": 2.771789312362671 + }, + { + "auxiliary_loss_clip": 0.01446553, + "auxiliary_loss_mlp": 0.01042904, + "balance_loss_clip": 1.29938293, + "balance_loss_mlp": 1.02272236, + "epoch": 0.2945438148203818, + "flos": 20050476210720.0, + "grad_norm": 3.3781020458066653, + "language_loss": 0.69774336, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72263789, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.20178223, + "step": 4899, + "time_per_iteration": 2.8287389278411865 + }, + { + "auxiliary_loss_clip": 0.01434276, + "auxiliary_loss_mlp": 0.01044522, + "balance_loss_clip": 1.29352093, + "balance_loss_mlp": 1.02577043, + "epoch": 0.29460393807304974, + "flos": 23738964607800.0, + "grad_norm": 3.9925257432887773, + "language_loss": 0.79075062, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.81553864, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.1875, + "step": 4900, + "time_per_iteration": 2.7675909996032715 + }, + { + "auxiliary_loss_clip": 0.01429227, + "auxiliary_loss_mlp": 0.01043257, + "balance_loss_clip": 1.28727424, + "balance_loss_mlp": 1.02667546, + "epoch": 0.2946640613257177, + "flos": 14980243308120.0, + "grad_norm": 1.7106681956363652, + "language_loss": 0.80409092, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82881576, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.16577148, + "step": 4901, + "time_per_iteration": 2.9021565914154053 + }, + { + "auxiliary_loss_clip": 0.01419277, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.28377843, + "balance_loss_mlp": 1.02008665, + "epoch": 0.2947241845783857, + "flos": 24249336389640.0, + "grad_norm": 1.8927189721017634, + "language_loss": 0.583588, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60813546, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.15374756, + "step": 4902, + "time_per_iteration": 2.7737784385681152 + }, + { + "auxiliary_loss_clip": 0.0143274, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.29203129, + "balance_loss_mlp": 1.02463245, + "epoch": 0.29478430783105364, + "flos": 19943239867200.0, + "grad_norm": 1.5932627330753397, + "language_loss": 0.76113737, + "learning_rate": 3.308516952661925e-06, + "loss": 0.78586972, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.15856934, + "step": 4903, + "time_per_iteration": 4.345589876174927 + }, + { + "auxiliary_loss_clip": 0.01440692, + "auxiliary_loss_mlp": 0.0104923, + "balance_loss_clip": 1.29864454, + "balance_loss_mlp": 1.03131258, + "epoch": 0.2948444310837216, + "flos": 27387145884600.0, + "grad_norm": 2.0222495791912527, + "language_loss": 0.62351155, + "learning_rate": 3.3082223892736e-06, + "loss": 0.6484108, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.17919922, + "step": 4904, + "time_per_iteration": 4.203859567642212 + }, + { + "auxiliary_loss_clip": 0.01436378, + "auxiliary_loss_mlp": 0.0104473, + "balance_loss_clip": 1.29264331, + "balance_loss_mlp": 1.02800488, + "epoch": 0.2949045543363896, + "flos": 23410880064720.0, + "grad_norm": 1.8101874504090627, + "language_loss": 0.73433757, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75914866, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.1673584, + "step": 4905, + "time_per_iteration": 2.773529052734375 + }, + { + "auxiliary_loss_clip": 0.01431971, + "auxiliary_loss_mlp": 0.0104514, + "balance_loss_clip": 1.29179168, + "balance_loss_mlp": 1.02804565, + "epoch": 0.2949646775890576, + "flos": 23956807788720.0, + "grad_norm": 1.7124501017111193, + "language_loss": 0.81774223, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.84251332, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17089844, + "step": 4906, + "time_per_iteration": 2.741297960281372 + }, + { + "auxiliary_loss_clip": 0.01432416, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.29400182, + "balance_loss_mlp": 1.02241397, + "epoch": 0.29502480084172555, + "flos": 22789535970240.0, + "grad_norm": 1.7703364758164246, + "language_loss": 0.87704146, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90175343, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16381836, + "step": 4907, + "time_per_iteration": 2.7684013843536377 + }, + { + "auxiliary_loss_clip": 0.01443389, + "auxiliary_loss_mlp": 0.01043558, + "balance_loss_clip": 1.29981303, + "balance_loss_mlp": 1.02597487, + "epoch": 0.2950849240943935, + "flos": 19651482825120.0, + "grad_norm": 2.1479602748297366, + "language_loss": 0.81728017, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84214967, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.17578125, + "step": 4908, + "time_per_iteration": 2.722964286804199 + }, + { + "auxiliary_loss_clip": 0.01259926, + "auxiliary_loss_mlp": 0.0101235, + "balance_loss_clip": 1.19748068, + "balance_loss_mlp": 1.00886869, + "epoch": 0.2951450473470615, + "flos": 71017063773120.0, + "grad_norm": 0.8063379454630826, + "language_loss": 0.5740118, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59673452, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03491211, + "step": 4909, + "time_per_iteration": 3.121354818344116 + }, + { + "auxiliary_loss_clip": 0.0144048, + "auxiliary_loss_mlp": 0.01046791, + "balance_loss_clip": 1.30119956, + "balance_loss_mlp": 1.03057897, + "epoch": 0.29520517059972945, + "flos": 22971498342120.0, + "grad_norm": 3.1771480541091948, + "language_loss": 0.86616337, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.89103615, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.16210938, + "step": 4910, + "time_per_iteration": 2.795793056488037 + }, + { + "auxiliary_loss_clip": 0.01427674, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.29044497, + "balance_loss_mlp": 1.02501547, + "epoch": 0.2952652938523974, + "flos": 20490588883800.0, + "grad_norm": 1.6835066034062294, + "language_loss": 0.73138052, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75607181, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16442871, + "step": 4911, + "time_per_iteration": 2.7542247772216797 + }, + { + "auxiliary_loss_clip": 0.01439106, + "auxiliary_loss_mlp": 0.01041427, + "balance_loss_clip": 1.30187953, + "balance_loss_mlp": 1.02464187, + "epoch": 0.2953254171050654, + "flos": 19651401608400.0, + "grad_norm": 1.6429322901879932, + "language_loss": 0.90392268, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.9287281, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16784668, + "step": 4912, + "time_per_iteration": 2.788439989089966 + }, + { + "auxiliary_loss_clip": 0.01439198, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.29984951, + "balance_loss_mlp": 1.03223157, + "epoch": 0.29538554035773334, + "flos": 22753208469240.0, + "grad_norm": 1.4001189723233536, + "language_loss": 0.83606684, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.86095315, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17199707, + "step": 4913, + "time_per_iteration": 2.7726738452911377 + }, + { + "auxiliary_loss_clip": 0.01434155, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.29449248, + "balance_loss_mlp": 1.0261662, + "epoch": 0.2954456636104013, + "flos": 21876922134000.0, + "grad_norm": 1.717687619371457, + "language_loss": 0.77612811, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.80089414, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16271973, + "step": 4914, + "time_per_iteration": 2.8148889541625977 + }, + { + "auxiliary_loss_clip": 0.01433838, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.29571962, + "balance_loss_mlp": 1.01810145, + "epoch": 0.2955057868630693, + "flos": 40449973897080.0, + "grad_norm": 1.5772557991446259, + "language_loss": 0.81804758, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.84274328, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17626953, + "step": 4915, + "time_per_iteration": 2.9140143394470215 + }, + { + "auxiliary_loss_clip": 0.0144466, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.30269015, + "balance_loss_mlp": 1.02070832, + "epoch": 0.29556591011573724, + "flos": 22569581154600.0, + "grad_norm": 1.7943598718001907, + "language_loss": 0.84818697, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.87301278, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17211914, + "step": 4916, + "time_per_iteration": 2.7779674530029297 + }, + { + "auxiliary_loss_clip": 0.01430343, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.29166675, + "balance_loss_mlp": 1.01673555, + "epoch": 0.2956260333684052, + "flos": 22094196797880.0, + "grad_norm": 2.434910710429871, + "language_loss": 0.69948131, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72410679, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.15490723, + "step": 4917, + "time_per_iteration": 2.7341511249542236 + }, + { + "auxiliary_loss_clip": 0.01443945, + "auxiliary_loss_mlp": 0.01042338, + "balance_loss_clip": 1.30347633, + "balance_loss_mlp": 1.02508891, + "epoch": 0.2956861566210732, + "flos": 16439840685720.0, + "grad_norm": 2.8061562253218164, + "language_loss": 0.91063827, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.9355011, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.17260742, + "step": 4918, + "time_per_iteration": 2.801708459854126 + }, + { + "auxiliary_loss_clip": 0.01436996, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.29597962, + "balance_loss_mlp": 1.02491784, + "epoch": 0.2957462798737412, + "flos": 25817632011720.0, + "grad_norm": 1.9712079138822716, + "language_loss": 0.72934437, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75413334, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.16992188, + "step": 4919, + "time_per_iteration": 2.814145088195801 + }, + { + "auxiliary_loss_clip": 0.01429033, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.29173946, + "balance_loss_mlp": 1.02737021, + "epoch": 0.29580640312640916, + "flos": 16695229618440.0, + "grad_norm": 1.6867844082659373, + "language_loss": 0.76643872, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.79116523, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16247559, + "step": 4920, + "time_per_iteration": 2.733018636703491 + }, + { + "auxiliary_loss_clip": 0.01438864, + "auxiliary_loss_mlp": 0.01054294, + "balance_loss_clip": 1.29776597, + "balance_loss_mlp": 1.03712761, + "epoch": 0.2958665263790771, + "flos": 23950148017680.0, + "grad_norm": 2.0088136007547375, + "language_loss": 0.69110084, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71603239, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17175293, + "step": 4921, + "time_per_iteration": 2.8322129249572754 + }, + { + "auxiliary_loss_clip": 0.0144321, + "auxiliary_loss_mlp": 0.01046742, + "balance_loss_clip": 1.3002373, + "balance_loss_mlp": 1.02877736, + "epoch": 0.2959266496317451, + "flos": 18482992755840.0, + "grad_norm": 1.772712499800358, + "language_loss": 0.75302523, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.77792478, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.17944336, + "step": 4922, + "time_per_iteration": 2.7501680850982666 + }, + { + "auxiliary_loss_clip": 0.01451372, + "auxiliary_loss_mlp": 0.01048824, + "balance_loss_clip": 1.30756164, + "balance_loss_mlp": 1.03094268, + "epoch": 0.29598677288441305, + "flos": 25962982624080.0, + "grad_norm": 1.6255967676155174, + "language_loss": 0.76954132, + "learning_rate": 3.302616272134737e-06, + "loss": 0.79454327, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.17883301, + "step": 4923, + "time_per_iteration": 2.80087947845459 + }, + { + "auxiliary_loss_clip": 0.01440335, + "auxiliary_loss_mlp": 0.01042286, + "balance_loss_clip": 1.30162406, + "balance_loss_mlp": 1.02547753, + "epoch": 0.296046896137081, + "flos": 25161544142280.0, + "grad_norm": 1.7107096581432435, + "language_loss": 0.86483222, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88965839, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.16796875, + "step": 4924, + "time_per_iteration": 2.784088373184204 + }, + { + "auxiliary_loss_clip": 0.0143349, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.29561913, + "balance_loss_mlp": 1.02435601, + "epoch": 0.296107019389749, + "flos": 21766031038080.0, + "grad_norm": 1.4542952110783882, + "language_loss": 0.82186902, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84661424, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16662598, + "step": 4925, + "time_per_iteration": 2.784946918487549 + }, + { + "auxiliary_loss_clip": 0.01437276, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.30005443, + "balance_loss_mlp": 1.02505338, + "epoch": 0.29616714264241695, + "flos": 17963565309720.0, + "grad_norm": 4.356764975918296, + "language_loss": 0.85672939, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88151276, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16027832, + "step": 4926, + "time_per_iteration": 2.6860783100128174 + }, + { + "auxiliary_loss_clip": 0.01443778, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.30278623, + "balance_loss_mlp": 1.02503324, + "epoch": 0.2962272658950849, + "flos": 15016976892720.0, + "grad_norm": 2.2915601253725795, + "language_loss": 0.86398411, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88883674, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.16455078, + "step": 4927, + "time_per_iteration": 2.717036724090576 + }, + { + "auxiliary_loss_clip": 0.01438452, + "auxiliary_loss_mlp": 0.01051887, + "balance_loss_clip": 1.30157614, + "balance_loss_mlp": 1.03565097, + "epoch": 0.2962873891477529, + "flos": 14725016808840.0, + "grad_norm": 1.778239697617132, + "language_loss": 0.80866373, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83356714, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16235352, + "step": 4928, + "time_per_iteration": 2.7126970291137695 + }, + { + "auxiliary_loss_clip": 0.01455302, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.30959773, + "balance_loss_mlp": 1.02496815, + "epoch": 0.29634751240042084, + "flos": 26729393072400.0, + "grad_norm": 2.392923333097889, + "language_loss": 0.72791296, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75291544, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.1998291, + "step": 4929, + "time_per_iteration": 4.1613311767578125 + }, + { + "auxiliary_loss_clip": 0.01443428, + "auxiliary_loss_mlp": 0.01042428, + "balance_loss_clip": 1.30151415, + "balance_loss_mlp": 1.02473712, + "epoch": 0.2964076356530888, + "flos": 14574955626720.0, + "grad_norm": 2.4442044172558353, + "language_loss": 0.72475779, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74961632, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.17663574, + "step": 4930, + "time_per_iteration": 2.7481331825256348 + }, + { + "auxiliary_loss_clip": 0.01256748, + "auxiliary_loss_mlp": 0.0101115, + "balance_loss_clip": 1.19533014, + "balance_loss_mlp": 1.00764525, + "epoch": 0.29646775890575683, + "flos": 63119986171920.0, + "grad_norm": 0.8165757749667564, + "language_loss": 0.60714877, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62982774, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.03515625, + "step": 4931, + "time_per_iteration": 3.1831934452056885 + }, + { + "auxiliary_loss_clip": 0.01254473, + "auxiliary_loss_mlp": 0.01008544, + "balance_loss_clip": 1.19194436, + "balance_loss_mlp": 1.00492048, + "epoch": 0.2965278821584248, + "flos": 63083333804040.0, + "grad_norm": 0.7457196972984569, + "language_loss": 0.52432925, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54695946, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03613281, + "step": 4932, + "time_per_iteration": 3.0813004970550537 + }, + { + "auxiliary_loss_clip": 0.01441097, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_clip": 1.30394173, + "balance_loss_mlp": 1.02676463, + "epoch": 0.29658800541109276, + "flos": 23773992641280.0, + "grad_norm": 1.5582891606929437, + "language_loss": 0.81015605, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83499086, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15612793, + "step": 4933, + "time_per_iteration": 2.7804148197174072 + }, + { + "auxiliary_loss_clip": 0.01428495, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.29521513, + "balance_loss_mlp": 1.02326357, + "epoch": 0.2966481286637607, + "flos": 23993947456920.0, + "grad_norm": 1.9205404537069692, + "language_loss": 0.75899863, + "learning_rate": 3.299362470215261e-06, + "loss": 0.78367257, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15637207, + "step": 4934, + "time_per_iteration": 2.8006184101104736 + }, + { + "auxiliary_loss_clip": 0.01439157, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.29846931, + "balance_loss_mlp": 1.03315091, + "epoch": 0.2967082519164287, + "flos": 17169598766160.0, + "grad_norm": 2.2173778301084432, + "language_loss": 0.62728798, + "learning_rate": 3.299066374184594e-06, + "loss": 0.65218139, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.17028809, + "step": 4935, + "time_per_iteration": 4.198742151260376 + }, + { + "auxiliary_loss_clip": 0.01441671, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.30663276, + "balance_loss_mlp": 1.02279162, + "epoch": 0.29676837516909665, + "flos": 29393726803560.0, + "grad_norm": 1.3987601826295537, + "language_loss": 0.79783881, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82265496, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.17138672, + "step": 4936, + "time_per_iteration": 2.84190034866333 + }, + { + "auxiliary_loss_clip": 0.01456952, + "auxiliary_loss_mlp": 0.0104648, + "balance_loss_clip": 1.31724739, + "balance_loss_mlp": 1.02933788, + "epoch": 0.2968284984217646, + "flos": 34757950208040.0, + "grad_norm": 1.5906441969522012, + "language_loss": 0.74507308, + "learning_rate": 3.298474034352309e-06, + "loss": 0.77010745, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17132568, + "step": 4937, + "time_per_iteration": 2.8499834537506104 + }, + { + "auxiliary_loss_clip": 0.01440783, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.30418324, + "balance_loss_mlp": 1.02548695, + "epoch": 0.2968886216744326, + "flos": 21549203066160.0, + "grad_norm": 1.6053370907415172, + "language_loss": 0.78399384, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.8088178, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.16113281, + "step": 4938, + "time_per_iteration": 2.8020336627960205 + }, + { + "auxiliary_loss_clip": 0.01451788, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_clip": 1.31129527, + "balance_loss_mlp": 1.02934384, + "epoch": 0.29694874492710055, + "flos": 12791253325320.0, + "grad_norm": 1.8014460692306895, + "language_loss": 0.77348047, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79846275, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17089844, + "step": 4939, + "time_per_iteration": 2.72835111618042 + }, + { + "auxiliary_loss_clip": 0.01448511, + "auxiliary_loss_mlp": 0.01044157, + "balance_loss_clip": 1.30722058, + "balance_loss_mlp": 1.0275507, + "epoch": 0.2970088681797685, + "flos": 24575146864560.0, + "grad_norm": 1.6723570862889103, + "language_loss": 0.78617269, + "learning_rate": 3.297585155344979e-06, + "loss": 0.81109935, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.1661377, + "step": 4940, + "time_per_iteration": 2.805445432662964 + }, + { + "auxiliary_loss_clip": 0.01450719, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.30953026, + "balance_loss_mlp": 1.02670813, + "epoch": 0.2970689914324365, + "flos": 23664401012880.0, + "grad_norm": 1.7991588426265541, + "language_loss": 0.75730264, + "learning_rate": 3.297288763918435e-06, + "loss": 0.78226107, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.1842041, + "step": 4941, + "time_per_iteration": 4.39372706413269 + }, + { + "auxiliary_loss_clip": 0.01455831, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.31235468, + "balance_loss_mlp": 1.02710438, + "epoch": 0.29712911468510445, + "flos": 39676822461000.0, + "grad_norm": 2.2060306844221578, + "language_loss": 0.7474798, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.77249128, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.18200684, + "step": 4942, + "time_per_iteration": 4.334664583206177 + }, + { + "auxiliary_loss_clip": 0.01455806, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.31221068, + "balance_loss_mlp": 1.03101075, + "epoch": 0.2971892379377724, + "flos": 26400861837360.0, + "grad_norm": 1.739535533940642, + "language_loss": 0.70681137, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.73186266, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.1829834, + "step": 4943, + "time_per_iteration": 2.76595139503479 + }, + { + "auxiliary_loss_clip": 0.01448216, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.30767918, + "balance_loss_mlp": 1.02076316, + "epoch": 0.2972493611904404, + "flos": 17607721629600.0, + "grad_norm": 1.9725578444388938, + "language_loss": 0.80240339, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82726508, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17199707, + "step": 4944, + "time_per_iteration": 2.844712018966675 + }, + { + "auxiliary_loss_clip": 0.01436216, + "auxiliary_loss_mlp": 0.01040804, + "balance_loss_clip": 1.30050635, + "balance_loss_mlp": 1.02561629, + "epoch": 0.2973094844431084, + "flos": 20417690231640.0, + "grad_norm": 1.940091318358835, + "language_loss": 0.83497095, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85974121, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15179443, + "step": 4945, + "time_per_iteration": 2.7538206577301025 + }, + { + "auxiliary_loss_clip": 0.01442682, + "auxiliary_loss_mlp": 0.01040115, + "balance_loss_clip": 1.30742598, + "balance_loss_mlp": 1.0241884, + "epoch": 0.29736960769577636, + "flos": 17497967567760.0, + "grad_norm": 1.6918596633654097, + "language_loss": 0.67365849, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69848645, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.15942383, + "step": 4946, + "time_per_iteration": 2.78476619720459 + }, + { + "auxiliary_loss_clip": 0.01444555, + "auxiliary_loss_mlp": 0.01036247, + "balance_loss_clip": 1.30510759, + "balance_loss_mlp": 1.0201304, + "epoch": 0.2974297309484443, + "flos": 26109551487240.0, + "grad_norm": 1.6761887792451657, + "language_loss": 0.74349952, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.76830757, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.16113281, + "step": 4947, + "time_per_iteration": 2.794288158416748 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01039151, + "balance_loss_clip": 1.31492066, + "balance_loss_mlp": 1.0220927, + "epoch": 0.2974898542011123, + "flos": 25671997140840.0, + "grad_norm": 2.1132552717121125, + "language_loss": 0.73228896, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.7572242, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17053223, + "step": 4948, + "time_per_iteration": 2.797638416290283 + }, + { + "auxiliary_loss_clip": 0.01435256, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.29934168, + "balance_loss_mlp": 1.01801658, + "epoch": 0.29754997745378026, + "flos": 18666010945080.0, + "grad_norm": 2.06924412571517, + "language_loss": 0.83688766, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86158025, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.15991211, + "step": 4949, + "time_per_iteration": 2.7772226333618164 + }, + { + "auxiliary_loss_clip": 0.01440199, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.30381012, + "balance_loss_mlp": 1.02116024, + "epoch": 0.2976101007064482, + "flos": 22279895138880.0, + "grad_norm": 1.7631281455812962, + "language_loss": 0.71267694, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73746324, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.17272949, + "step": 4950, + "time_per_iteration": 2.7484211921691895 + }, + { + "auxiliary_loss_clip": 0.01427927, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.29627895, + "balance_loss_mlp": 1.02409959, + "epoch": 0.2976702239591162, + "flos": 21950835995160.0, + "grad_norm": 2.08793175676135, + "language_loss": 0.83167124, + "learning_rate": 3.294322145875789e-06, + "loss": 0.85635257, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.16088867, + "step": 4951, + "time_per_iteration": 2.773580312728882 + }, + { + "auxiliary_loss_clip": 0.01439136, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.30091131, + "balance_loss_mlp": 1.01899123, + "epoch": 0.29773034721178415, + "flos": 24641629395840.0, + "grad_norm": 3.003952391400952, + "language_loss": 0.74016118, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76491463, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17211914, + "step": 4952, + "time_per_iteration": 2.763831377029419 + }, + { + "auxiliary_loss_clip": 0.01437715, + "auxiliary_loss_mlp": 0.01042691, + "balance_loss_clip": 1.30130374, + "balance_loss_mlp": 1.02480936, + "epoch": 0.2977904704644521, + "flos": 20562350501880.0, + "grad_norm": 1.7007122239574095, + "language_loss": 0.84678912, + "learning_rate": 3.293728232937228e-06, + "loss": 0.87159318, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.17883301, + "step": 4953, + "time_per_iteration": 2.78828501701355 + }, + { + "auxiliary_loss_clip": 0.01443663, + "auxiliary_loss_mlp": 0.01038352, + "balance_loss_clip": 1.30550122, + "balance_loss_mlp": 1.02186537, + "epoch": 0.2978505937171201, + "flos": 18921115619280.0, + "grad_norm": 2.6652838315883964, + "language_loss": 0.74051929, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76533943, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16473389, + "step": 4954, + "time_per_iteration": 2.7228784561157227 + }, + { + "auxiliary_loss_clip": 0.01438564, + "auxiliary_loss_mlp": 0.01036595, + "balance_loss_clip": 1.30096149, + "balance_loss_mlp": 1.02090728, + "epoch": 0.29791071696978805, + "flos": 19322464289760.0, + "grad_norm": 2.4170654136192704, + "language_loss": 0.75294518, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77769673, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.15686035, + "step": 4955, + "time_per_iteration": 2.8197243213653564 + }, + { + "auxiliary_loss_clip": 0.01446524, + "auxiliary_loss_mlp": 0.01042239, + "balance_loss_clip": 1.30570579, + "balance_loss_mlp": 1.02580023, + "epoch": 0.297970840222456, + "flos": 18811442774160.0, + "grad_norm": 1.5386020125986863, + "language_loss": 0.72589207, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.75077969, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.16442871, + "step": 4956, + "time_per_iteration": 2.738044500350952 + }, + { + "auxiliary_loss_clip": 0.01448969, + "auxiliary_loss_mlp": 0.01043452, + "balance_loss_clip": 1.30755138, + "balance_loss_mlp": 1.02629757, + "epoch": 0.298030963475124, + "flos": 22857399185760.0, + "grad_norm": 1.8278369817108941, + "language_loss": 0.7934866, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81841087, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.17163086, + "step": 4957, + "time_per_iteration": 2.8087522983551025 + }, + { + "auxiliary_loss_clip": 0.01435123, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.2964381, + "balance_loss_mlp": 1.0242815, + "epoch": 0.298091086727792, + "flos": 21873104948160.0, + "grad_norm": 1.6317950808526693, + "language_loss": 0.70721054, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.73198795, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.18347168, + "step": 4958, + "time_per_iteration": 2.786945104598999 + }, + { + "auxiliary_loss_clip": 0.01436736, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.3007164, + "balance_loss_mlp": 1.02573872, + "epoch": 0.29815120998045996, + "flos": 21179511935280.0, + "grad_norm": 1.6923438771393846, + "language_loss": 0.79056853, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81535828, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.16491699, + "step": 4959, + "time_per_iteration": 2.749438524246216 + }, + { + "auxiliary_loss_clip": 0.01436085, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_clip": 1.29929173, + "balance_loss_mlp": 1.02529716, + "epoch": 0.29821133323312793, + "flos": 19900374420240.0, + "grad_norm": 1.6759586980715029, + "language_loss": 0.79763472, + "learning_rate": 3.291647992907147e-06, + "loss": 0.82241285, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16442871, + "step": 4960, + "time_per_iteration": 2.834400177001953 + }, + { + "auxiliary_loss_clip": 0.01442121, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.30078912, + "balance_loss_mlp": 1.02598119, + "epoch": 0.2982714564857959, + "flos": 12754682174160.0, + "grad_norm": 2.49815311575095, + "language_loss": 0.74583316, + "learning_rate": 3.291350619752129e-06, + "loss": 0.77070111, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.18676758, + "step": 4961, + "time_per_iteration": 2.799333333969116 + }, + { + "auxiliary_loss_clip": 0.01435336, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.29689658, + "balance_loss_mlp": 1.0230099, + "epoch": 0.29833157973846386, + "flos": 22276849511880.0, + "grad_norm": 1.7929291404898045, + "language_loss": 0.62472469, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64946592, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.15771484, + "step": 4962, + "time_per_iteration": 2.7929980754852295 + }, + { + "auxiliary_loss_clip": 0.01438479, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.29995036, + "balance_loss_mlp": 1.02972758, + "epoch": 0.2983917029911318, + "flos": 15376637758680.0, + "grad_norm": 1.5941454847782677, + "language_loss": 0.83601451, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.86087322, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.17675781, + "step": 4963, + "time_per_iteration": 2.698910713195801 + }, + { + "auxiliary_loss_clip": 0.01431178, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.2957505, + "balance_loss_mlp": 1.01993465, + "epoch": 0.2984518262437998, + "flos": 15381876237120.0, + "grad_norm": 2.293223612884718, + "language_loss": 0.67153692, + "learning_rate": 3.290458206523322e-06, + "loss": 0.69621623, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.16821289, + "step": 4964, + "time_per_iteration": 2.7457127571105957 + }, + { + "auxiliary_loss_clip": 0.01430372, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.29583538, + "balance_loss_mlp": 1.02048564, + "epoch": 0.29851194949646775, + "flos": 18112327024320.0, + "grad_norm": 2.0439860448928946, + "language_loss": 0.71887356, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.74354076, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.15881348, + "step": 4965, + "time_per_iteration": 2.7247426509857178 + }, + { + "auxiliary_loss_clip": 0.01446669, + "auxiliary_loss_mlp": 0.01044937, + "balance_loss_clip": 1.30563772, + "balance_loss_mlp": 1.02743673, + "epoch": 0.2985720727491357, + "flos": 22023369172080.0, + "grad_norm": 1.6423829928590707, + "language_loss": 0.66395617, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68887222, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17492676, + "step": 4966, + "time_per_iteration": 2.773181200027466 + }, + { + "auxiliary_loss_clip": 0.01440243, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.30068648, + "balance_loss_mlp": 1.02431917, + "epoch": 0.2986321960018037, + "flos": 13045139748720.0, + "grad_norm": 2.467771678673201, + "language_loss": 0.74592572, + "learning_rate": 3.289565352885785e-06, + "loss": 0.77073264, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16125488, + "step": 4967, + "time_per_iteration": 2.7618086338043213 + }, + { + "auxiliary_loss_clip": 0.01436401, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.29760373, + "balance_loss_mlp": 1.020118, + "epoch": 0.29869231925447165, + "flos": 14469099967440.0, + "grad_norm": 1.903852276808341, + "language_loss": 0.71414858, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73887432, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16064453, + "step": 4968, + "time_per_iteration": 4.100582838058472 + }, + { + "auxiliary_loss_clip": 0.01444083, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.30447674, + "balance_loss_mlp": 1.01788461, + "epoch": 0.2987524425071396, + "flos": 31656590039160.0, + "grad_norm": 1.7480184199413842, + "language_loss": 0.76963866, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79442334, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16491699, + "step": 4969, + "time_per_iteration": 2.8345353603363037 + }, + { + "auxiliary_loss_clip": 0.01438518, + "auxiliary_loss_mlp": 0.01038417, + "balance_loss_clip": 1.30097389, + "balance_loss_mlp": 1.02258635, + "epoch": 0.2988125657598076, + "flos": 21438352578600.0, + "grad_norm": 1.7376935215130218, + "language_loss": 0.70080751, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72557688, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.1583252, + "step": 4970, + "time_per_iteration": 2.7837631702423096 + }, + { + "auxiliary_loss_clip": 0.0144916, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.30553317, + "balance_loss_mlp": 1.02336287, + "epoch": 0.2988726890124756, + "flos": 18081481651920.0, + "grad_norm": 2.3189233754706757, + "language_loss": 0.85134786, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87625903, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.18603516, + "step": 4971, + "time_per_iteration": 2.9565701484680176 + }, + { + "auxiliary_loss_clip": 0.01429519, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.2957685, + "balance_loss_mlp": 1.02408564, + "epoch": 0.29893281226514357, + "flos": 21759817959000.0, + "grad_norm": 1.8332725088168536, + "language_loss": 0.79628944, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.82099497, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16955566, + "step": 4972, + "time_per_iteration": 2.8263344764709473 + }, + { + "auxiliary_loss_clip": 0.01436126, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.29834342, + "balance_loss_mlp": 1.02740049, + "epoch": 0.29899293551781153, + "flos": 16841108139480.0, + "grad_norm": 2.3145861695845547, + "language_loss": 0.8575179, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.882321, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.16772461, + "step": 4973, + "time_per_iteration": 4.15091609954834 + }, + { + "auxiliary_loss_clip": 0.01425327, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.2942884, + "balance_loss_mlp": 1.0206213, + "epoch": 0.2990530587704795, + "flos": 11733004618200.0, + "grad_norm": 1.9548373644448263, + "language_loss": 0.77731168, + "learning_rate": 3.287480316742863e-06, + "loss": 0.8019278, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15673828, + "step": 4974, + "time_per_iteration": 2.758843421936035 + }, + { + "auxiliary_loss_clip": 0.01440199, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.3031776, + "balance_loss_mlp": 1.01901531, + "epoch": 0.29911318202314746, + "flos": 28045995122520.0, + "grad_norm": 1.6241412107050792, + "language_loss": 0.72619694, + "learning_rate": 3.287182259060815e-06, + "loss": 0.75095451, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16540527, + "step": 4975, + "time_per_iteration": 2.8653371334075928 + }, + { + "auxiliary_loss_clip": 0.01432952, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.29570735, + "balance_loss_mlp": 1.02258825, + "epoch": 0.2991733052758154, + "flos": 18738138038400.0, + "grad_norm": 2.4416922730997594, + "language_loss": 0.76781964, + "learning_rate": 3.286884152568687e-06, + "loss": 0.79254413, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16906738, + "step": 4976, + "time_per_iteration": 2.7883682250976562 + }, + { + "auxiliary_loss_clip": 0.01429195, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.29460764, + "balance_loss_mlp": 1.01994801, + "epoch": 0.2992334285284834, + "flos": 15563310700320.0, + "grad_norm": 2.115810167530046, + "language_loss": 0.86137772, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88603449, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.16534424, + "step": 4977, + "time_per_iteration": 2.744595527648926 + }, + { + "auxiliary_loss_clip": 0.01437939, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.30117726, + "balance_loss_mlp": 1.0190928, + "epoch": 0.29929355178115136, + "flos": 21802439755800.0, + "grad_norm": 1.7792748137477064, + "language_loss": 0.68752205, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.71226287, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17053223, + "step": 4978, + "time_per_iteration": 2.7643468379974365 + }, + { + "auxiliary_loss_clip": 0.01436374, + "auxiliary_loss_mlp": 0.0103832, + "balance_loss_clip": 1.30073166, + "balance_loss_mlp": 1.02194071, + "epoch": 0.2993536750338193, + "flos": 21183491554560.0, + "grad_norm": 2.109201152658339, + "language_loss": 0.76750731, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.79225427, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16369629, + "step": 4979, + "time_per_iteration": 2.7845330238342285 + }, + { + "auxiliary_loss_clip": 0.01439208, + "auxiliary_loss_mlp": 0.01037486, + "balance_loss_clip": 1.30065775, + "balance_loss_mlp": 1.0202961, + "epoch": 0.2994137982864873, + "flos": 32128238426760.0, + "grad_norm": 1.540524931296359, + "language_loss": 0.6866715, + "learning_rate": 3.285691238725484e-06, + "loss": 0.71143842, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.171875, + "step": 4980, + "time_per_iteration": 4.322749376296997 + }, + { + "auxiliary_loss_clip": 0.01435069, + "auxiliary_loss_mlp": 0.01040455, + "balance_loss_clip": 1.30113125, + "balance_loss_mlp": 1.02458811, + "epoch": 0.29947392153915525, + "flos": 21110349252240.0, + "grad_norm": 1.7571737563473977, + "language_loss": 0.73580408, + "learning_rate": 3.285392888352555e-06, + "loss": 0.76055926, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.15856934, + "step": 4981, + "time_per_iteration": 4.248949766159058 + }, + { + "auxiliary_loss_clip": 0.01442056, + "auxiliary_loss_mlp": 0.0104608, + "balance_loss_clip": 1.29923892, + "balance_loss_mlp": 1.02937841, + "epoch": 0.2995340447918232, + "flos": 21547538123400.0, + "grad_norm": 1.4291759334792737, + "language_loss": 0.86606425, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.89094555, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.16711426, + "step": 4982, + "time_per_iteration": 2.8492374420166016 + }, + { + "auxiliary_loss_clip": 0.01449938, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.30801272, + "balance_loss_mlp": 1.02162719, + "epoch": 0.2995941680444912, + "flos": 16729242442920.0, + "grad_norm": 2.5551778528364535, + "language_loss": 0.86494029, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88983524, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.17932129, + "step": 4983, + "time_per_iteration": 2.79910945892334 + }, + { + "auxiliary_loss_clip": 0.01435942, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.29770756, + "balance_loss_mlp": 1.02695203, + "epoch": 0.2996542912971592, + "flos": 20928468097080.0, + "grad_norm": 2.0413995344294125, + "language_loss": 0.78700733, + "learning_rate": 3.284497544825668e-06, + "loss": 0.8117981, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.1619873, + "step": 4984, + "time_per_iteration": 2.7340309619903564 + }, + { + "auxiliary_loss_clip": 0.01443655, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.30593848, + "balance_loss_mlp": 1.02234626, + "epoch": 0.29971441454982717, + "flos": 25084990737720.0, + "grad_norm": 1.5529360118974935, + "language_loss": 0.7868982, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.81172562, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.1673584, + "step": 4985, + "time_per_iteration": 2.847583532333374 + }, + { + "auxiliary_loss_clip": 0.01455409, + "auxiliary_loss_mlp": 0.010441, + "balance_loss_clip": 1.31319439, + "balance_loss_mlp": 1.02526474, + "epoch": 0.29977453780249513, + "flos": 52564916389680.0, + "grad_norm": 1.9773995246271028, + "language_loss": 0.7161597, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74115479, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.18823242, + "step": 4986, + "time_per_iteration": 3.066758394241333 + }, + { + "auxiliary_loss_clip": 0.01443671, + "auxiliary_loss_mlp": 0.01045577, + "balance_loss_clip": 1.30289447, + "balance_loss_mlp": 1.02813685, + "epoch": 0.2998346610551631, + "flos": 22242308778720.0, + "grad_norm": 1.693886421167088, + "language_loss": 0.73667228, + "learning_rate": 3.283601762924312e-06, + "loss": 0.76156473, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17431641, + "step": 4987, + "time_per_iteration": 2.759546995162964 + }, + { + "auxiliary_loss_clip": 0.01434546, + "auxiliary_loss_mlp": 0.01043499, + "balance_loss_clip": 1.29758859, + "balance_loss_mlp": 1.02746499, + "epoch": 0.29989478430783106, + "flos": 16877598073920.0, + "grad_norm": 1.5897400011734217, + "language_loss": 0.8102082, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.83498871, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16040039, + "step": 4988, + "time_per_iteration": 2.7313199043273926 + }, + { + "auxiliary_loss_clip": 0.01429991, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.29395461, + "balance_loss_mlp": 1.02579486, + "epoch": 0.29995490756049903, + "flos": 23774155074720.0, + "grad_norm": 1.8446129939983293, + "language_loss": 0.70920122, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.73391628, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15716553, + "step": 4989, + "time_per_iteration": 2.8133296966552734 + }, + { + "auxiliary_loss_clip": 0.01444275, + "auxiliary_loss_mlp": 0.01047806, + "balance_loss_clip": 1.30437469, + "balance_loss_mlp": 1.02979374, + "epoch": 0.300015030813167, + "flos": 14469424834320.0, + "grad_norm": 2.455170019682246, + "language_loss": 0.8593992, + "learning_rate": 3.282705542954199e-06, + "loss": 0.88431996, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.18005371, + "step": 4990, + "time_per_iteration": 2.728543758392334 + }, + { + "auxiliary_loss_clip": 0.01444778, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.30254328, + "balance_loss_mlp": 1.02267933, + "epoch": 0.30007515406583496, + "flos": 25197465559680.0, + "grad_norm": 2.207390510851919, + "language_loss": 0.67306322, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69790757, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.16992188, + "step": 4991, + "time_per_iteration": 2.75852632522583 + }, + { + "auxiliary_loss_clip": 0.01451537, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.30729997, + "balance_loss_mlp": 1.02185464, + "epoch": 0.3001352773185029, + "flos": 19395972067320.0, + "grad_norm": 2.111736992225188, + "language_loss": 0.79307246, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81798851, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.18212891, + "step": 4992, + "time_per_iteration": 2.7492518424987793 + }, + { + "auxiliary_loss_clip": 0.01438551, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.30110037, + "balance_loss_mlp": 1.02242887, + "epoch": 0.3001954005711709, + "flos": 21548918807640.0, + "grad_norm": 2.008932250244927, + "language_loss": 0.83339196, + "learning_rate": 3.281808885221193e-06, + "loss": 0.8581655, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.16369629, + "step": 4993, + "time_per_iteration": 2.7408833503723145 + }, + { + "auxiliary_loss_clip": 0.01447572, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.30513597, + "balance_loss_mlp": 1.02447617, + "epoch": 0.30025552382383885, + "flos": 17388822631320.0, + "grad_norm": 2.0289637874646433, + "language_loss": 0.86633146, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.89124405, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1920166, + "step": 4994, + "time_per_iteration": 2.751553535461426 + }, + { + "auxiliary_loss_clip": 0.01438326, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.30025697, + "balance_loss_mlp": 1.01669741, + "epoch": 0.3003156470765068, + "flos": 29539321066080.0, + "grad_norm": 1.471409985937456, + "language_loss": 0.81245649, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83716792, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16125488, + "step": 4995, + "time_per_iteration": 2.798720121383667 + }, + { + "auxiliary_loss_clip": 0.01436465, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_clip": 1.30122173, + "balance_loss_mlp": 1.02283716, + "epoch": 0.3003757703291748, + "flos": 43653331931040.0, + "grad_norm": 1.5685914823905183, + "language_loss": 0.67123294, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.6959902, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16424561, + "step": 4996, + "time_per_iteration": 2.9512126445770264 + }, + { + "auxiliary_loss_clip": 0.0144019, + "auxiliary_loss_mlp": 0.01042875, + "balance_loss_clip": 1.30254054, + "balance_loss_mlp": 1.02655578, + "epoch": 0.30043589358184275, + "flos": 22533497303760.0, + "grad_norm": 1.659713111692959, + "language_loss": 0.75755227, + "learning_rate": 3.280612661141615e-06, + "loss": 0.78238285, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16320801, + "step": 4997, + "time_per_iteration": 2.798142194747925 + }, + { + "auxiliary_loss_clip": 0.01431014, + "auxiliary_loss_mlp": 0.01042921, + "balance_loss_clip": 1.29549825, + "balance_loss_mlp": 1.02719772, + "epoch": 0.30049601683451077, + "flos": 21000798232200.0, + "grad_norm": 1.8242484030583124, + "language_loss": 0.78192437, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.80666375, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.15722656, + "step": 4998, + "time_per_iteration": 2.743135929107666 + }, + { + "auxiliary_loss_clip": 0.01425814, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.29354334, + "balance_loss_mlp": 1.01893997, + "epoch": 0.30055614008717874, + "flos": 23921454888360.0, + "grad_norm": 7.280685316051745, + "language_loss": 0.7309916, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.7555908, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.15155029, + "step": 4999, + "time_per_iteration": 2.8062832355499268 + }, + { + "auxiliary_loss_clip": 0.0143918, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.30176473, + "balance_loss_mlp": 1.02104306, + "epoch": 0.3006162633398467, + "flos": 19174230483840.0, + "grad_norm": 18.774700950991843, + "language_loss": 0.75751615, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78227818, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15966797, + "step": 5000, + "time_per_iteration": 2.7373087406158447 + }, + { + "auxiliary_loss_clip": 0.01433649, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.29921794, + "balance_loss_mlp": 1.02719569, + "epoch": 0.30067638659251467, + "flos": 14682719878920.0, + "grad_norm": 1.790680455686266, + "language_loss": 0.822083, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84684819, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.15655518, + "step": 5001, + "time_per_iteration": 2.7512171268463135 + }, + { + "auxiliary_loss_clip": 0.01434866, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.2984066, + "balance_loss_mlp": 1.02725518, + "epoch": 0.30073650984518263, + "flos": 23373293704560.0, + "grad_norm": 1.8352257702584391, + "language_loss": 0.81063306, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.83542132, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16711426, + "step": 5002, + "time_per_iteration": 2.7464053630828857 + }, + { + "auxiliary_loss_clip": 0.01448442, + "auxiliary_loss_mlp": 0.01041986, + "balance_loss_clip": 1.30692565, + "balance_loss_mlp": 1.02574956, + "epoch": 0.3007966330978506, + "flos": 22971863817360.0, + "grad_norm": 1.890105204144586, + "language_loss": 0.7099129, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73481721, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.16247559, + "step": 5003, + "time_per_iteration": 2.763704299926758 + }, + { + "auxiliary_loss_clip": 0.01445884, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.30558348, + "balance_loss_mlp": 1.02323484, + "epoch": 0.30085675635051856, + "flos": 27824212930680.0, + "grad_norm": 1.8597990899911905, + "language_loss": 0.70983613, + "learning_rate": 3.27851739984233e-06, + "loss": 0.73468804, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1607666, + "step": 5004, + "time_per_iteration": 2.7919511795043945 + }, + { + "auxiliary_loss_clip": 0.01439267, + "auxiliary_loss_mlp": 0.01043661, + "balance_loss_clip": 1.30093884, + "balance_loss_mlp": 1.02718639, + "epoch": 0.3009168796031865, + "flos": 10885452020640.0, + "grad_norm": 2.7286914571141105, + "language_loss": 0.81817687, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84300613, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.16479492, + "step": 5005, + "time_per_iteration": 2.7353386878967285 + }, + { + "auxiliary_loss_clip": 0.01431041, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.29596925, + "balance_loss_mlp": 1.02004862, + "epoch": 0.3009770028558545, + "flos": 23810685617520.0, + "grad_norm": 2.7168917406156075, + "language_loss": 0.74687541, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77154058, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.15405273, + "step": 5006, + "time_per_iteration": 4.18185830116272 + }, + { + "auxiliary_loss_clip": 0.01426338, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.29212391, + "balance_loss_mlp": 1.03136623, + "epoch": 0.30103712610852246, + "flos": 26473760489520.0, + "grad_norm": 1.8315869511608467, + "language_loss": 0.71920991, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.74394232, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.15539551, + "step": 5007, + "time_per_iteration": 2.770350694656372 + }, + { + "auxiliary_loss_clip": 0.01430762, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.29534197, + "balance_loss_mlp": 1.01874089, + "epoch": 0.3010972493611904, + "flos": 22861581846840.0, + "grad_norm": 2.007877104500302, + "language_loss": 0.76754647, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.7922045, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16296387, + "step": 5008, + "time_per_iteration": 2.860513687133789 + }, + { + "auxiliary_loss_clip": 0.01429036, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.292418, + "balance_loss_mlp": 1.0184989, + "epoch": 0.3011573726138584, + "flos": 24058074703320.0, + "grad_norm": 2.6122288441556205, + "language_loss": 0.84913731, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87376988, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.15722656, + "step": 5009, + "time_per_iteration": 2.7703354358673096 + }, + { + "auxiliary_loss_clip": 0.01440182, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.29885173, + "balance_loss_mlp": 1.02014101, + "epoch": 0.30121749586652635, + "flos": 20263649430240.0, + "grad_norm": 2.361156652217899, + "language_loss": 0.84027493, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86505175, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.17358398, + "step": 5010, + "time_per_iteration": 2.7352397441864014 + }, + { + "auxiliary_loss_clip": 0.01426205, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.29004264, + "balance_loss_mlp": 1.01938021, + "epoch": 0.3012776191191944, + "flos": 26948738762640.0, + "grad_norm": 2.188961684706966, + "language_loss": 0.85305452, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87765831, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.14807129, + "step": 5011, + "time_per_iteration": 2.8017470836639404 + }, + { + "auxiliary_loss_clip": 0.01433359, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.29412031, + "balance_loss_mlp": 1.01908255, + "epoch": 0.30133774237186234, + "flos": 20417162322960.0, + "grad_norm": 2.1635401584141802, + "language_loss": 0.72446269, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74916506, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.17810059, + "step": 5012, + "time_per_iteration": 4.193013668060303 + }, + { + "auxiliary_loss_clip": 0.01429863, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.29218054, + "balance_loss_mlp": 1.02014828, + "epoch": 0.3013978656245303, + "flos": 19797280129440.0, + "grad_norm": 2.1683548474930077, + "language_loss": 0.87717235, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90183812, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16564941, + "step": 5013, + "time_per_iteration": 2.7457144260406494 + }, + { + "auxiliary_loss_clip": 0.01440544, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.29936028, + "balance_loss_mlp": 1.01781201, + "epoch": 0.30145798887719827, + "flos": 16253533219320.0, + "grad_norm": 1.868790916851364, + "language_loss": 0.83898997, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.86374176, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.16833496, + "step": 5014, + "time_per_iteration": 2.721764326095581 + }, + { + "auxiliary_loss_clip": 0.01424871, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.2919569, + "balance_loss_mlp": 1.0187664, + "epoch": 0.30151811212986623, + "flos": 24577136674200.0, + "grad_norm": 1.562013693390544, + "language_loss": 0.68412048, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70871162, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.15478516, + "step": 5015, + "time_per_iteration": 2.801924467086792 + }, + { + "auxiliary_loss_clip": 0.01431284, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.29583728, + "balance_loss_mlp": 1.0214777, + "epoch": 0.3015782353825342, + "flos": 21877165784160.0, + "grad_norm": 2.0556926279457692, + "language_loss": 0.75375122, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77844965, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.17089844, + "step": 5016, + "time_per_iteration": 2.7750089168548584 + }, + { + "auxiliary_loss_clip": 0.01439344, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.30040491, + "balance_loss_mlp": 1.01796079, + "epoch": 0.30163835863520216, + "flos": 28775509552800.0, + "grad_norm": 1.6714807711186532, + "language_loss": 0.65922761, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.68396306, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16235352, + "step": 5017, + "time_per_iteration": 2.8307812213897705 + }, + { + "auxiliary_loss_clip": 0.01433309, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.29581058, + "balance_loss_mlp": 1.02722716, + "epoch": 0.30169848188787013, + "flos": 22971376517040.0, + "grad_norm": 2.3089447875064133, + "language_loss": 0.69347346, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71823043, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15155029, + "step": 5018, + "time_per_iteration": 2.827800989151001 + }, + { + "auxiliary_loss_clip": 0.01423408, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.29142952, + "balance_loss_mlp": 1.02440548, + "epoch": 0.3017586051405381, + "flos": 21840310374480.0, + "grad_norm": 2.0687673229778563, + "language_loss": 0.79393363, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81855965, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.14782715, + "step": 5019, + "time_per_iteration": 4.329486131668091 + }, + { + "auxiliary_loss_clip": 0.0143529, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.29922438, + "balance_loss_mlp": 1.0217514, + "epoch": 0.30181872839320606, + "flos": 22165065032040.0, + "grad_norm": 1.7586683119217483, + "language_loss": 0.69884324, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72356868, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15490723, + "step": 5020, + "time_per_iteration": 2.7496516704559326 + }, + { + "auxiliary_loss_clip": 0.01440209, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.30254865, + "balance_loss_mlp": 1.02038026, + "epoch": 0.301878851645874, + "flos": 18118783753560.0, + "grad_norm": 1.841415770217213, + "language_loss": 0.78761309, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.81237537, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.15637207, + "step": 5021, + "time_per_iteration": 2.874457597732544 + }, + { + "auxiliary_loss_clip": 0.01432416, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.29646564, + "balance_loss_mlp": 1.01613772, + "epoch": 0.301938974898542, + "flos": 17606625203880.0, + "grad_norm": 2.360296255481602, + "language_loss": 0.76670444, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.7913456, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.15563965, + "step": 5022, + "time_per_iteration": 2.7155818939208984 + }, + { + "auxiliary_loss_clip": 0.01437129, + "auxiliary_loss_mlp": 0.01038081, + "balance_loss_clip": 1.29914641, + "balance_loss_mlp": 1.02183247, + "epoch": 0.30199909815120995, + "flos": 11184843434400.0, + "grad_norm": 1.967965064954327, + "language_loss": 0.70107144, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72582358, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16235352, + "step": 5023, + "time_per_iteration": 2.7608916759490967 + }, + { + "auxiliary_loss_clip": 0.01441749, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.30147099, + "balance_loss_mlp": 1.02193058, + "epoch": 0.302059221403878, + "flos": 21912843551400.0, + "grad_norm": 1.911351248656561, + "language_loss": 0.71877646, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.74356931, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.15600586, + "step": 5024, + "time_per_iteration": 2.739166259765625 + }, + { + "auxiliary_loss_clip": 0.0143116, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.29927981, + "balance_loss_mlp": 1.02418351, + "epoch": 0.30211934465654594, + "flos": 26402080088160.0, + "grad_norm": 1.6888619891476122, + "language_loss": 0.74284756, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76756072, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.15979004, + "step": 5025, + "time_per_iteration": 2.782447099685669 + }, + { + "auxiliary_loss_clip": 0.01420841, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.29035306, + "balance_loss_mlp": 1.02292395, + "epoch": 0.3021794679092139, + "flos": 23405235502680.0, + "grad_norm": 1.8822429865108246, + "language_loss": 0.67131203, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69589555, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.14587402, + "step": 5026, + "time_per_iteration": 2.771139144897461 + }, + { + "auxiliary_loss_clip": 0.01434347, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_clip": 1.29921985, + "balance_loss_mlp": 1.03069663, + "epoch": 0.30223959116188187, + "flos": 20265111331200.0, + "grad_norm": 1.7414826650759574, + "language_loss": 0.85366929, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87848353, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16381836, + "step": 5027, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.01425749, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.29264092, + "balance_loss_mlp": 1.02480209, + "epoch": 0.30229971441454984, + "flos": 26693512263360.0, + "grad_norm": 1.5823098234533879, + "language_loss": 0.78810579, + "learning_rate": 3.271315635661351e-06, + "loss": 0.81275964, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.14831543, + "step": 5028, + "time_per_iteration": 2.8123016357421875 + }, + { + "auxiliary_loss_clip": 0.01434822, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.29927218, + "balance_loss_mlp": 1.02232099, + "epoch": 0.3023598376672178, + "flos": 34351403667480.0, + "grad_norm": 1.7065498731886992, + "language_loss": 0.77168453, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79641545, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.15966797, + "step": 5029, + "time_per_iteration": 2.8470382690429688 + }, + { + "auxiliary_loss_clip": 0.01437439, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.29976952, + "balance_loss_mlp": 1.02080023, + "epoch": 0.30241996091988577, + "flos": 23117336254800.0, + "grad_norm": 2.0161574993479032, + "language_loss": 0.82307756, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84782577, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16571045, + "step": 5030, + "time_per_iteration": 2.7802014350891113 + }, + { + "auxiliary_loss_clip": 0.01437921, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.29985285, + "balance_loss_mlp": 1.02242839, + "epoch": 0.30248008417255373, + "flos": 19394631991440.0, + "grad_norm": 1.6099184512050355, + "language_loss": 0.69688714, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72166157, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17077637, + "step": 5031, + "time_per_iteration": 2.7597076892852783 + }, + { + "auxiliary_loss_clip": 0.01433255, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.29742146, + "balance_loss_mlp": 1.01931417, + "epoch": 0.3025402074252217, + "flos": 23775089067000.0, + "grad_norm": 1.7085698942907932, + "language_loss": 0.82577491, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.85045969, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15917969, + "step": 5032, + "time_per_iteration": 2.896113872528076 + }, + { + "auxiliary_loss_clip": 0.01446627, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.30726171, + "balance_loss_mlp": 1.02067471, + "epoch": 0.30260033067788966, + "flos": 25999634991960.0, + "grad_norm": 2.7061754433446965, + "language_loss": 0.74171001, + "learning_rate": 3.269811767783906e-06, + "loss": 0.76655912, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17614746, + "step": 5033, + "time_per_iteration": 2.8318943977355957 + }, + { + "auxiliary_loss_clip": 0.01426866, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.29293942, + "balance_loss_mlp": 1.0211755, + "epoch": 0.3026604539305576, + "flos": 25380077665320.0, + "grad_norm": 1.4944664462135546, + "language_loss": 0.74169469, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76633978, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.16455078, + "step": 5034, + "time_per_iteration": 2.91487717628479 + }, + { + "auxiliary_loss_clip": 0.01437397, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.30057752, + "balance_loss_mlp": 1.01960993, + "epoch": 0.3027205771832256, + "flos": 25818362962200.0, + "grad_norm": 1.926614146189277, + "language_loss": 0.72545123, + "learning_rate": 3.269209883493352e-06, + "loss": 0.75018698, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16564941, + "step": 5035, + "time_per_iteration": 2.8306896686553955 + }, + { + "auxiliary_loss_clip": 0.01426494, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.29430211, + "balance_loss_mlp": 1.01908231, + "epoch": 0.30278070043589356, + "flos": 27350290474920.0, + "grad_norm": 2.138712005263426, + "language_loss": 0.87857765, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.90317929, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.14569092, + "step": 5036, + "time_per_iteration": 2.810774087905884 + }, + { + "auxiliary_loss_clip": 0.01427743, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_clip": 1.29480958, + "balance_loss_mlp": 1.03037786, + "epoch": 0.3028408236885616, + "flos": 24791081452560.0, + "grad_norm": 2.0014802256582387, + "language_loss": 0.77392942, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79868096, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.17028809, + "step": 5037, + "time_per_iteration": 2.81673002243042 + }, + { + "auxiliary_loss_clip": 0.01437769, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.30047119, + "balance_loss_mlp": 1.02184212, + "epoch": 0.30290094694122954, + "flos": 12936319679160.0, + "grad_norm": 2.095805621952191, + "language_loss": 0.78071672, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80547744, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16479492, + "step": 5038, + "time_per_iteration": 2.7148213386535645 + }, + { + "auxiliary_loss_clip": 0.01429032, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.2974968, + "balance_loss_mlp": 1.02272558, + "epoch": 0.3029610701938975, + "flos": 25921335427920.0, + "grad_norm": 1.7242729864031938, + "language_loss": 0.74288201, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76754922, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.14953613, + "step": 5039, + "time_per_iteration": 2.820241689682007 + }, + { + "auxiliary_loss_clip": 0.01426872, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.2947638, + "balance_loss_mlp": 1.02754927, + "epoch": 0.3030211934465655, + "flos": 21986026462080.0, + "grad_norm": 2.017784696835705, + "language_loss": 0.80106854, + "learning_rate": 3.267704330716847e-06, + "loss": 0.825755, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.14227295, + "step": 5040, + "time_per_iteration": 2.785717725753784 + }, + { + "auxiliary_loss_clip": 0.01426522, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.29406166, + "balance_loss_mlp": 1.02655768, + "epoch": 0.30308131669923344, + "flos": 20996371920960.0, + "grad_norm": 1.6013191761329255, + "language_loss": 0.82015777, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8448354, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.14678955, + "step": 5041, + "time_per_iteration": 2.78910756111145 + }, + { + "auxiliary_loss_clip": 0.01257519, + "auxiliary_loss_mlp": 0.01006379, + "balance_loss_clip": 1.20088589, + "balance_loss_mlp": 1.00325537, + "epoch": 0.3031414399519014, + "flos": 60564025818360.0, + "grad_norm": 0.7630220965130637, + "language_loss": 0.59465486, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61729383, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.03112793, + "step": 5042, + "time_per_iteration": 3.3454298973083496 + }, + { + "auxiliary_loss_clip": 0.01436008, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.29880977, + "balance_loss_mlp": 1.01873934, + "epoch": 0.30320156320456937, + "flos": 21912640509600.0, + "grad_norm": 2.9506428034717596, + "language_loss": 0.71917319, + "learning_rate": 3.266800422101892e-06, + "loss": 0.74387872, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15808105, + "step": 5043, + "time_per_iteration": 2.7738037109375 + }, + { + "auxiliary_loss_clip": 0.01431301, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.29593408, + "balance_loss_mlp": 1.01771629, + "epoch": 0.30326168645723733, + "flos": 21657779485560.0, + "grad_norm": 2.2258557102680694, + "language_loss": 0.70073014, + "learning_rate": 3.266499023140606e-06, + "loss": 0.72537005, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1496582, + "step": 5044, + "time_per_iteration": 2.791954517364502 + }, + { + "auxiliary_loss_clip": 0.01427065, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.2946943, + "balance_loss_mlp": 1.02207756, + "epoch": 0.3033218097099053, + "flos": 21876272400240.0, + "grad_norm": 1.342794972801814, + "language_loss": 0.77683389, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.80148017, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.15478516, + "step": 5045, + "time_per_iteration": 4.182086229324341 + }, + { + "auxiliary_loss_clip": 0.01434842, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.29975843, + "balance_loss_mlp": 1.02099383, + "epoch": 0.30338193296257326, + "flos": 27095510667600.0, + "grad_norm": 1.6981502306193133, + "language_loss": 0.7247498, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74947321, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16516113, + "step": 5046, + "time_per_iteration": 2.814934492111206 + }, + { + "auxiliary_loss_clip": 0.01433702, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.29587674, + "balance_loss_mlp": 1.02529478, + "epoch": 0.30344205621524123, + "flos": 19538804961360.0, + "grad_norm": 1.7597701037925955, + "language_loss": 0.80780381, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83257407, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.18041992, + "step": 5047, + "time_per_iteration": 2.7792294025421143 + }, + { + "auxiliary_loss_clip": 0.01428424, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.29462004, + "balance_loss_mlp": 1.02444673, + "epoch": 0.3035021794679092, + "flos": 23915769717960.0, + "grad_norm": 1.6674325415083622, + "language_loss": 0.72092867, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74560905, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.15161133, + "step": 5048, + "time_per_iteration": 2.779492139816284 + }, + { + "auxiliary_loss_clip": 0.01424192, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.28918779, + "balance_loss_mlp": 1.01770151, + "epoch": 0.30356230272057716, + "flos": 16147921210200.0, + "grad_norm": 1.9099925218117382, + "language_loss": 0.75526547, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77983147, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.14703369, + "step": 5049, + "time_per_iteration": 2.7383594512939453 + }, + { + "auxiliary_loss_clip": 0.01429111, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.2916944, + "balance_loss_mlp": 1.02284086, + "epoch": 0.3036224259732452, + "flos": 28920819556800.0, + "grad_norm": 1.6482374561860529, + "language_loss": 0.8224563, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84713686, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16113281, + "step": 5050, + "time_per_iteration": 4.324928045272827 + }, + { + "auxiliary_loss_clip": 0.01424805, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.28982043, + "balance_loss_mlp": 1.01921403, + "epoch": 0.30368254922591315, + "flos": 21110552294040.0, + "grad_norm": 1.8581407398498349, + "language_loss": 0.73956406, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.76417208, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.16772461, + "step": 5051, + "time_per_iteration": 2.7204749584198 + }, + { + "auxiliary_loss_clip": 0.01429081, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.29325211, + "balance_loss_mlp": 1.01741362, + "epoch": 0.3037426724785811, + "flos": 23007419759520.0, + "grad_norm": 1.837710343798118, + "language_loss": 0.76647663, + "learning_rate": 3.264086103483033e-06, + "loss": 0.79110146, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.15991211, + "step": 5052, + "time_per_iteration": 2.7710819244384766 + }, + { + "auxiliary_loss_clip": 0.01433665, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.2954576, + "balance_loss_mlp": 1.02028799, + "epoch": 0.3038027957312491, + "flos": 15636940302960.0, + "grad_norm": 1.7182397435687702, + "language_loss": 0.82568467, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.8503899, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.16564941, + "step": 5053, + "time_per_iteration": 2.713216543197632 + }, + { + "auxiliary_loss_clip": 0.01427434, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.2921015, + "balance_loss_mlp": 1.01971638, + "epoch": 0.30386291898391704, + "flos": 12717867372840.0, + "grad_norm": 1.5012745327929446, + "language_loss": 0.71167928, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73630929, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.15869141, + "step": 5054, + "time_per_iteration": 2.7423787117004395 + }, + { + "auxiliary_loss_clip": 0.01424011, + "auxiliary_loss_mlp": 0.01037598, + "balance_loss_clip": 1.28838086, + "balance_loss_mlp": 1.02119529, + "epoch": 0.303923042236585, + "flos": 26365062245040.0, + "grad_norm": 2.6117897389883264, + "language_loss": 0.69762468, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.72224081, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1640625, + "step": 5055, + "time_per_iteration": 2.7799768447875977 + }, + { + "auxiliary_loss_clip": 0.01432499, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.29574347, + "balance_loss_mlp": 1.01869106, + "epoch": 0.30398316548925297, + "flos": 19724016002040.0, + "grad_norm": 1.8866737822580921, + "language_loss": 0.68213922, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70681769, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16650391, + "step": 5056, + "time_per_iteration": 2.7454779148101807 + }, + { + "auxiliary_loss_clip": 0.01425907, + "auxiliary_loss_mlp": 0.01037355, + "balance_loss_clip": 1.29306519, + "balance_loss_mlp": 1.02185822, + "epoch": 0.30404328874192094, + "flos": 24244747644960.0, + "grad_norm": 1.663149118808972, + "language_loss": 0.82285702, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84748966, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.15490723, + "step": 5057, + "time_per_iteration": 2.778663158416748 + }, + { + "auxiliary_loss_clip": 0.01421229, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.28697634, + "balance_loss_mlp": 1.01995349, + "epoch": 0.3041034119945889, + "flos": 24504359847120.0, + "grad_norm": 1.6878090301475503, + "language_loss": 0.89470786, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91928691, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1673584, + "step": 5058, + "time_per_iteration": 4.342810153961182 + }, + { + "auxiliary_loss_clip": 0.0143751, + "auxiliary_loss_mlp": 0.01041224, + "balance_loss_clip": 1.30028152, + "balance_loss_mlp": 1.02426028, + "epoch": 0.30416353524725687, + "flos": 28294196375520.0, + "grad_norm": 1.973996555223667, + "language_loss": 0.71768022, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.74246752, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.16955566, + "step": 5059, + "time_per_iteration": 4.316181659698486 + }, + { + "auxiliary_loss_clip": 0.01430417, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.2952373, + "balance_loss_mlp": 1.02255058, + "epoch": 0.30422365849992483, + "flos": 23665781697120.0, + "grad_norm": 2.2401124345538865, + "language_loss": 0.72930193, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75398695, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.15527344, + "step": 5060, + "time_per_iteration": 2.858564853668213 + }, + { + "auxiliary_loss_clip": 0.01432384, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_clip": 1.29695511, + "balance_loss_mlp": 1.02529252, + "epoch": 0.3042837817525928, + "flos": 23003440140240.0, + "grad_norm": 1.6738490028976583, + "language_loss": 0.77048671, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.7952224, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15893555, + "step": 5061, + "time_per_iteration": 2.7642674446105957 + }, + { + "auxiliary_loss_clip": 0.0143893, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.30074632, + "balance_loss_mlp": 1.02186334, + "epoch": 0.30434390500526076, + "flos": 22086318776040.0, + "grad_norm": 1.938755303451723, + "language_loss": 0.81945825, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84423769, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17144775, + "step": 5062, + "time_per_iteration": 2.7713685035705566 + }, + { + "auxiliary_loss_clip": 0.01428851, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.29382825, + "balance_loss_mlp": 1.02064228, + "epoch": 0.3044040282579287, + "flos": 25488978951600.0, + "grad_norm": 2.140960276838586, + "language_loss": 0.7473371, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.77198392, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1519165, + "step": 5063, + "time_per_iteration": 2.778895854949951 + }, + { + "auxiliary_loss_clip": 0.01433472, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.29894817, + "balance_loss_mlp": 1.03090703, + "epoch": 0.30446415151059675, + "flos": 21950632953360.0, + "grad_norm": 1.606219890071775, + "language_loss": 0.84509951, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86991191, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16870117, + "step": 5064, + "time_per_iteration": 2.8042399883270264 + }, + { + "auxiliary_loss_clip": 0.01433844, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.29766452, + "balance_loss_mlp": 1.02875447, + "epoch": 0.3045242747632647, + "flos": 16439759469000.0, + "grad_norm": 1.7859112247721007, + "language_loss": 0.7580303, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78282714, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.17089844, + "step": 5065, + "time_per_iteration": 2.7869255542755127 + }, + { + "auxiliary_loss_clip": 0.01442669, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.30496931, + "balance_loss_mlp": 1.03269851, + "epoch": 0.3045843980159327, + "flos": 31546632935520.0, + "grad_norm": 2.1842354271390314, + "language_loss": 0.62691516, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.65184307, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.17407227, + "step": 5066, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.01445229, + "auxiliary_loss_mlp": 0.01049989, + "balance_loss_clip": 1.30471992, + "balance_loss_mlp": 1.03345513, + "epoch": 0.30464452126860064, + "flos": 17857547217000.0, + "grad_norm": 1.9698390115127804, + "language_loss": 0.83240455, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85735673, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.16516113, + "step": 5067, + "time_per_iteration": 2.751680612564087 + }, + { + "auxiliary_loss_clip": 0.01427998, + "auxiliary_loss_mlp": 0.01054412, + "balance_loss_clip": 1.29519165, + "balance_loss_mlp": 1.03837824, + "epoch": 0.3047046445212686, + "flos": 20636304971400.0, + "grad_norm": 1.693901673649404, + "language_loss": 0.62883258, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65365672, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.16027832, + "step": 5068, + "time_per_iteration": 2.7665696144104004 + }, + { + "auxiliary_loss_clip": 0.01434666, + "auxiliary_loss_mlp": 0.01055847, + "balance_loss_clip": 1.29932904, + "balance_loss_mlp": 1.03955126, + "epoch": 0.3047647677739366, + "flos": 21292514665920.0, + "grad_norm": 1.6445413637109987, + "language_loss": 0.75384867, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77875382, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16296387, + "step": 5069, + "time_per_iteration": 2.811326265335083 + }, + { + "auxiliary_loss_clip": 0.01432746, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.29873538, + "balance_loss_mlp": 1.04064465, + "epoch": 0.30482489102660454, + "flos": 21000838840560.0, + "grad_norm": 1.8958189520988886, + "language_loss": 0.75630063, + "learning_rate": 3.258645826569261e-06, + "loss": 0.78119099, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.15661621, + "step": 5070, + "time_per_iteration": 2.8030543327331543 + }, + { + "auxiliary_loss_clip": 0.01447479, + "auxiliary_loss_mlp": 0.01057675, + "balance_loss_clip": 1.3072623, + "balance_loss_mlp": 1.04056847, + "epoch": 0.3048850142792725, + "flos": 26296995987720.0, + "grad_norm": 1.5472881442228383, + "language_loss": 0.82117283, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.84622443, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.17089844, + "step": 5071, + "time_per_iteration": 2.8472402095794678 + }, + { + "auxiliary_loss_clip": 0.01443584, + "auxiliary_loss_mlp": 0.01052375, + "balance_loss_clip": 1.30349994, + "balance_loss_mlp": 1.0352205, + "epoch": 0.30494513753194047, + "flos": 22351372498440.0, + "grad_norm": 2.128337858614489, + "language_loss": 0.75970149, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78466105, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17150879, + "step": 5072, + "time_per_iteration": 2.7708539962768555 + }, + { + "auxiliary_loss_clip": 0.01439974, + "auxiliary_loss_mlp": 0.01063295, + "balance_loss_clip": 1.30439389, + "balance_loss_mlp": 1.04649854, + "epoch": 0.30500526078460843, + "flos": 19542540930480.0, + "grad_norm": 1.607030633688473, + "language_loss": 0.71509218, + "learning_rate": 3.257737608512723e-06, + "loss": 0.74012494, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16796875, + "step": 5073, + "time_per_iteration": 2.7754616737365723 + }, + { + "auxiliary_loss_clip": 0.01445787, + "auxiliary_loss_mlp": 0.01062196, + "balance_loss_clip": 1.30723035, + "balance_loss_mlp": 1.04487538, + "epoch": 0.3050653840372764, + "flos": 14469424834320.0, + "grad_norm": 2.1503765641374684, + "language_loss": 0.7651552, + "learning_rate": 3.257434773758163e-06, + "loss": 0.79023504, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.17321777, + "step": 5074, + "time_per_iteration": 2.8112735748291016 + }, + { + "auxiliary_loss_clip": 0.01442458, + "auxiliary_loss_mlp": 0.0105005, + "balance_loss_clip": 1.30824852, + "balance_loss_mlp": 1.03479135, + "epoch": 0.30512550728994436, + "flos": 24249214564560.0, + "grad_norm": 2.0368426190540836, + "language_loss": 0.75017393, + "learning_rate": 3.25713189132155e-06, + "loss": 0.77509904, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.15258789, + "step": 5075, + "time_per_iteration": 2.787898540496826 + }, + { + "auxiliary_loss_clip": 0.01456439, + "auxiliary_loss_mlp": 0.01054234, + "balance_loss_clip": 1.31544733, + "balance_loss_mlp": 1.03555417, + "epoch": 0.30518563054261233, + "flos": 16364627357040.0, + "grad_norm": 1.8170631758465425, + "language_loss": 0.7553463, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.78045303, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.18676758, + "step": 5076, + "time_per_iteration": 2.8469934463500977 + }, + { + "auxiliary_loss_clip": 0.01449513, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_clip": 1.31366301, + "balance_loss_mlp": 1.02742624, + "epoch": 0.30524575379528035, + "flos": 21584555966520.0, + "grad_norm": 1.8254574138684205, + "language_loss": 0.79329818, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81824386, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.17614746, + "step": 5077, + "time_per_iteration": 2.9389476776123047 + }, + { + "auxiliary_loss_clip": 0.01432369, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.30128789, + "balance_loss_mlp": 1.02351117, + "epoch": 0.3053058770479483, + "flos": 16549635355920.0, + "grad_norm": 4.052633072272034, + "language_loss": 0.74952102, + "learning_rate": 3.256222958034259e-06, + "loss": 0.77423346, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15368652, + "step": 5078, + "time_per_iteration": 2.8223302364349365 + }, + { + "auxiliary_loss_clip": 0.01441286, + "auxiliary_loss_mlp": 0.01049709, + "balance_loss_clip": 1.30717635, + "balance_loss_mlp": 1.03299594, + "epoch": 0.3053660003006163, + "flos": 12316762352520.0, + "grad_norm": 1.848925963918399, + "language_loss": 0.67372572, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69863564, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16711426, + "step": 5079, + "time_per_iteration": 2.7896952629089355 + }, + { + "auxiliary_loss_clip": 0.01444976, + "auxiliary_loss_mlp": 0.01043654, + "balance_loss_clip": 1.30925989, + "balance_loss_mlp": 1.02689362, + "epoch": 0.30542612355328425, + "flos": 23117498688240.0, + "grad_norm": 1.7083321851646431, + "language_loss": 0.8004306, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82531691, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16760254, + "step": 5080, + "time_per_iteration": 2.79683780670166 + }, + { + "auxiliary_loss_clip": 0.0144284, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.3072232, + "balance_loss_mlp": 1.01909161, + "epoch": 0.3054862468059522, + "flos": 24394849435440.0, + "grad_norm": 2.245890327039699, + "language_loss": 0.81268859, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83747375, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16577148, + "step": 5081, + "time_per_iteration": 2.8630619049072266 + }, + { + "auxiliary_loss_clip": 0.01444797, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.30990434, + "balance_loss_mlp": 1.02534425, + "epoch": 0.3055463700586202, + "flos": 29391899427360.0, + "grad_norm": 1.5737328370242145, + "language_loss": 0.7190358, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74389148, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.15423584, + "step": 5082, + "time_per_iteration": 2.8568930625915527 + }, + { + "auxiliary_loss_clip": 0.01449664, + "auxiliary_loss_mlp": 0.01043291, + "balance_loss_clip": 1.30892408, + "balance_loss_mlp": 1.02445638, + "epoch": 0.30560649331128814, + "flos": 25597108679040.0, + "grad_norm": 1.8368552095336146, + "language_loss": 0.73332512, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75825465, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.18835449, + "step": 5083, + "time_per_iteration": 4.253671169281006 + }, + { + "auxiliary_loss_clip": 0.01445552, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.30709267, + "balance_loss_mlp": 1.0222044, + "epoch": 0.3056666165639561, + "flos": 19132095987360.0, + "grad_norm": 1.665015227437967, + "language_loss": 0.71623278, + "learning_rate": 3.254403805595344e-06, + "loss": 0.74109066, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.18029785, + "step": 5084, + "time_per_iteration": 2.7634665966033936 + }, + { + "auxiliary_loss_clip": 0.01452444, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.31365609, + "balance_loss_mlp": 1.02180326, + "epoch": 0.30572673981662407, + "flos": 15528160841760.0, + "grad_norm": 1.7161542928997235, + "language_loss": 0.79078782, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.81570101, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1706543, + "step": 5085, + "time_per_iteration": 2.7469570636749268 + }, + { + "auxiliary_loss_clip": 0.01437484, + "auxiliary_loss_mlp": 0.01038913, + "balance_loss_clip": 1.30483568, + "balance_loss_mlp": 1.02231932, + "epoch": 0.30578686306929204, + "flos": 21511494880920.0, + "grad_norm": 1.599362982955388, + "language_loss": 0.78048813, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80525208, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16589355, + "step": 5086, + "time_per_iteration": 2.8091835975646973 + }, + { + "auxiliary_loss_clip": 0.0144507, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.31029212, + "balance_loss_mlp": 1.02684832, + "epoch": 0.30584698632196, + "flos": 20958501302280.0, + "grad_norm": 1.666347033301634, + "language_loss": 0.76326263, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78815639, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.17474365, + "step": 5087, + "time_per_iteration": 2.7595317363739014 + }, + { + "auxiliary_loss_clip": 0.01445649, + "auxiliary_loss_mlp": 0.01036424, + "balance_loss_clip": 1.30697227, + "balance_loss_mlp": 1.01916254, + "epoch": 0.30590710957462797, + "flos": 24686525260800.0, + "grad_norm": 2.4240421175896896, + "language_loss": 0.72568369, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.75050449, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17248535, + "step": 5088, + "time_per_iteration": 2.802823781967163 + }, + { + "auxiliary_loss_clip": 0.0145609, + "auxiliary_loss_mlp": 0.0103969, + "balance_loss_clip": 1.31533384, + "balance_loss_mlp": 1.02252388, + "epoch": 0.30596723282729593, + "flos": 17090649468360.0, + "grad_norm": 2.261975824898573, + "language_loss": 0.80004013, + "learning_rate": 3.252886537028521e-06, + "loss": 0.8249979, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.17163086, + "step": 5089, + "time_per_iteration": 4.229748010635376 + }, + { + "auxiliary_loss_clip": 0.01447765, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.31143618, + "balance_loss_mlp": 1.02309191, + "epoch": 0.30602735607996395, + "flos": 22862312797320.0, + "grad_norm": 2.554980929811837, + "language_loss": 0.77626514, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.80114543, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.17175293, + "step": 5090, + "time_per_iteration": 2.7914116382598877 + }, + { + "auxiliary_loss_clip": 0.01451217, + "auxiliary_loss_mlp": 0.01049913, + "balance_loss_clip": 1.31142068, + "balance_loss_mlp": 1.03166246, + "epoch": 0.3060874793326319, + "flos": 29867486825880.0, + "grad_norm": 1.8772653803403347, + "language_loss": 0.76231885, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78733015, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.18249512, + "step": 5091, + "time_per_iteration": 2.8798584938049316 + }, + { + "auxiliary_loss_clip": 0.0145001, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.31098342, + "balance_loss_mlp": 1.02523041, + "epoch": 0.3061476025852999, + "flos": 20453408607240.0, + "grad_norm": 2.223227406063943, + "language_loss": 0.7195251, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74444675, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.16918945, + "step": 5092, + "time_per_iteration": 2.745239734649658 + }, + { + "auxiliary_loss_clip": 0.01453546, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_clip": 1.31667471, + "balance_loss_mlp": 1.02532256, + "epoch": 0.30620772583796785, + "flos": 19396296934200.0, + "grad_norm": 2.1775695166578153, + "language_loss": 0.82770741, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.85265911, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16308594, + "step": 5093, + "time_per_iteration": 2.8015401363372803 + }, + { + "auxiliary_loss_clip": 0.014458, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.31225967, + "balance_loss_mlp": 1.02453816, + "epoch": 0.3062678490906358, + "flos": 24029909482680.0, + "grad_norm": 1.8625508868076592, + "language_loss": 0.75151491, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77637744, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15905762, + "step": 5094, + "time_per_iteration": 2.8352832794189453 + }, + { + "auxiliary_loss_clip": 0.01447672, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_clip": 1.31348968, + "balance_loss_mlp": 1.02594483, + "epoch": 0.3063279723433038, + "flos": 19759287685680.0, + "grad_norm": 1.8254329843558779, + "language_loss": 0.75950497, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78440869, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16748047, + "step": 5095, + "time_per_iteration": 4.297528266906738 + }, + { + "auxiliary_loss_clip": 0.01447544, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.31423473, + "balance_loss_mlp": 1.02734447, + "epoch": 0.30638809559597174, + "flos": 22454426180880.0, + "grad_norm": 1.6353992437646288, + "language_loss": 0.80896795, + "learning_rate": 3.250760365955042e-06, + "loss": 0.83388048, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16357422, + "step": 5096, + "time_per_iteration": 2.7801756858825684 + }, + { + "auxiliary_loss_clip": 0.01457469, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.31982529, + "balance_loss_mlp": 1.02014887, + "epoch": 0.3064482188486397, + "flos": 17169598766160.0, + "grad_norm": 2.0018759058791233, + "language_loss": 0.81557357, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84051484, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16516113, + "step": 5097, + "time_per_iteration": 4.240710258483887 + }, + { + "auxiliary_loss_clip": 0.01451473, + "auxiliary_loss_mlp": 0.01041253, + "balance_loss_clip": 1.3147918, + "balance_loss_mlp": 1.02401519, + "epoch": 0.3065083421013077, + "flos": 23773708382760.0, + "grad_norm": 2.6410970961402733, + "language_loss": 0.77972716, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80465442, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17236328, + "step": 5098, + "time_per_iteration": 2.77449107170105 + }, + { + "auxiliary_loss_clip": 0.01444393, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.30969262, + "balance_loss_mlp": 1.01967931, + "epoch": 0.30656846535397564, + "flos": 26437392380160.0, + "grad_norm": 1.7646162786551673, + "language_loss": 0.84289473, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86769891, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16351318, + "step": 5099, + "time_per_iteration": 2.8259482383728027 + }, + { + "auxiliary_loss_clip": 0.01447755, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_clip": 1.30962992, + "balance_loss_mlp": 1.02790046, + "epoch": 0.3066285886066436, + "flos": 26657509629240.0, + "grad_norm": 1.628829237050154, + "language_loss": 0.85775769, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.88268852, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17431641, + "step": 5100, + "time_per_iteration": 2.79789400100708 + }, + { + "auxiliary_loss_clip": 0.01455301, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.31936502, + "balance_loss_mlp": 1.02124596, + "epoch": 0.30668871185931157, + "flos": 15054238386000.0, + "grad_norm": 1.9168966142852226, + "language_loss": 0.79030108, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81523514, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.16833496, + "step": 5101, + "time_per_iteration": 2.83187198638916 + }, + { + "auxiliary_loss_clip": 0.0145159, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.31334209, + "balance_loss_mlp": 1.02751136, + "epoch": 0.30674883511197953, + "flos": 20086681886640.0, + "grad_norm": 1.7050218113446445, + "language_loss": 0.80170524, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.8266719, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.17541504, + "step": 5102, + "time_per_iteration": 2.8234236240386963 + }, + { + "auxiliary_loss_clip": 0.01454757, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.31779552, + "balance_loss_mlp": 1.02674997, + "epoch": 0.30680895836464755, + "flos": 22899046381920.0, + "grad_norm": 1.8031947473389878, + "language_loss": 0.88923097, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91422772, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.1817627, + "step": 5103, + "time_per_iteration": 2.7841343879699707 + }, + { + "auxiliary_loss_clip": 0.01447194, + "auxiliary_loss_mlp": 0.01047007, + "balance_loss_clip": 1.31069636, + "balance_loss_mlp": 1.03001952, + "epoch": 0.3068690816173155, + "flos": 23701459464360.0, + "grad_norm": 1.6747564411376203, + "language_loss": 0.73919713, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76413918, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.1697998, + "step": 5104, + "time_per_iteration": 2.802513599395752 + }, + { + "auxiliary_loss_clip": 0.01460785, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.31938338, + "balance_loss_mlp": 1.0268172, + "epoch": 0.3069292048699835, + "flos": 23556433718880.0, + "grad_norm": 1.7011211538226045, + "language_loss": 0.73206508, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75711268, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.17163086, + "step": 5105, + "time_per_iteration": 2.781306743621826 + }, + { + "auxiliary_loss_clip": 0.01446492, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.30976391, + "balance_loss_mlp": 1.02287877, + "epoch": 0.30698932812265145, + "flos": 24536585903760.0, + "grad_norm": 1.8848088025897385, + "language_loss": 0.88194138, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.9068104, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17541504, + "step": 5106, + "time_per_iteration": 2.822664737701416 + }, + { + "auxiliary_loss_clip": 0.01461762, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.32032037, + "balance_loss_mlp": 1.02795219, + "epoch": 0.3070494513753194, + "flos": 21001407357600.0, + "grad_norm": 2.107652356384652, + "language_loss": 0.71723473, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.74230772, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.17578125, + "step": 5107, + "time_per_iteration": 2.7686123847961426 + }, + { + "auxiliary_loss_clip": 0.01443392, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.3090713, + "balance_loss_mlp": 1.02612281, + "epoch": 0.3071095746279874, + "flos": 19030626030960.0, + "grad_norm": 2.112303217087596, + "language_loss": 0.72211933, + "learning_rate": 3.247110096547814e-06, + "loss": 0.7469871, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.17248535, + "step": 5108, + "time_per_iteration": 2.7565548419952393 + }, + { + "auxiliary_loss_clip": 0.01445384, + "auxiliary_loss_mlp": 0.0103983, + "balance_loss_clip": 1.30993581, + "balance_loss_mlp": 1.02314115, + "epoch": 0.30716969788065535, + "flos": 21220550006040.0, + "grad_norm": 1.5019778587091417, + "language_loss": 0.85903454, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.88388669, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16687012, + "step": 5109, + "time_per_iteration": 2.7540783882141113 + }, + { + "auxiliary_loss_clip": 0.01441321, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.30556464, + "balance_loss_mlp": 1.01813149, + "epoch": 0.3072298211333233, + "flos": 25777934016840.0, + "grad_norm": 1.5939373476749774, + "language_loss": 0.67600197, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.70075494, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.1585083, + "step": 5110, + "time_per_iteration": 2.8140549659729004 + }, + { + "auxiliary_loss_clip": 0.01444511, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.31105804, + "balance_loss_mlp": 1.01867044, + "epoch": 0.3072899443859913, + "flos": 25854528029760.0, + "grad_norm": 1.4134087572779346, + "language_loss": 0.77214819, + "learning_rate": 3.246196464379919e-06, + "loss": 0.7969321, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15209961, + "step": 5111, + "time_per_iteration": 2.869610548019409 + }, + { + "auxiliary_loss_clip": 0.0145189, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.31383204, + "balance_loss_mlp": 1.02339327, + "epoch": 0.30735006763865924, + "flos": 25928523107640.0, + "grad_norm": 2.3531880210531755, + "language_loss": 0.67036229, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69528329, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.16796875, + "step": 5112, + "time_per_iteration": 2.8636600971221924 + }, + { + "auxiliary_loss_clip": 0.01458046, + "auxiliary_loss_mlp": 0.01042009, + "balance_loss_clip": 1.31817794, + "balance_loss_mlp": 1.02273297, + "epoch": 0.3074101908913272, + "flos": 30922811731080.0, + "grad_norm": 1.9796566280045707, + "language_loss": 0.79555887, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.82055938, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.19274902, + "step": 5113, + "time_per_iteration": 2.853736400604248 + }, + { + "auxiliary_loss_clip": 0.01455249, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_clip": 1.31666827, + "balance_loss_mlp": 1.02957416, + "epoch": 0.30747031414399517, + "flos": 18405058667040.0, + "grad_norm": 1.8836267449690858, + "language_loss": 0.77275383, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79776323, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.16101074, + "step": 5114, + "time_per_iteration": 2.8007404804229736 + }, + { + "auxiliary_loss_clip": 0.01450335, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.31531692, + "balance_loss_mlp": 1.01917768, + "epoch": 0.30753043739666314, + "flos": 22637688020280.0, + "grad_norm": 2.2550928909015355, + "language_loss": 0.62069106, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64556682, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.18066406, + "step": 5115, + "time_per_iteration": 2.7985825538635254 + }, + { + "auxiliary_loss_clip": 0.01448632, + "auxiliary_loss_mlp": 0.01040364, + "balance_loss_clip": 1.31237125, + "balance_loss_mlp": 1.02382946, + "epoch": 0.3075905606493311, + "flos": 27349762566240.0, + "grad_norm": 2.0720366883635375, + "language_loss": 0.8260324, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85092235, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1652832, + "step": 5116, + "time_per_iteration": 2.800264596939087 + }, + { + "auxiliary_loss_clip": 0.01450692, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.31500173, + "balance_loss_mlp": 1.02694917, + "epoch": 0.3076506839019991, + "flos": 22095902349000.0, + "grad_norm": 1.7326453299435987, + "language_loss": 0.75861311, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78355569, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16601562, + "step": 5117, + "time_per_iteration": 2.7989814281463623 + }, + { + "auxiliary_loss_clip": 0.01451979, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.31580687, + "balance_loss_mlp": 1.02450562, + "epoch": 0.3077108071546671, + "flos": 21294707517360.0, + "grad_norm": 2.1749347382397577, + "language_loss": 0.71917856, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74411106, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.16760254, + "step": 5118, + "time_per_iteration": 2.87092661857605 + }, + { + "auxiliary_loss_clip": 0.01452473, + "auxiliary_loss_mlp": 0.01041815, + "balance_loss_clip": 1.31669021, + "balance_loss_mlp": 1.0261029, + "epoch": 0.30777093040733505, + "flos": 21435591210120.0, + "grad_norm": 1.4787180654131025, + "language_loss": 0.74593866, + "learning_rate": 3.243758033520219e-06, + "loss": 0.77088153, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15722656, + "step": 5119, + "time_per_iteration": 2.862940549850464 + }, + { + "auxiliary_loss_clip": 0.01458418, + "auxiliary_loss_mlp": 0.01050983, + "balance_loss_clip": 1.31939161, + "balance_loss_mlp": 1.03275561, + "epoch": 0.307831053660003, + "flos": 23154557139720.0, + "grad_norm": 1.7137506667491864, + "language_loss": 0.80620396, + "learning_rate": 3.243453017305926e-06, + "loss": 0.83129799, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.18212891, + "step": 5120, + "time_per_iteration": 2.841555118560791 + }, + { + "auxiliary_loss_clip": 0.01447908, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.31331527, + "balance_loss_mlp": 1.02636492, + "epoch": 0.307891176912671, + "flos": 17024248153800.0, + "grad_norm": 1.8514840197060418, + "language_loss": 0.80413258, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82903105, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15576172, + "step": 5121, + "time_per_iteration": 2.7814130783081055 + }, + { + "auxiliary_loss_clip": 0.01450385, + "auxiliary_loss_mlp": 0.01037853, + "balance_loss_clip": 1.31679547, + "balance_loss_mlp": 1.02177143, + "epoch": 0.30795130016533895, + "flos": 27710276207760.0, + "grad_norm": 1.4652757085628292, + "language_loss": 0.82591081, + "learning_rate": 3.242842843433319e-06, + "loss": 0.85079318, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16064453, + "step": 5122, + "time_per_iteration": 4.3132922649383545 + }, + { + "auxiliary_loss_clip": 0.01275669, + "auxiliary_loss_mlp": 0.01002889, + "balance_loss_clip": 1.21770811, + "balance_loss_mlp": 1.00032604, + "epoch": 0.3080114234180069, + "flos": 69076088982720.0, + "grad_norm": 0.7389445961807807, + "language_loss": 0.58632827, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60911393, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.02563477, + "step": 5123, + "time_per_iteration": 3.512165069580078 + }, + { + "auxiliary_loss_clip": 0.01461896, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.32037795, + "balance_loss_mlp": 1.02290726, + "epoch": 0.3080715466706749, + "flos": 24065465424840.0, + "grad_norm": 1.5072214584757666, + "language_loss": 0.8344115, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85943902, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17956543, + "step": 5124, + "time_per_iteration": 2.8152871131896973 + }, + { + "auxiliary_loss_clip": 0.01460527, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.32144654, + "balance_loss_mlp": 1.02481675, + "epoch": 0.30813166992334284, + "flos": 25854243771240.0, + "grad_norm": 2.392904727055611, + "language_loss": 0.795187, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.82021171, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17114258, + "step": 5125, + "time_per_iteration": 2.8229637145996094 + }, + { + "auxiliary_loss_clip": 0.01461242, + "auxiliary_loss_mlp": 0.01042508, + "balance_loss_clip": 1.31926656, + "balance_loss_mlp": 1.02430439, + "epoch": 0.3081917931760108, + "flos": 20454586249680.0, + "grad_norm": 1.9227190778495598, + "language_loss": 0.6469931, + "learning_rate": 3.241621930235989e-06, + "loss": 0.67203057, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.18225098, + "step": 5126, + "time_per_iteration": 2.777845859527588 + }, + { + "auxiliary_loss_clip": 0.01437283, + "auxiliary_loss_mlp": 0.01047008, + "balance_loss_clip": 1.30516863, + "balance_loss_mlp": 1.0308075, + "epoch": 0.3082519164286788, + "flos": 22171684194720.0, + "grad_norm": 1.468538266328187, + "language_loss": 0.86992812, + "learning_rate": 3.241316584201646e-06, + "loss": 0.8947711, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.1619873, + "step": 5127, + "time_per_iteration": 2.799454689025879 + }, + { + "auxiliary_loss_clip": 0.01451413, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.31651092, + "balance_loss_mlp": 1.02128494, + "epoch": 0.30831203968134674, + "flos": 28919357655840.0, + "grad_norm": 1.4635979852324594, + "language_loss": 0.68936014, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71424949, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.16247559, + "step": 5128, + "time_per_iteration": 4.263129472732544 + }, + { + "auxiliary_loss_clip": 0.01456288, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_clip": 1.31644678, + "balance_loss_mlp": 1.02682221, + "epoch": 0.3083721629340147, + "flos": 25673459041800.0, + "grad_norm": 1.7284855068979195, + "language_loss": 0.718692, + "learning_rate": 3.240705750931993e-06, + "loss": 0.74370253, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17944336, + "step": 5129, + "time_per_iteration": 2.792820453643799 + }, + { + "auxiliary_loss_clip": 0.01263957, + "auxiliary_loss_mlp": 0.01005748, + "balance_loss_clip": 1.20606041, + "balance_loss_mlp": 1.00300646, + "epoch": 0.3084322861866827, + "flos": 68229064293840.0, + "grad_norm": 0.9191909904638366, + "language_loss": 0.59254354, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61524057, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.02746582, + "step": 5130, + "time_per_iteration": 3.2352824211120605 + }, + { + "auxiliary_loss_clip": 0.01461477, + "auxiliary_loss_mlp": 0.01043952, + "balance_loss_clip": 1.32209468, + "balance_loss_mlp": 1.02616644, + "epoch": 0.3084924094393507, + "flos": 20300504839920.0, + "grad_norm": 2.1904731668713886, + "language_loss": 0.73773432, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.76278853, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17773438, + "step": 5131, + "time_per_iteration": 2.7645905017852783 + }, + { + "auxiliary_loss_clip": 0.01450014, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.31534743, + "balance_loss_mlp": 1.02004218, + "epoch": 0.30855253269201866, + "flos": 23954939804160.0, + "grad_norm": 1.663262619543064, + "language_loss": 0.71334648, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73820579, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15869141, + "step": 5132, + "time_per_iteration": 2.7910425662994385 + }, + { + "auxiliary_loss_clip": 0.0144376, + "auxiliary_loss_mlp": 0.01044042, + "balance_loss_clip": 1.31164336, + "balance_loss_mlp": 1.0280323, + "epoch": 0.3086126559446866, + "flos": 19286664697440.0, + "grad_norm": 1.8485997924913444, + "language_loss": 0.90503871, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92991674, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.16015625, + "step": 5133, + "time_per_iteration": 2.796438217163086 + }, + { + "auxiliary_loss_clip": 0.01453552, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.31348836, + "balance_loss_mlp": 1.02483988, + "epoch": 0.3086727791973546, + "flos": 33766671332520.0, + "grad_norm": 1.7681868805636278, + "language_loss": 0.6730594, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69801605, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17272949, + "step": 5134, + "time_per_iteration": 4.394319534301758 + }, + { + "auxiliary_loss_clip": 0.01457907, + "auxiliary_loss_mlp": 0.01046788, + "balance_loss_clip": 1.31801414, + "balance_loss_mlp": 1.02989566, + "epoch": 0.30873290245002255, + "flos": 16038451406880.0, + "grad_norm": 2.1248070279796485, + "language_loss": 0.83926237, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.86430931, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.16882324, + "step": 5135, + "time_per_iteration": 2.753594398498535 + }, + { + "auxiliary_loss_clip": 0.01264455, + "auxiliary_loss_mlp": 0.01003628, + "balance_loss_clip": 1.206617, + "balance_loss_mlp": 1.0008744, + "epoch": 0.3087930257026905, + "flos": 65065566688200.0, + "grad_norm": 0.7028252260805853, + "language_loss": 0.55347061, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57615149, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.02758789, + "step": 5136, + "time_per_iteration": 4.766871213912964 + }, + { + "auxiliary_loss_clip": 0.0145019, + "auxiliary_loss_mlp": 0.01040619, + "balance_loss_clip": 1.31313753, + "balance_loss_mlp": 1.02413285, + "epoch": 0.3088531489553585, + "flos": 74750674704120.0, + "grad_norm": 1.9565692012068496, + "language_loss": 0.76069725, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78560537, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16491699, + "step": 5137, + "time_per_iteration": 3.177140235900879 + }, + { + "auxiliary_loss_clip": 0.01446803, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.310619, + "balance_loss_mlp": 1.02537036, + "epoch": 0.30891327220802645, + "flos": 21147285878640.0, + "grad_norm": 1.6268951565744638, + "language_loss": 0.80185211, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82672131, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.14758301, + "step": 5138, + "time_per_iteration": 2.746534585952759 + }, + { + "auxiliary_loss_clip": 0.01451176, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_clip": 1.31243324, + "balance_loss_mlp": 1.02634764, + "epoch": 0.3089733954606944, + "flos": 25669804289400.0, + "grad_norm": 1.6264230089899252, + "language_loss": 0.81536818, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.84032035, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.17712402, + "step": 5139, + "time_per_iteration": 2.8161609172821045 + }, + { + "auxiliary_loss_clip": 0.01461375, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.31875181, + "balance_loss_mlp": 1.02119815, + "epoch": 0.3090335187133624, + "flos": 19432096526520.0, + "grad_norm": 1.8197082479900584, + "language_loss": 0.77944595, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.80445284, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.18121338, + "step": 5140, + "time_per_iteration": 2.7662415504455566 + }, + { + "auxiliary_loss_clip": 0.01431063, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.3004086, + "balance_loss_mlp": 1.02693892, + "epoch": 0.30909364196603034, + "flos": 20016179127720.0, + "grad_norm": 1.7545083359927993, + "language_loss": 0.7916652, + "learning_rate": 3.237036802553252e-06, + "loss": 0.81639409, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.14874268, + "step": 5141, + "time_per_iteration": 2.9005236625671387 + }, + { + "auxiliary_loss_clip": 0.01446976, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_clip": 1.30867589, + "balance_loss_mlp": 1.02599704, + "epoch": 0.3091537652186983, + "flos": 19681678463760.0, + "grad_norm": 2.0070354428675414, + "language_loss": 0.87399274, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89889675, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17431641, + "step": 5142, + "time_per_iteration": 2.772084951400757 + }, + { + "auxiliary_loss_clip": 0.01446896, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_clip": 1.30798829, + "balance_loss_mlp": 1.02877474, + "epoch": 0.3092138884713663, + "flos": 17024573020680.0, + "grad_norm": 3.310034132210836, + "language_loss": 0.79353929, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81846488, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16882324, + "step": 5143, + "time_per_iteration": 2.716810703277588 + }, + { + "auxiliary_loss_clip": 0.01442638, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.30728197, + "balance_loss_mlp": 1.01828635, + "epoch": 0.3092740117240343, + "flos": 25015503187800.0, + "grad_norm": 1.57300761892155, + "language_loss": 0.72137624, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74615872, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1730957, + "step": 5144, + "time_per_iteration": 2.806694269180298 + }, + { + "auxiliary_loss_clip": 0.01448909, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_clip": 1.30918503, + "balance_loss_mlp": 1.02563763, + "epoch": 0.30933413497670226, + "flos": 25595565561360.0, + "grad_norm": 1.798681464451505, + "language_loss": 0.74518079, + "learning_rate": 3.235812317696702e-06, + "loss": 0.77009952, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.17321777, + "step": 5145, + "time_per_iteration": 2.7881405353546143 + }, + { + "auxiliary_loss_clip": 0.01441004, + "auxiliary_loss_mlp": 0.01043934, + "balance_loss_clip": 1.30435932, + "balance_loss_mlp": 1.02648222, + "epoch": 0.3093942582293702, + "flos": 24395093085600.0, + "grad_norm": 1.6110427007042163, + "language_loss": 0.7635138, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78836316, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.17456055, + "step": 5146, + "time_per_iteration": 2.803661823272705 + }, + { + "auxiliary_loss_clip": 0.01446371, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.30862761, + "balance_loss_mlp": 1.01697993, + "epoch": 0.3094543814820382, + "flos": 19651239174960.0, + "grad_norm": 1.761342002385297, + "language_loss": 0.67444044, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.69923496, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.16101074, + "step": 5147, + "time_per_iteration": 2.7383108139038086 + }, + { + "auxiliary_loss_clip": 0.01451026, + "auxiliary_loss_mlp": 0.01041952, + "balance_loss_clip": 1.31267059, + "balance_loss_mlp": 1.0255487, + "epoch": 0.30951450473470615, + "flos": 25669032730560.0, + "grad_norm": 1.6421135318655156, + "language_loss": 0.74868095, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77361065, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.1640625, + "step": 5148, + "time_per_iteration": 2.7817623615264893 + }, + { + "auxiliary_loss_clip": 0.0146497, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.31947589, + "balance_loss_mlp": 1.02278411, + "epoch": 0.3095746279873741, + "flos": 12024152534880.0, + "grad_norm": 2.0609619083094044, + "language_loss": 0.73153049, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.75658596, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.17810059, + "step": 5149, + "time_per_iteration": 2.716837167739868 + }, + { + "auxiliary_loss_clip": 0.01449362, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.30887341, + "balance_loss_mlp": 1.01962471, + "epoch": 0.3096347512400421, + "flos": 23628317162040.0, + "grad_norm": 1.997873240755503, + "language_loss": 0.85072947, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87560374, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.18444824, + "step": 5150, + "time_per_iteration": 2.7609570026397705 + }, + { + "auxiliary_loss_clip": 0.0144577, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.30851638, + "balance_loss_mlp": 1.02233887, + "epoch": 0.30969487449271005, + "flos": 22534553121120.0, + "grad_norm": 1.9089176966982413, + "language_loss": 0.79141653, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81627214, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.17456055, + "step": 5151, + "time_per_iteration": 2.805511236190796 + }, + { + "auxiliary_loss_clip": 0.0145382, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.31413472, + "balance_loss_mlp": 1.0226928, + "epoch": 0.309754997745378, + "flos": 15272122175280.0, + "grad_norm": 1.7963763967504607, + "language_loss": 0.6700424, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.6949929, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.1854248, + "step": 5152, + "time_per_iteration": 2.7098379135131836 + }, + { + "auxiliary_loss_clip": 0.01442932, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.30642939, + "balance_loss_mlp": 1.02556896, + "epoch": 0.309815120998046, + "flos": 26984944438560.0, + "grad_norm": 2.023470798725208, + "language_loss": 0.82289553, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84775788, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.17736816, + "step": 5153, + "time_per_iteration": 2.842519521713257 + }, + { + "auxiliary_loss_clip": 0.01446323, + "auxiliary_loss_mlp": 0.01045839, + "balance_loss_clip": 1.30998921, + "balance_loss_mlp": 1.02836239, + "epoch": 0.30987524425071394, + "flos": 21148260479280.0, + "grad_norm": 1.6933292846810701, + "language_loss": 0.7418974, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.766819, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.17456055, + "step": 5154, + "time_per_iteration": 2.7888386249542236 + }, + { + "auxiliary_loss_clip": 0.01439451, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_clip": 1.30410278, + "balance_loss_mlp": 1.02545285, + "epoch": 0.3099353675033819, + "flos": 15272528258880.0, + "grad_norm": 1.7901090257120023, + "language_loss": 0.75850821, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78333217, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.17492676, + "step": 5155, + "time_per_iteration": 2.74513840675354 + }, + { + "auxiliary_loss_clip": 0.01446439, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.30721724, + "balance_loss_mlp": 1.02097106, + "epoch": 0.30999549075604993, + "flos": 15418000696320.0, + "grad_norm": 1.7636860142419826, + "language_loss": 0.79168212, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81652957, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17346191, + "step": 5156, + "time_per_iteration": 2.7061901092529297 + }, + { + "auxiliary_loss_clip": 0.01446604, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.30684352, + "balance_loss_mlp": 1.02160192, + "epoch": 0.3100556140087179, + "flos": 23189666389920.0, + "grad_norm": 1.9481179180977755, + "language_loss": 0.74994695, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77481681, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.18786621, + "step": 5157, + "time_per_iteration": 2.7823920249938965 + }, + { + "auxiliary_loss_clip": 0.01435678, + "auxiliary_loss_mlp": 0.01036098, + "balance_loss_clip": 1.30057061, + "balance_loss_mlp": 1.01977825, + "epoch": 0.31011573726138586, + "flos": 25748103853440.0, + "grad_norm": 1.6103063815717928, + "language_loss": 0.69841033, + "learning_rate": 3.231827567499327e-06, + "loss": 0.72312808, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16320801, + "step": 5158, + "time_per_iteration": 2.833570957183838 + }, + { + "auxiliary_loss_clip": 0.01436487, + "auxiliary_loss_mlp": 0.01047653, + "balance_loss_clip": 1.30177879, + "balance_loss_mlp": 1.03116632, + "epoch": 0.3101758605140538, + "flos": 20016179127720.0, + "grad_norm": 1.8657496832010945, + "language_loss": 0.84746855, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87230992, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.16491699, + "step": 5159, + "time_per_iteration": 2.7296011447906494 + }, + { + "auxiliary_loss_clip": 0.01441332, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.30360544, + "balance_loss_mlp": 1.02528536, + "epoch": 0.3102359837667218, + "flos": 19140420701160.0, + "grad_norm": 1.7924884045673557, + "language_loss": 0.8501699, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87501329, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17712402, + "step": 5160, + "time_per_iteration": 4.158666610717773 + }, + { + "auxiliary_loss_clip": 0.01435963, + "auxiliary_loss_mlp": 0.01037615, + "balance_loss_clip": 1.30187428, + "balance_loss_mlp": 1.02100885, + "epoch": 0.31029610701938976, + "flos": 22269499398720.0, + "grad_norm": 2.1470439589449297, + "language_loss": 0.763291, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78802681, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16589355, + "step": 5161, + "time_per_iteration": 2.7366020679473877 + }, + { + "auxiliary_loss_clip": 0.01444311, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.30472255, + "balance_loss_mlp": 1.02583325, + "epoch": 0.3103562302720577, + "flos": 20809171070640.0, + "grad_norm": 2.0015089742663594, + "language_loss": 0.81617737, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.84106791, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.18896484, + "step": 5162, + "time_per_iteration": 2.740222215652466 + }, + { + "auxiliary_loss_clip": 0.01437494, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.30283165, + "balance_loss_mlp": 1.02085686, + "epoch": 0.3104163535247257, + "flos": 22349017213560.0, + "grad_norm": 1.5822065434288684, + "language_loss": 0.83188051, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.8566252, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16113281, + "step": 5163, + "time_per_iteration": 2.770979642868042 + }, + { + "auxiliary_loss_clip": 0.01448041, + "auxiliary_loss_mlp": 0.01043816, + "balance_loss_clip": 1.30989146, + "balance_loss_mlp": 1.02607799, + "epoch": 0.31047647677739365, + "flos": 21694391245080.0, + "grad_norm": 1.7862207849980234, + "language_loss": 0.76455283, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78947139, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17724609, + "step": 5164, + "time_per_iteration": 2.775237560272217 + }, + { + "auxiliary_loss_clip": 0.0144689, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.30907166, + "balance_loss_mlp": 1.02413249, + "epoch": 0.3105366000300616, + "flos": 18922049611560.0, + "grad_norm": 4.762342870255457, + "language_loss": 0.7484799, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.77336502, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17504883, + "step": 5165, + "time_per_iteration": 2.804140090942383 + }, + { + "auxiliary_loss_clip": 0.0143811, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.30242574, + "balance_loss_mlp": 1.02286088, + "epoch": 0.3105967232827296, + "flos": 18265068358200.0, + "grad_norm": 1.8359303702304313, + "language_loss": 0.76238847, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78717697, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.17871094, + "step": 5166, + "time_per_iteration": 2.7485713958740234 + }, + { + "auxiliary_loss_clip": 0.01449276, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.31055999, + "balance_loss_mlp": 1.02349114, + "epoch": 0.31065684653539755, + "flos": 17676153362160.0, + "grad_norm": 2.5545582710582857, + "language_loss": 0.74443424, + "learning_rate": 3.229064268360444e-06, + "loss": 0.7693367, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.17492676, + "step": 5167, + "time_per_iteration": 4.1638007164001465 + }, + { + "auxiliary_loss_clip": 0.0125491, + "auxiliary_loss_mlp": 0.01005463, + "balance_loss_clip": 1.19753027, + "balance_loss_mlp": 1.00284028, + "epoch": 0.3107169697880655, + "flos": 68547240397080.0, + "grad_norm": 0.7138325100132047, + "language_loss": 0.53057355, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55317724, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.02624512, + "step": 5168, + "time_per_iteration": 3.3165414333343506 + }, + { + "auxiliary_loss_clip": 0.01449633, + "auxiliary_loss_mlp": 0.01044117, + "balance_loss_clip": 1.30911088, + "balance_loss_mlp": 1.02552032, + "epoch": 0.3107770930407335, + "flos": 13192723820880.0, + "grad_norm": 1.7093821807909457, + "language_loss": 0.7894212, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.81435871, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.18579102, + "step": 5169, + "time_per_iteration": 2.721524715423584 + }, + { + "auxiliary_loss_clip": 0.01447995, + "auxiliary_loss_mlp": 0.01044777, + "balance_loss_clip": 1.30739617, + "balance_loss_mlp": 1.02666879, + "epoch": 0.3108372162934015, + "flos": 31587671006280.0, + "grad_norm": 1.5266217978735808, + "language_loss": 0.64018124, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66510892, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.18115234, + "step": 5170, + "time_per_iteration": 2.8663063049316406 + }, + { + "auxiliary_loss_clip": 0.01444074, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.30594039, + "balance_loss_mlp": 1.02428484, + "epoch": 0.31089733954606946, + "flos": 28735608516120.0, + "grad_norm": 2.287680728870265, + "language_loss": 0.77695274, + "learning_rate": 3.22783492314295e-06, + "loss": 0.80181754, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.18127441, + "step": 5171, + "time_per_iteration": 2.8203108310699463 + }, + { + "auxiliary_loss_clip": 0.01444766, + "auxiliary_loss_mlp": 0.01049054, + "balance_loss_clip": 1.30615842, + "balance_loss_mlp": 1.03139961, + "epoch": 0.3109574627987374, + "flos": 19688094584640.0, + "grad_norm": 1.916164685167908, + "language_loss": 0.83663684, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86157507, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.17663574, + "step": 5172, + "time_per_iteration": 2.7721681594848633 + }, + { + "auxiliary_loss_clip": 0.01446344, + "auxiliary_loss_mlp": 0.01043758, + "balance_loss_clip": 1.30541921, + "balance_loss_mlp": 1.02525663, + "epoch": 0.3110175860514054, + "flos": 14688080182440.0, + "grad_norm": 2.482851058020966, + "language_loss": 0.85012078, + "learning_rate": 3.227219971129842e-06, + "loss": 0.87502182, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.18505859, + "step": 5173, + "time_per_iteration": 4.294744253158569 + }, + { + "auxiliary_loss_clip": 0.01436484, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.30353022, + "balance_loss_mlp": 1.02281642, + "epoch": 0.31107770930407336, + "flos": 25745261268240.0, + "grad_norm": 1.526236405059745, + "language_loss": 0.82975382, + "learning_rate": 3.226912425313001e-06, + "loss": 0.854518, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.17114258, + "step": 5174, + "time_per_iteration": 2.8249313831329346 + }, + { + "auxiliary_loss_clip": 0.01444449, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_clip": 1.30445766, + "balance_loss_mlp": 1.03212535, + "epoch": 0.3111378325567413, + "flos": 19212953878080.0, + "grad_norm": 1.9042874656606037, + "language_loss": 0.84912139, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87406927, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.18225098, + "step": 5175, + "time_per_iteration": 4.19353461265564 + }, + { + "auxiliary_loss_clip": 0.01444515, + "auxiliary_loss_mlp": 0.01045294, + "balance_loss_clip": 1.30897117, + "balance_loss_mlp": 1.02566028, + "epoch": 0.3111979558094093, + "flos": 23701703114520.0, + "grad_norm": 1.8338357700670525, + "language_loss": 0.83932996, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.86422801, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.1965332, + "step": 5176, + "time_per_iteration": 2.8324601650238037 + }, + { + "auxiliary_loss_clip": 0.01432433, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.29459596, + "balance_loss_mlp": 1.02113616, + "epoch": 0.31125807906207725, + "flos": 21038019117120.0, + "grad_norm": 1.792997537954134, + "language_loss": 0.80858624, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8333143, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.19250488, + "step": 5177, + "time_per_iteration": 2.7887401580810547 + }, + { + "auxiliary_loss_clip": 0.01447942, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.31059206, + "balance_loss_mlp": 1.0273962, + "epoch": 0.3113182023147452, + "flos": 23081942746080.0, + "grad_norm": 1.5721910372442127, + "language_loss": 0.80956435, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.834512, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.19445801, + "step": 5178, + "time_per_iteration": 2.849669933319092 + }, + { + "auxiliary_loss_clip": 0.01445876, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.30453742, + "balance_loss_mlp": 1.02816701, + "epoch": 0.3113783255674132, + "flos": 11842839896760.0, + "grad_norm": 3.925077406509085, + "language_loss": 0.81912994, + "learning_rate": 3.225373998592471e-06, + "loss": 0.84405589, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.18554688, + "step": 5179, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0144566, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_clip": 1.307585, + "balance_loss_mlp": 1.03520286, + "epoch": 0.31143844882008115, + "flos": 16293880947960.0, + "grad_norm": 1.595518825032187, + "language_loss": 0.78540039, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.81039, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.1809082, + "step": 5180, + "time_per_iteration": 2.763005256652832 + }, + { + "auxiliary_loss_clip": 0.01444777, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.30624437, + "balance_loss_mlp": 1.02343702, + "epoch": 0.3114985720727491, + "flos": 23222501571960.0, + "grad_norm": 1.5744705886826202, + "language_loss": 0.83590716, + "learning_rate": 3.22475830255844e-06, + "loss": 0.86077195, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.18273926, + "step": 5181, + "time_per_iteration": 2.7740402221679688 + }, + { + "auxiliary_loss_clip": 0.01441367, + "auxiliary_loss_mlp": 0.01047898, + "balance_loss_clip": 1.30534816, + "balance_loss_mlp": 1.03151882, + "epoch": 0.3115586953254171, + "flos": 30050748665280.0, + "grad_norm": 1.5218358870155577, + "language_loss": 0.74181503, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76670766, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16381836, + "step": 5182, + "time_per_iteration": 2.82161808013916 + }, + { + "auxiliary_loss_clip": 0.01450894, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.30884552, + "balance_loss_mlp": 1.03008628, + "epoch": 0.3116188185780851, + "flos": 25671875315760.0, + "grad_norm": 2.2612899987723165, + "language_loss": 0.70460606, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72959876, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.18286133, + "step": 5183, + "time_per_iteration": 2.8015992641448975 + }, + { + "auxiliary_loss_clip": 0.01253806, + "auxiliary_loss_mlp": 0.01004169, + "balance_loss_clip": 1.19418669, + "balance_loss_mlp": 1.00161779, + "epoch": 0.31167894183075306, + "flos": 69524509388400.0, + "grad_norm": 0.9484965247638941, + "language_loss": 0.59690428, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61948407, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.0255127, + "step": 5184, + "time_per_iteration": 3.28639817237854 + }, + { + "auxiliary_loss_clip": 0.01448349, + "auxiliary_loss_mlp": 0.01055991, + "balance_loss_clip": 1.30844426, + "balance_loss_mlp": 1.03898001, + "epoch": 0.31173906508342103, + "flos": 14944037632200.0, + "grad_norm": 2.4140839999801518, + "language_loss": 0.7047168, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72976017, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17004395, + "step": 5185, + "time_per_iteration": 2.756591796875 + }, + { + "auxiliary_loss_clip": 0.01446861, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.30666208, + "balance_loss_mlp": 1.03611088, + "epoch": 0.311799188336089, + "flos": 16179822399960.0, + "grad_norm": 2.4996324914494386, + "language_loss": 0.64415097, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66915774, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17712402, + "step": 5186, + "time_per_iteration": 2.7107129096984863 + }, + { + "auxiliary_loss_clip": 0.01453593, + "auxiliary_loss_mlp": 0.01054771, + "balance_loss_clip": 1.31078124, + "balance_loss_mlp": 1.03655577, + "epoch": 0.31185931158875696, + "flos": 25015097104200.0, + "grad_norm": 3.0368065196246397, + "language_loss": 0.86456108, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88964474, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.18225098, + "step": 5187, + "time_per_iteration": 2.7868387699127197 + }, + { + "auxiliary_loss_clip": 0.01450933, + "auxiliary_loss_mlp": 0.01053463, + "balance_loss_clip": 1.31058407, + "balance_loss_mlp": 1.03592753, + "epoch": 0.3119194348414249, + "flos": 37240037308800.0, + "grad_norm": 1.461662957522141, + "language_loss": 0.6317842, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.65682817, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1751709, + "step": 5188, + "time_per_iteration": 2.919978380203247 + }, + { + "auxiliary_loss_clip": 0.01454424, + "auxiliary_loss_mlp": 0.01052439, + "balance_loss_clip": 1.31436586, + "balance_loss_mlp": 1.03291273, + "epoch": 0.3119795580940929, + "flos": 15016976892720.0, + "grad_norm": 2.03351751120211, + "language_loss": 0.83355093, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85861957, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.19543457, + "step": 5189, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.01440191, + "auxiliary_loss_mlp": 0.01051771, + "balance_loss_clip": 1.30378461, + "balance_loss_mlp": 1.03466415, + "epoch": 0.31203968134676086, + "flos": 16002529989480.0, + "grad_norm": 1.6913683778106097, + "language_loss": 0.79561728, + "learning_rate": 3.22198537282789e-06, + "loss": 0.82053685, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.17102051, + "step": 5190, + "time_per_iteration": 2.763124704360962 + }, + { + "auxiliary_loss_clip": 0.01450755, + "auxiliary_loss_mlp": 0.01056661, + "balance_loss_clip": 1.31060362, + "balance_loss_mlp": 1.0387435, + "epoch": 0.3120998045994288, + "flos": 23842058898600.0, + "grad_norm": 1.4270591014474179, + "language_loss": 0.75442719, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77950132, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17919922, + "step": 5191, + "time_per_iteration": 2.7926185131073 + }, + { + "auxiliary_loss_clip": 0.01255296, + "auxiliary_loss_mlp": 0.01010584, + "balance_loss_clip": 1.19585776, + "balance_loss_mlp": 1.0078181, + "epoch": 0.3121599278520968, + "flos": 69198942563640.0, + "grad_norm": 0.8428019802478491, + "language_loss": 0.63932109, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66197991, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02770996, + "step": 5192, + "time_per_iteration": 3.3652706146240234 + }, + { + "auxiliary_loss_clip": 0.01447739, + "auxiliary_loss_mlp": 0.01054679, + "balance_loss_clip": 1.30468035, + "balance_loss_mlp": 1.03548622, + "epoch": 0.31222005110476475, + "flos": 23811782043240.0, + "grad_norm": 1.5072996745663623, + "language_loss": 0.80350107, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82852519, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.19189453, + "step": 5193, + "time_per_iteration": 2.791874647140503 + }, + { + "auxiliary_loss_clip": 0.01449945, + "auxiliary_loss_mlp": 0.01050994, + "balance_loss_clip": 1.30887544, + "balance_loss_mlp": 1.03287435, + "epoch": 0.3122801743574327, + "flos": 25231315950720.0, + "grad_norm": 2.301072735689118, + "language_loss": 0.71817291, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.7431823, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.18115234, + "step": 5194, + "time_per_iteration": 2.860985517501831 + }, + { + "auxiliary_loss_clip": 0.01447962, + "auxiliary_loss_mlp": 0.01050918, + "balance_loss_clip": 1.31061697, + "balance_loss_mlp": 1.03196335, + "epoch": 0.3123402976101007, + "flos": 22971701383920.0, + "grad_norm": 1.4493888224056948, + "language_loss": 0.77098703, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.7959758, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.18969727, + "step": 5195, + "time_per_iteration": 2.8833673000335693 + }, + { + "auxiliary_loss_clip": 0.01453651, + "auxiliary_loss_mlp": 0.01050406, + "balance_loss_clip": 1.31116843, + "balance_loss_mlp": 1.03259623, + "epoch": 0.3124004208627687, + "flos": 25197587384760.0, + "grad_norm": 2.325432589084583, + "language_loss": 0.78387761, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80891824, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17810059, + "step": 5196, + "time_per_iteration": 2.7983388900756836 + }, + { + "auxiliary_loss_clip": 0.01254833, + "auxiliary_loss_mlp": 0.01008604, + "balance_loss_clip": 1.19560945, + "balance_loss_mlp": 1.00594521, + "epoch": 0.31246054411543667, + "flos": 67501742715000.0, + "grad_norm": 0.7843187362339145, + "language_loss": 0.54814738, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.57078177, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02661133, + "step": 5197, + "time_per_iteration": 3.3600175380706787 + }, + { + "auxiliary_loss_clip": 0.01442749, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.30463266, + "balance_loss_mlp": 1.0286684, + "epoch": 0.31252066736810463, + "flos": 17863069953960.0, + "grad_norm": 1.583935007825819, + "language_loss": 0.66789877, + "learning_rate": 3.21951739516552e-06, + "loss": 0.69279283, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17993164, + "step": 5198, + "time_per_iteration": 2.762650966644287 + }, + { + "auxiliary_loss_clip": 0.01450359, + "auxiliary_loss_mlp": 0.0104582, + "balance_loss_clip": 1.30837536, + "balance_loss_mlp": 1.02728331, + "epoch": 0.3125807906207726, + "flos": 18479175570000.0, + "grad_norm": 2.0071665467924724, + "language_loss": 0.69834995, + "learning_rate": 3.219208689735857e-06, + "loss": 0.72331172, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.1854248, + "step": 5199, + "time_per_iteration": 4.154151201248169 + }, + { + "auxiliary_loss_clip": 0.01442418, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.30360579, + "balance_loss_mlp": 1.03283739, + "epoch": 0.31264091387344056, + "flos": 18950377265640.0, + "grad_norm": 1.7069790667868714, + "language_loss": 0.78935063, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81428301, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.17980957, + "step": 5200, + "time_per_iteration": 2.771653175354004 + }, + { + "auxiliary_loss_clip": 0.01443898, + "auxiliary_loss_mlp": 0.01038936, + "balance_loss_clip": 1.30782294, + "balance_loss_mlp": 1.02087569, + "epoch": 0.3127010371261085, + "flos": 21473258787000.0, + "grad_norm": 1.887402631723736, + "language_loss": 0.8410455, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86587381, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.18054199, + "step": 5201, + "time_per_iteration": 2.779989004135132 + }, + { + "auxiliary_loss_clip": 0.01445873, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.30631304, + "balance_loss_mlp": 1.03374338, + "epoch": 0.3127611603787765, + "flos": 15339619915560.0, + "grad_norm": 1.9500587538683989, + "language_loss": 0.69233179, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71731728, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.18933105, + "step": 5202, + "time_per_iteration": 2.7807979583740234 + }, + { + "auxiliary_loss_clip": 0.01448281, + "auxiliary_loss_mlp": 0.01044423, + "balance_loss_clip": 1.30741692, + "balance_loss_mlp": 1.02753103, + "epoch": 0.31282128363144446, + "flos": 17607477979440.0, + "grad_norm": 1.6559984184373584, + "language_loss": 0.8437922, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86871922, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16894531, + "step": 5203, + "time_per_iteration": 2.73614764213562 + }, + { + "auxiliary_loss_clip": 0.01455568, + "auxiliary_loss_mlp": 0.01047249, + "balance_loss_clip": 1.31388259, + "balance_loss_mlp": 1.02890313, + "epoch": 0.3128814068841124, + "flos": 26761903387560.0, + "grad_norm": 1.9703485563270986, + "language_loss": 0.61440384, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.63943195, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.18334961, + "step": 5204, + "time_per_iteration": 2.8227202892303467 + }, + { + "auxiliary_loss_clip": 0.01444056, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.30772805, + "balance_loss_mlp": 1.02511883, + "epoch": 0.3129415301367804, + "flos": 22277377420560.0, + "grad_norm": 1.602865440078457, + "language_loss": 0.65844917, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68330777, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.16687012, + "step": 5205, + "time_per_iteration": 4.221550941467285 + }, + { + "auxiliary_loss_clip": 0.0144939, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.30900097, + "balance_loss_mlp": 1.0288229, + "epoch": 0.31300165338944835, + "flos": 26470024520400.0, + "grad_norm": 1.479775529949224, + "language_loss": 0.76658314, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79156852, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.20324707, + "step": 5206, + "time_per_iteration": 2.785818338394165 + }, + { + "auxiliary_loss_clip": 0.01447462, + "auxiliary_loss_mlp": 0.01039434, + "balance_loss_clip": 1.30933821, + "balance_loss_mlp": 1.02242243, + "epoch": 0.3130617766421163, + "flos": 21949698961080.0, + "grad_norm": 1.9720034206384613, + "language_loss": 0.83282477, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85769367, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.17004395, + "step": 5207, + "time_per_iteration": 2.7631256580352783 + }, + { + "auxiliary_loss_clip": 0.01441297, + "auxiliary_loss_mlp": 0.01043435, + "balance_loss_clip": 1.306723, + "balance_loss_mlp": 1.02618551, + "epoch": 0.3131218998947843, + "flos": 23297755509000.0, + "grad_norm": 1.3827496520510865, + "language_loss": 0.71790814, + "learning_rate": 3.216428261810999e-06, + "loss": 0.74275553, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.17248535, + "step": 5208, + "time_per_iteration": 2.8061206340789795 + }, + { + "auxiliary_loss_clip": 0.01452077, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.31291318, + "balance_loss_mlp": 1.02748299, + "epoch": 0.3131820231474523, + "flos": 21144280860000.0, + "grad_norm": 1.8434212830871297, + "language_loss": 0.7453388, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.7703079, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17370605, + "step": 5209, + "time_per_iteration": 2.878185272216797 + }, + { + "auxiliary_loss_clip": 0.01445929, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.30517507, + "balance_loss_mlp": 1.03054595, + "epoch": 0.31324214640012027, + "flos": 23914510858800.0, + "grad_norm": 1.8841017836077834, + "language_loss": 0.77168167, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79661655, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17004395, + "step": 5210, + "time_per_iteration": 2.796722650527954 + }, + { + "auxiliary_loss_clip": 0.01440091, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.30596185, + "balance_loss_mlp": 1.02378726, + "epoch": 0.31330226965278823, + "flos": 22242065128560.0, + "grad_norm": 2.6290110430547666, + "language_loss": 0.79491007, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81972754, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.17858887, + "step": 5211, + "time_per_iteration": 4.345225095748901 + }, + { + "auxiliary_loss_clip": 0.01439442, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.3021872, + "balance_loss_mlp": 1.02315068, + "epoch": 0.3133623929054562, + "flos": 19758556735200.0, + "grad_norm": 1.6386125824364217, + "language_loss": 0.79479432, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81958377, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16369629, + "step": 5212, + "time_per_iteration": 2.815610408782959 + }, + { + "auxiliary_loss_clip": 0.0145113, + "auxiliary_loss_mlp": 0.01055981, + "balance_loss_clip": 1.31082559, + "balance_loss_mlp": 1.03752708, + "epoch": 0.31342251615812416, + "flos": 27168003236160.0, + "grad_norm": 1.8873208876679137, + "language_loss": 0.71349549, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73856664, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.18444824, + "step": 5213, + "time_per_iteration": 4.367974519729614 + }, + { + "auxiliary_loss_clip": 0.01452493, + "auxiliary_loss_mlp": 0.01048565, + "balance_loss_clip": 1.31205893, + "balance_loss_mlp": 1.03039789, + "epoch": 0.31348263941079213, + "flos": 20234631434040.0, + "grad_norm": 2.0673242285008544, + "language_loss": 0.77868271, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.80369329, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.18164062, + "step": 5214, + "time_per_iteration": 2.851015567779541 + }, + { + "auxiliary_loss_clip": 0.0143812, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.30471742, + "balance_loss_mlp": 1.02253222, + "epoch": 0.3135427626634601, + "flos": 24612976874880.0, + "grad_norm": 1.6393078445386293, + "language_loss": 0.82830173, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.85307002, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.16186523, + "step": 5215, + "time_per_iteration": 2.802469253540039 + }, + { + "auxiliary_loss_clip": 0.01443119, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_clip": 1.30398381, + "balance_loss_mlp": 1.03167224, + "epoch": 0.31360288591612806, + "flos": 20964836206440.0, + "grad_norm": 2.0797192551676495, + "language_loss": 0.79699802, + "learning_rate": 3.213953633415686e-06, + "loss": 0.82192862, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.18273926, + "step": 5216, + "time_per_iteration": 2.788517475128174 + }, + { + "auxiliary_loss_clip": 0.01452623, + "auxiliary_loss_mlp": 0.01050829, + "balance_loss_clip": 1.31172216, + "balance_loss_mlp": 1.0311352, + "epoch": 0.313663009168796, + "flos": 26986325122800.0, + "grad_norm": 1.6126343051007617, + "language_loss": 0.68260872, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70764327, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.19689941, + "step": 5217, + "time_per_iteration": 2.802896022796631 + }, + { + "auxiliary_loss_clip": 0.01450202, + "auxiliary_loss_mlp": 0.01043119, + "balance_loss_clip": 1.3105073, + "balance_loss_mlp": 1.02553606, + "epoch": 0.313723132421464, + "flos": 18045479017800.0, + "grad_norm": 1.532480849555289, + "language_loss": 0.80979133, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83472455, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.17578125, + "step": 5218, + "time_per_iteration": 2.7530202865600586 + }, + { + "auxiliary_loss_clip": 0.01451339, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_clip": 1.31172562, + "balance_loss_mlp": 1.03034699, + "epoch": 0.31378325567413196, + "flos": 22493636875440.0, + "grad_norm": 2.1659146544085046, + "language_loss": 0.6971572, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.72215623, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.18212891, + "step": 5219, + "time_per_iteration": 2.7294669151306152 + }, + { + "auxiliary_loss_clip": 0.01446916, + "auxiliary_loss_mlp": 0.0104994, + "balance_loss_clip": 1.30979776, + "balance_loss_mlp": 1.03320312, + "epoch": 0.3138433789267999, + "flos": 22424392975680.0, + "grad_norm": 2.9323916462149797, + "language_loss": 0.79781699, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.8227855, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16723633, + "step": 5220, + "time_per_iteration": 2.8058879375457764 + }, + { + "auxiliary_loss_clip": 0.01449771, + "auxiliary_loss_mlp": 0.01045253, + "balance_loss_clip": 1.31126666, + "balance_loss_mlp": 1.02837265, + "epoch": 0.3139035021794679, + "flos": 13009827456720.0, + "grad_norm": 1.9482223265089824, + "language_loss": 0.73464608, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75959635, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16870117, + "step": 5221, + "time_per_iteration": 2.715855121612549 + }, + { + "auxiliary_loss_clip": 0.01444667, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_clip": 1.30994236, + "balance_loss_mlp": 1.02731955, + "epoch": 0.31396362543213585, + "flos": 16950374901000.0, + "grad_norm": 1.9028135664036756, + "language_loss": 0.82275951, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84764987, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.17041016, + "step": 5222, + "time_per_iteration": 2.7543399333953857 + }, + { + "auxiliary_loss_clip": 0.01453456, + "auxiliary_loss_mlp": 0.01048954, + "balance_loss_clip": 1.31231737, + "balance_loss_mlp": 1.03026187, + "epoch": 0.31402374868480387, + "flos": 20161529740080.0, + "grad_norm": 1.9289591642604393, + "language_loss": 0.70152223, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72654629, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.18701172, + "step": 5223, + "time_per_iteration": 2.753366708755493 + }, + { + "auxiliary_loss_clip": 0.01444979, + "auxiliary_loss_mlp": 0.01045634, + "balance_loss_clip": 1.30780482, + "balance_loss_mlp": 1.0290879, + "epoch": 0.31408387193747184, + "flos": 21255984123120.0, + "grad_norm": 1.5090883195062663, + "language_loss": 0.8043015, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82920766, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16540527, + "step": 5224, + "time_per_iteration": 2.7546744346618652 + }, + { + "auxiliary_loss_clip": 0.01460226, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_clip": 1.31665874, + "balance_loss_mlp": 1.03695595, + "epoch": 0.3141439951901398, + "flos": 27489224966400.0, + "grad_norm": 2.590248559158262, + "language_loss": 0.5811255, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60628331, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.18591309, + "step": 5225, + "time_per_iteration": 2.872114658355713 + }, + { + "auxiliary_loss_clip": 0.014461, + "auxiliary_loss_mlp": 0.01038984, + "balance_loss_clip": 1.3123008, + "balance_loss_mlp": 1.02276003, + "epoch": 0.31420411844280777, + "flos": 17855963490960.0, + "grad_norm": 1.6952285945590793, + "language_loss": 0.81876713, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84361798, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.16210938, + "step": 5226, + "time_per_iteration": 2.7375802993774414 + }, + { + "auxiliary_loss_clip": 0.01453841, + "auxiliary_loss_mlp": 0.01051049, + "balance_loss_clip": 1.31511474, + "balance_loss_mlp": 1.03254795, + "epoch": 0.31426424169547573, + "flos": 21622061109960.0, + "grad_norm": 1.8375903672304956, + "language_loss": 0.74353433, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76858324, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.18505859, + "step": 5227, + "time_per_iteration": 2.772592782974243 + }, + { + "auxiliary_loss_clip": 0.01456241, + "auxiliary_loss_mlp": 0.01044914, + "balance_loss_clip": 1.31743991, + "balance_loss_mlp": 1.0279268, + "epoch": 0.3143243649481437, + "flos": 30926913175440.0, + "grad_norm": 1.6961385246760323, + "language_loss": 0.68064165, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.70565319, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.1697998, + "step": 5228, + "time_per_iteration": 2.8184051513671875 + }, + { + "auxiliary_loss_clip": 0.01455683, + "auxiliary_loss_mlp": 0.01049142, + "balance_loss_clip": 1.31606197, + "balance_loss_mlp": 1.03247643, + "epoch": 0.31438448820081166, + "flos": 22826716246800.0, + "grad_norm": 1.6353887847803552, + "language_loss": 0.79786801, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82291621, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16650391, + "step": 5229, + "time_per_iteration": 2.7922050952911377 + }, + { + "auxiliary_loss_clip": 0.01454332, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.31843758, + "balance_loss_mlp": 1.02320194, + "epoch": 0.3144446114534796, + "flos": 23297024558520.0, + "grad_norm": 1.798673788431321, + "language_loss": 0.69951129, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72446227, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17553711, + "step": 5230, + "time_per_iteration": 2.7766315937042236 + }, + { + "auxiliary_loss_clip": 0.01456582, + "auxiliary_loss_mlp": 0.0104423, + "balance_loss_clip": 1.31643593, + "balance_loss_mlp": 1.02633703, + "epoch": 0.3145047347061476, + "flos": 31361990411880.0, + "grad_norm": 1.5783213987422624, + "language_loss": 0.79408038, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81908846, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17883301, + "step": 5231, + "time_per_iteration": 2.889754056930542 + }, + { + "auxiliary_loss_clip": 0.01449901, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.31293535, + "balance_loss_mlp": 1.0282526, + "epoch": 0.31456485795881556, + "flos": 10893167609040.0, + "grad_norm": 2.0838181578459642, + "language_loss": 0.85258293, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87753671, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.17236328, + "step": 5232, + "time_per_iteration": 2.754223585128784 + }, + { + "auxiliary_loss_clip": 0.01445399, + "auxiliary_loss_mlp": 0.01049101, + "balance_loss_clip": 1.30987287, + "balance_loss_mlp": 1.03180361, + "epoch": 0.3146249812114835, + "flos": 17096903155800.0, + "grad_norm": 1.5928738969984835, + "language_loss": 0.80046552, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82541054, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.17297363, + "step": 5233, + "time_per_iteration": 2.750657796859741 + }, + { + "auxiliary_loss_clip": 0.01457923, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.31766593, + "balance_loss_mlp": 1.02345109, + "epoch": 0.3146851044641515, + "flos": 55300686872040.0, + "grad_norm": 1.71158175747085, + "language_loss": 0.70930576, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73428762, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.16809082, + "step": 5234, + "time_per_iteration": 3.0994677543640137 + }, + { + "auxiliary_loss_clip": 0.01454924, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.31474054, + "balance_loss_mlp": 1.02389276, + "epoch": 0.31474522771681945, + "flos": 27021556198080.0, + "grad_norm": 1.9468264113157958, + "language_loss": 0.72694498, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.75190639, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.17321777, + "step": 5235, + "time_per_iteration": 2.777264356613159 + }, + { + "auxiliary_loss_clip": 0.0145066, + "auxiliary_loss_mlp": 0.01045742, + "balance_loss_clip": 1.31377697, + "balance_loss_mlp": 1.02856362, + "epoch": 0.3148053509694875, + "flos": 21256958723760.0, + "grad_norm": 1.8351662788161107, + "language_loss": 0.79031086, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81527495, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.17175293, + "step": 5236, + "time_per_iteration": 2.796863555908203 + }, + { + "auxiliary_loss_clip": 0.01457626, + "auxiliary_loss_mlp": 0.01040969, + "balance_loss_clip": 1.31535363, + "balance_loss_mlp": 1.02445829, + "epoch": 0.31486547422215544, + "flos": 31254957110160.0, + "grad_norm": 1.4489919673791207, + "language_loss": 0.76052809, + "learning_rate": 3.207443732256881e-06, + "loss": 0.785514, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.16516113, + "step": 5237, + "time_per_iteration": 2.8197429180145264 + }, + { + "auxiliary_loss_clip": 0.01447926, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.31304169, + "balance_loss_mlp": 1.02607489, + "epoch": 0.3149255974748234, + "flos": 19833404588640.0, + "grad_norm": 1.7374059071887635, + "language_loss": 0.79702926, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82192957, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.16040039, + "step": 5238, + "time_per_iteration": 4.192685604095459 + }, + { + "auxiliary_loss_clip": 0.01282163, + "auxiliary_loss_mlp": 0.01010909, + "balance_loss_clip": 1.2236352, + "balance_loss_mlp": 1.00707018, + "epoch": 0.31498572072749137, + "flos": 67697941640040.0, + "grad_norm": 0.8531333207202847, + "language_loss": 0.67967337, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70260406, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.03833008, + "step": 5239, + "time_per_iteration": 3.244295120239258 + }, + { + "auxiliary_loss_clip": 0.01468275, + "auxiliary_loss_mlp": 0.01047558, + "balance_loss_clip": 1.32436717, + "balance_loss_mlp": 1.02834129, + "epoch": 0.31504584398015933, + "flos": 19798051688280.0, + "grad_norm": 2.5196780009433923, + "language_loss": 0.83270431, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.85786265, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.19213867, + "step": 5240, + "time_per_iteration": 2.7345924377441406 + }, + { + "auxiliary_loss_clip": 0.0145141, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_clip": 1.3138448, + "balance_loss_mlp": 1.02623844, + "epoch": 0.3151059672328273, + "flos": 26621141519880.0, + "grad_norm": 1.7288779511329413, + "language_loss": 0.81585449, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.84080279, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17175293, + "step": 5241, + "time_per_iteration": 2.8075146675109863 + }, + { + "auxiliary_loss_clip": 0.01453631, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.31918931, + "balance_loss_mlp": 1.024616, + "epoch": 0.31516609048549526, + "flos": 24209475961320.0, + "grad_norm": 1.5688938039375906, + "language_loss": 0.74491704, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76986593, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.16638184, + "step": 5242, + "time_per_iteration": 2.8109664916992188 + }, + { + "auxiliary_loss_clip": 0.01455225, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.31824732, + "balance_loss_mlp": 1.01950645, + "epoch": 0.31522621373816323, + "flos": 25964363308320.0, + "grad_norm": 2.089995852040701, + "language_loss": 0.7448175, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.76974469, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.17993164, + "step": 5243, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.01451105, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.31307709, + "balance_loss_mlp": 1.02278733, + "epoch": 0.3152863369908312, + "flos": 21913939977120.0, + "grad_norm": 1.7457847164990423, + "language_loss": 0.64593339, + "learning_rate": 3.205269272758513e-06, + "loss": 0.6708405, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16821289, + "step": 5244, + "time_per_iteration": 4.209145545959473 + }, + { + "auxiliary_loss_clip": 0.01461576, + "auxiliary_loss_mlp": 0.01044147, + "balance_loss_clip": 1.32115316, + "balance_loss_mlp": 1.02797079, + "epoch": 0.31534646024349916, + "flos": 16284297375000.0, + "grad_norm": 1.9973607109257787, + "language_loss": 0.91403687, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93909407, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.16162109, + "step": 5245, + "time_per_iteration": 2.7172086238861084 + }, + { + "auxiliary_loss_clip": 0.01454286, + "auxiliary_loss_mlp": 0.01049592, + "balance_loss_clip": 1.31548238, + "balance_loss_mlp": 1.03219914, + "epoch": 0.3154065834961671, + "flos": 24722609111640.0, + "grad_norm": 1.854143495777694, + "language_loss": 0.75334871, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77838743, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.1739502, + "step": 5246, + "time_per_iteration": 2.812814950942993 + }, + { + "auxiliary_loss_clip": 0.01455803, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.31730223, + "balance_loss_mlp": 1.03737855, + "epoch": 0.3154667067488351, + "flos": 35377913618280.0, + "grad_norm": 1.5904345410046794, + "language_loss": 0.62058449, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64568883, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.17248535, + "step": 5247, + "time_per_iteration": 2.8807082176208496 + }, + { + "auxiliary_loss_clip": 0.0145638, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_clip": 1.31593633, + "balance_loss_mlp": 1.02533042, + "epoch": 0.31552683000150306, + "flos": 17460868507920.0, + "grad_norm": 1.9349647187926406, + "language_loss": 0.83083665, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.85582459, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.17089844, + "step": 5248, + "time_per_iteration": 2.7124295234680176 + }, + { + "auxiliary_loss_clip": 0.01450354, + "auxiliary_loss_mlp": 0.01045924, + "balance_loss_clip": 1.31270695, + "balance_loss_mlp": 1.02772093, + "epoch": 0.3155869532541711, + "flos": 18410378362200.0, + "grad_norm": 1.9529267056218154, + "language_loss": 0.85173821, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87670094, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.18188477, + "step": 5249, + "time_per_iteration": 2.769292116165161 + }, + { + "auxiliary_loss_clip": 0.01455325, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.31647038, + "balance_loss_mlp": 1.02034807, + "epoch": 0.31564707650683904, + "flos": 21584759008320.0, + "grad_norm": 2.012927998697654, + "language_loss": 0.85715908, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.88209713, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.18127441, + "step": 5250, + "time_per_iteration": 4.335885524749756 + }, + { + "auxiliary_loss_clip": 0.01451981, + "auxiliary_loss_mlp": 0.01049751, + "balance_loss_clip": 1.31478894, + "balance_loss_mlp": 1.03262043, + "epoch": 0.315707199759507, + "flos": 21035704440600.0, + "grad_norm": 2.176328298339521, + "language_loss": 0.68404502, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70906234, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.17126465, + "step": 5251, + "time_per_iteration": 2.7386889457702637 + }, + { + "auxiliary_loss_clip": 0.01451124, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.31410646, + "balance_loss_mlp": 1.02779365, + "epoch": 0.31576732301217497, + "flos": 26833908655800.0, + "grad_norm": 2.0435748777011367, + "language_loss": 0.78860092, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81355548, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.1652832, + "step": 5252, + "time_per_iteration": 4.3746113777160645 + }, + { + "auxiliary_loss_clip": 0.01448454, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_clip": 1.31285226, + "balance_loss_mlp": 1.03547835, + "epoch": 0.31582744626484294, + "flos": 22716353059560.0, + "grad_norm": 1.585908139564609, + "language_loss": 0.73933035, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76433033, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16070557, + "step": 5253, + "time_per_iteration": 2.831160068511963 + }, + { + "auxiliary_loss_clip": 0.01461622, + "auxiliary_loss_mlp": 0.01045351, + "balance_loss_clip": 1.32227206, + "balance_loss_mlp": 1.02864981, + "epoch": 0.3158875695175109, + "flos": 23957295089040.0, + "grad_norm": 1.701456478250049, + "language_loss": 0.73733437, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.76240414, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16711426, + "step": 5254, + "time_per_iteration": 2.8626980781555176 + }, + { + "auxiliary_loss_clip": 0.01451296, + "auxiliary_loss_mlp": 0.01049903, + "balance_loss_clip": 1.31070423, + "balance_loss_mlp": 1.0333147, + "epoch": 0.31594769277017887, + "flos": 13265906731560.0, + "grad_norm": 2.8322408553340908, + "language_loss": 0.78491157, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80992353, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.16583252, + "step": 5255, + "time_per_iteration": 2.782061815261841 + }, + { + "auxiliary_loss_clip": 0.01449222, + "auxiliary_loss_mlp": 0.01046471, + "balance_loss_clip": 1.31230628, + "balance_loss_mlp": 1.02813709, + "epoch": 0.31600781602284683, + "flos": 23373577963080.0, + "grad_norm": 2.6072917935071542, + "language_loss": 0.78298926, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80794621, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.18334961, + "step": 5256, + "time_per_iteration": 2.820542573928833 + }, + { + "auxiliary_loss_clip": 0.01441472, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.30998683, + "balance_loss_mlp": 1.02665865, + "epoch": 0.3160679392755148, + "flos": 19833363980280.0, + "grad_norm": 1.5163524142930989, + "language_loss": 0.71864587, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.74347609, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.14892578, + "step": 5257, + "time_per_iteration": 2.79451847076416 + }, + { + "auxiliary_loss_clip": 0.01452699, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.31421232, + "balance_loss_mlp": 1.0311023, + "epoch": 0.31612806252818276, + "flos": 20198019674520.0, + "grad_norm": 2.0669920774772774, + "language_loss": 0.77112412, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79613554, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.17346191, + "step": 5258, + "time_per_iteration": 2.736445665359497 + }, + { + "auxiliary_loss_clip": 0.01456028, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_clip": 1.31662679, + "balance_loss_mlp": 1.03815126, + "epoch": 0.31618818578085073, + "flos": 24240361942080.0, + "grad_norm": 2.2012692955726703, + "language_loss": 0.73754716, + "learning_rate": 3.200602180731467e-06, + "loss": 0.76266271, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17370605, + "step": 5259, + "time_per_iteration": 2.790738821029663 + }, + { + "auxiliary_loss_clip": 0.01457942, + "auxiliary_loss_mlp": 0.01058339, + "balance_loss_clip": 1.31697202, + "balance_loss_mlp": 1.04192424, + "epoch": 0.3162483090335187, + "flos": 25087224197520.0, + "grad_norm": 1.6375388841427228, + "language_loss": 0.66587836, + "learning_rate": 3.20029067660664e-06, + "loss": 0.69104117, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.16418457, + "step": 5260, + "time_per_iteration": 2.7860360145568848 + }, + { + "auxiliary_loss_clip": 0.01449746, + "auxiliary_loss_mlp": 0.01039901, + "balance_loss_clip": 1.31113589, + "balance_loss_mlp": 1.023772, + "epoch": 0.31630843228618666, + "flos": 26329059610920.0, + "grad_norm": 1.7134710198800525, + "language_loss": 0.72642195, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.75131845, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.16125488, + "step": 5261, + "time_per_iteration": 2.8143036365509033 + }, + { + "auxiliary_loss_clip": 0.01282305, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.22286737, + "balance_loss_mlp": 1.00860786, + "epoch": 0.3163685555388547, + "flos": 66775314520080.0, + "grad_norm": 0.7618198500837311, + "language_loss": 0.50648052, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52942395, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.03442383, + "step": 5262, + "time_per_iteration": 3.28983473777771 + }, + { + "auxiliary_loss_clip": 0.01455772, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.31673646, + "balance_loss_mlp": 1.03535497, + "epoch": 0.31642867879152264, + "flos": 26001381151440.0, + "grad_norm": 1.4890784764595648, + "language_loss": 0.85395789, + "learning_rate": 3.19935589118856e-06, + "loss": 0.8790319, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.16259766, + "step": 5263, + "time_per_iteration": 2.7823705673217773 + }, + { + "auxiliary_loss_clip": 0.0143614, + "auxiliary_loss_mlp": 0.01045079, + "balance_loss_clip": 1.3025521, + "balance_loss_mlp": 1.02942729, + "epoch": 0.3164888020441906, + "flos": 25780411126800.0, + "grad_norm": 1.5101023707212968, + "language_loss": 0.81947422, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84428644, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15637207, + "step": 5264, + "time_per_iteration": 2.803311347961426 + }, + { + "auxiliary_loss_clip": 0.01457095, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.31638205, + "balance_loss_mlp": 1.02189088, + "epoch": 0.3165489252968586, + "flos": 19761074453520.0, + "grad_norm": 1.771299473632701, + "language_loss": 0.79907292, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82403815, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17529297, + "step": 5265, + "time_per_iteration": 2.738220453262329 + }, + { + "auxiliary_loss_clip": 0.01453989, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.31403804, + "balance_loss_mlp": 1.03313637, + "epoch": 0.31660904854952654, + "flos": 23188976047800.0, + "grad_norm": 2.2469758058123097, + "language_loss": 0.75217599, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77721912, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17211914, + "step": 5266, + "time_per_iteration": 2.797830820083618 + }, + { + "auxiliary_loss_clip": 0.01452447, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.31201506, + "balance_loss_mlp": 1.02878523, + "epoch": 0.3166691718021945, + "flos": 20412857836800.0, + "grad_norm": 2.233623312388555, + "language_loss": 0.79343557, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81841183, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.16381836, + "step": 5267, + "time_per_iteration": 2.7734317779541016 + }, + { + "auxiliary_loss_clip": 0.01276745, + "auxiliary_loss_mlp": 0.01007263, + "balance_loss_clip": 1.21833515, + "balance_loss_mlp": 1.00351965, + "epoch": 0.31672929505486247, + "flos": 70161203442960.0, + "grad_norm": 0.7383265472007815, + "language_loss": 0.57887924, + "learning_rate": 3.197797006055478e-06, + "loss": 0.60171932, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.03735352, + "step": 5268, + "time_per_iteration": 3.2785513401031494 + }, + { + "auxiliary_loss_clip": 0.01451131, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.31123662, + "balance_loss_mlp": 1.0242722, + "epoch": 0.31678941830753043, + "flos": 14359833205920.0, + "grad_norm": 2.282752193618753, + "language_loss": 0.73411179, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75903869, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.17297363, + "step": 5269, + "time_per_iteration": 2.7580151557922363 + }, + { + "auxiliary_loss_clip": 0.01445316, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.30726647, + "balance_loss_mlp": 1.0278461, + "epoch": 0.3168495415601984, + "flos": 22752924210720.0, + "grad_norm": 1.7841502013045898, + "language_loss": 0.79861772, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82351869, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16943359, + "step": 5270, + "time_per_iteration": 2.76676869392395 + }, + { + "auxiliary_loss_clip": 0.01452382, + "auxiliary_loss_mlp": 0.01045415, + "balance_loss_clip": 1.3119328, + "balance_loss_mlp": 1.02748609, + "epoch": 0.31690966481286637, + "flos": 20119395243600.0, + "grad_norm": 2.4052322233125536, + "language_loss": 0.79970634, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.82468432, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17932129, + "step": 5271, + "time_per_iteration": 2.7735471725463867 + }, + { + "auxiliary_loss_clip": 0.01445908, + "auxiliary_loss_mlp": 0.01041651, + "balance_loss_clip": 1.30794394, + "balance_loss_mlp": 1.0240078, + "epoch": 0.31696978806553433, + "flos": 21183938246520.0, + "grad_norm": 2.179995749199923, + "language_loss": 0.72904134, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75391698, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.1763916, + "step": 5272, + "time_per_iteration": 2.772709846496582 + }, + { + "auxiliary_loss_clip": 0.01455661, + "auxiliary_loss_mlp": 0.01043828, + "balance_loss_clip": 1.31292784, + "balance_loss_mlp": 1.02476621, + "epoch": 0.3170299113182023, + "flos": 43004837824920.0, + "grad_norm": 2.3290111548824255, + "language_loss": 0.69038516, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71538001, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.19067383, + "step": 5273, + "time_per_iteration": 3.0426852703094482 + }, + { + "auxiliary_loss_clip": 0.01447229, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.30933642, + "balance_loss_mlp": 1.02007747, + "epoch": 0.31709003457087026, + "flos": 24465270977640.0, + "grad_norm": 3.3692755155359575, + "language_loss": 0.68439651, + "learning_rate": 3.195924845146795e-06, + "loss": 0.70924985, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.18017578, + "step": 5274, + "time_per_iteration": 2.8473892211914062 + }, + { + "auxiliary_loss_clip": 0.01434725, + "auxiliary_loss_mlp": 0.01042677, + "balance_loss_clip": 1.30076563, + "balance_loss_mlp": 1.02642834, + "epoch": 0.3171501578235382, + "flos": 24140841186960.0, + "grad_norm": 1.4505416682754286, + "language_loss": 0.80969363, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83446765, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.16259766, + "step": 5275, + "time_per_iteration": 2.7875263690948486 + }, + { + "auxiliary_loss_clip": 0.01448596, + "auxiliary_loss_mlp": 0.01052494, + "balance_loss_clip": 1.30837512, + "balance_loss_mlp": 1.03440976, + "epoch": 0.31721028107620625, + "flos": 18884341426320.0, + "grad_norm": 1.7701635379315073, + "language_loss": 0.73387206, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75888294, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.1809082, + "step": 5276, + "time_per_iteration": 2.752640962600708 + }, + { + "auxiliary_loss_clip": 0.01441679, + "auxiliary_loss_mlp": 0.01039796, + "balance_loss_clip": 1.30687666, + "balance_loss_mlp": 1.02279711, + "epoch": 0.3172704043288742, + "flos": 23152973413680.0, + "grad_norm": 1.4211239089647854, + "language_loss": 0.7811715, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80598623, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16992188, + "step": 5277, + "time_per_iteration": 4.341200113296509 + }, + { + "auxiliary_loss_clip": 0.01449949, + "auxiliary_loss_mlp": 0.01040505, + "balance_loss_clip": 1.31001687, + "balance_loss_mlp": 1.02187312, + "epoch": 0.3173305275815422, + "flos": 17863069953960.0, + "grad_norm": 1.682085139416844, + "language_loss": 0.79188365, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.8167882, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1862793, + "step": 5278, + "time_per_iteration": 2.7509207725524902 + }, + { + "auxiliary_loss_clip": 0.01284341, + "auxiliary_loss_mlp": 0.01003935, + "balance_loss_clip": 1.22571647, + "balance_loss_mlp": 1.00021577, + "epoch": 0.31739065083421014, + "flos": 59984880455880.0, + "grad_norm": 2.0291652603785115, + "language_loss": 0.62838781, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.65127063, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.03710938, + "step": 5279, + "time_per_iteration": 3.086566209793091 + }, + { + "auxiliary_loss_clip": 0.01455477, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_clip": 1.31320059, + "balance_loss_mlp": 1.02350044, + "epoch": 0.3174507740868781, + "flos": 23805690789240.0, + "grad_norm": 1.5192881398258997, + "language_loss": 0.81469655, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83967459, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.18811035, + "step": 5280, + "time_per_iteration": 2.772193670272827 + }, + { + "auxiliary_loss_clip": 0.01447551, + "auxiliary_loss_mlp": 0.0105249, + "balance_loss_clip": 1.31145227, + "balance_loss_mlp": 1.03512096, + "epoch": 0.31751089733954607, + "flos": 27645011927280.0, + "grad_norm": 1.4234738299196952, + "language_loss": 0.7812953, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80629569, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17382812, + "step": 5281, + "time_per_iteration": 2.8570713996887207 + }, + { + "auxiliary_loss_clip": 0.01451132, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.31292725, + "balance_loss_mlp": 1.02833724, + "epoch": 0.31757102059221404, + "flos": 23774601766680.0, + "grad_norm": 1.5129297705495566, + "language_loss": 0.78440732, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80938017, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17822266, + "step": 5282, + "time_per_iteration": 2.771253824234009 + }, + { + "auxiliary_loss_clip": 0.01455883, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.31292069, + "balance_loss_mlp": 1.02902496, + "epoch": 0.317631143844882, + "flos": 25269958128240.0, + "grad_norm": 2.4865546210901353, + "language_loss": 0.68471992, + "learning_rate": 3.193113543486061e-06, + "loss": 0.70974791, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.17895508, + "step": 5283, + "time_per_iteration": 4.352840423583984 + }, + { + "auxiliary_loss_clip": 0.01278024, + "auxiliary_loss_mlp": 0.01006238, + "balance_loss_clip": 1.21981478, + "balance_loss_mlp": 1.00256658, + "epoch": 0.31769126709754997, + "flos": 55838347471800.0, + "grad_norm": 0.8431628220719579, + "language_loss": 0.52782756, + "learning_rate": 3.192800950261958e-06, + "loss": 0.55067021, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.03662109, + "step": 5284, + "time_per_iteration": 3.284588098526001 + }, + { + "auxiliary_loss_clip": 0.01457401, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.31452227, + "balance_loss_mlp": 1.02599955, + "epoch": 0.31775139035021793, + "flos": 16694904751560.0, + "grad_norm": 1.7077121865496365, + "language_loss": 0.70968825, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.73469186, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16955566, + "step": 5285, + "time_per_iteration": 2.749445915222168 + }, + { + "auxiliary_loss_clip": 0.01270705, + "auxiliary_loss_mlp": 0.01005625, + "balance_loss_clip": 1.2132864, + "balance_loss_mlp": 1.00171459, + "epoch": 0.3178115136028859, + "flos": 64241996650200.0, + "grad_norm": 0.8221927714711558, + "language_loss": 0.60606772, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62883103, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.0390625, + "step": 5286, + "time_per_iteration": 3.262467861175537 + }, + { + "auxiliary_loss_clip": 0.01451721, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_clip": 1.31038356, + "balance_loss_mlp": 1.02742529, + "epoch": 0.31787163685555386, + "flos": 18702094795920.0, + "grad_norm": 1.7064046011875749, + "language_loss": 0.72524607, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.75022054, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.18286133, + "step": 5287, + "time_per_iteration": 2.8534045219421387 + }, + { + "auxiliary_loss_clip": 0.01449127, + "auxiliary_loss_mlp": 0.01050861, + "balance_loss_clip": 1.30688107, + "balance_loss_mlp": 1.03195393, + "epoch": 0.31793176010822183, + "flos": 21329776159200.0, + "grad_norm": 2.7248179479657244, + "language_loss": 0.76091647, + "learning_rate": 3.191550125172792e-06, + "loss": 0.78591639, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.18920898, + "step": 5288, + "time_per_iteration": 2.7564520835876465 + }, + { + "auxiliary_loss_clip": 0.01431901, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.29665661, + "balance_loss_mlp": 1.02495742, + "epoch": 0.31799188336088985, + "flos": 20963496130560.0, + "grad_norm": 1.7687587488910523, + "language_loss": 0.87705278, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.90178704, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.16571045, + "step": 5289, + "time_per_iteration": 2.7669520378112793 + }, + { + "auxiliary_loss_clip": 0.01436555, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.30186629, + "balance_loss_mlp": 1.02190304, + "epoch": 0.3180520066135578, + "flos": 22497047977680.0, + "grad_norm": 1.4604832300017836, + "language_loss": 0.68277824, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70752823, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16552734, + "step": 5290, + "time_per_iteration": 4.270333766937256 + }, + { + "auxiliary_loss_clip": 0.01445437, + "auxiliary_loss_mlp": 0.01055073, + "balance_loss_clip": 1.30308867, + "balance_loss_mlp": 1.03762114, + "epoch": 0.3181121298662258, + "flos": 27241104930120.0, + "grad_norm": 2.136787514502871, + "language_loss": 0.79772043, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82272553, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17443848, + "step": 5291, + "time_per_iteration": 4.260348796844482 + }, + { + "auxiliary_loss_clip": 0.01438295, + "auxiliary_loss_mlp": 0.01043653, + "balance_loss_clip": 1.29720676, + "balance_loss_mlp": 1.02593875, + "epoch": 0.31817225311889374, + "flos": 23184712170000.0, + "grad_norm": 1.8274724954203108, + "language_loss": 0.79884088, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82366037, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17712402, + "step": 5292, + "time_per_iteration": 2.7802326679229736 + }, + { + "auxiliary_loss_clip": 0.0142959, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_clip": 1.29624939, + "balance_loss_mlp": 1.0298996, + "epoch": 0.3182323763715617, + "flos": 23264108159760.0, + "grad_norm": 1.545468753146553, + "language_loss": 0.75095832, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77571154, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.1583252, + "step": 5293, + "time_per_iteration": 2.790558099746704 + }, + { + "auxiliary_loss_clip": 0.01442206, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.30584097, + "balance_loss_mlp": 1.03032756, + "epoch": 0.3182924996242297, + "flos": 29021842821240.0, + "grad_norm": 2.1121375242657456, + "language_loss": 0.74225646, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76714581, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.16400146, + "step": 5294, + "time_per_iteration": 2.8287525177001953 + }, + { + "auxiliary_loss_clip": 0.01437434, + "auxiliary_loss_mlp": 0.01039838, + "balance_loss_clip": 1.29880559, + "balance_loss_mlp": 1.02270806, + "epoch": 0.31835262287689764, + "flos": 20454098949360.0, + "grad_norm": 1.9996862191531237, + "language_loss": 0.75741374, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78218645, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.17126465, + "step": 5295, + "time_per_iteration": 2.7621829509735107 + }, + { + "auxiliary_loss_clip": 0.01447332, + "auxiliary_loss_mlp": 0.0104773, + "balance_loss_clip": 1.30440736, + "balance_loss_mlp": 1.03108859, + "epoch": 0.3184127461295656, + "flos": 25125013599480.0, + "grad_norm": 1.4093543428281599, + "language_loss": 0.69797778, + "learning_rate": 3.189046306936296e-06, + "loss": 0.72292835, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.16638184, + "step": 5296, + "time_per_iteration": 2.777071237564087 + }, + { + "auxiliary_loss_clip": 0.01435231, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.29671955, + "balance_loss_mlp": 1.02635539, + "epoch": 0.31847286938223357, + "flos": 25556029999920.0, + "grad_norm": 1.6246766392529943, + "language_loss": 0.77534854, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.80013299, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.1685791, + "step": 5297, + "time_per_iteration": 2.897615432739258 + }, + { + "auxiliary_loss_clip": 0.01431187, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.29494989, + "balance_loss_mlp": 1.01897788, + "epoch": 0.31853299263490154, + "flos": 27788291513280.0, + "grad_norm": 1.77230492953782, + "language_loss": 0.79608762, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.82076544, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.17626953, + "step": 5298, + "time_per_iteration": 2.8320722579956055 + }, + { + "auxiliary_loss_clip": 0.01446075, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.30396223, + "balance_loss_mlp": 1.03057718, + "epoch": 0.3185931158875695, + "flos": 22711358231280.0, + "grad_norm": 1.7264441215490756, + "language_loss": 0.74361062, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76854426, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.16711426, + "step": 5299, + "time_per_iteration": 2.8109071254730225 + }, + { + "auxiliary_loss_clip": 0.01442211, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.29920185, + "balance_loss_mlp": 1.02717292, + "epoch": 0.31865323914023747, + "flos": 24576893024040.0, + "grad_norm": 2.0311554871300306, + "language_loss": 0.78381526, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.8086744, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.16540527, + "step": 5300, + "time_per_iteration": 2.7992875576019287 + }, + { + "auxiliary_loss_clip": 0.01438655, + "auxiliary_loss_mlp": 0.01051406, + "balance_loss_clip": 1.29914176, + "balance_loss_mlp": 1.03160572, + "epoch": 0.31871336239290543, + "flos": 18191113888680.0, + "grad_norm": 1.825564993788973, + "language_loss": 0.8402428, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86514342, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.19812012, + "step": 5301, + "time_per_iteration": 2.7111778259277344 + }, + { + "auxiliary_loss_clip": 0.01433917, + "auxiliary_loss_mlp": 0.01049469, + "balance_loss_clip": 1.29792869, + "balance_loss_mlp": 1.03209996, + "epoch": 0.31877348564557345, + "flos": 21830686193160.0, + "grad_norm": 2.243041000357728, + "language_loss": 0.77295202, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79778588, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17370605, + "step": 5302, + "time_per_iteration": 2.760563373565674 + }, + { + "auxiliary_loss_clip": 0.01430171, + "auxiliary_loss_mlp": 0.0104701, + "balance_loss_clip": 1.2939297, + "balance_loss_mlp": 1.02756703, + "epoch": 0.3188336088982414, + "flos": 22019998678200.0, + "grad_norm": 1.6400270151801815, + "language_loss": 0.79950386, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.82427561, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.19433594, + "step": 5303, + "time_per_iteration": 2.7559409141540527 + }, + { + "auxiliary_loss_clip": 0.01449209, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.30173874, + "balance_loss_mlp": 1.03081465, + "epoch": 0.3188937321509094, + "flos": 20052750278880.0, + "grad_norm": 3.044201378317254, + "language_loss": 0.73429835, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75928366, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.18505859, + "step": 5304, + "time_per_iteration": 2.7691285610198975 + }, + { + "auxiliary_loss_clip": 0.01431245, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.29494703, + "balance_loss_mlp": 1.0211885, + "epoch": 0.31895385540357735, + "flos": 25853594037480.0, + "grad_norm": 1.7789859975997402, + "language_loss": 0.72026998, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74495119, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.15698242, + "step": 5305, + "time_per_iteration": 2.764420747756958 + }, + { + "auxiliary_loss_clip": 0.01431445, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_clip": 1.29372358, + "balance_loss_mlp": 1.03362799, + "epoch": 0.3190139786562453, + "flos": 23483047766400.0, + "grad_norm": 1.5467999194411135, + "language_loss": 0.64037406, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66518998, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.1652832, + "step": 5306, + "time_per_iteration": 2.761133909225464 + }, + { + "auxiliary_loss_clip": 0.01433936, + "auxiliary_loss_mlp": 0.01046645, + "balance_loss_clip": 1.29480648, + "balance_loss_mlp": 1.02934766, + "epoch": 0.3190741019089133, + "flos": 29101482461160.0, + "grad_norm": 2.2588788139409046, + "language_loss": 0.78808367, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81288946, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.1730957, + "step": 5307, + "time_per_iteration": 2.7948927879333496 + }, + { + "auxiliary_loss_clip": 0.01427849, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_clip": 1.29250526, + "balance_loss_mlp": 1.028108, + "epoch": 0.31913422516158124, + "flos": 17134083432360.0, + "grad_norm": 3.717264558573081, + "language_loss": 0.77856648, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.8033064, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.18041992, + "step": 5308, + "time_per_iteration": 2.807011127471924 + }, + { + "auxiliary_loss_clip": 0.01453863, + "auxiliary_loss_mlp": 0.01050716, + "balance_loss_clip": 1.30584955, + "balance_loss_mlp": 1.03153563, + "epoch": 0.3191943484142492, + "flos": 16074047957400.0, + "grad_norm": 2.0625867000796343, + "language_loss": 0.74959755, + "learning_rate": 3.184971450390961e-06, + "loss": 0.7746433, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.19177246, + "step": 5309, + "time_per_iteration": 2.771860122680664 + }, + { + "auxiliary_loss_clip": 0.01437148, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.29866815, + "balance_loss_mlp": 1.02842474, + "epoch": 0.3192544716669172, + "flos": 22971335908680.0, + "grad_norm": 1.935979156565554, + "language_loss": 0.82756257, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85238302, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.16467285, + "step": 5310, + "time_per_iteration": 2.7722487449645996 + }, + { + "auxiliary_loss_clip": 0.01434733, + "auxiliary_loss_mlp": 0.01043741, + "balance_loss_clip": 1.29505098, + "balance_loss_mlp": 1.02714705, + "epoch": 0.31931459491958514, + "flos": 26876043152280.0, + "grad_norm": 1.3962431524685222, + "language_loss": 0.78559077, + "learning_rate": 3.184343874716412e-06, + "loss": 0.81037545, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.16589355, + "step": 5311, + "time_per_iteration": 2.8027477264404297 + }, + { + "auxiliary_loss_clip": 0.01433629, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.29551888, + "balance_loss_mlp": 1.03411853, + "epoch": 0.3193747181722531, + "flos": 21841731667080.0, + "grad_norm": 1.7837529005263837, + "language_loss": 0.84640408, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.87125134, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16967773, + "step": 5312, + "time_per_iteration": 2.744515895843506 + }, + { + "auxiliary_loss_clip": 0.01445493, + "auxiliary_loss_mlp": 0.01050018, + "balance_loss_clip": 1.30077481, + "balance_loss_mlp": 1.03231549, + "epoch": 0.31943484142492107, + "flos": 18328586479200.0, + "grad_norm": 2.157638164035579, + "language_loss": 0.79357922, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.81853437, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.17700195, + "step": 5313, + "time_per_iteration": 2.7347617149353027 + }, + { + "auxiliary_loss_clip": 0.01439941, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.30088091, + "balance_loss_mlp": 1.02890086, + "epoch": 0.31949496467758903, + "flos": 21620761642440.0, + "grad_norm": 2.6491770902945238, + "language_loss": 0.8620317, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88688934, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.16918945, + "step": 5314, + "time_per_iteration": 2.8003246784210205 + }, + { + "auxiliary_loss_clip": 0.01436092, + "auxiliary_loss_mlp": 0.01046469, + "balance_loss_clip": 1.29807687, + "balance_loss_mlp": 1.02960134, + "epoch": 0.31955508793025705, + "flos": 21764975220720.0, + "grad_norm": 1.90098068047116, + "language_loss": 0.7986297, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82345533, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1685791, + "step": 5315, + "time_per_iteration": 2.747086763381958 + }, + { + "auxiliary_loss_clip": 0.01433544, + "auxiliary_loss_mlp": 0.01050134, + "balance_loss_clip": 1.29502881, + "balance_loss_mlp": 1.03225243, + "epoch": 0.319615211182925, + "flos": 17168664773880.0, + "grad_norm": 2.010472957569868, + "language_loss": 0.67327362, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69811034, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.17871094, + "step": 5316, + "time_per_iteration": 4.323371648788452 + }, + { + "auxiliary_loss_clip": 0.01437833, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_clip": 1.29922974, + "balance_loss_mlp": 1.02849698, + "epoch": 0.319675334435593, + "flos": 28118934383040.0, + "grad_norm": 1.470918803705116, + "language_loss": 0.69578207, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.72060388, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.15869141, + "step": 5317, + "time_per_iteration": 2.7991766929626465 + }, + { + "auxiliary_loss_clip": 0.01274196, + "auxiliary_loss_mlp": 0.01005903, + "balance_loss_clip": 1.21440816, + "balance_loss_mlp": 1.00270772, + "epoch": 0.31973545768826095, + "flos": 69519166066080.0, + "grad_norm": 0.7319485475229275, + "language_loss": 0.53167176, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55447274, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.03198242, + "step": 5318, + "time_per_iteration": 3.408421039581299 + }, + { + "auxiliary_loss_clip": 0.01425178, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.28960657, + "balance_loss_mlp": 1.02957523, + "epoch": 0.3197955809409289, + "flos": 13703217427800.0, + "grad_norm": 1.61227638309505, + "language_loss": 0.84756941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.87227535, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.1583252, + "step": 5319, + "time_per_iteration": 2.7490921020507812 + }, + { + "auxiliary_loss_clip": 0.01426988, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.28987956, + "balance_loss_mlp": 1.02468824, + "epoch": 0.3198557041935969, + "flos": 33224560794360.0, + "grad_norm": 1.4542724815910941, + "language_loss": 0.63168317, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65637159, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.17144775, + "step": 5320, + "time_per_iteration": 2.8609869480133057 + }, + { + "auxiliary_loss_clip": 0.0143547, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.2950449, + "balance_loss_mlp": 1.02301717, + "epoch": 0.31991582744626484, + "flos": 23737259056680.0, + "grad_norm": 1.8385262987623654, + "language_loss": 0.70558107, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.73032331, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.15734863, + "step": 5321, + "time_per_iteration": 2.79544997215271 + }, + { + "auxiliary_loss_clip": 0.01447007, + "auxiliary_loss_mlp": 0.01063911, + "balance_loss_clip": 1.300699, + "balance_loss_mlp": 1.04598212, + "epoch": 0.3199759506989328, + "flos": 18555363499320.0, + "grad_norm": 3.269608952560189, + "language_loss": 0.87112141, + "learning_rate": 3.180888999963749e-06, + "loss": 0.89623058, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.17932129, + "step": 5322, + "time_per_iteration": 4.271731615066528 + }, + { + "auxiliary_loss_clip": 0.01432535, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.29383886, + "balance_loss_mlp": 1.02138042, + "epoch": 0.3200360739516008, + "flos": 22423986892080.0, + "grad_norm": 1.6325661981917192, + "language_loss": 0.83091426, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85562217, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.16870117, + "step": 5323, + "time_per_iteration": 2.7505548000335693 + }, + { + "auxiliary_loss_clip": 0.01423878, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.28735685, + "balance_loss_mlp": 1.02614689, + "epoch": 0.32009619720426874, + "flos": 20600180512200.0, + "grad_norm": 1.6234439882912959, + "language_loss": 0.78241348, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80709666, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.18273926, + "step": 5324, + "time_per_iteration": 2.7510969638824463 + }, + { + "auxiliary_loss_clip": 0.014293, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.2914257, + "balance_loss_mlp": 1.02288926, + "epoch": 0.3201563204569367, + "flos": 18151862585760.0, + "grad_norm": 1.614734044865751, + "language_loss": 0.80047572, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82517254, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17492676, + "step": 5325, + "time_per_iteration": 2.7039735317230225 + }, + { + "auxiliary_loss_clip": 0.0143328, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.2946924, + "balance_loss_mlp": 1.02340603, + "epoch": 0.32021644370960467, + "flos": 31690277996760.0, + "grad_norm": 1.8925218985624594, + "language_loss": 0.75046897, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77519774, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16174316, + "step": 5326, + "time_per_iteration": 2.854105234146118 + }, + { + "auxiliary_loss_clip": 0.01424265, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.28890109, + "balance_loss_mlp": 1.02494347, + "epoch": 0.32027656696227264, + "flos": 26871048324000.0, + "grad_norm": 1.4101829628239928, + "language_loss": 0.81256413, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83721972, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16345215, + "step": 5327, + "time_per_iteration": 2.783205032348633 + }, + { + "auxiliary_loss_clip": 0.01435376, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.29259086, + "balance_loss_mlp": 1.02370298, + "epoch": 0.32033669021494066, + "flos": 24175219486680.0, + "grad_norm": 1.4025767989538371, + "language_loss": 0.77709323, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80184972, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.16552734, + "step": 5328, + "time_per_iteration": 4.2572855949401855 + }, + { + "auxiliary_loss_clip": 0.01435434, + "auxiliary_loss_mlp": 0.0104545, + "balance_loss_clip": 1.29523778, + "balance_loss_mlp": 1.02698481, + "epoch": 0.3203968134676086, + "flos": 24465961319760.0, + "grad_norm": 1.520286960363344, + "language_loss": 0.74100107, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76580989, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.18469238, + "step": 5329, + "time_per_iteration": 4.29000449180603 + }, + { + "auxiliary_loss_clip": 0.01414736, + "auxiliary_loss_mlp": 0.01040023, + "balance_loss_clip": 1.28202784, + "balance_loss_mlp": 1.02441835, + "epoch": 0.3204569367202766, + "flos": 18009313950240.0, + "grad_norm": 1.5537468766626847, + "language_loss": 0.71086717, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73541474, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.15612793, + "step": 5330, + "time_per_iteration": 2.78183913230896 + }, + { + "auxiliary_loss_clip": 0.01437632, + "auxiliary_loss_mlp": 0.01049473, + "balance_loss_clip": 1.29349768, + "balance_loss_mlp": 1.03059006, + "epoch": 0.32051705997294455, + "flos": 30595579963560.0, + "grad_norm": 1.6226136132108218, + "language_loss": 0.80262178, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82749283, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.1887207, + "step": 5331, + "time_per_iteration": 2.8607523441314697 + }, + { + "auxiliary_loss_clip": 0.01259344, + "auxiliary_loss_mlp": 0.01007581, + "balance_loss_clip": 1.20060658, + "balance_loss_mlp": 1.0045414, + "epoch": 0.3205771832256125, + "flos": 68432613332040.0, + "grad_norm": 0.8496070831115257, + "language_loss": 0.57839763, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60106695, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.03039551, + "step": 5332, + "time_per_iteration": 3.249335289001465 + }, + { + "auxiliary_loss_clip": 0.01439801, + "auxiliary_loss_mlp": 0.01042348, + "balance_loss_clip": 1.29756367, + "balance_loss_mlp": 1.02514625, + "epoch": 0.3206373064782805, + "flos": 30449863875960.0, + "grad_norm": 1.73895965498318, + "language_loss": 0.73715067, + "learning_rate": 3.177428706902205e-06, + "loss": 0.76197219, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.17199707, + "step": 5333, + "time_per_iteration": 2.878324031829834 + }, + { + "auxiliary_loss_clip": 0.01429644, + "auxiliary_loss_mlp": 0.01046971, + "balance_loss_clip": 1.291255, + "balance_loss_mlp": 1.02937567, + "epoch": 0.32069742973094845, + "flos": 22059615456360.0, + "grad_norm": 2.0729415796212485, + "language_loss": 0.7098031, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73456925, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.17590332, + "step": 5334, + "time_per_iteration": 2.7722887992858887 + }, + { + "auxiliary_loss_clip": 0.01428339, + "auxiliary_loss_mlp": 0.01041175, + "balance_loss_clip": 1.28895545, + "balance_loss_mlp": 1.02424765, + "epoch": 0.3207575529836164, + "flos": 22059046939320.0, + "grad_norm": 2.861915174303358, + "language_loss": 0.78264087, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.80733609, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16918945, + "step": 5335, + "time_per_iteration": 2.7646493911743164 + }, + { + "auxiliary_loss_clip": 0.01431806, + "auxiliary_loss_mlp": 0.01041217, + "balance_loss_clip": 1.29345536, + "balance_loss_mlp": 1.02360988, + "epoch": 0.3208176762362844, + "flos": 34064113545000.0, + "grad_norm": 2.4248675113911133, + "language_loss": 0.68486166, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70959187, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17614746, + "step": 5336, + "time_per_iteration": 2.844155788421631 + }, + { + "auxiliary_loss_clip": 0.01433955, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.29329658, + "balance_loss_mlp": 1.02483773, + "epoch": 0.32087779948895234, + "flos": 21803739223320.0, + "grad_norm": 1.6504255359725024, + "language_loss": 0.79006088, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81483674, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.18811035, + "step": 5337, + "time_per_iteration": 2.776676654815674 + }, + { + "auxiliary_loss_clip": 0.01417302, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.28440905, + "balance_loss_mlp": 1.02117753, + "epoch": 0.3209379227416203, + "flos": 21439124137440.0, + "grad_norm": 1.631163377258993, + "language_loss": 0.7473551, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.771905, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16503906, + "step": 5338, + "time_per_iteration": 2.7830493450164795 + }, + { + "auxiliary_loss_clip": 0.01429969, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.28855467, + "balance_loss_mlp": 1.01748347, + "epoch": 0.3209980459942883, + "flos": 25854690463200.0, + "grad_norm": 1.825552892314297, + "language_loss": 0.63420236, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65885073, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.1739502, + "step": 5339, + "time_per_iteration": 2.8118839263916016 + }, + { + "auxiliary_loss_clip": 0.01431579, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.29224133, + "balance_loss_mlp": 1.02016902, + "epoch": 0.32105816924695624, + "flos": 19103890158360.0, + "grad_norm": 1.8236647962747443, + "language_loss": 0.82362151, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84831083, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17181396, + "step": 5340, + "time_per_iteration": 2.7273218631744385 + }, + { + "auxiliary_loss_clip": 0.01427003, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.28838694, + "balance_loss_mlp": 1.01765251, + "epoch": 0.3211182924996242, + "flos": 16586287723800.0, + "grad_norm": 1.9499090599933757, + "language_loss": 0.76504314, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78965539, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16564941, + "step": 5341, + "time_per_iteration": 2.820007801055908 + }, + { + "auxiliary_loss_clip": 0.01422583, + "auxiliary_loss_mlp": 0.01040505, + "balance_loss_clip": 1.28674233, + "balance_loss_mlp": 1.02320814, + "epoch": 0.3211784157522922, + "flos": 22677061148280.0, + "grad_norm": 1.687259356924238, + "language_loss": 0.79351163, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81814247, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17297363, + "step": 5342, + "time_per_iteration": 2.8252198696136475 + }, + { + "auxiliary_loss_clip": 0.01435203, + "auxiliary_loss_mlp": 0.01037788, + "balance_loss_clip": 1.29458094, + "balance_loss_mlp": 1.02004957, + "epoch": 0.3212385390049602, + "flos": 20563731186120.0, + "grad_norm": 2.273732251255634, + "language_loss": 0.75575078, + "learning_rate": 3.174278297458438e-06, + "loss": 0.78048062, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17736816, + "step": 5343, + "time_per_iteration": 2.8180489540100098 + }, + { + "auxiliary_loss_clip": 0.01429434, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.29057801, + "balance_loss_mlp": 1.01792288, + "epoch": 0.32129866225762815, + "flos": 24796644797880.0, + "grad_norm": 1.4837130767643365, + "language_loss": 0.82974893, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85439456, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.17211914, + "step": 5344, + "time_per_iteration": 2.8477835655212402 + }, + { + "auxiliary_loss_clip": 0.01431353, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.29031909, + "balance_loss_mlp": 1.0235815, + "epoch": 0.3213587855102961, + "flos": 18371167667640.0, + "grad_norm": 1.8964679337671517, + "language_loss": 0.79545248, + "learning_rate": 3.173647680842262e-06, + "loss": 0.82017452, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.17272949, + "step": 5345, + "time_per_iteration": 2.772542953491211 + }, + { + "auxiliary_loss_clip": 0.01433078, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.29284108, + "balance_loss_mlp": 1.01985312, + "epoch": 0.3214189087629641, + "flos": 27021678023160.0, + "grad_norm": 2.4320351814910564, + "language_loss": 0.83190966, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85660309, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.1640625, + "step": 5346, + "time_per_iteration": 2.8109827041625977 + }, + { + "auxiliary_loss_clip": 0.01437354, + "auxiliary_loss_mlp": 0.01040891, + "balance_loss_clip": 1.29358733, + "balance_loss_mlp": 1.02242577, + "epoch": 0.32147903201563205, + "flos": 23153420105640.0, + "grad_norm": 1.483875173183881, + "language_loss": 0.81604278, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84082526, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.18444824, + "step": 5347, + "time_per_iteration": 2.778634548187256 + }, + { + "auxiliary_loss_clip": 0.01424356, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_clip": 1.2867198, + "balance_loss_mlp": 1.03017688, + "epoch": 0.3215391552683, + "flos": 16585028864640.0, + "grad_norm": 1.9021357705113373, + "language_loss": 0.79814017, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82287043, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.18493652, + "step": 5348, + "time_per_iteration": 2.767404556274414 + }, + { + "auxiliary_loss_clip": 0.01435575, + "auxiliary_loss_mlp": 0.0105214, + "balance_loss_clip": 1.2949214, + "balance_loss_mlp": 1.03542733, + "epoch": 0.321599278520968, + "flos": 17826336369360.0, + "grad_norm": 2.379738222611679, + "language_loss": 0.85997379, + "learning_rate": 3.172385913647542e-06, + "loss": 0.88485098, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16711426, + "step": 5349, + "time_per_iteration": 2.7909083366394043 + }, + { + "auxiliary_loss_clip": 0.01432308, + "auxiliary_loss_mlp": 0.01046095, + "balance_loss_clip": 1.29315746, + "balance_loss_mlp": 1.02854705, + "epoch": 0.32165940177363594, + "flos": 16255929112560.0, + "grad_norm": 1.6516245433641001, + "language_loss": 0.81064987, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83543396, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.17553711, + "step": 5350, + "time_per_iteration": 2.767524242401123 + }, + { + "auxiliary_loss_clip": 0.01422677, + "auxiliary_loss_mlp": 0.01051773, + "balance_loss_clip": 1.28590703, + "balance_loss_mlp": 1.03587055, + "epoch": 0.3217195250263039, + "flos": 27605679407640.0, + "grad_norm": 1.598957680565116, + "language_loss": 0.79778099, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.8225255, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.15905762, + "step": 5351, + "time_per_iteration": 2.794942855834961 + }, + { + "auxiliary_loss_clip": 0.01429244, + "auxiliary_loss_mlp": 0.01053044, + "balance_loss_clip": 1.29106474, + "balance_loss_mlp": 1.0358181, + "epoch": 0.3217796482789719, + "flos": 21475614071880.0, + "grad_norm": 1.5935974372348172, + "language_loss": 0.75937116, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.78419399, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.17236328, + "step": 5352, + "time_per_iteration": 2.792588472366333 + }, + { + "auxiliary_loss_clip": 0.0142471, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_clip": 1.28685844, + "balance_loss_mlp": 1.02699065, + "epoch": 0.32183977153163984, + "flos": 21220631222760.0, + "grad_norm": 1.7974848453095893, + "language_loss": 0.82020974, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84490001, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17321777, + "step": 5353, + "time_per_iteration": 2.7582945823669434 + }, + { + "auxiliary_loss_clip": 0.01419845, + "auxiliary_loss_mlp": 0.01047187, + "balance_loss_clip": 1.28474605, + "balance_loss_mlp": 1.0302, + "epoch": 0.3218998947843078, + "flos": 24613586000280.0, + "grad_norm": 1.5123529671779683, + "language_loss": 0.73641723, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.7610876, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1697998, + "step": 5354, + "time_per_iteration": 4.180360555648804 + }, + { + "auxiliary_loss_clip": 0.01426325, + "auxiliary_loss_mlp": 0.01051406, + "balance_loss_clip": 1.2868185, + "balance_loss_mlp": 1.03532457, + "epoch": 0.3219600180369758, + "flos": 22275347002560.0, + "grad_norm": 1.4949557712584742, + "language_loss": 0.836245, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86102229, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.16088867, + "step": 5355, + "time_per_iteration": 2.8189218044281006 + }, + { + "auxiliary_loss_clip": 0.01437223, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_clip": 1.29793119, + "balance_loss_mlp": 1.03482366, + "epoch": 0.3220201412896438, + "flos": 14943225465000.0, + "grad_norm": 1.9539083190783604, + "language_loss": 0.71094263, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73583233, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.16931152, + "step": 5356, + "time_per_iteration": 2.73996639251709 + }, + { + "auxiliary_loss_clip": 0.01449682, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_clip": 1.30427742, + "balance_loss_mlp": 1.03147829, + "epoch": 0.32208026454231176, + "flos": 22671619628040.0, + "grad_norm": 2.456999735101158, + "language_loss": 0.67989123, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70487905, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.17614746, + "step": 5357, + "time_per_iteration": 2.783538579940796 + }, + { + "auxiliary_loss_clip": 0.01269479, + "auxiliary_loss_mlp": 0.0101522, + "balance_loss_clip": 1.20967221, + "balance_loss_mlp": 1.01207256, + "epoch": 0.3221403877949797, + "flos": 64621636829280.0, + "grad_norm": 0.703860913913745, + "language_loss": 0.58259565, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.6054427, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.03149414, + "step": 5358, + "time_per_iteration": 3.3700737953186035 + }, + { + "auxiliary_loss_clip": 0.01429342, + "auxiliary_loss_mlp": 0.0104735, + "balance_loss_clip": 1.28915477, + "balance_loss_mlp": 1.02940953, + "epoch": 0.3222005110476477, + "flos": 20161854606960.0, + "grad_norm": 1.8898410114647803, + "language_loss": 0.83948982, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86425674, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.17932129, + "step": 5359, + "time_per_iteration": 2.7276158332824707 + }, + { + "auxiliary_loss_clip": 0.0142929, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_clip": 1.29002953, + "balance_loss_mlp": 1.02729726, + "epoch": 0.32226063430031565, + "flos": 22679294608080.0, + "grad_norm": 1.6858772050589046, + "language_loss": 0.79838645, + "learning_rate": 3.168912388464595e-06, + "loss": 0.8231194, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.16699219, + "step": 5360, + "time_per_iteration": 2.8194565773010254 + }, + { + "auxiliary_loss_clip": 0.01262755, + "auxiliary_loss_mlp": 0.01007649, + "balance_loss_clip": 1.20376873, + "balance_loss_mlp": 1.00440681, + "epoch": 0.3223207575529836, + "flos": 63843571781640.0, + "grad_norm": 0.8128620851099647, + "language_loss": 0.57067859, + "learning_rate": 3.168596347256737e-06, + "loss": 0.5933826, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.0324707, + "step": 5361, + "time_per_iteration": 4.47145414352417 + }, + { + "auxiliary_loss_clip": 0.01431233, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.29396796, + "balance_loss_mlp": 1.02857018, + "epoch": 0.3223808808056516, + "flos": 26875799502120.0, + "grad_norm": 1.8030315664706944, + "language_loss": 0.71947509, + "learning_rate": 3.168280261735588e-06, + "loss": 0.74424845, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.17541504, + "step": 5362, + "time_per_iteration": 2.7686245441436768 + }, + { + "auxiliary_loss_clip": 0.01430789, + "auxiliary_loss_mlp": 0.0104475, + "balance_loss_clip": 1.29230642, + "balance_loss_mlp": 1.02779889, + "epoch": 0.32244100405831955, + "flos": 26767101257640.0, + "grad_norm": 1.6733297898013784, + "language_loss": 0.74089319, + "learning_rate": 3.167964131913135e-06, + "loss": 0.7656486, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.16949463, + "step": 5363, + "time_per_iteration": 2.8107504844665527 + }, + { + "auxiliary_loss_clip": 0.01431473, + "auxiliary_loss_mlp": 0.0104847, + "balance_loss_clip": 1.28872156, + "balance_loss_mlp": 1.03015935, + "epoch": 0.3225011273109875, + "flos": 23808005465760.0, + "grad_norm": 2.045378865660729, + "language_loss": 0.76926893, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79406834, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.1829834, + "step": 5364, + "time_per_iteration": 2.755889415740967 + }, + { + "auxiliary_loss_clip": 0.01426829, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_clip": 1.28882909, + "balance_loss_mlp": 1.02367556, + "epoch": 0.3225612505636555, + "flos": 17278946744400.0, + "grad_norm": 2.411892663330494, + "language_loss": 0.77670014, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.80138409, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.17895508, + "step": 5365, + "time_per_iteration": 2.740872859954834 + }, + { + "auxiliary_loss_clip": 0.01429839, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.2918452, + "balance_loss_mlp": 1.02332067, + "epoch": 0.32262137381632344, + "flos": 23371303894920.0, + "grad_norm": 1.6177713815229002, + "language_loss": 0.76575732, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79045248, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.16369629, + "step": 5366, + "time_per_iteration": 2.777604818344116 + }, + { + "auxiliary_loss_clip": 0.01425183, + "auxiliary_loss_mlp": 0.01038177, + "balance_loss_clip": 1.28788924, + "balance_loss_mlp": 1.02054596, + "epoch": 0.3226814970689914, + "flos": 23264108159760.0, + "grad_norm": 2.1150705960335734, + "language_loss": 0.71929181, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74392545, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1763916, + "step": 5367, + "time_per_iteration": 4.314033031463623 + }, + { + "auxiliary_loss_clip": 0.01422493, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.28618193, + "balance_loss_mlp": 1.01897216, + "epoch": 0.32274162032165943, + "flos": 16399046265120.0, + "grad_norm": 1.6563697322956865, + "language_loss": 0.74708951, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.77166927, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16516113, + "step": 5368, + "time_per_iteration": 4.254523038864136 + }, + { + "auxiliary_loss_clip": 0.01424889, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.28900146, + "balance_loss_mlp": 1.02300632, + "epoch": 0.3228017435743274, + "flos": 27860865298560.0, + "grad_norm": 1.4398977408731004, + "language_loss": 0.7847172, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80936682, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17077637, + "step": 5369, + "time_per_iteration": 2.9141571521759033 + }, + { + "auxiliary_loss_clip": 0.01418232, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.28446293, + "balance_loss_mlp": 1.02301359, + "epoch": 0.32286186682699536, + "flos": 19613409164640.0, + "grad_norm": 1.760254623310299, + "language_loss": 0.82987988, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85445929, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16687012, + "step": 5370, + "time_per_iteration": 2.7340261936187744 + }, + { + "auxiliary_loss_clip": 0.01425663, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.28893018, + "balance_loss_mlp": 1.02189171, + "epoch": 0.3229219900796633, + "flos": 24139379286000.0, + "grad_norm": 2.1883921502947654, + "language_loss": 0.83066511, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85531044, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16955566, + "step": 5371, + "time_per_iteration": 2.784573793411255 + }, + { + "auxiliary_loss_clip": 0.01431368, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_clip": 1.2888658, + "balance_loss_mlp": 1.0304153, + "epoch": 0.3229821133323313, + "flos": 17753194067040.0, + "grad_norm": 3.4032704209424702, + "language_loss": 0.88968945, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.91449946, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1920166, + "step": 5372, + "time_per_iteration": 2.7288620471954346 + }, + { + "auxiliary_loss_clip": 0.01428109, + "auxiliary_loss_mlp": 0.0104706, + "balance_loss_clip": 1.29083514, + "balance_loss_mlp": 1.02975106, + "epoch": 0.32304223658499925, + "flos": 22351128848280.0, + "grad_norm": 3.6062429905970768, + "language_loss": 0.72848725, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75323892, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.17297363, + "step": 5373, + "time_per_iteration": 2.797787666320801 + }, + { + "auxiliary_loss_clip": 0.01415692, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.28206468, + "balance_loss_mlp": 1.02149844, + "epoch": 0.3231023598376672, + "flos": 18482870930760.0, + "grad_norm": 2.405223852067629, + "language_loss": 0.81440139, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83893967, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16650391, + "step": 5374, + "time_per_iteration": 2.7888455390930176 + }, + { + "auxiliary_loss_clip": 0.01414651, + "auxiliary_loss_mlp": 0.01041468, + "balance_loss_clip": 1.28085279, + "balance_loss_mlp": 1.02459955, + "epoch": 0.3231624830903352, + "flos": 27642088125360.0, + "grad_norm": 2.249378324617229, + "language_loss": 0.87602353, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.90058476, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.1685791, + "step": 5375, + "time_per_iteration": 2.820795774459839 + }, + { + "auxiliary_loss_clip": 0.01428953, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.28821468, + "balance_loss_mlp": 1.02378774, + "epoch": 0.32322260634300315, + "flos": 21731530913280.0, + "grad_norm": 1.8251859250186422, + "language_loss": 0.7601921, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78490305, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.18359375, + "step": 5376, + "time_per_iteration": 2.786245584487915 + }, + { + "auxiliary_loss_clip": 0.01426167, + "auxiliary_loss_mlp": 0.01037323, + "balance_loss_clip": 1.28881788, + "balance_loss_mlp": 1.01947689, + "epoch": 0.3232827295956711, + "flos": 22642764065280.0, + "grad_norm": 2.0776273497366273, + "language_loss": 0.67159295, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69622779, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.17834473, + "step": 5377, + "time_per_iteration": 2.7722394466400146 + }, + { + "auxiliary_loss_clip": 0.01425436, + "auxiliary_loss_mlp": 0.01063604, + "balance_loss_clip": 1.28927612, + "balance_loss_mlp": 1.04497159, + "epoch": 0.3233428528483391, + "flos": 26328166227000.0, + "grad_norm": 1.412619236286747, + "language_loss": 0.72670913, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.75159949, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.1862793, + "step": 5378, + "time_per_iteration": 2.8077824115753174 + }, + { + "auxiliary_loss_clip": 0.01421445, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.28441608, + "balance_loss_mlp": 1.02146137, + "epoch": 0.32340297610100704, + "flos": 28591151287680.0, + "grad_norm": 1.7810124213219782, + "language_loss": 0.82258224, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84717607, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16491699, + "step": 5379, + "time_per_iteration": 2.8046257495880127 + }, + { + "auxiliary_loss_clip": 0.01430353, + "auxiliary_loss_mlp": 0.010411, + "balance_loss_clip": 1.29000783, + "balance_loss_mlp": 1.02423143, + "epoch": 0.323463099353675, + "flos": 30779938228680.0, + "grad_norm": 1.5200638162801294, + "language_loss": 0.79169315, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81640768, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.16882324, + "step": 5380, + "time_per_iteration": 2.8666422367095947 + }, + { + "auxiliary_loss_clip": 0.01430605, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.29291201, + "balance_loss_mlp": 1.02983069, + "epoch": 0.32352322260634303, + "flos": 25234117927560.0, + "grad_norm": 1.6225841304383959, + "language_loss": 0.7763921, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.80115366, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.15722656, + "step": 5381, + "time_per_iteration": 2.7961201667785645 + }, + { + "auxiliary_loss_clip": 0.01416344, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.28427792, + "balance_loss_mlp": 1.02806127, + "epoch": 0.323583345859011, + "flos": 23335626127680.0, + "grad_norm": 1.6828236174173545, + "language_loss": 0.72021985, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74481881, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15490723, + "step": 5382, + "time_per_iteration": 2.781486988067627 + }, + { + "auxiliary_loss_clip": 0.0143426, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_clip": 1.29349244, + "balance_loss_mlp": 1.02916408, + "epoch": 0.32364346911167896, + "flos": 26212320911160.0, + "grad_norm": 2.027496603114915, + "language_loss": 0.70762855, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.73243439, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.17163086, + "step": 5383, + "time_per_iteration": 2.8173227310180664 + }, + { + "auxiliary_loss_clip": 0.01416063, + "auxiliary_loss_mlp": 0.01042321, + "balance_loss_clip": 1.28353584, + "balance_loss_mlp": 1.02632344, + "epoch": 0.3237035923643469, + "flos": 23701012772400.0, + "grad_norm": 1.5389143978351592, + "language_loss": 0.78478122, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80936503, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.16003418, + "step": 5384, + "time_per_iteration": 2.763904333114624 + }, + { + "auxiliary_loss_clip": 0.01435499, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.29477274, + "balance_loss_mlp": 1.03221464, + "epoch": 0.3237637156170149, + "flos": 14432813074800.0, + "grad_norm": 2.0651326190295003, + "language_loss": 0.75396287, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77881885, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.17871094, + "step": 5385, + "time_per_iteration": 2.7558228969573975 + }, + { + "auxiliary_loss_clip": 0.01425062, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.28965354, + "balance_loss_mlp": 1.02733052, + "epoch": 0.32382383886968286, + "flos": 31068812077200.0, + "grad_norm": 1.7879551652758006, + "language_loss": 0.72011721, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74480486, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.16363525, + "step": 5386, + "time_per_iteration": 2.845797300338745 + }, + { + "auxiliary_loss_clip": 0.01430852, + "auxiliary_loss_mlp": 0.0105087, + "balance_loss_clip": 1.29046166, + "balance_loss_mlp": 1.03371584, + "epoch": 0.3238839621223508, + "flos": 23261996525040.0, + "grad_norm": 1.7608821041673561, + "language_loss": 0.94322741, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96804464, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.17163086, + "step": 5387, + "time_per_iteration": 2.8760907649993896 + }, + { + "auxiliary_loss_clip": 0.01441559, + "auxiliary_loss_mlp": 0.01055149, + "balance_loss_clip": 1.30185926, + "balance_loss_mlp": 1.03732729, + "epoch": 0.3239440853750188, + "flos": 22969630357560.0, + "grad_norm": 1.9656915269454567, + "language_loss": 0.7745266, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79949367, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.17822266, + "step": 5388, + "time_per_iteration": 2.7415051460266113 + }, + { + "auxiliary_loss_clip": 0.01424093, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_clip": 1.28551018, + "balance_loss_mlp": 1.02496791, + "epoch": 0.32400420862768675, + "flos": 36253469003040.0, + "grad_norm": 1.8819511867224976, + "language_loss": 0.71744895, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74210745, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16772461, + "step": 5389, + "time_per_iteration": 2.9063172340393066 + }, + { + "auxiliary_loss_clip": 0.01425641, + "auxiliary_loss_mlp": 0.01047342, + "balance_loss_clip": 1.29081166, + "balance_loss_mlp": 1.03002059, + "epoch": 0.3240643318803547, + "flos": 21621533201280.0, + "grad_norm": 1.6306781027365629, + "language_loss": 0.81805265, + "learning_rate": 3.159411924656557e-06, + "loss": 0.8427825, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.1730957, + "step": 5390, + "time_per_iteration": 2.7619011402130127 + }, + { + "auxiliary_loss_clip": 0.01431769, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_clip": 1.29487491, + "balance_loss_mlp": 1.02847767, + "epoch": 0.3241244551330227, + "flos": 23300963569440.0, + "grad_norm": 1.8116501786983183, + "language_loss": 0.73463929, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75941598, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.17431641, + "step": 5391, + "time_per_iteration": 2.821682929992676 + }, + { + "auxiliary_loss_clip": 0.01425863, + "auxiliary_loss_mlp": 0.01046175, + "balance_loss_clip": 1.28939724, + "balance_loss_mlp": 1.02998602, + "epoch": 0.32418457838569065, + "flos": 14100870737520.0, + "grad_norm": 1.5310788720507031, + "language_loss": 0.77395129, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79867172, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16186523, + "step": 5392, + "time_per_iteration": 2.753399133682251 + }, + { + "auxiliary_loss_clip": 0.01432577, + "auxiliary_loss_mlp": 0.0104895, + "balance_loss_clip": 1.29142952, + "balance_loss_mlp": 1.03040075, + "epoch": 0.3242447016383586, + "flos": 29758747973040.0, + "grad_norm": 1.866966691393777, + "language_loss": 0.62923527, + "learning_rate": 3.158459696652067e-06, + "loss": 0.65405059, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.18554688, + "step": 5393, + "time_per_iteration": 4.357086658477783 + }, + { + "auxiliary_loss_clip": 0.01429496, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.2928462, + "balance_loss_mlp": 1.02494812, + "epoch": 0.3243048248910266, + "flos": 24356369691360.0, + "grad_norm": 1.7291135563477622, + "language_loss": 0.82785732, + "learning_rate": 3.158142199443371e-06, + "loss": 0.85257435, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17272949, + "step": 5394, + "time_per_iteration": 2.7889721393585205 + }, + { + "auxiliary_loss_clip": 0.01420219, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.28903842, + "balance_loss_mlp": 1.03354573, + "epoch": 0.3243649481436946, + "flos": 24358440717720.0, + "grad_norm": 1.6819256599090664, + "language_loss": 0.81895685, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.84365094, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15649414, + "step": 5395, + "time_per_iteration": 2.803663730621338 + }, + { + "auxiliary_loss_clip": 0.01423415, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.29178786, + "balance_loss_mlp": 1.02762604, + "epoch": 0.32442507139636256, + "flos": 22929810537600.0, + "grad_norm": 1.6429435138081534, + "language_loss": 0.83312738, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85780144, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.16369629, + "step": 5396, + "time_per_iteration": 2.7644739151000977 + }, + { + "auxiliary_loss_clip": 0.01434571, + "auxiliary_loss_mlp": 0.01046634, + "balance_loss_clip": 1.29432106, + "balance_loss_mlp": 1.02996826, + "epoch": 0.32448519464903053, + "flos": 22205290935600.0, + "grad_norm": 1.9684260848701993, + "language_loss": 0.76029813, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.78511024, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.16662598, + "step": 5397, + "time_per_iteration": 2.757512092590332 + }, + { + "auxiliary_loss_clip": 0.01427597, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.29388285, + "balance_loss_mlp": 1.02389479, + "epoch": 0.3245453179016985, + "flos": 18842450580000.0, + "grad_norm": 2.1067306222043176, + "language_loss": 0.67328918, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69797873, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.17468262, + "step": 5398, + "time_per_iteration": 2.720353841781616 + }, + { + "auxiliary_loss_clip": 0.01428475, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.29266953, + "balance_loss_mlp": 1.01946485, + "epoch": 0.32460544115436646, + "flos": 21183329121120.0, + "grad_norm": 1.3598971683533867, + "language_loss": 0.73051912, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75516707, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.1685791, + "step": 5399, + "time_per_iteration": 4.233730316162109 + }, + { + "auxiliary_loss_clip": 0.01432386, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.29710233, + "balance_loss_mlp": 1.02673388, + "epoch": 0.3246655644070344, + "flos": 21986188895520.0, + "grad_norm": 2.1588807874684566, + "language_loss": 0.71719956, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.74196267, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.171875, + "step": 5400, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.01439051, + "auxiliary_loss_mlp": 0.01041935, + "balance_loss_clip": 1.29969442, + "balance_loss_mlp": 1.02523339, + "epoch": 0.3247256876597024, + "flos": 32165987220360.0, + "grad_norm": 2.379137661068694, + "language_loss": 0.80407536, + "learning_rate": 3.155918489984614e-06, + "loss": 0.8288852, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.16711426, + "step": 5401, + "time_per_iteration": 2.856987953186035 + }, + { + "auxiliary_loss_clip": 0.01439596, + "auxiliary_loss_mlp": 0.01045461, + "balance_loss_clip": 1.30247116, + "balance_loss_mlp": 1.0275197, + "epoch": 0.32478581091237035, + "flos": 21002666216760.0, + "grad_norm": 1.4927703971871902, + "language_loss": 0.87891644, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.90376699, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.17944336, + "step": 5402, + "time_per_iteration": 2.849883794784546 + }, + { + "auxiliary_loss_clip": 0.0142391, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.29275966, + "balance_loss_mlp": 1.02159023, + "epoch": 0.3248459341650383, + "flos": 17928699709680.0, + "grad_norm": 3.0225546823968865, + "language_loss": 0.8488059, + "learning_rate": 3.155282749751332e-06, + "loss": 0.87341642, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15563965, + "step": 5403, + "time_per_iteration": 2.700005531311035 + }, + { + "auxiliary_loss_clip": 0.01421073, + "auxiliary_loss_mlp": 0.01040595, + "balance_loss_clip": 1.2921648, + "balance_loss_mlp": 1.02577186, + "epoch": 0.3249060574177063, + "flos": 24541052823360.0, + "grad_norm": 2.0161747840631294, + "language_loss": 0.8779099, + "learning_rate": 3.154964813916007e-06, + "loss": 0.90252656, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14825439, + "step": 5404, + "time_per_iteration": 2.7900378704071045 + }, + { + "auxiliary_loss_clip": 0.01427959, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.29566634, + "balance_loss_mlp": 1.02335238, + "epoch": 0.32496618067037425, + "flos": 26000365942440.0, + "grad_norm": 1.6424654458167236, + "language_loss": 0.73239863, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.7570765, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.16455078, + "step": 5405, + "time_per_iteration": 4.298320531845093 + }, + { + "auxiliary_loss_clip": 0.01424445, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.29252863, + "balance_loss_mlp": 1.02469492, + "epoch": 0.3250263039230422, + "flos": 19578584172960.0, + "grad_norm": 1.602502161091347, + "language_loss": 0.83176214, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85641718, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.16357422, + "step": 5406, + "time_per_iteration": 4.351953506469727 + }, + { + "auxiliary_loss_clip": 0.01428469, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.29699123, + "balance_loss_mlp": 1.02192163, + "epoch": 0.3250864271757102, + "flos": 16768006445520.0, + "grad_norm": 1.6349933306486586, + "language_loss": 0.88090587, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90556622, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15649414, + "step": 5407, + "time_per_iteration": 2.7155447006225586 + }, + { + "auxiliary_loss_clip": 0.01426954, + "auxiliary_loss_mlp": 0.0104023, + "balance_loss_clip": 1.29451728, + "balance_loss_mlp": 1.02411318, + "epoch": 0.3251465504283782, + "flos": 27825349964760.0, + "grad_norm": 1.3057221869414826, + "language_loss": 0.70019692, + "learning_rate": 3.153692632731479e-06, + "loss": 0.72486877, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16113281, + "step": 5408, + "time_per_iteration": 2.8396410942077637 + }, + { + "auxiliary_loss_clip": 0.01436159, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.29682767, + "balance_loss_mlp": 1.02496612, + "epoch": 0.32520667368104617, + "flos": 19067928132600.0, + "grad_norm": 1.645525570403733, + "language_loss": 0.77875459, + "learning_rate": 3.153374478034841e-06, + "loss": 0.80352199, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.15612793, + "step": 5409, + "time_per_iteration": 2.7359957695007324 + }, + { + "auxiliary_loss_clip": 0.01436197, + "auxiliary_loss_mlp": 0.01053292, + "balance_loss_clip": 1.29952693, + "balance_loss_mlp": 1.03704429, + "epoch": 0.32526679693371413, + "flos": 29387432507760.0, + "grad_norm": 1.546333033616845, + "language_loss": 0.83027315, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85516798, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16247559, + "step": 5410, + "time_per_iteration": 2.821920871734619 + }, + { + "auxiliary_loss_clip": 0.01415974, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.28673208, + "balance_loss_mlp": 1.01934481, + "epoch": 0.3253269201863821, + "flos": 20709447273720.0, + "grad_norm": 1.6405132992698779, + "language_loss": 0.7166248, + "learning_rate": 3.152738037445405e-06, + "loss": 0.74112809, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15002441, + "step": 5411, + "time_per_iteration": 2.8183162212371826 + }, + { + "auxiliary_loss_clip": 0.01426639, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.2941699, + "balance_loss_mlp": 1.02625084, + "epoch": 0.32538704343905006, + "flos": 29099898735120.0, + "grad_norm": 1.473335570242372, + "language_loss": 0.83447862, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85916078, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15319824, + "step": 5412, + "time_per_iteration": 2.8633463382720947 + }, + { + "auxiliary_loss_clip": 0.01433434, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.29760718, + "balance_loss_mlp": 1.02618766, + "epoch": 0.325447166691718, + "flos": 24680880698760.0, + "grad_norm": 2.32194153513843, + "language_loss": 0.81157935, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83634305, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16748047, + "step": 5413, + "time_per_iteration": 2.7712652683258057 + }, + { + "auxiliary_loss_clip": 0.01431262, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_clip": 1.29874015, + "balance_loss_mlp": 1.02684236, + "epoch": 0.325507289944386, + "flos": 21548106640440.0, + "grad_norm": 1.5137256170533222, + "language_loss": 0.76898152, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79373276, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.17016602, + "step": 5414, + "time_per_iteration": 2.7499876022338867 + }, + { + "auxiliary_loss_clip": 0.01280044, + "auxiliary_loss_mlp": 0.01013058, + "balance_loss_clip": 1.22412491, + "balance_loss_mlp": 1.0096966, + "epoch": 0.32556741319705396, + "flos": 71533689242400.0, + "grad_norm": 0.9090205990596669, + "language_loss": 0.63987041, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66280144, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.03369141, + "step": 5415, + "time_per_iteration": 3.197542428970337 + }, + { + "auxiliary_loss_clip": 0.01430719, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.29846907, + "balance_loss_mlp": 1.02251863, + "epoch": 0.3256275364497219, + "flos": 23737705748640.0, + "grad_norm": 2.386424850126258, + "language_loss": 0.74501354, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76969934, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15344238, + "step": 5416, + "time_per_iteration": 2.827545642852783 + }, + { + "auxiliary_loss_clip": 0.01278178, + "auxiliary_loss_mlp": 0.01002642, + "balance_loss_clip": 1.22136021, + "balance_loss_mlp": 0.99916154, + "epoch": 0.3256876597023899, + "flos": 67303983691080.0, + "grad_norm": 0.7806205915723791, + "language_loss": 0.57948643, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60229462, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03491211, + "step": 5417, + "time_per_iteration": 3.3366317749023438 + }, + { + "auxiliary_loss_clip": 0.01279361, + "auxiliary_loss_mlp": 0.01006014, + "balance_loss_clip": 1.22288728, + "balance_loss_mlp": 1.00231838, + "epoch": 0.32574778295505785, + "flos": 71299741523760.0, + "grad_norm": 0.8221231593633654, + "language_loss": 0.63529444, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65814817, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03686523, + "step": 5418, + "time_per_iteration": 3.3344573974609375 + }, + { + "auxiliary_loss_clip": 0.01419608, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.28847861, + "balance_loss_mlp": 1.02265358, + "epoch": 0.3258079062077258, + "flos": 20781168283440.0, + "grad_norm": 3.219146893374495, + "language_loss": 0.70257747, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.72715855, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15844727, + "step": 5419, + "time_per_iteration": 2.753199815750122 + }, + { + "auxiliary_loss_clip": 0.0142721, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.29431343, + "balance_loss_mlp": 1.02305877, + "epoch": 0.3258680294603938, + "flos": 22240400185800.0, + "grad_norm": 1.8953984259022514, + "language_loss": 0.77568007, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.80034119, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.15844727, + "step": 5420, + "time_per_iteration": 2.7994279861450195 + }, + { + "auxiliary_loss_clip": 0.01433618, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_clip": 1.29849458, + "balance_loss_mlp": 1.0328362, + "epoch": 0.3259281527130618, + "flos": 26985715997400.0, + "grad_norm": 1.594962163190284, + "language_loss": 0.80532902, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.83016795, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.17443848, + "step": 5421, + "time_per_iteration": 2.858672618865967 + }, + { + "auxiliary_loss_clip": 0.01423243, + "auxiliary_loss_mlp": 0.01041084, + "balance_loss_clip": 1.29339266, + "balance_loss_mlp": 1.02659416, + "epoch": 0.32598827596572977, + "flos": 26219955282840.0, + "grad_norm": 1.4304517340624463, + "language_loss": 0.7575447, + "learning_rate": 3.149234491389381e-06, + "loss": 0.78218806, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.1449585, + "step": 5422, + "time_per_iteration": 2.80973482131958 + }, + { + "auxiliary_loss_clip": 0.01428575, + "auxiliary_loss_mlp": 0.01049536, + "balance_loss_clip": 1.29518628, + "balance_loss_mlp": 1.03447974, + "epoch": 0.32604839921839773, + "flos": 17644292780760.0, + "grad_norm": 1.8983791856718268, + "language_loss": 0.63444543, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.65922654, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15063477, + "step": 5423, + "time_per_iteration": 2.761909246444702 + }, + { + "auxiliary_loss_clip": 0.01408826, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.28196144, + "balance_loss_mlp": 1.02447438, + "epoch": 0.3261085224710657, + "flos": 23627951686800.0, + "grad_norm": 1.4395604996792928, + "language_loss": 0.75118101, + "learning_rate": 3.148596916016224e-06, + "loss": 0.77565515, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14117432, + "step": 5424, + "time_per_iteration": 2.793020009994507 + }, + { + "auxiliary_loss_clip": 0.01414843, + "auxiliary_loss_mlp": 0.01046307, + "balance_loss_clip": 1.28523922, + "balance_loss_mlp": 1.0312866, + "epoch": 0.32616864572373366, + "flos": 23265976144320.0, + "grad_norm": 1.6697516163767974, + "language_loss": 0.7708621, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79547358, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15014648, + "step": 5425, + "time_per_iteration": 2.8619511127471924 + }, + { + "auxiliary_loss_clip": 0.01426641, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.28929913, + "balance_loss_mlp": 1.02868128, + "epoch": 0.32622876897640163, + "flos": 25598976663600.0, + "grad_norm": 2.2324199157041438, + "language_loss": 0.78975868, + "learning_rate": 3.147959166423428e-06, + "loss": 0.81448674, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.17504883, + "step": 5426, + "time_per_iteration": 2.8400075435638428 + }, + { + "auxiliary_loss_clip": 0.01424128, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.29189181, + "balance_loss_mlp": 1.02071869, + "epoch": 0.3262888922290696, + "flos": 22424108717160.0, + "grad_norm": 1.5562594649258348, + "language_loss": 0.75000554, + "learning_rate": 3.147640226324893e-06, + "loss": 0.7746141, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.16015625, + "step": 5427, + "time_per_iteration": 2.867788791656494 + }, + { + "auxiliary_loss_clip": 0.01418786, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.2848413, + "balance_loss_mlp": 1.024382, + "epoch": 0.32634901548173756, + "flos": 19723650526800.0, + "grad_norm": 1.4617213092395116, + "language_loss": 0.7941941, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81878531, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.15930176, + "step": 5428, + "time_per_iteration": 2.7601158618927 + }, + { + "auxiliary_loss_clip": 0.01414825, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_clip": 1.28343225, + "balance_loss_mlp": 1.02559757, + "epoch": 0.3264091387344055, + "flos": 16147149651360.0, + "grad_norm": 1.5680474758396372, + "language_loss": 0.71723199, + "learning_rate": 3.147002215584023e-06, + "loss": 0.7417863, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.15002441, + "step": 5429, + "time_per_iteration": 2.8368473052978516 + }, + { + "auxiliary_loss_clip": 0.0141462, + "auxiliary_loss_mlp": 0.01040913, + "balance_loss_clip": 1.28280187, + "balance_loss_mlp": 1.02616668, + "epoch": 0.3264692619870735, + "flos": 16403431968000.0, + "grad_norm": 1.679117611259117, + "language_loss": 0.78812951, + "learning_rate": 3.146683144965881e-06, + "loss": 0.81268483, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.14746094, + "step": 5430, + "time_per_iteration": 2.771472930908203 + }, + { + "auxiliary_loss_clip": 0.01420557, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.28648043, + "balance_loss_mlp": 1.02377558, + "epoch": 0.32652938523974145, + "flos": 22387415740920.0, + "grad_norm": 1.9887572521858696, + "language_loss": 0.84454596, + "learning_rate": 3.146364030865399e-06, + "loss": 0.869151, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16162109, + "step": 5431, + "time_per_iteration": 2.9722964763641357 + }, + { + "auxiliary_loss_clip": 0.01414304, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.2835716, + "balance_loss_mlp": 1.02252412, + "epoch": 0.3265895084924094, + "flos": 21913087201560.0, + "grad_norm": 1.4922789713661941, + "language_loss": 0.70217508, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72669429, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15075684, + "step": 5432, + "time_per_iteration": 4.2920143604278564 + }, + { + "auxiliary_loss_clip": 0.01418818, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.28540361, + "balance_loss_mlp": 1.01924169, + "epoch": 0.3266496317450774, + "flos": 16071002330400.0, + "grad_norm": 1.482718841117189, + "language_loss": 0.84114695, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86567688, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.14941406, + "step": 5433, + "time_per_iteration": 2.7524473667144775 + }, + { + "auxiliary_loss_clip": 0.01410357, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.28178036, + "balance_loss_mlp": 1.01716185, + "epoch": 0.3267097549977454, + "flos": 22533334870320.0, + "grad_norm": 1.462718814123076, + "language_loss": 0.85869682, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88311779, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.14587402, + "step": 5434, + "time_per_iteration": 2.779923677444458 + }, + { + "auxiliary_loss_clip": 0.01420278, + "auxiliary_loss_mlp": 0.01038948, + "balance_loss_clip": 1.28557467, + "balance_loss_mlp": 1.02360547, + "epoch": 0.32676987825041337, + "flos": 27276011138520.0, + "grad_norm": 1.814008951865092, + "language_loss": 0.88030612, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90489841, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15319824, + "step": 5435, + "time_per_iteration": 2.8243916034698486 + }, + { + "auxiliary_loss_clip": 0.01410959, + "auxiliary_loss_mlp": 0.01032847, + "balance_loss_clip": 1.27984929, + "balance_loss_mlp": 1.01808882, + "epoch": 0.32683000150308134, + "flos": 11511750335040.0, + "grad_norm": 2.170235896057635, + "language_loss": 0.76889139, + "learning_rate": 3.144767808551479e-06, + "loss": 0.79332942, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.14758301, + "step": 5436, + "time_per_iteration": 2.7176852226257324 + }, + { + "auxiliary_loss_clip": 0.01411033, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.28098345, + "balance_loss_mlp": 1.01666641, + "epoch": 0.3268901247557493, + "flos": 25635791464920.0, + "grad_norm": 1.5129175571026194, + "language_loss": 0.72262776, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74705404, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.14929199, + "step": 5437, + "time_per_iteration": 2.830456495285034 + }, + { + "auxiliary_loss_clip": 0.01425656, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.28992033, + "balance_loss_mlp": 1.02207303, + "epoch": 0.32695024800841727, + "flos": 24865929306000.0, + "grad_norm": 1.6036165411419754, + "language_loss": 0.63925648, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66390544, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.17163086, + "step": 5438, + "time_per_iteration": 4.2694597244262695 + }, + { + "auxiliary_loss_clip": 0.01411959, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.28160393, + "balance_loss_mlp": 1.02271438, + "epoch": 0.32701037126108523, + "flos": 28845037711080.0, + "grad_norm": 1.6536308898336465, + "language_loss": 0.74511957, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76962084, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15454102, + "step": 5439, + "time_per_iteration": 2.8107244968414307 + }, + { + "auxiliary_loss_clip": 0.0141873, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.2857182, + "balance_loss_mlp": 1.02372324, + "epoch": 0.3270704945137532, + "flos": 27970944227280.0, + "grad_norm": 1.9683317657972057, + "language_loss": 0.74958849, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77417928, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16638184, + "step": 5440, + "time_per_iteration": 2.860805034637451 + }, + { + "auxiliary_loss_clip": 0.0140999, + "auxiliary_loss_mlp": 0.01039626, + "balance_loss_clip": 1.28036094, + "balance_loss_mlp": 1.02472472, + "epoch": 0.32713061776642116, + "flos": 23695286993640.0, + "grad_norm": 2.899427616091861, + "language_loss": 0.85039103, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.87488723, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14916992, + "step": 5441, + "time_per_iteration": 2.753418207168579 + }, + { + "auxiliary_loss_clip": 0.01418514, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.2854619, + "balance_loss_mlp": 1.02006578, + "epoch": 0.3271907410190891, + "flos": 22460761085040.0, + "grad_norm": 2.595406600908372, + "language_loss": 0.86525977, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88980663, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16101074, + "step": 5442, + "time_per_iteration": 2.82633113861084 + }, + { + "auxiliary_loss_clip": 0.01426342, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.29132199, + "balance_loss_mlp": 1.01700258, + "epoch": 0.3272508642717571, + "flos": 22825254345840.0, + "grad_norm": 1.7343968543409543, + "language_loss": 0.77700186, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.80160117, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.16601562, + "step": 5443, + "time_per_iteration": 2.7741856575012207 + }, + { + "auxiliary_loss_clip": 0.01420854, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.28538656, + "balance_loss_mlp": 1.0244503, + "epoch": 0.32731098752442506, + "flos": 11805050494800.0, + "grad_norm": 2.0457696856719836, + "language_loss": 0.82181191, + "learning_rate": 3.142211596174343e-06, + "loss": 0.84642565, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1607666, + "step": 5444, + "time_per_iteration": 4.2440009117126465 + }, + { + "auxiliary_loss_clip": 0.01411275, + "auxiliary_loss_mlp": 0.01039614, + "balance_loss_clip": 1.27994287, + "balance_loss_mlp": 1.02389598, + "epoch": 0.327371110777093, + "flos": 21032090296560.0, + "grad_norm": 2.700962977316135, + "language_loss": 0.59396684, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61847574, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15716553, + "step": 5445, + "time_per_iteration": 4.250964403152466 + }, + { + "auxiliary_loss_clip": 0.01416664, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.28429592, + "balance_loss_mlp": 1.02332783, + "epoch": 0.327431234029761, + "flos": 19066750490160.0, + "grad_norm": 2.051940837730493, + "language_loss": 0.88603789, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.91060287, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16497803, + "step": 5446, + "time_per_iteration": 2.759270191192627 + }, + { + "auxiliary_loss_clip": 0.01436489, + "auxiliary_loss_mlp": 0.01039768, + "balance_loss_clip": 1.29745066, + "balance_loss_mlp": 1.0222199, + "epoch": 0.32749135728242895, + "flos": 25854852896640.0, + "grad_norm": 1.6870208720497815, + "language_loss": 0.79221386, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81697643, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17529297, + "step": 5447, + "time_per_iteration": 2.8402597904205322 + }, + { + "auxiliary_loss_clip": 0.01411363, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.28032875, + "balance_loss_mlp": 1.02439725, + "epoch": 0.327551480535097, + "flos": 20125364672520.0, + "grad_norm": 1.8952469159658998, + "language_loss": 0.73536116, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75986135, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.14251709, + "step": 5448, + "time_per_iteration": 2.807842969894409 + }, + { + "auxiliary_loss_clip": 0.01416301, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.28565526, + "balance_loss_mlp": 1.02020955, + "epoch": 0.32761160378776494, + "flos": 28809441160560.0, + "grad_norm": 1.381429935679523, + "language_loss": 0.66764796, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69216943, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15637207, + "step": 5449, + "time_per_iteration": 2.804764986038208 + }, + { + "auxiliary_loss_clip": 0.01417097, + "auxiliary_loss_mlp": 0.01043491, + "balance_loss_clip": 1.28463161, + "balance_loss_mlp": 1.02818441, + "epoch": 0.3276717270404329, + "flos": 26942769333720.0, + "grad_norm": 1.5231110793828697, + "language_loss": 0.65893507, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.683541, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15307617, + "step": 5450, + "time_per_iteration": 2.8271069526672363 + }, + { + "auxiliary_loss_clip": 0.01417372, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.28391063, + "balance_loss_mlp": 1.02260423, + "epoch": 0.32773185029310087, + "flos": 25343993814480.0, + "grad_norm": 1.4161081980513486, + "language_loss": 0.77894151, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.80350077, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.1595459, + "step": 5451, + "time_per_iteration": 2.7892491817474365 + }, + { + "auxiliary_loss_clip": 0.01426038, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.29129744, + "balance_loss_mlp": 1.02181041, + "epoch": 0.32779197354576883, + "flos": 26396801001360.0, + "grad_norm": 2.8275090975093566, + "language_loss": 0.70298958, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72763216, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16418457, + "step": 5452, + "time_per_iteration": 2.886620044708252 + }, + { + "auxiliary_loss_clip": 0.01408029, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.27874506, + "balance_loss_mlp": 1.01929641, + "epoch": 0.3278520967984368, + "flos": 24904733916960.0, + "grad_norm": 1.5499227545468122, + "language_loss": 0.78840721, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.81282622, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.14581299, + "step": 5453, + "time_per_iteration": 2.797318458557129 + }, + { + "auxiliary_loss_clip": 0.01415246, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.28333592, + "balance_loss_mlp": 1.02104104, + "epoch": 0.32791222005110476, + "flos": 29759113448280.0, + "grad_norm": 1.770578207677099, + "language_loss": 0.75583673, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.78034782, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.14819336, + "step": 5454, + "time_per_iteration": 2.8337812423706055 + }, + { + "auxiliary_loss_clip": 0.01404796, + "auxiliary_loss_mlp": 0.01037923, + "balance_loss_clip": 1.27727854, + "balance_loss_mlp": 1.02434468, + "epoch": 0.32797234330377273, + "flos": 16512373862640.0, + "grad_norm": 1.8993880107838848, + "language_loss": 0.76754367, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79197085, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.13592529, + "step": 5455, + "time_per_iteration": 2.733771800994873 + }, + { + "auxiliary_loss_clip": 0.01415114, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.27987909, + "balance_loss_mlp": 1.0248239, + "epoch": 0.3280324665564407, + "flos": 26583514551360.0, + "grad_norm": 1.7877834245222821, + "language_loss": 0.7415508, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76611316, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1628418, + "step": 5456, + "time_per_iteration": 2.8067800998687744 + }, + { + "auxiliary_loss_clip": 0.0141588, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.28385079, + "balance_loss_mlp": 1.02553952, + "epoch": 0.32809258980910866, + "flos": 22935170841120.0, + "grad_norm": 1.5197975263917038, + "language_loss": 0.78819931, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.81276453, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15100098, + "step": 5457, + "time_per_iteration": 2.783268451690674 + }, + { + "auxiliary_loss_clip": 0.01425207, + "auxiliary_loss_mlp": 0.01038168, + "balance_loss_clip": 1.28782618, + "balance_loss_mlp": 1.0228858, + "epoch": 0.3281527130617766, + "flos": 22789008061560.0, + "grad_norm": 2.1780957066358613, + "language_loss": 0.79510641, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81974018, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15283203, + "step": 5458, + "time_per_iteration": 2.793475389480591 + }, + { + "auxiliary_loss_clip": 0.01409758, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.27789712, + "balance_loss_mlp": 1.02480102, + "epoch": 0.3282128363144446, + "flos": 21255537431160.0, + "grad_norm": 1.7397753290683173, + "language_loss": 0.73353702, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75803888, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.15661621, + "step": 5459, + "time_per_iteration": 2.762704372406006 + }, + { + "auxiliary_loss_clip": 0.01420394, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.28473234, + "balance_loss_mlp": 1.02181435, + "epoch": 0.32827295956711255, + "flos": 30848735436480.0, + "grad_norm": 1.7702000846984072, + "language_loss": 0.84355021, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86813194, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15966797, + "step": 5460, + "time_per_iteration": 2.8445186614990234 + }, + { + "auxiliary_loss_clip": 0.0140927, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.27700162, + "balance_loss_mlp": 1.02794421, + "epoch": 0.3283330828197806, + "flos": 25919711093520.0, + "grad_norm": 1.6934001394752358, + "language_loss": 0.76968491, + "learning_rate": 3.136770448642288e-06, + "loss": 0.79420471, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.14758301, + "step": 5461, + "time_per_iteration": 2.7928388118743896 + }, + { + "auxiliary_loss_clip": 0.01412124, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.27967119, + "balance_loss_mlp": 1.01904082, + "epoch": 0.32839320607244854, + "flos": 38589271499160.0, + "grad_norm": 1.920295438466639, + "language_loss": 0.63172758, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65622306, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.18395996, + "step": 5462, + "time_per_iteration": 2.933511734008789 + }, + { + "auxiliary_loss_clip": 0.01401324, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.27156281, + "balance_loss_mlp": 1.02495849, + "epoch": 0.3284533293251165, + "flos": 26656494420240.0, + "grad_norm": 1.4176201053804636, + "language_loss": 0.78741705, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.81183296, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15283203, + "step": 5463, + "time_per_iteration": 2.831855058670044 + }, + { + "auxiliary_loss_clip": 0.01414979, + "auxiliary_loss_mlp": 0.01041851, + "balance_loss_clip": 1.28327274, + "balance_loss_mlp": 1.02439916, + "epoch": 0.32851345257778447, + "flos": 15308327851200.0, + "grad_norm": 1.823583938294417, + "language_loss": 0.69759572, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72216403, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.17456055, + "step": 5464, + "time_per_iteration": 2.740069627761841 + }, + { + "auxiliary_loss_clip": 0.01409205, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.27998781, + "balance_loss_mlp": 1.0222851, + "epoch": 0.32857357583045244, + "flos": 23519009792160.0, + "grad_norm": 1.6064293807556949, + "language_loss": 0.72419453, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74866945, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16003418, + "step": 5465, + "time_per_iteration": 2.7800791263580322 + }, + { + "auxiliary_loss_clip": 0.01412497, + "auxiliary_loss_mlp": 0.01041178, + "balance_loss_clip": 1.2802428, + "balance_loss_mlp": 1.02523971, + "epoch": 0.3286336990831204, + "flos": 21000026673360.0, + "grad_norm": 1.5224355600280475, + "language_loss": 0.83233285, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85686964, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.15917969, + "step": 5466, + "time_per_iteration": 2.789077043533325 + }, + { + "auxiliary_loss_clip": 0.0141608, + "auxiliary_loss_mlp": 0.01040005, + "balance_loss_clip": 1.28217649, + "balance_loss_mlp": 1.02467513, + "epoch": 0.32869382233578837, + "flos": 23663994929280.0, + "grad_norm": 1.9150208499954813, + "language_loss": 0.79365039, + "learning_rate": 3.134847066213879e-06, + "loss": 0.8182112, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.15319824, + "step": 5467, + "time_per_iteration": 2.752811908721924 + }, + { + "auxiliary_loss_clip": 0.01415794, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.28240824, + "balance_loss_mlp": 1.01957095, + "epoch": 0.32875394558845633, + "flos": 25341841571400.0, + "grad_norm": 2.1181000489587216, + "language_loss": 0.74818385, + "learning_rate": 3.134526351787587e-06, + "loss": 0.77270496, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.16748047, + "step": 5468, + "time_per_iteration": 2.8506948947906494 + }, + { + "auxiliary_loss_clip": 0.01427617, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.28947711, + "balance_loss_mlp": 1.0224818, + "epoch": 0.3288140688411243, + "flos": 14907263439240.0, + "grad_norm": 1.7345312917698614, + "language_loss": 0.79090756, + "learning_rate": 3.134205594339942e-06, + "loss": 0.81558669, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.17822266, + "step": 5469, + "time_per_iteration": 2.7680232524871826 + }, + { + "auxiliary_loss_clip": 0.01415435, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.28233123, + "balance_loss_mlp": 1.02621961, + "epoch": 0.32887419209379226, + "flos": 18555932016360.0, + "grad_norm": 1.6618900231049722, + "language_loss": 0.8171066, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84167755, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.15454102, + "step": 5470, + "time_per_iteration": 2.7653050422668457 + }, + { + "auxiliary_loss_clip": 0.01414973, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.27998686, + "balance_loss_mlp": 1.02553296, + "epoch": 0.3289343153464602, + "flos": 48114159597000.0, + "grad_norm": 1.764610643662715, + "language_loss": 0.68175888, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70633191, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.16796875, + "step": 5471, + "time_per_iteration": 4.374630451202393 + }, + { + "auxiliary_loss_clip": 0.01429584, + "auxiliary_loss_mlp": 0.0105302, + "balance_loss_clip": 1.29026449, + "balance_loss_mlp": 1.03463805, + "epoch": 0.3289944385991282, + "flos": 27606126099600.0, + "grad_norm": 1.5946266516994603, + "language_loss": 0.65182924, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67665529, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.18383789, + "step": 5472, + "time_per_iteration": 2.804840326309204 + }, + { + "auxiliary_loss_clip": 0.01422575, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.28705025, + "balance_loss_mlp": 1.02485156, + "epoch": 0.32905456185179616, + "flos": 20125121022360.0, + "grad_norm": 1.673153530554181, + "language_loss": 0.88233519, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90698516, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.17553711, + "step": 5473, + "time_per_iteration": 2.7940120697021484 + }, + { + "auxiliary_loss_clip": 0.01422591, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.28588986, + "balance_loss_mlp": 1.0226928, + "epoch": 0.3291146851044642, + "flos": 23184793386720.0, + "grad_norm": 3.0717733315570417, + "language_loss": 0.79405296, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.818685, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.17932129, + "step": 5474, + "time_per_iteration": 2.763259172439575 + }, + { + "auxiliary_loss_clip": 0.01285048, + "auxiliary_loss_mlp": 0.01017083, + "balance_loss_clip": 1.22879219, + "balance_loss_mlp": 1.01420963, + "epoch": 0.32917480835713214, + "flos": 67637306712600.0, + "grad_norm": 0.816931335025034, + "language_loss": 0.60297, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62599128, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.02868652, + "step": 5475, + "time_per_iteration": 3.250826358795166 + }, + { + "auxiliary_loss_clip": 0.01426738, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.28673959, + "balance_loss_mlp": 1.02240121, + "epoch": 0.3292349316098001, + "flos": 27970335101880.0, + "grad_norm": 3.8706459023406925, + "language_loss": 0.77057785, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79524314, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1739502, + "step": 5476, + "time_per_iteration": 2.8439371585845947 + }, + { + "auxiliary_loss_clip": 0.0141622, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.28308749, + "balance_loss_mlp": 1.02544582, + "epoch": 0.3292950548624681, + "flos": 20268156958200.0, + "grad_norm": 1.832989796187697, + "language_loss": 0.75005454, + "learning_rate": 3.131637987449997e-06, + "loss": 0.77462906, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15783691, + "step": 5477, + "time_per_iteration": 4.379128694534302 + }, + { + "auxiliary_loss_clip": 0.01399648, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.27200866, + "balance_loss_mlp": 1.01964951, + "epoch": 0.32935517811513604, + "flos": 20817820651320.0, + "grad_norm": 2.1617002278748765, + "language_loss": 0.75679159, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78112507, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.14044189, + "step": 5478, + "time_per_iteration": 2.7343151569366455 + }, + { + "auxiliary_loss_clip": 0.01405895, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.27503109, + "balance_loss_mlp": 1.01917148, + "epoch": 0.329415301367804, + "flos": 18446421604680.0, + "grad_norm": 1.8011632172741872, + "language_loss": 0.80648285, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.83089697, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16369629, + "step": 5479, + "time_per_iteration": 2.754595994949341 + }, + { + "auxiliary_loss_clip": 0.01275191, + "auxiliary_loss_mlp": 0.01000451, + "balance_loss_clip": 1.21803784, + "balance_loss_mlp": 0.99718422, + "epoch": 0.32947542462047197, + "flos": 66340156066920.0, + "grad_norm": 0.7534562431928485, + "language_loss": 0.56625557, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58901203, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03271484, + "step": 5480, + "time_per_iteration": 3.348768711090088 + }, + { + "auxiliary_loss_clip": 0.0140994, + "auxiliary_loss_mlp": 0.01039115, + "balance_loss_clip": 1.27650607, + "balance_loss_mlp": 1.02267635, + "epoch": 0.32953554787313993, + "flos": 23226806058120.0, + "grad_norm": 1.7054448064427397, + "language_loss": 0.77902102, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.80351162, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16455078, + "step": 5481, + "time_per_iteration": 2.767414093017578 + }, + { + "auxiliary_loss_clip": 0.01416976, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.28037322, + "balance_loss_mlp": 1.02249694, + "epoch": 0.3295956711258079, + "flos": 27014327910000.0, + "grad_norm": 1.5315686767960213, + "language_loss": 0.78400785, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80856538, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.16271973, + "step": 5482, + "time_per_iteration": 4.2691144943237305 + }, + { + "auxiliary_loss_clip": 0.01415528, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.28034067, + "balance_loss_mlp": 1.02538943, + "epoch": 0.32965579437847586, + "flos": 19176382726920.0, + "grad_norm": 1.765370669803955, + "language_loss": 0.74051023, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76509178, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.17224121, + "step": 5483, + "time_per_iteration": 2.824215888977051 + }, + { + "auxiliary_loss_clip": 0.01415193, + "auxiliary_loss_mlp": 0.01040357, + "balance_loss_clip": 1.28148437, + "balance_loss_mlp": 1.02561069, + "epoch": 0.32971591763114383, + "flos": 30488709095280.0, + "grad_norm": 1.569056932402838, + "language_loss": 0.7567727, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.7813282, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.14746094, + "step": 5484, + "time_per_iteration": 4.3933188915252686 + }, + { + "auxiliary_loss_clip": 0.01407898, + "auxiliary_loss_mlp": 0.01044509, + "balance_loss_clip": 1.27538717, + "balance_loss_mlp": 1.02795053, + "epoch": 0.3297760408838118, + "flos": 16293921556320.0, + "grad_norm": 5.802085683750724, + "language_loss": 0.7198776, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74440169, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16564941, + "step": 5485, + "time_per_iteration": 2.7872345447540283 + }, + { + "auxiliary_loss_clip": 0.01412108, + "auxiliary_loss_mlp": 0.0104278, + "balance_loss_clip": 1.27922225, + "balance_loss_mlp": 1.02746177, + "epoch": 0.32983616413647976, + "flos": 29536600305960.0, + "grad_norm": 1.5980894244169794, + "language_loss": 0.80298847, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82753736, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.15319824, + "step": 5486, + "time_per_iteration": 2.8155336380004883 + }, + { + "auxiliary_loss_clip": 0.01413563, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.28191113, + "balance_loss_mlp": 1.02899504, + "epoch": 0.3298962873891478, + "flos": 20636304971400.0, + "grad_norm": 2.149753089209199, + "language_loss": 0.84841561, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.87299985, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.15856934, + "step": 5487, + "time_per_iteration": 2.7993853092193604 + }, + { + "auxiliary_loss_clip": 0.01421272, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_clip": 1.28373206, + "balance_loss_mlp": 1.03028035, + "epoch": 0.32995641064181574, + "flos": 14980162091400.0, + "grad_norm": 2.146834804065882, + "language_loss": 0.74170673, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76639295, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.17053223, + "step": 5488, + "time_per_iteration": 2.788973093032837 + }, + { + "auxiliary_loss_clip": 0.01416701, + "auxiliary_loss_mlp": 0.01056122, + "balance_loss_clip": 1.28255892, + "balance_loss_mlp": 1.03917027, + "epoch": 0.3300165338944837, + "flos": 18666132770160.0, + "grad_norm": 2.4857446843395743, + "language_loss": 0.7336936, + "learning_rate": 3.127781429646098e-06, + "loss": 0.7584219, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.16955566, + "step": 5489, + "time_per_iteration": 2.760995864868164 + }, + { + "auxiliary_loss_clip": 0.01414916, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.28146636, + "balance_loss_mlp": 1.02982426, + "epoch": 0.3300766571471517, + "flos": 25588012406400.0, + "grad_norm": 2.6722047838956673, + "language_loss": 0.89240688, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91701341, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15893555, + "step": 5490, + "time_per_iteration": 2.791734218597412 + }, + { + "auxiliary_loss_clip": 0.01413795, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.28124213, + "balance_loss_mlp": 1.02712035, + "epoch": 0.33013678039981964, + "flos": 11367130673160.0, + "grad_norm": 1.7516121691808253, + "language_loss": 0.8327384, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85729665, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.14910889, + "step": 5491, + "time_per_iteration": 2.7152764797210693 + }, + { + "auxiliary_loss_clip": 0.0140964, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.27618492, + "balance_loss_mlp": 1.03432512, + "epoch": 0.3301969036524876, + "flos": 24825865835880.0, + "grad_norm": 1.7182453207398627, + "language_loss": 0.77827269, + "learning_rate": 3.126816327146554e-06, + "loss": 0.80286801, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.15557861, + "step": 5492, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.01426805, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_clip": 1.28773212, + "balance_loss_mlp": 1.0402317, + "epoch": 0.33025702690515557, + "flos": 15965430929640.0, + "grad_norm": 2.088859754082377, + "language_loss": 0.75444698, + "learning_rate": 3.12649454083913e-06, + "loss": 0.77928805, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.17047119, + "step": 5493, + "time_per_iteration": 2.7420058250427246 + }, + { + "auxiliary_loss_clip": 0.0127196, + "auxiliary_loss_mlp": 0.01012984, + "balance_loss_clip": 1.21482062, + "balance_loss_mlp": 1.01029003, + "epoch": 0.33031715015782354, + "flos": 59431619599920.0, + "grad_norm": 0.783301710686692, + "language_loss": 0.5394749, + "learning_rate": 3.12617271181492e-06, + "loss": 0.5623244, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.02697754, + "step": 5494, + "time_per_iteration": 3.214735507965088 + }, + { + "auxiliary_loss_clip": 0.014168, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_clip": 1.28245413, + "balance_loss_mlp": 1.03017449, + "epoch": 0.3303772734104915, + "flos": 23189463348120.0, + "grad_norm": 1.4263335943004625, + "language_loss": 0.86880898, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89344931, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.17028809, + "step": 5495, + "time_per_iteration": 2.7804980278015137 + }, + { + "auxiliary_loss_clip": 0.01420812, + "auxiliary_loss_mlp": 0.01052781, + "balance_loss_clip": 1.28297615, + "balance_loss_mlp": 1.03489971, + "epoch": 0.33043739666315947, + "flos": 33078276189720.0, + "grad_norm": 2.192979421841689, + "language_loss": 0.73444659, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75918251, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.17883301, + "step": 5496, + "time_per_iteration": 2.826897621154785 + }, + { + "auxiliary_loss_clip": 0.01412117, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_clip": 1.27923632, + "balance_loss_mlp": 1.02772367, + "epoch": 0.33049751991582743, + "flos": 24900266997360.0, + "grad_norm": 1.9793704102234064, + "language_loss": 0.72472352, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74928337, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16137695, + "step": 5497, + "time_per_iteration": 2.7988595962524414 + }, + { + "auxiliary_loss_clip": 0.01419355, + "auxiliary_loss_mlp": 0.01041317, + "balance_loss_clip": 1.28633475, + "balance_loss_mlp": 1.02488983, + "epoch": 0.3305576431684954, + "flos": 29466584847360.0, + "grad_norm": 1.6961471148904324, + "language_loss": 0.80696768, + "learning_rate": 3.124884968794321e-06, + "loss": 0.83157444, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16418457, + "step": 5498, + "time_per_iteration": 2.793084144592285 + }, + { + "auxiliary_loss_clip": 0.01419307, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.28330827, + "balance_loss_mlp": 1.0309155, + "epoch": 0.33061776642116336, + "flos": 22636713419640.0, + "grad_norm": 2.3362229077187417, + "language_loss": 0.76410484, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78878665, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.1796875, + "step": 5499, + "time_per_iteration": 2.772425651550293 + }, + { + "auxiliary_loss_clip": 0.01418117, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.28392553, + "balance_loss_mlp": 1.02789867, + "epoch": 0.3306778896738313, + "flos": 25781954244480.0, + "grad_norm": 1.6364749662052434, + "language_loss": 0.79326832, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81789637, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16772461, + "step": 5500, + "time_per_iteration": 2.8298988342285156 + }, + { + "auxiliary_loss_clip": 0.01419885, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.28476489, + "balance_loss_mlp": 1.01671255, + "epoch": 0.33073801292649935, + "flos": 36947589924600.0, + "grad_norm": 1.781263949464395, + "language_loss": 0.6633327, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68786722, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1685791, + "step": 5501, + "time_per_iteration": 2.921577215194702 + }, + { + "auxiliary_loss_clip": 0.01421537, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.28526294, + "balance_loss_mlp": 1.02133262, + "epoch": 0.3307981361791673, + "flos": 12971428929360.0, + "grad_norm": 2.0642858826023818, + "language_loss": 0.77501279, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79962033, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.17883301, + "step": 5502, + "time_per_iteration": 2.7318527698516846 + }, + { + "auxiliary_loss_clip": 0.01423936, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.28848135, + "balance_loss_mlp": 1.02352214, + "epoch": 0.3308582594318353, + "flos": 25379874623520.0, + "grad_norm": 1.5053924924372062, + "language_loss": 0.72810137, + "learning_rate": 3.123274330355824e-06, + "loss": 0.7527492, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.17321777, + "step": 5503, + "time_per_iteration": 2.826758623123169 + }, + { + "auxiliary_loss_clip": 0.01415715, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.28170586, + "balance_loss_mlp": 1.0221076, + "epoch": 0.33091838268450324, + "flos": 26474166573120.0, + "grad_norm": 1.4888108978993835, + "language_loss": 0.75028354, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77483261, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.17077637, + "step": 5504, + "time_per_iteration": 2.8664462566375732 + }, + { + "auxiliary_loss_clip": 0.01414753, + "auxiliary_loss_mlp": 0.01040053, + "balance_loss_clip": 1.28053379, + "balance_loss_mlp": 1.02388859, + "epoch": 0.3309785059371712, + "flos": 24976373709960.0, + "grad_norm": 2.0245972988253116, + "language_loss": 0.70349628, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72804439, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16162109, + "step": 5505, + "time_per_iteration": 2.8166418075561523 + }, + { + "auxiliary_loss_clip": 0.01419246, + "auxiliary_loss_mlp": 0.01043254, + "balance_loss_clip": 1.28591967, + "balance_loss_mlp": 1.02698231, + "epoch": 0.3310386291898392, + "flos": 20451296972520.0, + "grad_norm": 1.8170107901157833, + "language_loss": 0.81998801, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84461308, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.16271973, + "step": 5506, + "time_per_iteration": 2.8672568798065186 + }, + { + "auxiliary_loss_clip": 0.01418241, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.2858808, + "balance_loss_mlp": 1.02328801, + "epoch": 0.33109875244250714, + "flos": 23187554755200.0, + "grad_norm": 1.7290074114598448, + "language_loss": 0.78983986, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81442189, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16687012, + "step": 5507, + "time_per_iteration": 2.759918451309204 + }, + { + "auxiliary_loss_clip": 0.0141268, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.27886152, + "balance_loss_mlp": 1.02316737, + "epoch": 0.3311588756951751, + "flos": 24173473327200.0, + "grad_norm": 1.476612560096688, + "language_loss": 0.71638477, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.74090517, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.1619873, + "step": 5508, + "time_per_iteration": 2.8794479370117188 + }, + { + "auxiliary_loss_clip": 0.01407577, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.27759123, + "balance_loss_mlp": 1.01749849, + "epoch": 0.33121899894784307, + "flos": 28151404089840.0, + "grad_norm": 1.9631863428287355, + "language_loss": 0.72155213, + "learning_rate": 3.12134015873989e-06, + "loss": 0.7459597, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15673828, + "step": 5509, + "time_per_iteration": 2.8169872760772705 + }, + { + "auxiliary_loss_clip": 0.01416072, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.28534508, + "balance_loss_mlp": 1.01938391, + "epoch": 0.33127912220051103, + "flos": 29573374498920.0, + "grad_norm": 1.562372728685872, + "language_loss": 0.73496979, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75948489, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.16052246, + "step": 5510, + "time_per_iteration": 4.354275703430176 + }, + { + "auxiliary_loss_clip": 0.01408814, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.27757061, + "balance_loss_mlp": 1.02408433, + "epoch": 0.331339245453179, + "flos": 14432934899880.0, + "grad_norm": 2.3722306758446665, + "language_loss": 0.87925982, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90374714, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15820312, + "step": 5511, + "time_per_iteration": 2.7825841903686523 + }, + { + "auxiliary_loss_clip": 0.01406237, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.28118312, + "balance_loss_mlp": 1.01601553, + "epoch": 0.33139936870584696, + "flos": 20892099987720.0, + "grad_norm": 1.6330933992689363, + "language_loss": 0.73349881, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75786877, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14758301, + "step": 5512, + "time_per_iteration": 2.817033052444458 + }, + { + "auxiliary_loss_clip": 0.01407441, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.27954936, + "balance_loss_mlp": 1.02159858, + "epoch": 0.33145949195851493, + "flos": 36291380230080.0, + "grad_norm": 1.660971209529301, + "language_loss": 0.73053861, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.75499344, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16442871, + "step": 5513, + "time_per_iteration": 2.9211831092834473 + }, + { + "auxiliary_loss_clip": 0.01419563, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.28706408, + "balance_loss_mlp": 1.01650119, + "epoch": 0.33151961521118295, + "flos": 14283401626440.0, + "grad_norm": 1.9397612659941519, + "language_loss": 0.68764037, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.71216846, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.1673584, + "step": 5514, + "time_per_iteration": 2.863860607147217 + }, + { + "auxiliary_loss_clip": 0.0141749, + "auxiliary_loss_mlp": 0.0104101, + "balance_loss_clip": 1.28414357, + "balance_loss_mlp": 1.02356935, + "epoch": 0.3315797384638509, + "flos": 20778853606920.0, + "grad_norm": 2.260010495374644, + "language_loss": 0.66813302, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.69271803, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.17419434, + "step": 5515, + "time_per_iteration": 4.160696029663086 + }, + { + "auxiliary_loss_clip": 0.01418479, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.28529263, + "balance_loss_mlp": 1.02168751, + "epoch": 0.3316398617165189, + "flos": 24684657276240.0, + "grad_norm": 1.7811441609949308, + "language_loss": 0.69383717, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71839654, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.15759277, + "step": 5516, + "time_per_iteration": 2.796098232269287 + }, + { + "auxiliary_loss_clip": 0.01422282, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.28673899, + "balance_loss_mlp": 1.02253366, + "epoch": 0.33169998496918685, + "flos": 18592584384240.0, + "grad_norm": 1.8801646579060214, + "language_loss": 0.80686796, + "learning_rate": 3.118758882514359e-06, + "loss": 0.83147883, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16271973, + "step": 5517, + "time_per_iteration": 2.7402172088623047 + }, + { + "auxiliary_loss_clip": 0.01404849, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.27813542, + "balance_loss_mlp": 1.01879907, + "epoch": 0.3317601082218548, + "flos": 20198506974840.0, + "grad_norm": 1.9010208598134575, + "language_loss": 0.74881321, + "learning_rate": 3.118436031952143e-06, + "loss": 0.77319717, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.1473999, + "step": 5518, + "time_per_iteration": 2.7865335941314697 + }, + { + "auxiliary_loss_clip": 0.01281418, + "auxiliary_loss_mlp": 0.01006461, + "balance_loss_clip": 1.22589612, + "balance_loss_mlp": 1.00361204, + "epoch": 0.3318202314745228, + "flos": 68990642347320.0, + "grad_norm": 0.6193413010769775, + "language_loss": 0.54311275, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56599152, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.02844238, + "step": 5519, + "time_per_iteration": 3.477592945098877 + }, + { + "auxiliary_loss_clip": 0.0141431, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.28489637, + "balance_loss_mlp": 1.02296257, + "epoch": 0.33188035472719074, + "flos": 21504225984480.0, + "grad_norm": 2.1684315961338845, + "language_loss": 0.79070365, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81523776, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16149902, + "step": 5520, + "time_per_iteration": 2.7429165840148926 + }, + { + "auxiliary_loss_clip": 0.01412495, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.28376603, + "balance_loss_mlp": 1.02095437, + "epoch": 0.3319404779798587, + "flos": 28875598824960.0, + "grad_norm": 2.4216106272222575, + "language_loss": 0.76518035, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.789671, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15625, + "step": 5521, + "time_per_iteration": 4.224761247634888 + }, + { + "auxiliary_loss_clip": 0.01418565, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.28565431, + "balance_loss_mlp": 1.02671289, + "epoch": 0.33200060123252667, + "flos": 23082267612960.0, + "grad_norm": 2.02281475819462, + "language_loss": 0.70994818, + "learning_rate": 3.117144205713664e-06, + "loss": 0.73457229, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.17114258, + "step": 5522, + "time_per_iteration": 4.340726375579834 + }, + { + "auxiliary_loss_clip": 0.0141082, + "auxiliary_loss_mlp": 0.01039097, + "balance_loss_clip": 1.2816453, + "balance_loss_mlp": 1.02369463, + "epoch": 0.33206072448519464, + "flos": 21147488920440.0, + "grad_norm": 1.6054734351345445, + "language_loss": 0.74197584, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76647502, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.1541748, + "step": 5523, + "time_per_iteration": 2.8267276287078857 + }, + { + "auxiliary_loss_clip": 0.01403701, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.27686214, + "balance_loss_mlp": 1.01947761, + "epoch": 0.3321208477378626, + "flos": 13083050975760.0, + "grad_norm": 1.9104251833981725, + "language_loss": 0.8211174, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84550142, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15209961, + "step": 5524, + "time_per_iteration": 2.8403513431549072 + }, + { + "auxiliary_loss_clip": 0.0140563, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.27763963, + "balance_loss_mlp": 1.01885974, + "epoch": 0.33218097099053057, + "flos": 21220428180960.0, + "grad_norm": 1.6882831408351222, + "language_loss": 0.82452279, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84890717, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.1394043, + "step": 5525, + "time_per_iteration": 2.8311774730682373 + }, + { + "auxiliary_loss_clip": 0.01280684, + "auxiliary_loss_mlp": 0.01018146, + "balance_loss_clip": 1.22433615, + "balance_loss_mlp": 1.01529694, + "epoch": 0.33224109424319853, + "flos": 64365354512640.0, + "grad_norm": 0.7676462891052481, + "language_loss": 0.52607226, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54906058, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.02844238, + "step": 5526, + "time_per_iteration": 3.235513925552368 + }, + { + "auxiliary_loss_clip": 0.01414687, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.28314996, + "balance_loss_mlp": 1.02479982, + "epoch": 0.33230121749586655, + "flos": 17350261670520.0, + "grad_norm": 2.129585551240399, + "language_loss": 0.77590311, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.80045938, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.16137695, + "step": 5527, + "time_per_iteration": 2.8188023567199707 + }, + { + "auxiliary_loss_clip": 0.0140888, + "auxiliary_loss_mlp": 0.01040631, + "balance_loss_clip": 1.28104866, + "balance_loss_mlp": 1.02577782, + "epoch": 0.3323613407485345, + "flos": 21002097699720.0, + "grad_norm": 1.7760702265462653, + "language_loss": 0.71851289, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74300796, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.14855957, + "step": 5528, + "time_per_iteration": 2.834538459777832 + }, + { + "auxiliary_loss_clip": 0.01404394, + "auxiliary_loss_mlp": 0.01037096, + "balance_loss_clip": 1.27416265, + "balance_loss_mlp": 1.02156341, + "epoch": 0.3324214640012025, + "flos": 13156558753320.0, + "grad_norm": 1.7173572092848457, + "language_loss": 0.82975739, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.85417223, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15545654, + "step": 5529, + "time_per_iteration": 2.7777531147003174 + }, + { + "auxiliary_loss_clip": 0.01413224, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.28030193, + "balance_loss_mlp": 1.02277184, + "epoch": 0.33248158725387045, + "flos": 22278676888080.0, + "grad_norm": 2.4385348264986733, + "language_loss": 0.69878119, + "learning_rate": 3.114558520634423e-06, + "loss": 0.72328711, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.14587402, + "step": 5530, + "time_per_iteration": 2.7699623107910156 + }, + { + "auxiliary_loss_clip": 0.01409726, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.27902126, + "balance_loss_mlp": 1.02930295, + "epoch": 0.3325417105065384, + "flos": 20745937208160.0, + "grad_norm": 2.839307364285173, + "language_loss": 0.77307308, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.79762876, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.1652832, + "step": 5531, + "time_per_iteration": 2.741952657699585 + }, + { + "auxiliary_loss_clip": 0.01410001, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_clip": 1.27900612, + "balance_loss_mlp": 1.0270673, + "epoch": 0.3326018337592064, + "flos": 24795954455760.0, + "grad_norm": 1.7592228655854634, + "language_loss": 0.73426569, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75878763, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15136719, + "step": 5532, + "time_per_iteration": 2.793534994125366 + }, + { + "auxiliary_loss_clip": 0.01406908, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.2779305, + "balance_loss_mlp": 1.02221739, + "epoch": 0.33266195701187434, + "flos": 14505792943680.0, + "grad_norm": 2.011532065430583, + "language_loss": 0.66615146, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.690584, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.14117432, + "step": 5533, + "time_per_iteration": 2.713905096054077 + }, + { + "auxiliary_loss_clip": 0.0140558, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.27474761, + "balance_loss_mlp": 1.02752829, + "epoch": 0.3327220802645423, + "flos": 15308612109720.0, + "grad_norm": 1.6454409953835616, + "language_loss": 0.71173227, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73621535, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.15185547, + "step": 5534, + "time_per_iteration": 2.7919323444366455 + }, + { + "auxiliary_loss_clip": 0.01403, + "auxiliary_loss_mlp": 0.01045018, + "balance_loss_clip": 1.27516568, + "balance_loss_mlp": 1.02934229, + "epoch": 0.3327822035172103, + "flos": 23482844724600.0, + "grad_norm": 1.6895875598748225, + "language_loss": 0.67246997, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.6969502, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.15679932, + "step": 5535, + "time_per_iteration": 2.803539276123047 + }, + { + "auxiliary_loss_clip": 0.01401548, + "auxiliary_loss_mlp": 0.0104333, + "balance_loss_clip": 1.271631, + "balance_loss_mlp": 1.02860761, + "epoch": 0.33284232676987824, + "flos": 25380037056960.0, + "grad_norm": 2.0360351847913525, + "language_loss": 0.7307061, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75515491, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.14727783, + "step": 5536, + "time_per_iteration": 2.8462843894958496 + }, + { + "auxiliary_loss_clip": 0.01405067, + "auxiliary_loss_mlp": 0.01042141, + "balance_loss_clip": 1.27529204, + "balance_loss_mlp": 1.02796674, + "epoch": 0.3329024500225462, + "flos": 23699429046360.0, + "grad_norm": 1.9899632499383242, + "language_loss": 0.81990075, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84437281, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1418457, + "step": 5537, + "time_per_iteration": 2.799093008041382 + }, + { + "auxiliary_loss_clip": 0.01411946, + "auxiliary_loss_mlp": 0.0104388, + "balance_loss_clip": 1.28086519, + "balance_loss_mlp": 1.0283469, + "epoch": 0.33296257327521417, + "flos": 31729529299680.0, + "grad_norm": 1.8180196456627638, + "language_loss": 0.71480227, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73936057, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15551758, + "step": 5538, + "time_per_iteration": 2.843585729598999 + }, + { + "auxiliary_loss_clip": 0.01401506, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.2731787, + "balance_loss_mlp": 1.02057934, + "epoch": 0.33302269652788213, + "flos": 22749513108480.0, + "grad_norm": 1.6918126875998434, + "language_loss": 0.74506098, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76943159, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14990234, + "step": 5539, + "time_per_iteration": 2.799661159515381 + }, + { + "auxiliary_loss_clip": 0.01412376, + "auxiliary_loss_mlp": 0.01045431, + "balance_loss_clip": 1.27907097, + "balance_loss_mlp": 1.02969587, + "epoch": 0.33308281978055015, + "flos": 11477493860400.0, + "grad_norm": 1.7280642787009401, + "language_loss": 0.7136066, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73818469, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15734863, + "step": 5540, + "time_per_iteration": 2.7664380073547363 + }, + { + "auxiliary_loss_clip": 0.01401959, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.27369785, + "balance_loss_mlp": 1.02267385, + "epoch": 0.3331429430332181, + "flos": 38220392535480.0, + "grad_norm": 1.4873161849424814, + "language_loss": 0.60835683, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.63274395, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.14056396, + "step": 5541, + "time_per_iteration": 2.930830717086792 + }, + { + "auxiliary_loss_clip": 0.01408636, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_clip": 1.27680862, + "balance_loss_mlp": 1.02605391, + "epoch": 0.3332030662858861, + "flos": 22533822170640.0, + "grad_norm": 1.5572540803731338, + "language_loss": 0.6930691, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.71757877, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.1628418, + "step": 5542, + "time_per_iteration": 2.7466881275177 + }, + { + "auxiliary_loss_clip": 0.01403243, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.27372265, + "balance_loss_mlp": 1.02102149, + "epoch": 0.33326318953855405, + "flos": 16002083297520.0, + "grad_norm": 1.9843148749799886, + "language_loss": 0.75326335, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77765644, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.15026855, + "step": 5543, + "time_per_iteration": 2.8311777114868164 + }, + { + "auxiliary_loss_clip": 0.01413752, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.28185856, + "balance_loss_mlp": 1.01824856, + "epoch": 0.333323312791222, + "flos": 25599098488680.0, + "grad_norm": 1.5778163723511909, + "language_loss": 0.75807476, + "learning_rate": 3.110027066843348e-06, + "loss": 0.78254491, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15014648, + "step": 5544, + "time_per_iteration": 2.820951461791992 + }, + { + "auxiliary_loss_clip": 0.01404118, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.27518106, + "balance_loss_mlp": 1.02016807, + "epoch": 0.33338343604389, + "flos": 25125094816200.0, + "grad_norm": 1.544032962476695, + "language_loss": 0.71500301, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73939079, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14483643, + "step": 5545, + "time_per_iteration": 2.83203125 + }, + { + "auxiliary_loss_clip": 0.01400667, + "auxiliary_loss_mlp": 0.01029298, + "balance_loss_clip": 1.27324557, + "balance_loss_mlp": 1.01485586, + "epoch": 0.33344355929655795, + "flos": 16951836801960.0, + "grad_norm": 1.5967320552108177, + "language_loss": 0.69471312, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7190128, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.14459229, + "step": 5546, + "time_per_iteration": 2.784189462661743 + }, + { + "auxiliary_loss_clip": 0.01411672, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.27850103, + "balance_loss_mlp": 1.01926541, + "epoch": 0.3335036825492259, + "flos": 27895121773200.0, + "grad_norm": 1.592669765570886, + "language_loss": 0.64916033, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.6736266, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.15710449, + "step": 5547, + "time_per_iteration": 2.825753688812256 + }, + { + "auxiliary_loss_clip": 0.01404361, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.2753768, + "balance_loss_mlp": 1.01694548, + "epoch": 0.3335638058018939, + "flos": 16183802019240.0, + "grad_norm": 2.0181202361877126, + "language_loss": 0.85872531, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.88307333, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.13494873, + "step": 5548, + "time_per_iteration": 4.198710680007935 + }, + { + "auxiliary_loss_clip": 0.01411203, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.27926183, + "balance_loss_mlp": 1.01687503, + "epoch": 0.33362392905456184, + "flos": 39903558872760.0, + "grad_norm": 2.172845311617008, + "language_loss": 0.74662882, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.77106583, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.15625, + "step": 5549, + "time_per_iteration": 2.91784405708313 + }, + { + "auxiliary_loss_clip": 0.0140779, + "auxiliary_loss_mlp": 0.01034227, + "balance_loss_clip": 1.27596688, + "balance_loss_mlp": 1.01849151, + "epoch": 0.3336840523072298, + "flos": 44278818078240.0, + "grad_norm": 1.8858307280155395, + "language_loss": 0.6912384, + "learning_rate": 3.108082487713921e-06, + "loss": 0.71565866, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.15722656, + "step": 5550, + "time_per_iteration": 2.944927453994751 + }, + { + "auxiliary_loss_clip": 0.01409875, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.27998853, + "balance_loss_mlp": 1.025437, + "epoch": 0.33374417555989777, + "flos": 15089875544880.0, + "grad_norm": 1.6885167928503055, + "language_loss": 0.60936916, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.63386679, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14459229, + "step": 5551, + "time_per_iteration": 2.7573440074920654 + }, + { + "auxiliary_loss_clip": 0.01402824, + "auxiliary_loss_mlp": 0.01040702, + "balance_loss_clip": 1.27477908, + "balance_loss_mlp": 1.02508545, + "epoch": 0.33380429881256574, + "flos": 15853118541120.0, + "grad_norm": 1.5949465356581307, + "language_loss": 0.70651984, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73095512, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.15612793, + "step": 5552, + "time_per_iteration": 2.729708194732666 + }, + { + "auxiliary_loss_clip": 0.01408675, + "auxiliary_loss_mlp": 0.01039571, + "balance_loss_clip": 1.27910209, + "balance_loss_mlp": 1.0243597, + "epoch": 0.33386442206523376, + "flos": 13484318429520.0, + "grad_norm": 2.0313940658808893, + "language_loss": 0.83385146, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85833395, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.15222168, + "step": 5553, + "time_per_iteration": 2.7435436248779297 + }, + { + "auxiliary_loss_clip": 0.0141374, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.283494, + "balance_loss_mlp": 1.01919627, + "epoch": 0.3339245453179017, + "flos": 16695026576640.0, + "grad_norm": 2.2195057279822157, + "language_loss": 0.81351542, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83800238, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15759277, + "step": 5554, + "time_per_iteration": 4.267648220062256 + }, + { + "auxiliary_loss_clip": 0.01412984, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.28198695, + "balance_loss_mlp": 1.01984608, + "epoch": 0.3339846685705697, + "flos": 24616347368760.0, + "grad_norm": 1.6213367838898016, + "language_loss": 0.82055211, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.84504008, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1595459, + "step": 5555, + "time_per_iteration": 2.828134059906006 + }, + { + "auxiliary_loss_clip": 0.01406384, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.27737582, + "balance_loss_mlp": 1.02035892, + "epoch": 0.33404479182323765, + "flos": 30959667140760.0, + "grad_norm": 1.792500107056073, + "language_loss": 0.74576223, + "learning_rate": 3.106136395915099e-06, + "loss": 0.77017838, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14874268, + "step": 5556, + "time_per_iteration": 2.8161532878875732 + }, + { + "auxiliary_loss_clip": 0.01407694, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.28030562, + "balance_loss_mlp": 1.02043724, + "epoch": 0.3341049150759056, + "flos": 23518522491840.0, + "grad_norm": 1.4464013033617555, + "language_loss": 0.82472968, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84916055, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14953613, + "step": 5557, + "time_per_iteration": 2.822373390197754 + }, + { + "auxiliary_loss_clip": 0.01417297, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.28625596, + "balance_loss_mlp": 1.01492906, + "epoch": 0.3341650383285736, + "flos": 24032467809360.0, + "grad_norm": 1.942689429976163, + "language_loss": 0.80392766, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82840443, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15454102, + "step": 5558, + "time_per_iteration": 2.8307621479034424 + }, + { + "auxiliary_loss_clip": 0.01416304, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.2851963, + "balance_loss_mlp": 1.01570845, + "epoch": 0.33422516158124155, + "flos": 24907657718880.0, + "grad_norm": 2.421770941894261, + "language_loss": 0.81931591, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84377933, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.14343262, + "step": 5559, + "time_per_iteration": 2.8369104862213135 + }, + { + "auxiliary_loss_clip": 0.01409461, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.28282142, + "balance_loss_mlp": 1.01541424, + "epoch": 0.3342852848339095, + "flos": 18338088835440.0, + "grad_norm": 1.6861067575003255, + "language_loss": 0.71710241, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.7414974, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.1461792, + "step": 5560, + "time_per_iteration": 4.219556570053101 + }, + { + "auxiliary_loss_clip": 0.01426822, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.29307866, + "balance_loss_mlp": 1.0201509, + "epoch": 0.3343454080865775, + "flos": 30052900908360.0, + "grad_norm": 1.3058686106177428, + "language_loss": 0.74973655, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77436304, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15673828, + "step": 5561, + "time_per_iteration": 4.343935489654541 + }, + { + "auxiliary_loss_clip": 0.01415329, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.28575647, + "balance_loss_mlp": 1.01366496, + "epoch": 0.33440553133924544, + "flos": 16403147709480.0, + "grad_norm": 1.659482836707105, + "language_loss": 0.69959807, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.72403419, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14605713, + "step": 5562, + "time_per_iteration": 2.9361977577209473 + }, + { + "auxiliary_loss_clip": 0.01413486, + "auxiliary_loss_mlp": 0.0102711, + "balance_loss_clip": 1.28466523, + "balance_loss_mlp": 1.01337087, + "epoch": 0.3344656545919134, + "flos": 24247265363280.0, + "grad_norm": 1.9372133691089781, + "language_loss": 0.65469581, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67910171, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.13720703, + "step": 5563, + "time_per_iteration": 2.7910425662994385 + }, + { + "auxiliary_loss_clip": 0.0141986, + "auxiliary_loss_mlp": 0.01039135, + "balance_loss_clip": 1.28793252, + "balance_loss_mlp": 1.02280354, + "epoch": 0.3345257778445814, + "flos": 52127889951960.0, + "grad_norm": 1.6167111825849256, + "language_loss": 0.74205106, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76664102, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.16326904, + "step": 5564, + "time_per_iteration": 3.038796901702881 + }, + { + "auxiliary_loss_clip": 0.01300342, + "auxiliary_loss_mlp": 0.01004845, + "balance_loss_clip": 1.2423954, + "balance_loss_mlp": 1.00188828, + "epoch": 0.33458590109724934, + "flos": 68062597334280.0, + "grad_norm": 0.7875287378043759, + "language_loss": 0.55423117, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57728302, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.02954102, + "step": 5565, + "time_per_iteration": 3.233483076095581 + }, + { + "auxiliary_loss_clip": 0.01414401, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.28665996, + "balance_loss_mlp": 1.02288508, + "epoch": 0.3346460243499173, + "flos": 37421877855600.0, + "grad_norm": 1.8121409015753573, + "language_loss": 0.64939034, + "learning_rate": 3.102889555312721e-06, + "loss": 0.6739102, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.14703369, + "step": 5566, + "time_per_iteration": 2.8650991916656494 + }, + { + "auxiliary_loss_clip": 0.01415431, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.28928804, + "balance_loss_mlp": 1.01692247, + "epoch": 0.3347061476025853, + "flos": 18701972970840.0, + "grad_norm": 1.7470921854041193, + "language_loss": 0.76997948, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79445279, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.14978027, + "step": 5567, + "time_per_iteration": 2.7876572608947754 + }, + { + "auxiliary_loss_clip": 0.01421222, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.2907207, + "balance_loss_mlp": 1.01774001, + "epoch": 0.3347662708552533, + "flos": 13921832167560.0, + "grad_norm": 1.5940818249178392, + "language_loss": 0.76955563, + "learning_rate": 3.102239684937949e-06, + "loss": 0.79409856, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15332031, + "step": 5568, + "time_per_iteration": 2.7228612899780273 + }, + { + "auxiliary_loss_clip": 0.01421827, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.29130089, + "balance_loss_mlp": 1.02464509, + "epoch": 0.33482639410792125, + "flos": 19754495899200.0, + "grad_norm": 3.5769104756314114, + "language_loss": 0.71292186, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73754174, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.1552124, + "step": 5569, + "time_per_iteration": 2.760824203491211 + }, + { + "auxiliary_loss_clip": 0.0142533, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.29419303, + "balance_loss_mlp": 1.01865768, + "epoch": 0.3348865173605892, + "flos": 16106761314360.0, + "grad_norm": 2.0140912768970574, + "language_loss": 0.89926481, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92386222, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15759277, + "step": 5570, + "time_per_iteration": 2.719911813735962 + }, + { + "auxiliary_loss_clip": 0.01419949, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.29120898, + "balance_loss_mlp": 1.01830411, + "epoch": 0.3349466406132572, + "flos": 25014325545360.0, + "grad_norm": 1.6138314985133313, + "language_loss": 0.7996344, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82417023, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15319824, + "step": 5571, + "time_per_iteration": 2.8448221683502197 + }, + { + "auxiliary_loss_clip": 0.01288672, + "auxiliary_loss_mlp": 0.01001749, + "balance_loss_clip": 1.23185253, + "balance_loss_mlp": 0.99838704, + "epoch": 0.33500676386592515, + "flos": 54335657978280.0, + "grad_norm": 0.8962714492299116, + "language_loss": 0.55967772, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.582582, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.03369141, + "step": 5572, + "time_per_iteration": 3.183202028274536 + }, + { + "auxiliary_loss_clip": 0.01417113, + "auxiliary_loss_mlp": 0.01043769, + "balance_loss_clip": 1.28858733, + "balance_loss_mlp": 1.02908206, + "epoch": 0.3350668871185931, + "flos": 26803022675040.0, + "grad_norm": 2.0620577543828342, + "language_loss": 0.78632295, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.81093174, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.14685059, + "step": 5573, + "time_per_iteration": 2.801987409591675 + }, + { + "auxiliary_loss_clip": 0.01421459, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_clip": 1.29087424, + "balance_loss_mlp": 1.03117073, + "epoch": 0.3351270103712611, + "flos": 33517048786920.0, + "grad_norm": 2.1523877782877876, + "language_loss": 0.73440748, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.75908959, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15576172, + "step": 5574, + "time_per_iteration": 2.8668768405914307 + }, + { + "auxiliary_loss_clip": 0.014121, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.28559351, + "balance_loss_mlp": 1.01700568, + "epoch": 0.33518713362392905, + "flos": 26511996583440.0, + "grad_norm": 1.618166401499122, + "language_loss": 0.88312078, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90755421, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14245605, + "step": 5575, + "time_per_iteration": 2.7949042320251465 + }, + { + "auxiliary_loss_clip": 0.01433056, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.29779863, + "balance_loss_mlp": 1.02809048, + "epoch": 0.335247256876597, + "flos": 17236040689080.0, + "grad_norm": 2.543682529493248, + "language_loss": 0.83322698, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.85799617, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15759277, + "step": 5576, + "time_per_iteration": 2.7567715644836426 + }, + { + "auxiliary_loss_clip": 0.0142342, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.29053617, + "balance_loss_mlp": 1.02313066, + "epoch": 0.335307380129265, + "flos": 25634857472640.0, + "grad_norm": 2.1749918269984048, + "language_loss": 0.73443407, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75905246, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.15283203, + "step": 5577, + "time_per_iteration": 2.785040855407715 + }, + { + "auxiliary_loss_clip": 0.01427988, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.29786313, + "balance_loss_mlp": 1.02749527, + "epoch": 0.33536750338193294, + "flos": 19684521048960.0, + "grad_norm": 1.735532644402151, + "language_loss": 0.81685305, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.84156227, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15429688, + "step": 5578, + "time_per_iteration": 2.7945570945739746 + }, + { + "auxiliary_loss_clip": 0.01416116, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.291116, + "balance_loss_mlp": 1.02591729, + "epoch": 0.3354276266346009, + "flos": 18337195451520.0, + "grad_norm": 2.07944235098197, + "language_loss": 0.72429734, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.74885738, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.1395874, + "step": 5579, + "time_per_iteration": 2.782371759414673 + }, + { + "auxiliary_loss_clip": 0.01431759, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.29928982, + "balance_loss_mlp": 1.02438927, + "epoch": 0.3354877498872689, + "flos": 17863110562320.0, + "grad_norm": 1.9183200637293263, + "language_loss": 0.81379092, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83850443, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.15209961, + "step": 5580, + "time_per_iteration": 2.7379801273345947 + }, + { + "auxiliary_loss_clip": 0.01431498, + "auxiliary_loss_mlp": 0.01036264, + "balance_loss_clip": 1.29892206, + "balance_loss_mlp": 1.02009869, + "epoch": 0.3355478731399369, + "flos": 24723177628680.0, + "grad_norm": 1.54952649236479, + "language_loss": 0.78025246, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80492997, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16162109, + "step": 5581, + "time_per_iteration": 2.878873109817505 + }, + { + "auxiliary_loss_clip": 0.0143817, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.30189729, + "balance_loss_mlp": 1.02598858, + "epoch": 0.33560799639260486, + "flos": 16878369632760.0, + "grad_norm": 2.166129841172055, + "language_loss": 0.7462095, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77102178, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.17077637, + "step": 5582, + "time_per_iteration": 2.74947452545166 + }, + { + "auxiliary_loss_clip": 0.01439011, + "auxiliary_loss_mlp": 0.01043494, + "balance_loss_clip": 1.30508602, + "balance_loss_mlp": 1.02758002, + "epoch": 0.3356681196452728, + "flos": 18338210660520.0, + "grad_norm": 1.5304817836742535, + "language_loss": 0.82078648, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84561157, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15905762, + "step": 5583, + "time_per_iteration": 2.794654130935669 + }, + { + "auxiliary_loss_clip": 0.01432782, + "auxiliary_loss_mlp": 0.0105008, + "balance_loss_clip": 1.30169749, + "balance_loss_mlp": 1.03488076, + "epoch": 0.3357282428979408, + "flos": 34757259865920.0, + "grad_norm": 1.599875517802804, + "language_loss": 0.77689332, + "learning_rate": 3.097034711451581e-06, + "loss": 0.80172193, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15197754, + "step": 5584, + "time_per_iteration": 2.9063804149627686 + }, + { + "auxiliary_loss_clip": 0.01439232, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.30452716, + "balance_loss_mlp": 1.02187681, + "epoch": 0.33578836615060875, + "flos": 21585124483560.0, + "grad_norm": 1.5915282877268724, + "language_loss": 0.75966763, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78443384, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15509033, + "step": 5585, + "time_per_iteration": 2.8131134510040283 + }, + { + "auxiliary_loss_clip": 0.01426588, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.29714704, + "balance_loss_mlp": 1.02208889, + "epoch": 0.3358484894032767, + "flos": 24535042786080.0, + "grad_norm": 1.5070979101099613, + "language_loss": 0.78310478, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.80774283, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15130615, + "step": 5586, + "time_per_iteration": 2.81158185005188 + }, + { + "auxiliary_loss_clip": 0.01452293, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_clip": 1.31373477, + "balance_loss_mlp": 1.0335021, + "epoch": 0.3359086126559447, + "flos": 22460720476680.0, + "grad_norm": 1.7265658252192708, + "language_loss": 0.81096184, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83599848, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.17858887, + "step": 5587, + "time_per_iteration": 2.857665777206421 + }, + { + "auxiliary_loss_clip": 0.01426599, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.29914212, + "balance_loss_mlp": 1.0234865, + "epoch": 0.33596873590861265, + "flos": 16547889196440.0, + "grad_norm": 2.4805325972353183, + "language_loss": 0.67949581, + "learning_rate": 3.095731802118677e-06, + "loss": 0.70414197, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.14526367, + "step": 5588, + "time_per_iteration": 4.247563123703003 + }, + { + "auxiliary_loss_clip": 0.01442001, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.30879879, + "balance_loss_mlp": 1.02523577, + "epoch": 0.3360288591612806, + "flos": 31182099066360.0, + "grad_norm": 1.947570831697813, + "language_loss": 0.70397198, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72881627, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.17199707, + "step": 5589, + "time_per_iteration": 2.8634607791900635 + }, + { + "auxiliary_loss_clip": 0.01442294, + "auxiliary_loss_mlp": 0.01047494, + "balance_loss_clip": 1.30827069, + "balance_loss_mlp": 1.03099573, + "epoch": 0.3360889824139486, + "flos": 23702312239920.0, + "grad_norm": 1.7553882735236128, + "language_loss": 0.67714876, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.70204663, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16491699, + "step": 5590, + "time_per_iteration": 2.8158113956451416 + }, + { + "auxiliary_loss_clip": 0.01436807, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.30785871, + "balance_loss_mlp": 1.02483249, + "epoch": 0.33614910566661654, + "flos": 19322870373360.0, + "grad_norm": 2.0428978914283906, + "language_loss": 0.73752999, + "learning_rate": 3.094754183798047e-06, + "loss": 0.76231194, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.16552734, + "step": 5591, + "time_per_iteration": 2.805572986602783 + }, + { + "auxiliary_loss_clip": 0.01437765, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.30702531, + "balance_loss_mlp": 1.02079737, + "epoch": 0.3362092289192845, + "flos": 16476655487040.0, + "grad_norm": 2.005551036118684, + "language_loss": 0.69855297, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72329593, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1574707, + "step": 5592, + "time_per_iteration": 4.205800771713257 + }, + { + "auxiliary_loss_clip": 0.01438096, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.30801475, + "balance_loss_mlp": 1.01907611, + "epoch": 0.33626935217195253, + "flos": 24248970914400.0, + "grad_norm": 2.7478904238717843, + "language_loss": 0.77051187, + "learning_rate": 3.094102230664423e-06, + "loss": 0.79523629, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15264893, + "step": 5593, + "time_per_iteration": 2.796391248703003 + }, + { + "auxiliary_loss_clip": 0.01445718, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.30953228, + "balance_loss_mlp": 1.02350664, + "epoch": 0.3363294754246205, + "flos": 19723488093360.0, + "grad_norm": 1.8362900573670564, + "language_loss": 0.71456993, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73943901, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.17687988, + "step": 5594, + "time_per_iteration": 2.792680501937866 + }, + { + "auxiliary_loss_clip": 0.01447949, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.31374872, + "balance_loss_mlp": 1.02480304, + "epoch": 0.33638959867728846, + "flos": 22600751393880.0, + "grad_norm": 2.7371910605922025, + "language_loss": 0.80305886, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82795596, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16955566, + "step": 5595, + "time_per_iteration": 2.8344805240631104 + }, + { + "auxiliary_loss_clip": 0.01438643, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.30819011, + "balance_loss_mlp": 1.0191282, + "epoch": 0.3364497219299564, + "flos": 20999498764680.0, + "grad_norm": 1.4712229352156962, + "language_loss": 0.8155008, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.84022915, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15063477, + "step": 5596, + "time_per_iteration": 2.8216114044189453 + }, + { + "auxiliary_loss_clip": 0.01442513, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.30952394, + "balance_loss_mlp": 1.02081966, + "epoch": 0.3365098451826244, + "flos": 25234117927560.0, + "grad_norm": 1.5480707929628716, + "language_loss": 0.75953823, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78432435, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.1529541, + "step": 5597, + "time_per_iteration": 2.869065523147583 + }, + { + "auxiliary_loss_clip": 0.01444629, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.31391394, + "balance_loss_mlp": 1.01761663, + "epoch": 0.33656996843529235, + "flos": 24577096065840.0, + "grad_norm": 1.7075025675633273, + "language_loss": 0.78771794, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81249237, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15209961, + "step": 5598, + "time_per_iteration": 4.23722243309021 + }, + { + "auxiliary_loss_clip": 0.01453801, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.31486714, + "balance_loss_mlp": 1.02052903, + "epoch": 0.3366300916879603, + "flos": 44099007949440.0, + "grad_norm": 1.4492573619493092, + "language_loss": 0.64591771, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.67083299, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.17211914, + "step": 5599, + "time_per_iteration": 3.005692481994629 + }, + { + "auxiliary_loss_clip": 0.01450463, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.31272769, + "balance_loss_mlp": 1.01860285, + "epoch": 0.3366902149406283, + "flos": 13883677290360.0, + "grad_norm": 2.3784732384181266, + "language_loss": 0.82550055, + "learning_rate": 3.091819088459249e-06, + "loss": 0.85036731, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.17614746, + "step": 5600, + "time_per_iteration": 4.266145467758179 + }, + { + "auxiliary_loss_clip": 0.01441409, + "auxiliary_loss_mlp": 0.01047054, + "balance_loss_clip": 1.30599308, + "balance_loss_mlp": 1.02966094, + "epoch": 0.33675033819329625, + "flos": 16257431621880.0, + "grad_norm": 2.35944776704932, + "language_loss": 0.83756673, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.86245137, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.1739502, + "step": 5601, + "time_per_iteration": 2.7823522090911865 + }, + { + "auxiliary_loss_clip": 0.01427145, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.30017614, + "balance_loss_mlp": 1.02155638, + "epoch": 0.3368104614459642, + "flos": 17060250787920.0, + "grad_norm": 1.8297977540453605, + "language_loss": 0.83084631, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85548967, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.15631104, + "step": 5602, + "time_per_iteration": 2.7977547645568848 + }, + { + "auxiliary_loss_clip": 0.01442254, + "auxiliary_loss_mlp": 0.01051017, + "balance_loss_clip": 1.30938172, + "balance_loss_mlp": 1.03466165, + "epoch": 0.3368705846986322, + "flos": 17863191779040.0, + "grad_norm": 1.666576165017637, + "language_loss": 0.7006464, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72557908, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16333008, + "step": 5603, + "time_per_iteration": 2.755917549133301 + }, + { + "auxiliary_loss_clip": 0.01448964, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.31419802, + "balance_loss_mlp": 1.02125776, + "epoch": 0.33693070795130015, + "flos": 22934642932440.0, + "grad_norm": 1.8870422663230144, + "language_loss": 0.83186108, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8567239, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.1607666, + "step": 5604, + "time_per_iteration": 2.8219571113586426 + }, + { + "auxiliary_loss_clip": 0.0143522, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.29943848, + "balance_loss_mlp": 1.02292228, + "epoch": 0.3369908312039681, + "flos": 22022475788160.0, + "grad_norm": 1.3183039299080106, + "language_loss": 0.73971134, + "learning_rate": 3.090187030294409e-06, + "loss": 0.76445907, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.16638184, + "step": 5605, + "time_per_iteration": 2.8113818168640137 + }, + { + "auxiliary_loss_clip": 0.0144403, + "auxiliary_loss_mlp": 0.01036887, + "balance_loss_clip": 1.30662775, + "balance_loss_mlp": 1.02011395, + "epoch": 0.33705095445663613, + "flos": 11805740836920.0, + "grad_norm": 2.5920913531436582, + "language_loss": 0.83787286, + "learning_rate": 3.089860494591919e-06, + "loss": 0.8626821, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16772461, + "step": 5606, + "time_per_iteration": 2.9294140338897705 + }, + { + "auxiliary_loss_clip": 0.01435062, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.30236554, + "balance_loss_mlp": 1.01309764, + "epoch": 0.3371110777093041, + "flos": 25051911905520.0, + "grad_norm": 1.5281831882801085, + "language_loss": 0.67552763, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70017266, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16345215, + "step": 5607, + "time_per_iteration": 2.9357311725616455 + }, + { + "auxiliary_loss_clip": 0.01443565, + "auxiliary_loss_mlp": 0.01040061, + "balance_loss_clip": 1.30633569, + "balance_loss_mlp": 1.02295482, + "epoch": 0.33717120096197206, + "flos": 26584773410520.0, + "grad_norm": 1.7383472757858922, + "language_loss": 0.71126485, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73610103, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.17102051, + "step": 5608, + "time_per_iteration": 2.8481693267822266 + }, + { + "auxiliary_loss_clip": 0.01431914, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.29845774, + "balance_loss_mlp": 1.02180135, + "epoch": 0.33723132421464, + "flos": 15162733588680.0, + "grad_norm": 2.1828659235772507, + "language_loss": 0.79248846, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81718671, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16107178, + "step": 5609, + "time_per_iteration": 2.762399196624756 + }, + { + "auxiliary_loss_clip": 0.01438689, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.30421567, + "balance_loss_mlp": 1.02238297, + "epoch": 0.337291447467308, + "flos": 23440750836480.0, + "grad_norm": 1.591814392272498, + "language_loss": 0.82806545, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.8528415, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16552734, + "step": 5610, + "time_per_iteration": 2.774588108062744 + }, + { + "auxiliary_loss_clip": 0.0142751, + "auxiliary_loss_mlp": 0.01039211, + "balance_loss_clip": 1.29853153, + "balance_loss_mlp": 1.02208114, + "epoch": 0.33735157071997596, + "flos": 17242619243400.0, + "grad_norm": 1.8965684176435673, + "language_loss": 0.82537431, + "learning_rate": 3.088227196412879e-06, + "loss": 0.85004151, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.17126465, + "step": 5611, + "time_per_iteration": 2.841866970062256 + }, + { + "auxiliary_loss_clip": 0.01441637, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.30774319, + "balance_loss_mlp": 1.02506638, + "epoch": 0.3374116939726439, + "flos": 28263351003120.0, + "grad_norm": 1.7253415722248775, + "language_loss": 0.79630059, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.82113647, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.16882324, + "step": 5612, + "time_per_iteration": 2.8304684162139893 + }, + { + "auxiliary_loss_clip": 0.01438468, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.30284667, + "balance_loss_mlp": 1.02110553, + "epoch": 0.3374718172253119, + "flos": 35926359060600.0, + "grad_norm": 2.207579950135045, + "language_loss": 0.70157528, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7263357, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.16455078, + "step": 5613, + "time_per_iteration": 2.906771183013916 + }, + { + "auxiliary_loss_clip": 0.01437886, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.30187631, + "balance_loss_mlp": 1.02069438, + "epoch": 0.33753194047797985, + "flos": 18191235713760.0, + "grad_norm": 2.6296898237450645, + "language_loss": 0.79673028, + "learning_rate": 3.087246722218144e-06, + "loss": 0.82148254, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16638184, + "step": 5614, + "time_per_iteration": 2.7697412967681885 + }, + { + "auxiliary_loss_clip": 0.01438198, + "auxiliary_loss_mlp": 0.01045466, + "balance_loss_clip": 1.30338824, + "balance_loss_mlp": 1.02697706, + "epoch": 0.3375920637306478, + "flos": 23154151056120.0, + "grad_norm": 1.6977424967736263, + "language_loss": 0.91668802, + "learning_rate": 3.086919815013031e-06, + "loss": 0.94152462, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.18481445, + "step": 5615, + "time_per_iteration": 2.8010995388031006 + }, + { + "auxiliary_loss_clip": 0.0142899, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.29729295, + "balance_loss_mlp": 1.02317846, + "epoch": 0.3376521869833158, + "flos": 23117417471520.0, + "grad_norm": 1.9039406954047693, + "language_loss": 0.81320244, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83788264, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.15869141, + "step": 5616, + "time_per_iteration": 2.8373019695281982 + }, + { + "auxiliary_loss_clip": 0.01446483, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.3086009, + "balance_loss_mlp": 1.02592778, + "epoch": 0.33771231023598375, + "flos": 19278421200360.0, + "grad_norm": 1.6540096770566284, + "language_loss": 0.84116554, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86608028, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.19055176, + "step": 5617, + "time_per_iteration": 2.7874560356140137 + }, + { + "auxiliary_loss_clip": 0.01430362, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.29720902, + "balance_loss_mlp": 1.02165353, + "epoch": 0.3377724334886517, + "flos": 18154745779320.0, + "grad_norm": 1.443100741133467, + "language_loss": 0.79969656, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82438248, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.16589355, + "step": 5618, + "time_per_iteration": 2.795681953430176 + }, + { + "auxiliary_loss_clip": 0.01442718, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.30775487, + "balance_loss_mlp": 1.02213335, + "epoch": 0.3378325567413197, + "flos": 25781588769240.0, + "grad_norm": 1.562010903118227, + "language_loss": 0.71240914, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73722458, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16687012, + "step": 5619, + "time_per_iteration": 2.8377273082733154 + }, + { + "auxiliary_loss_clip": 0.0143271, + "auxiliary_loss_mlp": 0.01045979, + "balance_loss_clip": 1.29826164, + "balance_loss_mlp": 1.02988613, + "epoch": 0.3378926799939877, + "flos": 21321126578520.0, + "grad_norm": 2.5045631146157445, + "language_loss": 0.70739317, + "learning_rate": 3.085284660993821e-06, + "loss": 0.73218012, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16088867, + "step": 5620, + "time_per_iteration": 2.8326199054718018 + }, + { + "auxiliary_loss_clip": 0.01431008, + "auxiliary_loss_mlp": 0.01047915, + "balance_loss_clip": 1.29968309, + "balance_loss_mlp": 1.03179789, + "epoch": 0.33795280324665566, + "flos": 24905464867440.0, + "grad_norm": 1.735085494704034, + "language_loss": 0.68614203, + "learning_rate": 3.084957506678058e-06, + "loss": 0.71093118, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.16113281, + "step": 5621, + "time_per_iteration": 2.8229877948760986 + }, + { + "auxiliary_loss_clip": 0.01433373, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.30210733, + "balance_loss_mlp": 1.0303874, + "epoch": 0.33801292649932363, + "flos": 24759342696240.0, + "grad_norm": 2.127024132306869, + "language_loss": 0.82986772, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.8546629, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.1574707, + "step": 5622, + "time_per_iteration": 2.8395135402679443 + }, + { + "auxiliary_loss_clip": 0.01430083, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.30078411, + "balance_loss_mlp": 1.0247314, + "epoch": 0.3380730497519916, + "flos": 26729596114200.0, + "grad_norm": 1.462483640934052, + "language_loss": 0.73833942, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.76305103, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16357422, + "step": 5623, + "time_per_iteration": 2.869997978210449 + }, + { + "auxiliary_loss_clip": 0.01305469, + "auxiliary_loss_mlp": 0.01012411, + "balance_loss_clip": 1.24813843, + "balance_loss_mlp": 1.00883436, + "epoch": 0.33813317300465956, + "flos": 70052220939960.0, + "grad_norm": 0.755242333972275, + "language_loss": 0.5503909, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57356966, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03564453, + "step": 5624, + "time_per_iteration": 3.4011359214782715 + }, + { + "auxiliary_loss_clip": 0.01429986, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.29590464, + "balance_loss_mlp": 1.02772093, + "epoch": 0.3381932962573275, + "flos": 24102645701400.0, + "grad_norm": 2.576706838934813, + "language_loss": 0.73600727, + "learning_rate": 3.083648478122111e-06, + "loss": 0.7607584, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.17407227, + "step": 5625, + "time_per_iteration": 2.8398356437683105 + }, + { + "auxiliary_loss_clip": 0.01436755, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.30044091, + "balance_loss_mlp": 1.02336168, + "epoch": 0.3382534195099955, + "flos": 19282563253080.0, + "grad_norm": 1.9996064726370677, + "language_loss": 0.71452457, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.73929572, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.16992188, + "step": 5626, + "time_per_iteration": 4.248210906982422 + }, + { + "auxiliary_loss_clip": 0.01422565, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.29331601, + "balance_loss_mlp": 1.01755905, + "epoch": 0.33831354276266346, + "flos": 25231437775800.0, + "grad_norm": 1.3873479114233573, + "language_loss": 0.80896431, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83352619, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16070557, + "step": 5627, + "time_per_iteration": 2.8095574378967285 + }, + { + "auxiliary_loss_clip": 0.01433198, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.29765987, + "balance_loss_mlp": 1.02151656, + "epoch": 0.3383736660153314, + "flos": 23117173821360.0, + "grad_norm": 1.8680665467385908, + "language_loss": 0.80617857, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.83089232, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.16650391, + "step": 5628, + "time_per_iteration": 2.8122849464416504 + }, + { + "auxiliary_loss_clip": 0.01428097, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.29391181, + "balance_loss_mlp": 1.02078271, + "epoch": 0.3384337892679994, + "flos": 23482398032640.0, + "grad_norm": 2.272242851778512, + "language_loss": 0.7776866, + "learning_rate": 3.082338792093254e-06, + "loss": 0.8023417, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.16625977, + "step": 5629, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.01431555, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.29453254, + "balance_loss_mlp": 1.02357483, + "epoch": 0.33849391252066735, + "flos": 19430350367040.0, + "grad_norm": 1.7032586873044648, + "language_loss": 0.84443802, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86916625, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.17687988, + "step": 5630, + "time_per_iteration": 2.8021206855773926 + }, + { + "auxiliary_loss_clip": 0.01429777, + "auxiliary_loss_mlp": 0.01044479, + "balance_loss_clip": 1.29593062, + "balance_loss_mlp": 1.02877927, + "epoch": 0.3385540357733353, + "flos": 21069229964760.0, + "grad_norm": 2.711365862712245, + "language_loss": 0.71781784, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74256039, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15686035, + "step": 5631, + "time_per_iteration": 2.804684638977051 + }, + { + "auxiliary_loss_clip": 0.01291025, + "auxiliary_loss_mlp": 0.0101674, + "balance_loss_clip": 1.23301411, + "balance_loss_mlp": 1.0128535, + "epoch": 0.3386141590260033, + "flos": 69221097747000.0, + "grad_norm": 0.8731821164034859, + "language_loss": 0.56137973, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58445734, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.03881836, + "step": 5632, + "time_per_iteration": 4.678909540176392 + }, + { + "auxiliary_loss_clip": 0.01424539, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.29053307, + "balance_loss_mlp": 1.0137105, + "epoch": 0.3386742822786713, + "flos": 25524900369000.0, + "grad_norm": 1.5889164772036277, + "language_loss": 0.80185318, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82639891, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.16320801, + "step": 5633, + "time_per_iteration": 2.838550567626953 + }, + { + "auxiliary_loss_clip": 0.01426528, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.29256117, + "balance_loss_mlp": 1.01753139, + "epoch": 0.33873440553133927, + "flos": 23628398378760.0, + "grad_norm": 1.9962899375417082, + "language_loss": 0.5936507, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61824828, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15704346, + "step": 5634, + "time_per_iteration": 2.804184675216675 + }, + { + "auxiliary_loss_clip": 0.01418652, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.28510535, + "balance_loss_mlp": 1.01753664, + "epoch": 0.33879452878400723, + "flos": 17092882928160.0, + "grad_norm": 1.7249071057029988, + "language_loss": 0.92759752, + "learning_rate": 3.080373032026589e-06, + "loss": 0.95210719, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.14758301, + "step": 5635, + "time_per_iteration": 2.774709939956665 + }, + { + "auxiliary_loss_clip": 0.01407868, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.28048372, + "balance_loss_mlp": 1.0196265, + "epoch": 0.3388546520366752, + "flos": 15746288281200.0, + "grad_norm": 1.7284478823180554, + "language_loss": 0.75575936, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.78018486, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15039062, + "step": 5636, + "time_per_iteration": 2.8240394592285156 + }, + { + "auxiliary_loss_clip": 0.01413506, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.28243876, + "balance_loss_mlp": 1.0210371, + "epoch": 0.33891477528934316, + "flos": 22423824458640.0, + "grad_norm": 1.601710940939997, + "language_loss": 0.83734643, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.86184841, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15649414, + "step": 5637, + "time_per_iteration": 2.8032424449920654 + }, + { + "auxiliary_loss_clip": 0.01421631, + "auxiliary_loss_mlp": 0.01040026, + "balance_loss_clip": 1.28695035, + "balance_loss_mlp": 1.0232172, + "epoch": 0.3389748985420111, + "flos": 17279312219640.0, + "grad_norm": 2.2969242529965204, + "language_loss": 0.6957196, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72033614, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16809082, + "step": 5638, + "time_per_iteration": 5.712960243225098 + }, + { + "auxiliary_loss_clip": 0.01418657, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.28683424, + "balance_loss_mlp": 1.03014398, + "epoch": 0.3390350217946791, + "flos": 27750664544760.0, + "grad_norm": 1.6628871256716449, + "language_loss": 0.81290716, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83755797, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.16271973, + "step": 5639, + "time_per_iteration": 2.8423938751220703 + }, + { + "auxiliary_loss_clip": 0.01423138, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.28631449, + "balance_loss_mlp": 1.03223014, + "epoch": 0.33909514504734706, + "flos": 20344710362760.0, + "grad_norm": 2.64247556880573, + "language_loss": 0.68431473, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70903385, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.16552734, + "step": 5640, + "time_per_iteration": 2.772294521331787 + }, + { + "auxiliary_loss_clip": 0.01413215, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.28169012, + "balance_loss_mlp": 1.02148652, + "epoch": 0.339155268300015, + "flos": 14834161745280.0, + "grad_norm": 1.5986534107693828, + "language_loss": 0.70270437, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.7272023, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.15087891, + "step": 5641, + "time_per_iteration": 2.77687406539917 + }, + { + "auxiliary_loss_clip": 0.01425031, + "auxiliary_loss_mlp": 0.01050542, + "balance_loss_clip": 1.29076982, + "balance_loss_mlp": 1.03518808, + "epoch": 0.339215391552683, + "flos": 26073589461480.0, + "grad_norm": 1.688951334166319, + "language_loss": 0.87907696, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.90383267, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.15368652, + "step": 5642, + "time_per_iteration": 2.8576700687408447 + }, + { + "auxiliary_loss_clip": 0.01398561, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.27364433, + "balance_loss_mlp": 1.01676702, + "epoch": 0.33927551480535095, + "flos": 14578813420920.0, + "grad_norm": 1.8254123376565938, + "language_loss": 0.84404975, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86833912, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.13604736, + "step": 5643, + "time_per_iteration": 2.799968719482422 + }, + { + "auxiliary_loss_clip": 0.01406568, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_clip": 1.27607298, + "balance_loss_mlp": 1.03109598, + "epoch": 0.3393356380580189, + "flos": 23810888659320.0, + "grad_norm": 1.5785317195203792, + "language_loss": 0.77215123, + "learning_rate": 3.077421627435922e-06, + "loss": 0.7966758, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.14782715, + "step": 5644, + "time_per_iteration": 2.838665246963501 + }, + { + "auxiliary_loss_clip": 0.01409793, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.27739906, + "balance_loss_mlp": 1.02793884, + "epoch": 0.3393957613106869, + "flos": 17352373305240.0, + "grad_norm": 3.2880823540335213, + "language_loss": 0.63640499, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.66093385, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15148926, + "step": 5645, + "time_per_iteration": 2.742386817932129 + }, + { + "auxiliary_loss_clip": 0.01404314, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.27545941, + "balance_loss_mlp": 1.02241206, + "epoch": 0.3394558845633549, + "flos": 28439181512640.0, + "grad_norm": 1.7140240633747736, + "language_loss": 0.76496792, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78938448, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.14935303, + "step": 5646, + "time_per_iteration": 2.835768938064575 + }, + { + "auxiliary_loss_clip": 0.0142078, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.28455865, + "balance_loss_mlp": 1.02425444, + "epoch": 0.33951600781602287, + "flos": 22091516646120.0, + "grad_norm": 2.926289723190576, + "language_loss": 0.79815406, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.82277477, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.17028809, + "step": 5647, + "time_per_iteration": 2.7532501220703125 + }, + { + "auxiliary_loss_clip": 0.01408185, + "auxiliary_loss_mlp": 0.01036443, + "balance_loss_clip": 1.27604771, + "balance_loss_mlp": 1.02133942, + "epoch": 0.33957613106869083, + "flos": 23883543661320.0, + "grad_norm": 1.8487713583416525, + "language_loss": 0.77769095, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.80213726, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15100098, + "step": 5648, + "time_per_iteration": 2.8783481121063232 + }, + { + "auxiliary_loss_clip": 0.01272491, + "auxiliary_loss_mlp": 0.01017094, + "balance_loss_clip": 1.21136582, + "balance_loss_mlp": 1.01363695, + "epoch": 0.3396362543213588, + "flos": 71259637445280.0, + "grad_norm": 0.7806127583792828, + "language_loss": 0.56419563, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58709145, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03466797, + "step": 5649, + "time_per_iteration": 3.2906651496887207 + }, + { + "auxiliary_loss_clip": 0.01414806, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.28227091, + "balance_loss_mlp": 1.02544546, + "epoch": 0.33969637757402676, + "flos": 25927183031760.0, + "grad_norm": 1.5358492834372013, + "language_loss": 0.85629666, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.8808533, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15411377, + "step": 5650, + "time_per_iteration": 2.9068994522094727 + }, + { + "auxiliary_loss_clip": 0.01406119, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.27602625, + "balance_loss_mlp": 1.01673234, + "epoch": 0.33975650082669473, + "flos": 35268484423320.0, + "grad_norm": 1.4796426200670934, + "language_loss": 0.70613039, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73050255, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.14367676, + "step": 5651, + "time_per_iteration": 2.9292373657226562 + }, + { + "auxiliary_loss_clip": 0.01407751, + "auxiliary_loss_mlp": 0.01040171, + "balance_loss_clip": 1.27780914, + "balance_loss_mlp": 1.02476931, + "epoch": 0.3398166240793627, + "flos": 16650414970200.0, + "grad_norm": 1.7548017899738801, + "language_loss": 0.81226486, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83674407, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15405273, + "step": 5652, + "time_per_iteration": 2.7455317974090576 + }, + { + "auxiliary_loss_clip": 0.01420077, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.28560901, + "balance_loss_mlp": 1.02501047, + "epoch": 0.33987674733203066, + "flos": 24067698884640.0, + "grad_norm": 1.7211373436489918, + "language_loss": 0.77756983, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.80218357, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1628418, + "step": 5653, + "time_per_iteration": 2.8102452754974365 + }, + { + "auxiliary_loss_clip": 0.0140776, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.27686858, + "balance_loss_mlp": 1.01975727, + "epoch": 0.3399368705846986, + "flos": 13253236923240.0, + "grad_norm": 5.038446179662613, + "language_loss": 0.8622781, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.88670683, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.15344238, + "step": 5654, + "time_per_iteration": 2.7241363525390625 + }, + { + "auxiliary_loss_clip": 0.0140752, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.27648962, + "balance_loss_mlp": 1.01714182, + "epoch": 0.3399969938373666, + "flos": 27018104487480.0, + "grad_norm": 2.4251830929906673, + "language_loss": 0.66054666, + "learning_rate": 3.073809861919351e-06, + "loss": 0.68494761, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15466309, + "step": 5655, + "time_per_iteration": 2.8190560340881348 + }, + { + "auxiliary_loss_clip": 0.01412519, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.28063738, + "balance_loss_mlp": 1.01756275, + "epoch": 0.34005711709003456, + "flos": 28556204470920.0, + "grad_norm": 1.4955848184638383, + "language_loss": 0.76564884, + "learning_rate": 3.073481275036697e-06, + "loss": 0.79009187, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.14221191, + "step": 5656, + "time_per_iteration": 2.841458559036255 + }, + { + "auxiliary_loss_clip": 0.01418359, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.281322, + "balance_loss_mlp": 1.01834774, + "epoch": 0.3401172403427025, + "flos": 21622101718320.0, + "grad_norm": 1.5733792407156086, + "language_loss": 0.83420587, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85874057, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.1673584, + "step": 5657, + "time_per_iteration": 2.766378879547119 + }, + { + "auxiliary_loss_clip": 0.01406588, + "auxiliary_loss_mlp": 0.01038686, + "balance_loss_clip": 1.27524698, + "balance_loss_mlp": 1.02383268, + "epoch": 0.3401773635953705, + "flos": 25891424047800.0, + "grad_norm": 1.838728776032296, + "language_loss": 0.85896444, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.88341719, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1484375, + "step": 5658, + "time_per_iteration": 2.8473024368286133 + }, + { + "auxiliary_loss_clip": 0.01270772, + "auxiliary_loss_mlp": 0.0100382, + "balance_loss_clip": 1.20819116, + "balance_loss_mlp": 1.00067317, + "epoch": 0.3402374868480385, + "flos": 65523408233400.0, + "grad_norm": 0.8237473928605511, + "language_loss": 0.60121888, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62396479, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03149414, + "step": 5659, + "time_per_iteration": 3.2349605560302734 + }, + { + "auxiliary_loss_clip": 0.01399927, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.27366102, + "balance_loss_mlp": 1.02260733, + "epoch": 0.34029761010070647, + "flos": 24065871508440.0, + "grad_norm": 1.8903788312456196, + "language_loss": 0.68051279, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70488447, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.14630127, + "step": 5660, + "time_per_iteration": 2.801405906677246 + }, + { + "auxiliary_loss_clip": 0.01408507, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.27816117, + "balance_loss_mlp": 1.02294731, + "epoch": 0.34035773335337444, + "flos": 27605476365840.0, + "grad_norm": 1.5822306546674882, + "language_loss": 0.67689061, + "learning_rate": 3.071837730274918e-06, + "loss": 0.70136088, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15563965, + "step": 5661, + "time_per_iteration": 2.8351869583129883 + }, + { + "auxiliary_loss_clip": 0.01402163, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.27341747, + "balance_loss_mlp": 1.02191281, + "epoch": 0.3404178566060424, + "flos": 20817577001160.0, + "grad_norm": 1.5978984946599246, + "language_loss": 0.79127789, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81566393, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.14526367, + "step": 5662, + "time_per_iteration": 2.7621076107025146 + }, + { + "auxiliary_loss_clip": 0.01403513, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.27306056, + "balance_loss_mlp": 1.02188158, + "epoch": 0.34047797985871037, + "flos": 26839228350960.0, + "grad_norm": 1.8142658873814366, + "language_loss": 0.74054074, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.76495892, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.16430664, + "step": 5663, + "time_per_iteration": 2.831754446029663 + }, + { + "auxiliary_loss_clip": 0.01396417, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.26951468, + "balance_loss_mlp": 1.02021599, + "epoch": 0.34053810311137833, + "flos": 19687404242520.0, + "grad_norm": 1.586661102822281, + "language_loss": 0.86514318, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88944948, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.13995361, + "step": 5664, + "time_per_iteration": 4.2346978187561035 + }, + { + "auxiliary_loss_clip": 0.01411213, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.2768507, + "balance_loss_mlp": 1.02347565, + "epoch": 0.3405982263640463, + "flos": 21730759354440.0, + "grad_norm": 1.7638659915275867, + "language_loss": 0.69308579, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71758008, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.1473999, + "step": 5665, + "time_per_iteration": 2.7666640281677246 + }, + { + "auxiliary_loss_clip": 0.01410722, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.2771194, + "balance_loss_mlp": 1.02315366, + "epoch": 0.34065834961671426, + "flos": 18046291185000.0, + "grad_norm": 3.0996635716765275, + "language_loss": 0.72773409, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75223732, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.16442871, + "step": 5666, + "time_per_iteration": 2.8181071281433105 + }, + { + "auxiliary_loss_clip": 0.01410909, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.2773248, + "balance_loss_mlp": 1.02541864, + "epoch": 0.3407184728693822, + "flos": 21402309336120.0, + "grad_norm": 1.4971022634206548, + "language_loss": 0.73592877, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.76044703, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15515137, + "step": 5667, + "time_per_iteration": 2.7750320434570312 + }, + { + "auxiliary_loss_clip": 0.01267757, + "auxiliary_loss_mlp": 0.01007826, + "balance_loss_clip": 1.20707798, + "balance_loss_mlp": 1.00470281, + "epoch": 0.3407785961220502, + "flos": 68704692300720.0, + "grad_norm": 0.856297223111716, + "language_loss": 0.63332462, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65608037, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.03125, + "step": 5668, + "time_per_iteration": 3.487741470336914 + }, + { + "auxiliary_loss_clip": 0.01402737, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.27074742, + "balance_loss_mlp": 1.03588009, + "epoch": 0.34083871937471816, + "flos": 14068482247440.0, + "grad_norm": 1.982008410729447, + "language_loss": 0.7306006, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.75514489, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15820312, + "step": 5669, + "time_per_iteration": 2.726746082305908 + }, + { + "auxiliary_loss_clip": 0.01411268, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.27721763, + "balance_loss_mlp": 1.02035093, + "epoch": 0.3408988426273861, + "flos": 17088984525600.0, + "grad_norm": 1.7994144134645278, + "language_loss": 0.80374217, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82823026, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.17199707, + "step": 5670, + "time_per_iteration": 4.166428327560425 + }, + { + "auxiliary_loss_clip": 0.01410246, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_clip": 1.27438855, + "balance_loss_mlp": 1.02813661, + "epoch": 0.3409589658800541, + "flos": 24029909482680.0, + "grad_norm": 1.660654176183545, + "language_loss": 0.77339149, + "learning_rate": 3.068547593996078e-06, + "loss": 0.797925, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.14978027, + "step": 5671, + "time_per_iteration": 2.8789379596710205 + }, + { + "auxiliary_loss_clip": 0.01409215, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.27664304, + "balance_loss_mlp": 1.02127337, + "epoch": 0.34101908913272205, + "flos": 21146961011760.0, + "grad_norm": 1.7088927263286404, + "language_loss": 0.74356973, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7680428, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16796875, + "step": 5672, + "time_per_iteration": 2.763965129852295 + }, + { + "auxiliary_loss_clip": 0.01412689, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.27783585, + "balance_loss_mlp": 1.01909697, + "epoch": 0.3410792123853901, + "flos": 15705778119120.0, + "grad_norm": 2.855460377574351, + "language_loss": 0.73534203, + "learning_rate": 3.06788908010777e-06, + "loss": 0.7598086, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.14880371, + "step": 5673, + "time_per_iteration": 2.7533843517303467 + }, + { + "auxiliary_loss_clip": 0.01400426, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.27078629, + "balance_loss_mlp": 1.01746416, + "epoch": 0.34113933563805804, + "flos": 23040782850240.0, + "grad_norm": 1.7352578203254339, + "language_loss": 0.80050993, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82484269, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.15405273, + "step": 5674, + "time_per_iteration": 2.7517740726470947 + }, + { + "auxiliary_loss_clip": 0.01267184, + "auxiliary_loss_mlp": 0.01015558, + "balance_loss_clip": 1.2076354, + "balance_loss_mlp": 1.01229119, + "epoch": 0.341199458890726, + "flos": 69628009762800.0, + "grad_norm": 0.7942413759234048, + "language_loss": 0.56108212, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58390951, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.03271484, + "step": 5675, + "time_per_iteration": 3.4387779235839844 + }, + { + "auxiliary_loss_clip": 0.01397058, + "auxiliary_loss_mlp": 0.01041927, + "balance_loss_clip": 1.26827252, + "balance_loss_mlp": 1.02660894, + "epoch": 0.34125958214339397, + "flos": 22351494323520.0, + "grad_norm": 1.611637467409799, + "language_loss": 0.79640418, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.82079399, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15332031, + "step": 5676, + "time_per_iteration": 5.671878099441528 + }, + { + "auxiliary_loss_clip": 0.01411565, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.27748561, + "balance_loss_mlp": 1.01827896, + "epoch": 0.34131970539606193, + "flos": 21877003350720.0, + "grad_norm": 1.7256281498692227, + "language_loss": 0.8615557, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.88601649, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.16247559, + "step": 5677, + "time_per_iteration": 2.8256406784057617 + }, + { + "auxiliary_loss_clip": 0.01410391, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.27826905, + "balance_loss_mlp": 1.01836038, + "epoch": 0.3413798286487299, + "flos": 24941061417960.0, + "grad_norm": 1.8784842374195891, + "language_loss": 0.79557705, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.82002687, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.16223145, + "step": 5678, + "time_per_iteration": 2.8038859367370605 + }, + { + "auxiliary_loss_clip": 0.01407137, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.27379513, + "balance_loss_mlp": 1.01585746, + "epoch": 0.34143995190139786, + "flos": 25379874623520.0, + "grad_norm": 1.7631319095135085, + "language_loss": 0.75053316, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.77491498, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15185547, + "step": 5679, + "time_per_iteration": 2.817688226699829 + }, + { + "auxiliary_loss_clip": 0.01267541, + "auxiliary_loss_mlp": 0.01013295, + "balance_loss_clip": 1.20757878, + "balance_loss_mlp": 1.01048172, + "epoch": 0.34150007515406583, + "flos": 67799915877960.0, + "grad_norm": 0.7248974114341232, + "language_loss": 0.59503067, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61783904, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.02807617, + "step": 5680, + "time_per_iteration": 3.334468364715576 + }, + { + "auxiliary_loss_clip": 0.01405977, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.27672076, + "balance_loss_mlp": 1.01957297, + "epoch": 0.3415601984067338, + "flos": 20307205219320.0, + "grad_norm": 2.15605623617389, + "language_loss": 0.72516632, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74957305, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15130615, + "step": 5681, + "time_per_iteration": 2.7226500511169434 + }, + { + "auxiliary_loss_clip": 0.01404824, + "auxiliary_loss_mlp": 0.01044108, + "balance_loss_clip": 1.27566516, + "balance_loss_mlp": 1.02826476, + "epoch": 0.34162032165940176, + "flos": 26036774660160.0, + "grad_norm": 1.865984526934798, + "language_loss": 0.71699119, + "learning_rate": 3.064923764577233e-06, + "loss": 0.74148047, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15844727, + "step": 5682, + "time_per_iteration": 2.7924644947052 + }, + { + "auxiliary_loss_clip": 0.01408959, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.27734983, + "balance_loss_mlp": 1.02372491, + "epoch": 0.3416804449120697, + "flos": 28809197510400.0, + "grad_norm": 1.4495522253896203, + "language_loss": 0.84163654, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86612296, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.1595459, + "step": 5683, + "time_per_iteration": 2.809633255004883 + }, + { + "auxiliary_loss_clip": 0.01415526, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.28154278, + "balance_loss_mlp": 1.02064621, + "epoch": 0.3417405681647377, + "flos": 22606761431160.0, + "grad_norm": 1.7201483234319834, + "language_loss": 0.71080697, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.73534954, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.18066406, + "step": 5684, + "time_per_iteration": 2.7639565467834473 + }, + { + "auxiliary_loss_clip": 0.01405906, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.27614379, + "balance_loss_mlp": 1.01854408, + "epoch": 0.34180069141740566, + "flos": 24721472077560.0, + "grad_norm": 1.3651354286401833, + "language_loss": 0.75061834, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77501225, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.14929199, + "step": 5685, + "time_per_iteration": 2.778085470199585 + }, + { + "auxiliary_loss_clip": 0.01401986, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.27369869, + "balance_loss_mlp": 1.01826227, + "epoch": 0.3418608146700737, + "flos": 30524914771200.0, + "grad_norm": 1.5724863948161267, + "language_loss": 0.71120894, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.73555541, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14398193, + "step": 5686, + "time_per_iteration": 2.8274316787719727 + }, + { + "auxiliary_loss_clip": 0.01409943, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.2772162, + "balance_loss_mlp": 1.02105272, + "epoch": 0.34192093792274164, + "flos": 15126568521120.0, + "grad_norm": 1.7581412145303064, + "language_loss": 0.77576685, + "learning_rate": 3.06327495310661e-06, + "loss": 0.80023885, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.1619873, + "step": 5687, + "time_per_iteration": 2.75369930267334 + }, + { + "auxiliary_loss_clip": 0.01406269, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.2786485, + "balance_loss_mlp": 1.02216828, + "epoch": 0.3419810611754096, + "flos": 13191668003520.0, + "grad_norm": 2.0406720497689226, + "language_loss": 0.86719251, + "learning_rate": 3.062945069803981e-06, + "loss": 0.89163554, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15881348, + "step": 5688, + "time_per_iteration": 2.763838768005371 + }, + { + "auxiliary_loss_clip": 0.01421828, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.28477538, + "balance_loss_mlp": 1.02189207, + "epoch": 0.34204118442807757, + "flos": 19540957204440.0, + "grad_norm": 2.255133079840729, + "language_loss": 0.7978211, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82242811, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.1697998, + "step": 5689, + "time_per_iteration": 2.760938882827759 + }, + { + "auxiliary_loss_clip": 0.01419756, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.28654122, + "balance_loss_mlp": 1.02530015, + "epoch": 0.34210130768074554, + "flos": 15199020481320.0, + "grad_norm": 1.7125484736929328, + "language_loss": 0.74017715, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.7647953, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.16748047, + "step": 5690, + "time_per_iteration": 2.786515951156616 + }, + { + "auxiliary_loss_clip": 0.01411456, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.27836299, + "balance_loss_mlp": 1.01796293, + "epoch": 0.3421614309334135, + "flos": 24941629935000.0, + "grad_norm": 1.9972537005884006, + "language_loss": 0.76072907, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78517449, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15136719, + "step": 5691, + "time_per_iteration": 2.845845937728882 + }, + { + "auxiliary_loss_clip": 0.01409019, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.27949166, + "balance_loss_mlp": 1.01764131, + "epoch": 0.34222155418608147, + "flos": 21913736935320.0, + "grad_norm": 1.66290709912935, + "language_loss": 0.68840814, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7128309, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15612793, + "step": 5692, + "time_per_iteration": 2.7520265579223633 + }, + { + "auxiliary_loss_clip": 0.01416692, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.28357887, + "balance_loss_mlp": 1.01651633, + "epoch": 0.34228167743874943, + "flos": 18118661928480.0, + "grad_norm": 1.9751016710955396, + "language_loss": 0.73153573, + "learning_rate": 3.06129504893632e-06, + "loss": 0.75602925, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.16131592, + "step": 5693, + "time_per_iteration": 2.749396324157715 + }, + { + "auxiliary_loss_clip": 0.01406401, + "auxiliary_loss_mlp": 0.01038554, + "balance_loss_clip": 1.27660286, + "balance_loss_mlp": 1.02414179, + "epoch": 0.3423418006914174, + "flos": 21293651700000.0, + "grad_norm": 1.7022355026986613, + "language_loss": 0.76227337, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.7867229, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.14416504, + "step": 5694, + "time_per_iteration": 2.8784422874450684 + }, + { + "auxiliary_loss_clip": 0.01413517, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.28411627, + "balance_loss_mlp": 1.02362633, + "epoch": 0.34240192394408536, + "flos": 19827597593160.0, + "grad_norm": 1.6919340653871142, + "language_loss": 0.797916, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82242465, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.137146, + "step": 5695, + "time_per_iteration": 2.781287908554077 + }, + { + "auxiliary_loss_clip": 0.0141854, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.2852695, + "balance_loss_mlp": 1.02366352, + "epoch": 0.3424620471967533, + "flos": 24540971606640.0, + "grad_norm": 2.064037827469549, + "language_loss": 0.74011093, + "learning_rate": 3.060304553382635e-06, + "loss": 0.76468939, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15649414, + "step": 5696, + "time_per_iteration": 2.8077428340911865 + }, + { + "auxiliary_loss_clip": 0.01414707, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.28247452, + "balance_loss_mlp": 1.03113878, + "epoch": 0.3425221704494213, + "flos": 25854487421400.0, + "grad_norm": 1.7821912632261945, + "language_loss": 0.71416628, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.7387799, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.1550293, + "step": 5697, + "time_per_iteration": 2.791093349456787 + }, + { + "auxiliary_loss_clip": 0.0141536, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.28618169, + "balance_loss_mlp": 1.02215314, + "epoch": 0.34258229370208926, + "flos": 21545142230160.0, + "grad_norm": 1.7037695237307804, + "language_loss": 0.81951439, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84403324, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.14379883, + "step": 5698, + "time_per_iteration": 2.801330327987671 + }, + { + "auxiliary_loss_clip": 0.01416109, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_clip": 1.28246176, + "balance_loss_mlp": 1.03235316, + "epoch": 0.3426424169547573, + "flos": 23657172724800.0, + "grad_norm": 1.9110268352020094, + "language_loss": 0.69029927, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.7149502, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1661377, + "step": 5699, + "time_per_iteration": 2.7984402179718018 + }, + { + "auxiliary_loss_clip": 0.01418491, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.2855742, + "balance_loss_mlp": 1.02692151, + "epoch": 0.34270254020742524, + "flos": 24650319584880.0, + "grad_norm": 2.934016631091197, + "language_loss": 0.72770095, + "learning_rate": 3.058983329806877e-06, + "loss": 0.75230598, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.15106201, + "step": 5700, + "time_per_iteration": 2.8487277030944824 + }, + { + "auxiliary_loss_clip": 0.01419577, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.28889346, + "balance_loss_mlp": 1.02526641, + "epoch": 0.3427626634600932, + "flos": 21001772832840.0, + "grad_norm": 1.9770515909269082, + "language_loss": 0.82368457, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84828293, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.14996338, + "step": 5701, + "time_per_iteration": 2.7705769538879395 + }, + { + "auxiliary_loss_clip": 0.01419751, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.28553617, + "balance_loss_mlp": 1.0275259, + "epoch": 0.3428227867127612, + "flos": 21438758662200.0, + "grad_norm": 1.7411866245176972, + "language_loss": 0.72046894, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.74508905, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.1472168, + "step": 5702, + "time_per_iteration": 2.8261992931365967 + }, + { + "auxiliary_loss_clip": 0.01279279, + "auxiliary_loss_mlp": 0.01016339, + "balance_loss_clip": 1.21768475, + "balance_loss_mlp": 1.01312006, + "epoch": 0.34288290996542914, + "flos": 55746176829840.0, + "grad_norm": 0.7833985355917884, + "language_loss": 0.57555932, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59851545, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03222656, + "step": 5703, + "time_per_iteration": 4.551512241363525 + }, + { + "auxiliary_loss_clip": 0.01416231, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.28355086, + "balance_loss_mlp": 1.02578783, + "epoch": 0.3429430332180971, + "flos": 20161610956800.0, + "grad_norm": 1.9879863516446075, + "language_loss": 0.75435907, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77894187, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.16259766, + "step": 5704, + "time_per_iteration": 2.756659746170044 + }, + { + "auxiliary_loss_clip": 0.01412968, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.28290236, + "balance_loss_mlp": 1.02659607, + "epoch": 0.34300315647076507, + "flos": 17970143864040.0, + "grad_norm": 1.9668255380697617, + "language_loss": 0.73026061, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75479496, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.13879395, + "step": 5705, + "time_per_iteration": 2.747927188873291 + }, + { + "auxiliary_loss_clip": 0.01422858, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.29006195, + "balance_loss_mlp": 1.02680373, + "epoch": 0.34306327972343303, + "flos": 22091394821040.0, + "grad_norm": 2.2603026495134326, + "language_loss": 0.79591131, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82055992, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.15197754, + "step": 5706, + "time_per_iteration": 2.769930124282837 + }, + { + "auxiliary_loss_clip": 0.01423781, + "auxiliary_loss_mlp": 0.01040302, + "balance_loss_clip": 1.28866208, + "balance_loss_mlp": 1.02511477, + "epoch": 0.343123402976101, + "flos": 18447355596960.0, + "grad_norm": 2.0355948003235422, + "language_loss": 0.83358264, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85822356, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1519165, + "step": 5707, + "time_per_iteration": 2.7373955249786377 + }, + { + "auxiliary_loss_clip": 0.01423157, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.2921952, + "balance_loss_mlp": 1.01895177, + "epoch": 0.34318352622876896, + "flos": 17167933823400.0, + "grad_norm": 3.1607826475133214, + "language_loss": 0.75539994, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77996361, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.14257812, + "step": 5708, + "time_per_iteration": 2.736661195755005 + }, + { + "auxiliary_loss_clip": 0.01414137, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.28421688, + "balance_loss_mlp": 1.02373683, + "epoch": 0.34324364948143693, + "flos": 26693877738600.0, + "grad_norm": 1.6167719469394484, + "language_loss": 0.80789405, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83241594, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.14318848, + "step": 5709, + "time_per_iteration": 4.339643478393555 + }, + { + "auxiliary_loss_clip": 0.01423604, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.2893095, + "balance_loss_mlp": 1.02323055, + "epoch": 0.3433037727341049, + "flos": 21256552640160.0, + "grad_norm": 1.931475513498338, + "language_loss": 0.79354703, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81817836, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16296387, + "step": 5710, + "time_per_iteration": 2.7785279750823975 + }, + { + "auxiliary_loss_clip": 0.01424249, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.2883184, + "balance_loss_mlp": 1.02051401, + "epoch": 0.34336389598677286, + "flos": 20634071511600.0, + "grad_norm": 1.8690856081823644, + "language_loss": 0.70915383, + "learning_rate": 3.055346654453996e-06, + "loss": 0.73376358, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.16223145, + "step": 5711, + "time_per_iteration": 2.747908353805542 + }, + { + "auxiliary_loss_clip": 0.01414088, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.279971, + "balance_loss_mlp": 1.02533579, + "epoch": 0.3434240192394409, + "flos": 14542729570080.0, + "grad_norm": 4.185973065700901, + "language_loss": 0.67947555, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70402759, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.15771484, + "step": 5712, + "time_per_iteration": 2.777451753616333 + }, + { + "auxiliary_loss_clip": 0.01265049, + "auxiliary_loss_mlp": 0.01007004, + "balance_loss_clip": 1.20369005, + "balance_loss_mlp": 1.00401211, + "epoch": 0.34348414249210885, + "flos": 58064233472640.0, + "grad_norm": 0.8846134654780301, + "language_loss": 0.581487, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60420746, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.02990723, + "step": 5713, + "time_per_iteration": 3.3004629611968994 + }, + { + "auxiliary_loss_clip": 0.01419499, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.28587329, + "balance_loss_mlp": 1.02067542, + "epoch": 0.3435442657447768, + "flos": 20709366057000.0, + "grad_norm": 1.558367963007853, + "language_loss": 0.81074905, + "learning_rate": 3.054353992805076e-06, + "loss": 0.83529329, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.14251709, + "step": 5714, + "time_per_iteration": 4.453989744186401 + }, + { + "auxiliary_loss_clip": 0.01416389, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.28294444, + "balance_loss_mlp": 1.0239743, + "epoch": 0.3436043889974448, + "flos": 22935495708000.0, + "grad_norm": 1.7309786177603153, + "language_loss": 0.72114384, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74571049, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.1628418, + "step": 5715, + "time_per_iteration": 4.494873523712158 + }, + { + "auxiliary_loss_clip": 0.01260191, + "auxiliary_loss_mlp": 0.01007833, + "balance_loss_clip": 1.19911885, + "balance_loss_mlp": 1.00487673, + "epoch": 0.34366451225011274, + "flos": 58419078924960.0, + "grad_norm": 0.9024603993364742, + "language_loss": 0.6602236, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68290389, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.02954102, + "step": 5716, + "time_per_iteration": 3.2853057384490967 + }, + { + "auxiliary_loss_clip": 0.01402802, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.27323449, + "balance_loss_mlp": 1.02531612, + "epoch": 0.3437246355027807, + "flos": 15600531585240.0, + "grad_norm": 1.7105206917325397, + "language_loss": 0.74315923, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76758611, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.14556885, + "step": 5717, + "time_per_iteration": 2.787956714630127 + }, + { + "auxiliary_loss_clip": 0.01406534, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.27393711, + "balance_loss_mlp": 1.02045369, + "epoch": 0.34378475875544867, + "flos": 27677968934400.0, + "grad_norm": 1.8239485237529425, + "language_loss": 0.75748038, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.78188831, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.13800049, + "step": 5718, + "time_per_iteration": 2.8478035926818848 + }, + { + "auxiliary_loss_clip": 0.01409279, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.27456808, + "balance_loss_mlp": 1.03099656, + "epoch": 0.34384488200811664, + "flos": 31437853474320.0, + "grad_norm": 1.7975601939118768, + "language_loss": 0.64022398, + "learning_rate": 3.052698757266734e-06, + "loss": 0.6647802, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.15356445, + "step": 5719, + "time_per_iteration": 3.0833230018615723 + }, + { + "auxiliary_loss_clip": 0.01416429, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.2799629, + "balance_loss_mlp": 1.02370977, + "epoch": 0.3439050052607846, + "flos": 24905343042360.0, + "grad_norm": 1.6452983361247233, + "language_loss": 0.73097736, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75554872, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17004395, + "step": 5720, + "time_per_iteration": 2.9278006553649902 + }, + { + "auxiliary_loss_clip": 0.01410433, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.27840972, + "balance_loss_mlp": 1.0244875, + "epoch": 0.34396512851345257, + "flos": 18154623954240.0, + "grad_norm": 1.6507187903476286, + "language_loss": 0.7432279, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.7677331, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15612793, + "step": 5721, + "time_per_iteration": 2.8461992740631104 + }, + { + "auxiliary_loss_clip": 0.01414959, + "auxiliary_loss_mlp": 0.01044093, + "balance_loss_clip": 1.28044045, + "balance_loss_mlp": 1.02884626, + "epoch": 0.34402525176612053, + "flos": 16038776273760.0, + "grad_norm": 2.401989778554955, + "language_loss": 0.80397606, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82856661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.15246582, + "step": 5722, + "time_per_iteration": 2.746981382369995 + }, + { + "auxiliary_loss_clip": 0.01406658, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.2744391, + "balance_loss_mlp": 1.02003145, + "epoch": 0.3440853750187885, + "flos": 21183613379640.0, + "grad_norm": 1.5851301594749483, + "language_loss": 0.81689209, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84130096, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.14208984, + "step": 5723, + "time_per_iteration": 2.736405611038208 + }, + { + "auxiliary_loss_clip": 0.0141045, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.27595973, + "balance_loss_mlp": 1.0299859, + "epoch": 0.34414549827145646, + "flos": 12681783522000.0, + "grad_norm": 2.2939580713015375, + "language_loss": 0.81377316, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83833474, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15716553, + "step": 5724, + "time_per_iteration": 2.799691915512085 + }, + { + "auxiliary_loss_clip": 0.01409947, + "auxiliary_loss_mlp": 0.01039748, + "balance_loss_clip": 1.27576637, + "balance_loss_mlp": 1.02477527, + "epoch": 0.3442056215241244, + "flos": 31290716094120.0, + "grad_norm": 1.7886150358902497, + "language_loss": 0.69496286, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71945977, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.14971924, + "step": 5725, + "time_per_iteration": 2.8194971084594727 + }, + { + "auxiliary_loss_clip": 0.01421757, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.28203416, + "balance_loss_mlp": 1.03184652, + "epoch": 0.34426574477679245, + "flos": 23372481537360.0, + "grad_norm": 1.4144469219603997, + "language_loss": 0.69391251, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71862155, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.17285156, + "step": 5726, + "time_per_iteration": 2.8529765605926514 + }, + { + "auxiliary_loss_clip": 0.01420242, + "auxiliary_loss_mlp": 0.01044326, + "balance_loss_clip": 1.28563762, + "balance_loss_mlp": 1.02929401, + "epoch": 0.3443258680294604, + "flos": 24540849781560.0, + "grad_norm": 1.6556333108689198, + "language_loss": 0.72995389, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75459957, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.15020752, + "step": 5727, + "time_per_iteration": 2.8103256225585938 + }, + { + "auxiliary_loss_clip": 0.01408119, + "auxiliary_loss_mlp": 0.01042203, + "balance_loss_clip": 1.27483487, + "balance_loss_mlp": 1.02605045, + "epoch": 0.3443859912821284, + "flos": 20234996909280.0, + "grad_norm": 1.7778294796250762, + "language_loss": 0.8878212, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.91232443, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.16162109, + "step": 5728, + "time_per_iteration": 2.8114736080169678 + }, + { + "auxiliary_loss_clip": 0.01413527, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.28128934, + "balance_loss_mlp": 1.02463818, + "epoch": 0.34444611453479634, + "flos": 24321910174920.0, + "grad_norm": 3.3836067436463124, + "language_loss": 0.70225114, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72678608, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15307617, + "step": 5729, + "time_per_iteration": 2.9008982181549072 + }, + { + "auxiliary_loss_clip": 0.01406586, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.27457106, + "balance_loss_mlp": 1.01985157, + "epoch": 0.3445062377874643, + "flos": 16987920652800.0, + "grad_norm": 4.203029730581894, + "language_loss": 0.74066877, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76507783, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.14453125, + "step": 5730, + "time_per_iteration": 2.7812585830688477 + }, + { + "auxiliary_loss_clip": 0.0141506, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.28082919, + "balance_loss_mlp": 1.02332246, + "epoch": 0.3445663610401323, + "flos": 20307530086200.0, + "grad_norm": 2.5763625749023014, + "language_loss": 0.79697615, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82151335, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.15338135, + "step": 5731, + "time_per_iteration": 2.8057785034179688 + }, + { + "auxiliary_loss_clip": 0.01413231, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.27916336, + "balance_loss_mlp": 1.02484691, + "epoch": 0.34462648429280024, + "flos": 15892369844040.0, + "grad_norm": 2.1005562649389486, + "language_loss": 0.78896976, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81350797, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15740967, + "step": 5732, + "time_per_iteration": 2.747728109359741 + }, + { + "auxiliary_loss_clip": 0.01256817, + "auxiliary_loss_mlp": 0.01011993, + "balance_loss_clip": 1.1954366, + "balance_loss_mlp": 1.00872648, + "epoch": 0.3446866075454682, + "flos": 59326373066040.0, + "grad_norm": 0.7500809052215736, + "language_loss": 0.53572464, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55841273, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.03271484, + "step": 5733, + "time_per_iteration": 3.29729962348938 + }, + { + "auxiliary_loss_clip": 0.01409941, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.27706432, + "balance_loss_mlp": 1.02202654, + "epoch": 0.34474673079813617, + "flos": 22348611129960.0, + "grad_norm": 1.9101237464593404, + "language_loss": 0.83987951, + "learning_rate": 3.047727069167207e-06, + "loss": 0.86436081, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.16162109, + "step": 5734, + "time_per_iteration": 2.820547580718994 + }, + { + "auxiliary_loss_clip": 0.01412956, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.2790736, + "balance_loss_mlp": 1.01596928, + "epoch": 0.34480685405080413, + "flos": 27675573041160.0, + "grad_norm": 1.722668012582058, + "language_loss": 0.93282956, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.95727706, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15808105, + "step": 5735, + "time_per_iteration": 2.853433847427368 + }, + { + "auxiliary_loss_clip": 0.01422668, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.28627968, + "balance_loss_mlp": 1.02411222, + "epoch": 0.3448669773034721, + "flos": 22461207777000.0, + "grad_norm": 1.7746733058242083, + "language_loss": 0.77480763, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.79943788, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.16241455, + "step": 5736, + "time_per_iteration": 2.8559086322784424 + }, + { + "auxiliary_loss_clip": 0.01415175, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.27970755, + "balance_loss_mlp": 1.02276099, + "epoch": 0.34492710055614006, + "flos": 24941589326640.0, + "grad_norm": 1.6526497054093776, + "language_loss": 0.79312241, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81766617, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16442871, + "step": 5737, + "time_per_iteration": 2.7892234325408936 + }, + { + "auxiliary_loss_clip": 0.01423668, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.28580129, + "balance_loss_mlp": 1.02071214, + "epoch": 0.34498722380880803, + "flos": 20125689539400.0, + "grad_norm": 1.9282046930221661, + "language_loss": 0.71779102, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.74240708, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.17211914, + "step": 5738, + "time_per_iteration": 2.9298291206359863 + }, + { + "auxiliary_loss_clip": 0.01419605, + "auxiliary_loss_mlp": 0.01040231, + "balance_loss_clip": 1.28310287, + "balance_loss_mlp": 1.02387536, + "epoch": 0.34504734706147605, + "flos": 28443892082400.0, + "grad_norm": 1.8485139204414223, + "language_loss": 0.82417828, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84877664, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.16357422, + "step": 5739, + "time_per_iteration": 2.8442087173461914 + }, + { + "auxiliary_loss_clip": 0.01412677, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.2771486, + "balance_loss_mlp": 1.02183032, + "epoch": 0.345107470314144, + "flos": 22679619474960.0, + "grad_norm": 1.7215763309418062, + "language_loss": 0.83290076, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.8574115, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.16564941, + "step": 5740, + "time_per_iteration": 2.895540237426758 + }, + { + "auxiliary_loss_clip": 0.01421082, + "auxiliary_loss_mlp": 0.01036803, + "balance_loss_clip": 1.28747559, + "balance_loss_mlp": 1.02060282, + "epoch": 0.345167593566812, + "flos": 20635533412560.0, + "grad_norm": 2.0822904489678464, + "language_loss": 0.77756071, + "learning_rate": 3.045403886269181e-06, + "loss": 0.80213952, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16210938, + "step": 5741, + "time_per_iteration": 2.92891263961792 + }, + { + "auxiliary_loss_clip": 0.01419669, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.28148746, + "balance_loss_mlp": 1.01693225, + "epoch": 0.34522771681947995, + "flos": 26220077107920.0, + "grad_norm": 1.603712281448769, + "language_loss": 0.77063477, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79515243, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.15155029, + "step": 5742, + "time_per_iteration": 4.3095808029174805 + }, + { + "auxiliary_loss_clip": 0.01413454, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.27979112, + "balance_loss_mlp": 1.02634382, + "epoch": 0.3452878400721479, + "flos": 19067034748680.0, + "grad_norm": 1.9676624937616416, + "language_loss": 0.76491427, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78947884, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16650391, + "step": 5743, + "time_per_iteration": 2.82155179977417 + }, + { + "auxiliary_loss_clip": 0.01407829, + "auxiliary_loss_mlp": 0.01043757, + "balance_loss_clip": 1.27541947, + "balance_loss_mlp": 1.02789021, + "epoch": 0.3453479633248159, + "flos": 27935753760360.0, + "grad_norm": 1.5856347543042888, + "language_loss": 0.70335823, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72787404, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15869141, + "step": 5744, + "time_per_iteration": 2.821601152420044 + }, + { + "auxiliary_loss_clip": 0.01405783, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.27490091, + "balance_loss_mlp": 1.02095675, + "epoch": 0.34540808657748384, + "flos": 19610404146000.0, + "grad_norm": 1.6741360627371569, + "language_loss": 0.80107129, + "learning_rate": 3.044075480787665e-06, + "loss": 0.82549608, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15734863, + "step": 5745, + "time_per_iteration": 2.742877960205078 + }, + { + "auxiliary_loss_clip": 0.01417455, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.28064561, + "balance_loss_mlp": 1.02158308, + "epoch": 0.3454682098301518, + "flos": 20416431372480.0, + "grad_norm": 1.7446582720546393, + "language_loss": 0.89363015, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91818601, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.16540527, + "step": 5746, + "time_per_iteration": 2.790757179260254 + }, + { + "auxiliary_loss_clip": 0.01428055, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.28892982, + "balance_loss_mlp": 1.01926827, + "epoch": 0.34552833308281977, + "flos": 21330182242800.0, + "grad_norm": 5.89799298082557, + "language_loss": 0.64999628, + "learning_rate": 3.043411040447849e-06, + "loss": 0.67463452, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.16491699, + "step": 5747, + "time_per_iteration": 2.7428224086761475 + }, + { + "auxiliary_loss_clip": 0.0141218, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.27743411, + "balance_loss_mlp": 1.02160692, + "epoch": 0.34558845633548774, + "flos": 36250098509160.0, + "grad_norm": 1.6729594628284925, + "language_loss": 0.73150134, + "learning_rate": 3.043078760922264e-06, + "loss": 0.7559908, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1517334, + "step": 5748, + "time_per_iteration": 2.9169275760650635 + }, + { + "auxiliary_loss_clip": 0.0140622, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.27571726, + "balance_loss_mlp": 1.01884103, + "epoch": 0.3456485795881557, + "flos": 22455035306280.0, + "grad_norm": 1.507886332030975, + "language_loss": 0.75826085, + "learning_rate": 3.042746441843029e-06, + "loss": 0.78265655, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.1451416, + "step": 5749, + "time_per_iteration": 4.268859624862671 + }, + { + "auxiliary_loss_clip": 0.01253413, + "auxiliary_loss_mlp": 0.01003316, + "balance_loss_clip": 1.19266891, + "balance_loss_mlp": 1.00024033, + "epoch": 0.34570870284082367, + "flos": 62019115668000.0, + "grad_norm": 0.8861176054521781, + "language_loss": 0.62789774, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.65046501, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03063965, + "step": 5750, + "time_per_iteration": 3.1892449855804443 + }, + { + "auxiliary_loss_clip": 0.01402125, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.27445281, + "balance_loss_mlp": 1.0199964, + "epoch": 0.34576882609349163, + "flos": 22787140077000.0, + "grad_norm": 1.6833180387550566, + "language_loss": 0.8089422, + "learning_rate": 3.042081685074012e-06, + "loss": 0.8333149, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15148926, + "step": 5751, + "time_per_iteration": 2.7740061283111572 + }, + { + "auxiliary_loss_clip": 0.01405377, + "auxiliary_loss_mlp": 0.010374, + "balance_loss_clip": 1.27454853, + "balance_loss_mlp": 1.0213784, + "epoch": 0.34582894934615965, + "flos": 12352602553200.0, + "grad_norm": 2.0023449430053692, + "language_loss": 0.83975708, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86418486, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16015625, + "step": 5752, + "time_per_iteration": 2.744370937347412 + }, + { + "auxiliary_loss_clip": 0.01252507, + "auxiliary_loss_mlp": 0.01006812, + "balance_loss_clip": 1.19269335, + "balance_loss_mlp": 1.00397456, + "epoch": 0.3458890725988276, + "flos": 70182343417320.0, + "grad_norm": 0.7323129601417449, + "language_loss": 0.63123268, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65382588, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.02832031, + "step": 5753, + "time_per_iteration": 6.186391353607178 + }, + { + "auxiliary_loss_clip": 0.0140705, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.27509654, + "balance_loss_mlp": 1.0203737, + "epoch": 0.3459491958514956, + "flos": 17097065589240.0, + "grad_norm": 1.8255585923119197, + "language_loss": 0.71086586, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73531604, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.17590332, + "step": 5754, + "time_per_iteration": 2.7454845905303955 + }, + { + "auxiliary_loss_clip": 0.01420609, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.28356457, + "balance_loss_mlp": 1.01968181, + "epoch": 0.34600931910416355, + "flos": 16654881889800.0, + "grad_norm": 1.7143941394853204, + "language_loss": 0.73110664, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75567162, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.1619873, + "step": 5755, + "time_per_iteration": 2.772258996963501 + }, + { + "auxiliary_loss_clip": 0.01410385, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.27864349, + "balance_loss_mlp": 1.01649404, + "epoch": 0.3460694423568315, + "flos": 38553471906840.0, + "grad_norm": 1.505589583026718, + "language_loss": 0.72224963, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74666727, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.14886475, + "step": 5756, + "time_per_iteration": 2.9373462200164795 + }, + { + "auxiliary_loss_clip": 0.01251947, + "auxiliary_loss_mlp": 0.01003703, + "balance_loss_clip": 1.19147885, + "balance_loss_mlp": 1.00068712, + "epoch": 0.3461295656094995, + "flos": 72098540462160.0, + "grad_norm": 0.7249028669814971, + "language_loss": 0.62600517, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64856166, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.03015137, + "step": 5757, + "time_per_iteration": 3.2427196502685547 + }, + { + "auxiliary_loss_clip": 0.01250468, + "auxiliary_loss_mlp": 0.01005695, + "balance_loss_clip": 1.19102693, + "balance_loss_mlp": 1.00286949, + "epoch": 0.34618968886216744, + "flos": 65474004840480.0, + "grad_norm": 0.8247469196208012, + "language_loss": 0.59377831, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61633992, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.02819824, + "step": 5758, + "time_per_iteration": 3.188013792037964 + }, + { + "auxiliary_loss_clip": 0.01403471, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.27378762, + "balance_loss_mlp": 1.02399993, + "epoch": 0.3462498121148354, + "flos": 23477200162560.0, + "grad_norm": 1.5827022207570332, + "language_loss": 0.71555042, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.7399686, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14343262, + "step": 5759, + "time_per_iteration": 2.828526735305786 + }, + { + "auxiliary_loss_clip": 0.01400999, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.26979208, + "balance_loss_mlp": 1.02806854, + "epoch": 0.3463099353675034, + "flos": 24176194087320.0, + "grad_norm": 2.2354755855788113, + "language_loss": 0.83445048, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85889721, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15600586, + "step": 5760, + "time_per_iteration": 2.8520476818084717 + }, + { + "auxiliary_loss_clip": 0.01249482, + "auxiliary_loss_mlp": 0.01004479, + "balance_loss_clip": 1.18843722, + "balance_loss_mlp": 1.00178492, + "epoch": 0.34637005862017134, + "flos": 63713229281280.0, + "grad_norm": 0.8392533091600536, + "language_loss": 0.56544363, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58798325, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.02697754, + "step": 5761, + "time_per_iteration": 3.28535795211792 + }, + { + "auxiliary_loss_clip": 0.01394398, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.26446104, + "balance_loss_mlp": 1.02034473, + "epoch": 0.3464301818728393, + "flos": 13148640123120.0, + "grad_norm": 1.9751425959444475, + "language_loss": 0.95023859, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97453731, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15124512, + "step": 5762, + "time_per_iteration": 2.8064017295837402 + }, + { + "auxiliary_loss_clip": 0.01411252, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.27382743, + "balance_loss_mlp": 1.01881862, + "epoch": 0.34649030512550727, + "flos": 29321234235000.0, + "grad_norm": 2.0242781277645676, + "language_loss": 0.69495416, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71940947, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.15454102, + "step": 5763, + "time_per_iteration": 2.839337110519409 + }, + { + "auxiliary_loss_clip": 0.01408136, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.2711854, + "balance_loss_mlp": 1.0181067, + "epoch": 0.34655042837817523, + "flos": 23735959589160.0, + "grad_norm": 1.8516120189777172, + "language_loss": 0.84265983, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86709404, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.171875, + "step": 5764, + "time_per_iteration": 2.782972574234009 + }, + { + "auxiliary_loss_clip": 0.01400058, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.26776958, + "balance_loss_mlp": 1.02291274, + "epoch": 0.34661055163084326, + "flos": 22059290589480.0, + "grad_norm": 2.171361753756009, + "language_loss": 0.68132204, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.70569861, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.14691162, + "step": 5765, + "time_per_iteration": 2.787768840789795 + }, + { + "auxiliary_loss_clip": 0.01398878, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.26746213, + "balance_loss_mlp": 1.02008379, + "epoch": 0.3466706748835112, + "flos": 21804307740360.0, + "grad_norm": 1.9605676587150063, + "language_loss": 0.77318317, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79753006, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15722656, + "step": 5766, + "time_per_iteration": 2.7560036182403564 + }, + { + "auxiliary_loss_clip": 0.01400024, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.26768315, + "balance_loss_mlp": 1.01983202, + "epoch": 0.3467307981361792, + "flos": 19466149959360.0, + "grad_norm": 1.520906509356721, + "language_loss": 0.72981048, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75415373, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.14453125, + "step": 5767, + "time_per_iteration": 2.825415849685669 + }, + { + "auxiliary_loss_clip": 0.01400121, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.26854539, + "balance_loss_mlp": 1.02201819, + "epoch": 0.34679092138884715, + "flos": 24832931690520.0, + "grad_norm": 2.040460725096476, + "language_loss": 0.78545034, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80982894, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15722656, + "step": 5768, + "time_per_iteration": 2.7940666675567627 + }, + { + "auxiliary_loss_clip": 0.01252548, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.18801188, + "balance_loss_mlp": 0.99966735, + "epoch": 0.3468510446415151, + "flos": 63249905607480.0, + "grad_norm": 0.7651466443242476, + "language_loss": 0.57576632, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59831381, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.02539062, + "step": 5769, + "time_per_iteration": 3.2647616863250732 + }, + { + "auxiliary_loss_clip": 0.01424264, + "auxiliary_loss_mlp": 0.0104471, + "balance_loss_clip": 1.28263688, + "balance_loss_mlp": 1.02728152, + "epoch": 0.3469111678941831, + "flos": 12123267206400.0, + "grad_norm": 3.1016059344067757, + "language_loss": 0.86067766, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.88536739, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.17431641, + "step": 5770, + "time_per_iteration": 2.805542469024658 + }, + { + "auxiliary_loss_clip": 0.0125556, + "auxiliary_loss_mlp": 0.01009137, + "balance_loss_clip": 1.1898222, + "balance_loss_mlp": 1.00656211, + "epoch": 0.34697129114685105, + "flos": 65948049121320.0, + "grad_norm": 0.7692799420347628, + "language_loss": 0.59841287, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.62105989, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.02575684, + "step": 5771, + "time_per_iteration": 3.018451452255249 + }, + { + "auxiliary_loss_clip": 0.01404429, + "auxiliary_loss_mlp": 0.01047332, + "balance_loss_clip": 1.27135074, + "balance_loss_mlp": 1.03184652, + "epoch": 0.347031414399519, + "flos": 34460223737040.0, + "grad_norm": 2.4806804380729077, + "language_loss": 0.71944386, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.74396145, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15460205, + "step": 5772, + "time_per_iteration": 2.903944253921509 + }, + { + "auxiliary_loss_clip": 0.01397745, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.26449871, + "balance_loss_mlp": 1.0233885, + "epoch": 0.347091537652187, + "flos": 26949957013440.0, + "grad_norm": 2.173340312265114, + "language_loss": 0.76650423, + "learning_rate": 3.034758950632507e-06, + "loss": 0.79088366, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.16796875, + "step": 5773, + "time_per_iteration": 2.809016704559326 + }, + { + "auxiliary_loss_clip": 0.01401709, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.26709199, + "balance_loss_mlp": 1.01770353, + "epoch": 0.34715166090485494, + "flos": 21147204661920.0, + "grad_norm": 2.3037494881621927, + "language_loss": 0.70426887, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72861922, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.15606689, + "step": 5774, + "time_per_iteration": 2.812803030014038 + }, + { + "auxiliary_loss_clip": 0.01389116, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.25772786, + "balance_loss_mlp": 1.02092576, + "epoch": 0.3472117841575229, + "flos": 23483372633280.0, + "grad_norm": 3.376103002981415, + "language_loss": 0.76663864, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.79088616, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.1472168, + "step": 5775, + "time_per_iteration": 2.8016552925109863 + }, + { + "auxiliary_loss_clip": 0.01410928, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.27239776, + "balance_loss_mlp": 1.02272367, + "epoch": 0.34727190741019087, + "flos": 17497196008920.0, + "grad_norm": 1.97608689694058, + "language_loss": 0.77741915, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.80191797, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.16241455, + "step": 5776, + "time_per_iteration": 2.7986226081848145 + }, + { + "auxiliary_loss_clip": 0.01252539, + "auxiliary_loss_mlp": 0.01009889, + "balance_loss_clip": 1.19049573, + "balance_loss_mlp": 1.00727832, + "epoch": 0.34733203066285884, + "flos": 65282557093560.0, + "grad_norm": 0.8380942545436084, + "language_loss": 0.63414657, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65677083, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.02612305, + "step": 5777, + "time_per_iteration": 3.3240575790405273 + }, + { + "auxiliary_loss_clip": 0.01410373, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.27511215, + "balance_loss_mlp": 1.02531528, + "epoch": 0.3473921539155268, + "flos": 28664740281960.0, + "grad_norm": 1.8956418481483164, + "language_loss": 0.65180564, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67631954, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15698242, + "step": 5778, + "time_per_iteration": 2.811870813369751 + }, + { + "auxiliary_loss_clip": 0.01403939, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.26725221, + "balance_loss_mlp": 1.02672029, + "epoch": 0.3474522771681948, + "flos": 40843444545720.0, + "grad_norm": 1.9069073228780837, + "language_loss": 0.72439051, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.74885023, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.15319824, + "step": 5779, + "time_per_iteration": 2.9764819145202637 + }, + { + "auxiliary_loss_clip": 0.01413168, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.27539933, + "balance_loss_mlp": 1.0287478, + "epoch": 0.3475124004208628, + "flos": 24614032692240.0, + "grad_norm": 2.189123181646595, + "language_loss": 0.6273219, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.65189493, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.15393066, + "step": 5780, + "time_per_iteration": 2.8080403804779053 + }, + { + "auxiliary_loss_clip": 0.01408264, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.27468514, + "balance_loss_mlp": 1.02879691, + "epoch": 0.34757252367353075, + "flos": 22716515493000.0, + "grad_norm": 1.734650988630227, + "language_loss": 0.72454858, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74906957, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15045166, + "step": 5781, + "time_per_iteration": 2.879387855529785 + }, + { + "auxiliary_loss_clip": 0.01411511, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.27547789, + "balance_loss_mlp": 1.03177071, + "epoch": 0.3476326469261987, + "flos": 19832917288320.0, + "grad_norm": 2.517964731175941, + "language_loss": 0.77506757, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79967344, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.1730957, + "step": 5782, + "time_per_iteration": 4.182196617126465 + }, + { + "auxiliary_loss_clip": 0.01404448, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.27025473, + "balance_loss_mlp": 1.01700282, + "epoch": 0.3476927701788667, + "flos": 19942915000320.0, + "grad_norm": 2.0128033576162228, + "language_loss": 0.63322741, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.65758634, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.14434814, + "step": 5783, + "time_per_iteration": 2.7416720390319824 + }, + { + "auxiliary_loss_clip": 0.0139817, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.2663281, + "balance_loss_mlp": 1.01849008, + "epoch": 0.34775289343153465, + "flos": 20739277437120.0, + "grad_norm": 3.4362288311420004, + "language_loss": 0.88574076, + "learning_rate": 3.031090453282605e-06, + "loss": 0.91004848, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.14117432, + "step": 5784, + "time_per_iteration": 2.761253595352173 + }, + { + "auxiliary_loss_clip": 0.01407776, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.27609229, + "balance_loss_mlp": 1.0241313, + "epoch": 0.3478130166842026, + "flos": 19359644566320.0, + "grad_norm": 1.6160536049905638, + "language_loss": 0.81400621, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.8384757, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.15032959, + "step": 5785, + "time_per_iteration": 2.7732605934143066 + }, + { + "auxiliary_loss_clip": 0.01411963, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_clip": 1.2788794, + "balance_loss_mlp": 1.02717912, + "epoch": 0.3478731399368706, + "flos": 22055879487240.0, + "grad_norm": 1.7549409146967898, + "language_loss": 0.80490988, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82944417, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.1428833, + "step": 5786, + "time_per_iteration": 2.840548276901245 + }, + { + "auxiliary_loss_clip": 0.01404114, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.27311456, + "balance_loss_mlp": 1.02370858, + "epoch": 0.34793326318953854, + "flos": 18046088143200.0, + "grad_norm": 1.7496828731608878, + "language_loss": 0.7519697, + "learning_rate": 3.030089132216836e-06, + "loss": 0.7764042, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15618896, + "step": 5787, + "time_per_iteration": 4.184311151504517 + }, + { + "auxiliary_loss_clip": 0.01412988, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.27843034, + "balance_loss_mlp": 1.02222729, + "epoch": 0.3479933864422065, + "flos": 29320665717960.0, + "grad_norm": 1.6158793660853865, + "language_loss": 0.81617808, + "learning_rate": 3.029755280389203e-06, + "loss": 0.84068656, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.15612793, + "step": 5788, + "time_per_iteration": 2.8326847553253174 + }, + { + "auxiliary_loss_clip": 0.01426971, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.28899777, + "balance_loss_mlp": 1.01850915, + "epoch": 0.3480535096948745, + "flos": 20125567714320.0, + "grad_norm": 1.7266502140715436, + "language_loss": 0.86178452, + "learning_rate": 3.029421389513147e-06, + "loss": 0.88640523, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.16595459, + "step": 5789, + "time_per_iteration": 2.797480583190918 + }, + { + "auxiliary_loss_clip": 0.0142503, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_clip": 1.28820324, + "balance_loss_mlp": 1.03017616, + "epoch": 0.34811363294754244, + "flos": 18553251864600.0, + "grad_norm": 2.0862197878552524, + "language_loss": 0.85168958, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87639892, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.1572876, + "step": 5790, + "time_per_iteration": 2.797193765640259 + }, + { + "auxiliary_loss_clip": 0.01415664, + "auxiliary_loss_mlp": 0.01040828, + "balance_loss_clip": 1.28256822, + "balance_loss_mlp": 1.02447271, + "epoch": 0.3481737562002104, + "flos": 26876002543920.0, + "grad_norm": 2.0163422855497926, + "language_loss": 0.81554246, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.84010738, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16333008, + "step": 5791, + "time_per_iteration": 2.826486110687256 + }, + { + "auxiliary_loss_clip": 0.01422231, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.28398895, + "balance_loss_mlp": 1.02154875, + "epoch": 0.3482338794528784, + "flos": 28913997352320.0, + "grad_norm": 1.7284152042098078, + "language_loss": 0.78109604, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80569857, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.16461182, + "step": 5792, + "time_per_iteration": 4.277633428573608 + }, + { + "auxiliary_loss_clip": 0.01421786, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.28682518, + "balance_loss_mlp": 1.01713967, + "epoch": 0.3482940027055464, + "flos": 22205940669360.0, + "grad_norm": 1.5150064617367025, + "language_loss": 0.81948936, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.84403586, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.15692139, + "step": 5793, + "time_per_iteration": 4.33390736579895 + }, + { + "auxiliary_loss_clip": 0.0142104, + "auxiliary_loss_mlp": 0.01049052, + "balance_loss_clip": 1.28459954, + "balance_loss_mlp": 1.032565, + "epoch": 0.34835412595821436, + "flos": 20307530086200.0, + "grad_norm": 2.534247566051099, + "language_loss": 0.76167917, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78638005, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.16491699, + "step": 5794, + "time_per_iteration": 2.7453973293304443 + }, + { + "auxiliary_loss_clip": 0.01416737, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.28287208, + "balance_loss_mlp": 1.02069116, + "epoch": 0.3484142492108823, + "flos": 20454667466400.0, + "grad_norm": 2.262446158086705, + "language_loss": 0.57869428, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.60323775, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.16931152, + "step": 5795, + "time_per_iteration": 2.821516752243042 + }, + { + "auxiliary_loss_clip": 0.01419471, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.28642547, + "balance_loss_mlp": 1.02360082, + "epoch": 0.3484743724635503, + "flos": 24358318892640.0, + "grad_norm": 1.53940562162945, + "language_loss": 0.82343203, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84802109, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.15808105, + "step": 5796, + "time_per_iteration": 2.8896546363830566 + }, + { + "auxiliary_loss_clip": 0.01414511, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.28407145, + "balance_loss_mlp": 1.01927364, + "epoch": 0.34853449571621825, + "flos": 24358400109360.0, + "grad_norm": 2.4360837371028774, + "language_loss": 0.83625448, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86074322, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.15075684, + "step": 5797, + "time_per_iteration": 2.825547695159912 + }, + { + "auxiliary_loss_clip": 0.01418257, + "auxiliary_loss_mlp": 0.01038647, + "balance_loss_clip": 1.2850368, + "balance_loss_mlp": 1.02201772, + "epoch": 0.3485946189688862, + "flos": 27272924903160.0, + "grad_norm": 2.4946390122782685, + "language_loss": 0.73641783, + "learning_rate": 3.026414616539167e-06, + "loss": 0.76098686, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16625977, + "step": 5798, + "time_per_iteration": 2.8314120769500732 + }, + { + "auxiliary_loss_clip": 0.0142679, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.29060662, + "balance_loss_mlp": 1.02440763, + "epoch": 0.3486547422215542, + "flos": 20161489131720.0, + "grad_norm": 1.883264870437881, + "language_loss": 0.76226056, + "learning_rate": 3.026080335875485e-06, + "loss": 0.7869451, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.17260742, + "step": 5799, + "time_per_iteration": 2.73717999458313 + }, + { + "auxiliary_loss_clip": 0.01416982, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.28314281, + "balance_loss_mlp": 1.02254128, + "epoch": 0.34871486547422215, + "flos": 20235240559440.0, + "grad_norm": 1.7184885689419502, + "language_loss": 0.76114053, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7856825, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.14672852, + "step": 5800, + "time_per_iteration": 2.7758822441101074 + }, + { + "auxiliary_loss_clip": 0.01426067, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.28797507, + "balance_loss_mlp": 1.02474022, + "epoch": 0.3487749887268901, + "flos": 44060934288960.0, + "grad_norm": 1.7698697619837485, + "language_loss": 0.67392194, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69860184, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.17199707, + "step": 5801, + "time_per_iteration": 2.973480463027954 + }, + { + "auxiliary_loss_clip": 0.01412485, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.27960014, + "balance_loss_mlp": 1.02528572, + "epoch": 0.3488351119795581, + "flos": 23300435660760.0, + "grad_norm": 1.771019435733949, + "language_loss": 0.77251327, + "learning_rate": 3.025077260480735e-06, + "loss": 0.79704165, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.15063477, + "step": 5802, + "time_per_iteration": 2.7651522159576416 + }, + { + "auxiliary_loss_clip": 0.014034, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.27535963, + "balance_loss_mlp": 1.01928294, + "epoch": 0.34889523523222604, + "flos": 19939219639560.0, + "grad_norm": 1.6692648190230934, + "language_loss": 0.79233402, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81671476, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15405273, + "step": 5803, + "time_per_iteration": 2.7506234645843506 + }, + { + "auxiliary_loss_clip": 0.01419536, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.2812593, + "balance_loss_mlp": 1.02111197, + "epoch": 0.348955358484894, + "flos": 30452381594280.0, + "grad_norm": 5.823274165940977, + "language_loss": 0.67954671, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.70410794, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.15478516, + "step": 5804, + "time_per_iteration": 2.8335835933685303 + }, + { + "auxiliary_loss_clip": 0.01403559, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.27423811, + "balance_loss_mlp": 1.02444696, + "epoch": 0.349015481737562, + "flos": 18003953646720.0, + "grad_norm": 1.819262697594681, + "language_loss": 0.76724893, + "learning_rate": 3.024073835246702e-06, + "loss": 0.79169112, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16192627, + "step": 5805, + "time_per_iteration": 2.762545108795166 + }, + { + "auxiliary_loss_clip": 0.01412082, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.27818418, + "balance_loss_mlp": 1.02417636, + "epoch": 0.34907560499023, + "flos": 27204208912080.0, + "grad_norm": 7.85505957316793, + "language_loss": 0.68210423, + "learning_rate": 3.023739282485814e-06, + "loss": 0.70662916, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.16235352, + "step": 5806, + "time_per_iteration": 2.866774559020996 + }, + { + "auxiliary_loss_clip": 0.01408874, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_clip": 1.27349782, + "balance_loss_mlp": 1.02840841, + "epoch": 0.34913572824289796, + "flos": 30232995295680.0, + "grad_norm": 1.4815735033633917, + "language_loss": 0.72270107, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74723268, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.15869141, + "step": 5807, + "time_per_iteration": 2.89766001701355 + }, + { + "auxiliary_loss_clip": 0.01414405, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_clip": 1.27795529, + "balance_loss_mlp": 1.02875638, + "epoch": 0.3491958514955659, + "flos": 29978053054920.0, + "grad_norm": 3.208409119095868, + "language_loss": 0.74085867, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7654531, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.16271973, + "step": 5808, + "time_per_iteration": 2.843451499938965 + }, + { + "auxiliary_loss_clip": 0.01404505, + "auxiliary_loss_mlp": 0.01059099, + "balance_loss_clip": 1.27481115, + "balance_loss_mlp": 1.04280281, + "epoch": 0.3492559747482339, + "flos": 22788358327800.0, + "grad_norm": 1.650474031053714, + "language_loss": 0.847633, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.87226903, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.16296387, + "step": 5809, + "time_per_iteration": 2.849575996398926 + }, + { + "auxiliary_loss_clip": 0.01398253, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.26919949, + "balance_loss_mlp": 1.03339148, + "epoch": 0.34931609800090185, + "flos": 26073670678200.0, + "grad_norm": 3.7561907714164144, + "language_loss": 0.80889422, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.83336687, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15625, + "step": 5810, + "time_per_iteration": 2.8042473793029785 + }, + { + "auxiliary_loss_clip": 0.01409568, + "auxiliary_loss_mlp": 0.01056142, + "balance_loss_clip": 1.27582479, + "balance_loss_mlp": 1.04062724, + "epoch": 0.3493762212535698, + "flos": 29248051324320.0, + "grad_norm": 2.0818719561238113, + "language_loss": 0.7612046, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.78586173, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.1552124, + "step": 5811, + "time_per_iteration": 2.8704614639282227 + }, + { + "auxiliary_loss_clip": 0.01414057, + "auxiliary_loss_mlp": 0.0106064, + "balance_loss_clip": 1.27700388, + "balance_loss_mlp": 1.04401016, + "epoch": 0.3494363445062378, + "flos": 27131838168600.0, + "grad_norm": 1.5319001652933526, + "language_loss": 0.80278569, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82753277, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.16638184, + "step": 5812, + "time_per_iteration": 2.843576669692993 + }, + { + "auxiliary_loss_clip": 0.0140699, + "auxiliary_loss_mlp": 0.01056584, + "balance_loss_clip": 1.27199376, + "balance_loss_mlp": 1.03951323, + "epoch": 0.34949646775890575, + "flos": 12280028767920.0, + "grad_norm": 1.7973590204518568, + "language_loss": 0.70337456, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7280103, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.1706543, + "step": 5813, + "time_per_iteration": 2.7197964191436768 + }, + { + "auxiliary_loss_clip": 0.0139973, + "auxiliary_loss_mlp": 0.01052063, + "balance_loss_clip": 1.26828074, + "balance_loss_mlp": 1.03570783, + "epoch": 0.3495565910115737, + "flos": 17170004849760.0, + "grad_norm": 1.9825979307708925, + "language_loss": 0.76442403, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78894198, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.16345215, + "step": 5814, + "time_per_iteration": 2.7565088272094727 + }, + { + "auxiliary_loss_clip": 0.01418525, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.28241014, + "balance_loss_mlp": 1.02840114, + "epoch": 0.3496167142642417, + "flos": 26470877295960.0, + "grad_norm": 1.550924905944458, + "language_loss": 0.84677964, + "learning_rate": 3.020726562247328e-06, + "loss": 0.87142706, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.17822266, + "step": 5815, + "time_per_iteration": 2.878626823425293 + }, + { + "auxiliary_loss_clip": 0.01416772, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.28018403, + "balance_loss_mlp": 1.02130795, + "epoch": 0.34967683751690964, + "flos": 17418571578000.0, + "grad_norm": 2.1808653214167886, + "language_loss": 0.78015423, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.80468833, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.15332031, + "step": 5816, + "time_per_iteration": 2.889007091522217 + }, + { + "auxiliary_loss_clip": 0.01419374, + "auxiliary_loss_mlp": 0.01044946, + "balance_loss_clip": 1.28189957, + "balance_loss_mlp": 1.02872157, + "epoch": 0.3497369607695776, + "flos": 22604974663320.0, + "grad_norm": 1.962049611479077, + "language_loss": 0.59440482, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.619048, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.16223145, + "step": 5817, + "time_per_iteration": 2.8399882316589355 + }, + { + "auxiliary_loss_clip": 0.01284602, + "auxiliary_loss_mlp": 0.01013572, + "balance_loss_clip": 1.22253597, + "balance_loss_mlp": 1.01023388, + "epoch": 0.34979708402224563, + "flos": 68544275986800.0, + "grad_norm": 0.8861967348380699, + "language_loss": 0.59896988, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62195158, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03344727, + "step": 5818, + "time_per_iteration": 3.3459525108337402 + }, + { + "auxiliary_loss_clip": 0.01408216, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.27743292, + "balance_loss_mlp": 1.0210371, + "epoch": 0.3498572072749136, + "flos": 18994339138320.0, + "grad_norm": 1.6970133377120873, + "language_loss": 0.83843207, + "learning_rate": 3.019386568567123e-06, + "loss": 0.86288458, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15991211, + "step": 5819, + "time_per_iteration": 2.8222200870513916 + }, + { + "auxiliary_loss_clip": 0.01415606, + "auxiliary_loss_mlp": 0.01034263, + "balance_loss_clip": 1.2809484, + "balance_loss_mlp": 1.01936817, + "epoch": 0.34991733052758156, + "flos": 27824700231000.0, + "grad_norm": 1.6563500247352678, + "language_loss": 0.71104133, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73554003, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.14916992, + "step": 5820, + "time_per_iteration": 4.34880518913269 + }, + { + "auxiliary_loss_clip": 0.01416878, + "auxiliary_loss_mlp": 0.0103286, + "balance_loss_clip": 1.28145766, + "balance_loss_mlp": 1.01849496, + "epoch": 0.3499774537802495, + "flos": 33590840823000.0, + "grad_norm": 3.567445911243787, + "language_loss": 0.70265931, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7271567, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.14355469, + "step": 5821, + "time_per_iteration": 2.9688446521759033 + }, + { + "auxiliary_loss_clip": 0.01425713, + "auxiliary_loss_mlp": 0.01049108, + "balance_loss_clip": 1.28469431, + "balance_loss_mlp": 1.03237152, + "epoch": 0.3500375770329175, + "flos": 23481951340680.0, + "grad_norm": 13.37303217475, + "language_loss": 0.7479943, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.77274251, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.16748047, + "step": 5822, + "time_per_iteration": 2.788431167602539 + }, + { + "auxiliary_loss_clip": 0.01414803, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_clip": 1.27843809, + "balance_loss_mlp": 1.02863431, + "epoch": 0.35009770028558546, + "flos": 19030747856040.0, + "grad_norm": 1.547089931186329, + "language_loss": 0.78575426, + "learning_rate": 3.018045956403094e-06, + "loss": 0.81035173, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16320801, + "step": 5823, + "time_per_iteration": 2.7666966915130615 + }, + { + "auxiliary_loss_clip": 0.0128071, + "auxiliary_loss_mlp": 0.01020721, + "balance_loss_clip": 1.21915174, + "balance_loss_mlp": 1.01754987, + "epoch": 0.3501578235382534, + "flos": 68367389659920.0, + "grad_norm": 0.7228715347972446, + "language_loss": 0.59233713, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61535144, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.03173828, + "step": 5824, + "time_per_iteration": 3.3115947246551514 + }, + { + "auxiliary_loss_clip": 0.01414123, + "auxiliary_loss_mlp": 0.01038687, + "balance_loss_clip": 1.27918327, + "balance_loss_mlp": 1.0221349, + "epoch": 0.3502179467909214, + "flos": 21255781081320.0, + "grad_norm": 2.0464859559830004, + "language_loss": 0.84712148, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87164962, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.16546631, + "step": 5825, + "time_per_iteration": 2.7867276668548584 + }, + { + "auxiliary_loss_clip": 0.01414968, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.28084397, + "balance_loss_mlp": 1.03115129, + "epoch": 0.35027807004358935, + "flos": 11946096621000.0, + "grad_norm": 2.5943604143267267, + "language_loss": 0.83403301, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85865092, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.15673828, + "step": 5826, + "time_per_iteration": 4.218889474868774 + }, + { + "auxiliary_loss_clip": 0.01419227, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.28091109, + "balance_loss_mlp": 1.0332551, + "epoch": 0.3503381932962573, + "flos": 21475979547120.0, + "grad_norm": 1.7809411273826943, + "language_loss": 0.81008488, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83476257, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1529541, + "step": 5827, + "time_per_iteration": 2.828524589538574 + }, + { + "auxiliary_loss_clip": 0.01416703, + "auxiliary_loss_mlp": 0.01047241, + "balance_loss_clip": 1.28191042, + "balance_loss_mlp": 1.03261447, + "epoch": 0.3503983165489253, + "flos": 21256024731480.0, + "grad_norm": 2.2172390156055255, + "language_loss": 0.7049455, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.72958499, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.14624023, + "step": 5828, + "time_per_iteration": 2.7417702674865723 + }, + { + "auxiliary_loss_clip": 0.0142048, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.28329086, + "balance_loss_mlp": 1.03520799, + "epoch": 0.35045843980159325, + "flos": 27821126695320.0, + "grad_norm": 1.7819770691872199, + "language_loss": 0.79579419, + "learning_rate": 3.016033880279248e-06, + "loss": 0.82052612, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.17492676, + "step": 5829, + "time_per_iteration": 2.8367366790771484 + }, + { + "auxiliary_loss_clip": 0.01425452, + "auxiliary_loss_mlp": 0.01044409, + "balance_loss_clip": 1.28517199, + "balance_loss_mlp": 1.02757668, + "epoch": 0.3505185630542612, + "flos": 25926655123080.0, + "grad_norm": 1.9644467899140345, + "language_loss": 0.72979712, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.75449574, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.16821289, + "step": 5830, + "time_per_iteration": 4.366157054901123 + }, + { + "auxiliary_loss_clip": 0.01412071, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.27777457, + "balance_loss_mlp": 1.02599168, + "epoch": 0.35057868630692923, + "flos": 20526632126280.0, + "grad_norm": 1.7885651698850384, + "language_loss": 0.88566256, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.9102031, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.15997314, + "step": 5831, + "time_per_iteration": 4.280826091766357 + }, + { + "auxiliary_loss_clip": 0.01417957, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.27994776, + "balance_loss_mlp": 1.02374291, + "epoch": 0.3506388095595972, + "flos": 20453489823960.0, + "grad_norm": 1.871874071065761, + "language_loss": 0.78968579, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.81425512, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.15209961, + "step": 5832, + "time_per_iteration": 2.8091259002685547 + }, + { + "auxiliary_loss_clip": 0.01420164, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.28052092, + "balance_loss_mlp": 1.0177145, + "epoch": 0.35069893281226516, + "flos": 23114534277960.0, + "grad_norm": 2.088116740951234, + "language_loss": 0.709252, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73379683, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.16589355, + "step": 5833, + "time_per_iteration": 2.847440719604492 + }, + { + "auxiliary_loss_clip": 0.01406243, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.27329707, + "balance_loss_mlp": 1.01545107, + "epoch": 0.35075905606493313, + "flos": 27277797906360.0, + "grad_norm": 1.4869679892958048, + "language_loss": 0.80923396, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83360374, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.15270996, + "step": 5834, + "time_per_iteration": 2.8098747730255127 + }, + { + "auxiliary_loss_clip": 0.01416081, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.27885449, + "balance_loss_mlp": 1.02135181, + "epoch": 0.3508191793176011, + "flos": 19132420854240.0, + "grad_norm": 2.0013621195004174, + "language_loss": 0.83824694, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86278659, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.16540527, + "step": 5835, + "time_per_iteration": 2.8050479888916016 + }, + { + "auxiliary_loss_clip": 0.01408013, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.27226639, + "balance_loss_mlp": 1.02321625, + "epoch": 0.35087930257026906, + "flos": 25563217679640.0, + "grad_norm": 1.4991587242513873, + "language_loss": 0.77152276, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.79597998, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.14483643, + "step": 5836, + "time_per_iteration": 2.7911157608032227 + }, + { + "auxiliary_loss_clip": 0.01406888, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.27358508, + "balance_loss_mlp": 1.01791608, + "epoch": 0.350939425822937, + "flos": 18008786041560.0, + "grad_norm": 2.0221593672756843, + "language_loss": 0.77764809, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.80204821, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15203857, + "step": 5837, + "time_per_iteration": 2.8060410022735596 + }, + { + "auxiliary_loss_clip": 0.01407387, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.27221429, + "balance_loss_mlp": 1.01850414, + "epoch": 0.350999549075605, + "flos": 22278311412840.0, + "grad_norm": 1.7082274487929106, + "language_loss": 0.67952609, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70393848, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.15344238, + "step": 5838, + "time_per_iteration": 2.7976715564727783 + }, + { + "auxiliary_loss_clip": 0.0140137, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.26607478, + "balance_loss_mlp": 1.01723027, + "epoch": 0.35105967232827295, + "flos": 14396363748720.0, + "grad_norm": 2.1723554835061014, + "language_loss": 0.83661592, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.86096478, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.1628418, + "step": 5839, + "time_per_iteration": 2.7765262126922607 + }, + { + "auxiliary_loss_clip": 0.01409189, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.26799226, + "balance_loss_mlp": 1.01947498, + "epoch": 0.3511197955809409, + "flos": 25087792714560.0, + "grad_norm": 1.6737988317581511, + "language_loss": 0.59508848, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61953866, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.16357422, + "step": 5840, + "time_per_iteration": 2.8500354290008545 + }, + { + "auxiliary_loss_clip": 0.01410379, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.27191126, + "balance_loss_mlp": 1.01804304, + "epoch": 0.3511799188336089, + "flos": 25889596671600.0, + "grad_norm": 2.157529188172638, + "language_loss": 0.87615454, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.90059733, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.15844727, + "step": 5841, + "time_per_iteration": 2.8164377212524414 + }, + { + "auxiliary_loss_clip": 0.01416069, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.27474856, + "balance_loss_mlp": 1.02097678, + "epoch": 0.35124004208627685, + "flos": 20088509262840.0, + "grad_norm": 1.8671263712536024, + "language_loss": 0.75713396, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.78168011, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17590332, + "step": 5842, + "time_per_iteration": 2.8140316009521484 + }, + { + "auxiliary_loss_clip": 0.01410902, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.27152729, + "balance_loss_mlp": 1.01911092, + "epoch": 0.3513001653389448, + "flos": 17787694191840.0, + "grad_norm": 1.8293013156823592, + "language_loss": 0.6871919, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.71166337, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.17138672, + "step": 5843, + "time_per_iteration": 2.778726577758789 + }, + { + "auxiliary_loss_clip": 0.01404638, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.26880133, + "balance_loss_mlp": 1.0204165, + "epoch": 0.3513602885916128, + "flos": 29393077069800.0, + "grad_norm": 3.0553178756728863, + "language_loss": 0.66003484, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68444633, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.16088867, + "step": 5844, + "time_per_iteration": 2.8326902389526367 + }, + { + "auxiliary_loss_clip": 0.01407626, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.26951861, + "balance_loss_mlp": 1.02616119, + "epoch": 0.3514204118442808, + "flos": 16184167494480.0, + "grad_norm": 2.060114096746499, + "language_loss": 0.75739908, + "learning_rate": 3.010661570469245e-06, + "loss": 0.78190506, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.16821289, + "step": 5845, + "time_per_iteration": 2.7966055870056152 + }, + { + "auxiliary_loss_clip": 0.01402732, + "auxiliary_loss_mlp": 0.01037648, + "balance_loss_clip": 1.26792598, + "balance_loss_mlp": 1.02125692, + "epoch": 0.35148053509694877, + "flos": 23839013271600.0, + "grad_norm": 3.0985142509485413, + "language_loss": 0.73561049, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.7600143, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16394043, + "step": 5846, + "time_per_iteration": 2.7459235191345215 + }, + { + "auxiliary_loss_clip": 0.01411272, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.27298462, + "balance_loss_mlp": 1.02069175, + "epoch": 0.35154065834961673, + "flos": 20995600362120.0, + "grad_norm": 1.7027584067545498, + "language_loss": 0.75647336, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78095859, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.16577148, + "step": 5847, + "time_per_iteration": 2.8282628059387207 + }, + { + "auxiliary_loss_clip": 0.01410335, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.27113402, + "balance_loss_mlp": 1.01461864, + "epoch": 0.3516007816022847, + "flos": 33262025329440.0, + "grad_norm": 3.25480147534474, + "language_loss": 0.72512257, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74953258, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.16052246, + "step": 5848, + "time_per_iteration": 2.8808867931365967 + }, + { + "auxiliary_loss_clip": 0.01408866, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.26885223, + "balance_loss_mlp": 1.02568519, + "epoch": 0.35166090485495266, + "flos": 11730446291520.0, + "grad_norm": 2.537395478827271, + "language_loss": 0.90211034, + "learning_rate": 3.009316958003178e-06, + "loss": 0.92662567, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.16967773, + "step": 5849, + "time_per_iteration": 2.7666420936584473 + }, + { + "auxiliary_loss_clip": 0.01401678, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.26547456, + "balance_loss_mlp": 1.0203644, + "epoch": 0.3517210281076206, + "flos": 22643657449200.0, + "grad_norm": 2.0355077987156873, + "language_loss": 0.75371021, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.77808714, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.15649414, + "step": 5850, + "time_per_iteration": 2.783615827560425 + }, + { + "auxiliary_loss_clip": 0.01398944, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.26418865, + "balance_loss_mlp": 1.01933551, + "epoch": 0.3517811513602886, + "flos": 21327502091040.0, + "grad_norm": 1.4873465515036068, + "language_loss": 0.76120317, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.78554821, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.16223145, + "step": 5851, + "time_per_iteration": 2.7876219749450684 + }, + { + "auxiliary_loss_clip": 0.01406566, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.27038705, + "balance_loss_mlp": 1.01683402, + "epoch": 0.35184127461295656, + "flos": 21037856683680.0, + "grad_norm": 1.9132919894239064, + "language_loss": 0.87678397, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.9011845, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.16638184, + "step": 5852, + "time_per_iteration": 2.7602250576019287 + }, + { + "auxiliary_loss_clip": 0.01392105, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.25763369, + "balance_loss_mlp": 1.01834929, + "epoch": 0.3519013978656245, + "flos": 22460355001440.0, + "grad_norm": 20.02382543347409, + "language_loss": 0.68027556, + "learning_rate": 3.007971733162737e-06, + "loss": 0.7045275, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.1473999, + "step": 5853, + "time_per_iteration": 2.8094773292541504 + }, + { + "auxiliary_loss_clip": 0.01399827, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.26139283, + "balance_loss_mlp": 1.02157331, + "epoch": 0.3519615211182925, + "flos": 13119216043320.0, + "grad_norm": 1.7815639001501298, + "language_loss": 0.81298923, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83736962, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.16625977, + "step": 5854, + "time_per_iteration": 2.7479937076568604 + }, + { + "auxiliary_loss_clip": 0.01392479, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.25845206, + "balance_loss_mlp": 1.01696801, + "epoch": 0.35202164437096045, + "flos": 19139811575760.0, + "grad_norm": 1.517289632875678, + "language_loss": 0.73477316, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75901747, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.14978027, + "step": 5855, + "time_per_iteration": 2.8265819549560547 + }, + { + "auxiliary_loss_clip": 0.0138966, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.2561574, + "balance_loss_mlp": 1.0178647, + "epoch": 0.3520817676236284, + "flos": 26547877392480.0, + "grad_norm": 2.160632451551218, + "language_loss": 0.7132653, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73748696, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.14648438, + "step": 5856, + "time_per_iteration": 2.8638410568237305 + }, + { + "auxiliary_loss_clip": 0.01405903, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.26578188, + "balance_loss_mlp": 1.02247262, + "epoch": 0.3521418908762964, + "flos": 44901421031880.0, + "grad_norm": 1.774928170046542, + "language_loss": 0.61267096, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63712943, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.17480469, + "step": 5857, + "time_per_iteration": 2.988046646118164 + }, + { + "auxiliary_loss_clip": 0.01397421, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.26080179, + "balance_loss_mlp": 1.01947522, + "epoch": 0.3522020141289644, + "flos": 20191441120200.0, + "grad_norm": 2.2844526230869624, + "language_loss": 0.73077691, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75510561, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.15979004, + "step": 5858, + "time_per_iteration": 2.7811124324798584 + }, + { + "auxiliary_loss_clip": 0.01393679, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.25560701, + "balance_loss_mlp": 1.01632416, + "epoch": 0.35226213738163237, + "flos": 27569757990240.0, + "grad_norm": 1.5855763726559475, + "language_loss": 0.76224971, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78651094, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16113281, + "step": 5859, + "time_per_iteration": 4.264425754547119 + }, + { + "auxiliary_loss_clip": 0.01407314, + "auxiliary_loss_mlp": 0.01039587, + "balance_loss_clip": 1.26357198, + "balance_loss_mlp": 1.0223496, + "epoch": 0.35232226063430033, + "flos": 22971498342120.0, + "grad_norm": 2.363962789689223, + "language_loss": 0.72055048, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.7450195, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.17248535, + "step": 5860, + "time_per_iteration": 2.7676138877868652 + }, + { + "auxiliary_loss_clip": 0.01399934, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.25799155, + "balance_loss_mlp": 1.01729584, + "epoch": 0.3523823838869683, + "flos": 19172524932720.0, + "grad_norm": 2.5246285889046396, + "language_loss": 0.67142707, + "learning_rate": 3.005279449623811e-06, + "loss": 0.6957714, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.17211914, + "step": 5861, + "time_per_iteration": 2.7685768604278564 + }, + { + "auxiliary_loss_clip": 0.01388564, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.25293243, + "balance_loss_mlp": 1.01404965, + "epoch": 0.35244250713963626, + "flos": 17935481305800.0, + "grad_norm": 2.248707743940746, + "language_loss": 0.66987664, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.6940583, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.15563965, + "step": 5862, + "time_per_iteration": 2.716538429260254 + }, + { + "auxiliary_loss_clip": 0.01395369, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.25520039, + "balance_loss_mlp": 1.01977658, + "epoch": 0.35250263039230423, + "flos": 21437337369600.0, + "grad_norm": 1.9450455821393455, + "language_loss": 0.77093929, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79526824, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.17749023, + "step": 5863, + "time_per_iteration": 2.8641343116760254 + }, + { + "auxiliary_loss_clip": 0.01389753, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.25103903, + "balance_loss_mlp": 1.01414132, + "epoch": 0.3525627536449722, + "flos": 27422539393320.0, + "grad_norm": 2.2898086309019066, + "language_loss": 0.75339127, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77758849, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.1583252, + "step": 5864, + "time_per_iteration": 4.19609522819519 + }, + { + "auxiliary_loss_clip": 0.01389716, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.25050282, + "balance_loss_mlp": 1.02065635, + "epoch": 0.35262287689764016, + "flos": 24795142288560.0, + "grad_norm": 3.788331355396357, + "language_loss": 0.79332578, + "learning_rate": 3.003932392558793e-06, + "loss": 0.8175841, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.15466309, + "step": 5865, + "time_per_iteration": 2.8128201961517334 + }, + { + "auxiliary_loss_clip": 0.01401748, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.25833535, + "balance_loss_mlp": 1.02258766, + "epoch": 0.3526830001503081, + "flos": 17826092719200.0, + "grad_norm": 4.1510478164140805, + "language_loss": 0.81343704, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83785307, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.17236328, + "step": 5866, + "time_per_iteration": 2.7909255027770996 + }, + { + "auxiliary_loss_clip": 0.01403788, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.25769377, + "balance_loss_mlp": 1.01826453, + "epoch": 0.3527431234029761, + "flos": 18082781119440.0, + "grad_norm": 2.065102792551892, + "language_loss": 0.84700406, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.87140161, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.17700195, + "step": 5867, + "time_per_iteration": 2.812173843383789 + }, + { + "auxiliary_loss_clip": 0.01392883, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.25166416, + "balance_loss_mlp": 1.02620316, + "epoch": 0.35280324665564405, + "flos": 19432096526520.0, + "grad_norm": 2.007816127600762, + "language_loss": 0.74377924, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76813102, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.16088867, + "step": 5868, + "time_per_iteration": 2.8282041549682617 + }, + { + "auxiliary_loss_clip": 0.01399448, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.25748301, + "balance_loss_mlp": 1.02097023, + "epoch": 0.352863369908312, + "flos": 21508652295720.0, + "grad_norm": 2.06148321330587, + "language_loss": 0.62380773, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.64817798, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.16601562, + "step": 5869, + "time_per_iteration": 4.345341444015503 + }, + { + "auxiliary_loss_clip": 0.01396437, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.25414252, + "balance_loss_mlp": 1.01997066, + "epoch": 0.35292349316098, + "flos": 22314598305480.0, + "grad_norm": 1.9955188350441686, + "language_loss": 0.75033128, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.77466083, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.16552734, + "step": 5870, + "time_per_iteration": 2.7654144763946533 + }, + { + "auxiliary_loss_clip": 0.01391271, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.25136352, + "balance_loss_mlp": 1.01556981, + "epoch": 0.352983616413648, + "flos": 33116309241840.0, + "grad_norm": 1.5672269290297054, + "language_loss": 0.72125965, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74549747, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.16943359, + "step": 5871, + "time_per_iteration": 4.330395698547363 + }, + { + "auxiliary_loss_clip": 0.0138109, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.24529505, + "balance_loss_mlp": 1.01415062, + "epoch": 0.35304373966631597, + "flos": 18701120195280.0, + "grad_norm": 2.4673805974347345, + "language_loss": 0.73311543, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.7572189, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15100098, + "step": 5872, + "time_per_iteration": 2.869532823562622 + }, + { + "auxiliary_loss_clip": 0.01397057, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.25729942, + "balance_loss_mlp": 1.01975763, + "epoch": 0.35310386291898394, + "flos": 23369841993960.0, + "grad_norm": 2.983203032139284, + "language_loss": 0.82378423, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84811234, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.15991211, + "step": 5873, + "time_per_iteration": 2.802253484725952 + }, + { + "auxiliary_loss_clip": 0.01403353, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.25941491, + "balance_loss_mlp": 1.0211221, + "epoch": 0.3531639861716519, + "flos": 24467301395640.0, + "grad_norm": 1.8762484448371024, + "language_loss": 0.66125691, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68567216, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.17053223, + "step": 5874, + "time_per_iteration": 2.836623191833496 + }, + { + "auxiliary_loss_clip": 0.01298112, + "auxiliary_loss_mlp": 0.01010102, + "balance_loss_clip": 1.22415566, + "balance_loss_mlp": 1.00578678, + "epoch": 0.35322410942431987, + "flos": 70326963079200.0, + "grad_norm": 0.8446283055179508, + "language_loss": 0.61579472, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63887686, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.04321289, + "step": 5875, + "time_per_iteration": 3.1957430839538574 + }, + { + "auxiliary_loss_clip": 0.01395536, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.25615144, + "balance_loss_mlp": 1.02713251, + "epoch": 0.35328423267698783, + "flos": 19824673791240.0, + "grad_norm": 2.635148111467391, + "language_loss": 0.8012929, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82567531, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.15576172, + "step": 5876, + "time_per_iteration": 2.7467563152313232 + }, + { + "auxiliary_loss_clip": 0.01280136, + "auxiliary_loss_mlp": 0.01009417, + "balance_loss_clip": 1.20862234, + "balance_loss_mlp": 1.00502968, + "epoch": 0.3533443559296558, + "flos": 60839661341520.0, + "grad_norm": 0.6752238203676996, + "language_loss": 0.56792814, + "learning_rate": 2.999887569990088e-06, + "loss": 0.59082365, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.04394531, + "step": 5877, + "time_per_iteration": 3.3137094974517822 + }, + { + "auxiliary_loss_clip": 0.01401944, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.26174569, + "balance_loss_mlp": 1.01792693, + "epoch": 0.35340447918232376, + "flos": 24761413722600.0, + "grad_norm": 1.4696391233166846, + "language_loss": 0.72176051, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74612242, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.16320801, + "step": 5878, + "time_per_iteration": 2.8284199237823486 + }, + { + "auxiliary_loss_clip": 0.01396011, + "auxiliary_loss_mlp": 0.01040508, + "balance_loss_clip": 1.2563343, + "balance_loss_mlp": 1.02495074, + "epoch": 0.3534646024349917, + "flos": 21801018463200.0, + "grad_norm": 2.023380480155957, + "language_loss": 0.78766382, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.812029, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.15563965, + "step": 5879, + "time_per_iteration": 2.830026149749756 + }, + { + "auxiliary_loss_clip": 0.0141348, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.26877952, + "balance_loss_mlp": 1.0243175, + "epoch": 0.3535247256876597, + "flos": 20017072511640.0, + "grad_norm": 2.4004517920541937, + "language_loss": 0.63444734, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65900397, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.17871094, + "step": 5880, + "time_per_iteration": 2.824613094329834 + }, + { + "auxiliary_loss_clip": 0.01405161, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.26437569, + "balance_loss_mlp": 1.02563405, + "epoch": 0.35358484894032766, + "flos": 18192657006360.0, + "grad_norm": 3.9528887797036774, + "language_loss": 0.6657176, + "learning_rate": 2.998538081402727e-06, + "loss": 0.69018435, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.15869141, + "step": 5881, + "time_per_iteration": 2.739872455596924 + }, + { + "auxiliary_loss_clip": 0.01391344, + "auxiliary_loss_mlp": 0.01036975, + "balance_loss_clip": 1.25640702, + "balance_loss_mlp": 1.02266383, + "epoch": 0.3536449721929956, + "flos": 22825538604360.0, + "grad_norm": 1.4573169732407192, + "language_loss": 0.75722647, + "learning_rate": 2.998200614562239e-06, + "loss": 0.78150964, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.14312744, + "step": 5882, + "time_per_iteration": 2.7770063877105713 + }, + { + "auxiliary_loss_clip": 0.01400942, + "auxiliary_loss_mlp": 0.01048318, + "balance_loss_clip": 1.26027846, + "balance_loss_mlp": 1.03209376, + "epoch": 0.3537050954456636, + "flos": 26437960897200.0, + "grad_norm": 4.0123300400008075, + "language_loss": 0.70912117, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73361373, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.16223145, + "step": 5883, + "time_per_iteration": 2.8679349422454834 + }, + { + "auxiliary_loss_clip": 0.01407565, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.26491034, + "balance_loss_mlp": 1.03164685, + "epoch": 0.3537652186983316, + "flos": 17201012655600.0, + "grad_norm": 2.3558026360600675, + "language_loss": 0.78383827, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80838442, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.1541748, + "step": 5884, + "time_per_iteration": 2.7446296215057373 + }, + { + "auxiliary_loss_clip": 0.01401201, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.26412559, + "balance_loss_mlp": 1.03343534, + "epoch": 0.3538253419509996, + "flos": 19541525721480.0, + "grad_norm": 1.990742341402196, + "language_loss": 0.75606292, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.78055644, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.14697266, + "step": 5885, + "time_per_iteration": 2.751105546951294 + }, + { + "auxiliary_loss_clip": 0.01410829, + "auxiliary_loss_mlp": 0.01050393, + "balance_loss_clip": 1.26894665, + "balance_loss_mlp": 1.03432405, + "epoch": 0.35388546520366754, + "flos": 12132160437240.0, + "grad_norm": 2.76756772336265, + "language_loss": 0.83911979, + "learning_rate": 2.996850368809606e-06, + "loss": 0.86373204, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.16064453, + "step": 5886, + "time_per_iteration": 2.7868194580078125 + }, + { + "auxiliary_loss_clip": 0.01400721, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.26496577, + "balance_loss_mlp": 1.02730608, + "epoch": 0.3539455884563355, + "flos": 19682328197520.0, + "grad_norm": 2.3248452578193803, + "language_loss": 0.79323006, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.81767333, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16296387, + "step": 5887, + "time_per_iteration": 2.7990288734436035 + }, + { + "auxiliary_loss_clip": 0.01400754, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.2627902, + "balance_loss_mlp": 1.03160453, + "epoch": 0.35400571170900347, + "flos": 18075918306600.0, + "grad_norm": 5.236651883912283, + "language_loss": 0.65182602, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67630106, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.15148926, + "step": 5888, + "time_per_iteration": 2.799866199493408 + }, + { + "auxiliary_loss_clip": 0.01413532, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_clip": 1.27617037, + "balance_loss_mlp": 1.02824557, + "epoch": 0.35406583496167143, + "flos": 26073711286560.0, + "grad_norm": 1.7195654296799867, + "language_loss": 0.77235126, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79691362, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.14477539, + "step": 5889, + "time_per_iteration": 2.880239486694336 + }, + { + "auxiliary_loss_clip": 0.01412617, + "auxiliary_loss_mlp": 0.01050445, + "balance_loss_clip": 1.2760005, + "balance_loss_mlp": 1.03521013, + "epoch": 0.3541259582143394, + "flos": 19797280129440.0, + "grad_norm": 1.8584088058844053, + "language_loss": 0.81161052, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83624113, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.15246582, + "step": 5890, + "time_per_iteration": 2.78286075592041 + }, + { + "auxiliary_loss_clip": 0.014075, + "auxiliary_loss_mlp": 0.01039231, + "balance_loss_clip": 1.27055919, + "balance_loss_mlp": 1.02502108, + "epoch": 0.35418608146700736, + "flos": 24027107505840.0, + "grad_norm": 1.7021852514811304, + "language_loss": 0.79460377, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81907105, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.14208984, + "step": 5891, + "time_per_iteration": 2.8992667198181152 + }, + { + "auxiliary_loss_clip": 0.01409381, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.27249908, + "balance_loss_mlp": 1.0315783, + "epoch": 0.35424620471967533, + "flos": 12389620396320.0, + "grad_norm": 1.8275644128866797, + "language_loss": 0.73455697, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75912321, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.15673828, + "step": 5892, + "time_per_iteration": 2.8125765323638916 + }, + { + "auxiliary_loss_clip": 0.0141513, + "auxiliary_loss_mlp": 0.01042447, + "balance_loss_clip": 1.27610767, + "balance_loss_mlp": 1.02627039, + "epoch": 0.3543063279723433, + "flos": 19677211544160.0, + "grad_norm": 2.2053740870989103, + "language_loss": 0.66770613, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69228196, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.16162109, + "step": 5893, + "time_per_iteration": 2.735658645629883 + }, + { + "auxiliary_loss_clip": 0.01410165, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.27202702, + "balance_loss_mlp": 1.02466178, + "epoch": 0.35436645122501126, + "flos": 21914386669080.0, + "grad_norm": 2.4107365431030146, + "language_loss": 0.69934171, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.72384936, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.15966797, + "step": 5894, + "time_per_iteration": 2.8324761390686035 + }, + { + "auxiliary_loss_clip": 0.01407261, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.27150989, + "balance_loss_mlp": 1.02220774, + "epoch": 0.3544265744776792, + "flos": 21723734108160.0, + "grad_norm": 2.9097245196910877, + "language_loss": 0.7463485, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77079213, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.14886475, + "step": 5895, + "time_per_iteration": 2.83943510055542 + }, + { + "auxiliary_loss_clip": 0.01414303, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.27630281, + "balance_loss_mlp": 1.02282226, + "epoch": 0.3544866977303472, + "flos": 21217951071000.0, + "grad_norm": 1.8370688386978944, + "language_loss": 0.8352989, + "learning_rate": 2.993472110174491e-06, + "loss": 0.8598218, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.15161133, + "step": 5896, + "time_per_iteration": 2.750702142715454 + }, + { + "auxiliary_loss_clip": 0.01416941, + "auxiliary_loss_mlp": 0.01046947, + "balance_loss_clip": 1.27942264, + "balance_loss_mlp": 1.03083014, + "epoch": 0.35454682098301515, + "flos": 29316970357200.0, + "grad_norm": 2.2688168412457785, + "language_loss": 0.70114017, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.72577906, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.16125488, + "step": 5897, + "time_per_iteration": 4.234845876693726 + }, + { + "auxiliary_loss_clip": 0.01411897, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.27329445, + "balance_loss_mlp": 1.02443731, + "epoch": 0.3546069442356832, + "flos": 24321991391640.0, + "grad_norm": 3.1136345821478124, + "language_loss": 0.81802356, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84254253, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.15576172, + "step": 5898, + "time_per_iteration": 2.7962546348571777 + }, + { + "auxiliary_loss_clip": 0.01405467, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.27010655, + "balance_loss_mlp": 1.02249765, + "epoch": 0.35466706748835114, + "flos": 22862515839120.0, + "grad_norm": 1.9548132180474058, + "language_loss": 0.74591762, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.77034682, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.14959717, + "step": 5899, + "time_per_iteration": 2.779707193374634 + }, + { + "auxiliary_loss_clip": 0.01416204, + "auxiliary_loss_mlp": 0.01034128, + "balance_loss_clip": 1.27633619, + "balance_loss_mlp": 1.0185349, + "epoch": 0.3547271907410191, + "flos": 28336046613480.0, + "grad_norm": 1.792576656312032, + "language_loss": 0.79804993, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.82255328, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.15606689, + "step": 5900, + "time_per_iteration": 2.792363166809082 + }, + { + "auxiliary_loss_clip": 0.0141137, + "auxiliary_loss_mlp": 0.01039074, + "balance_loss_clip": 1.27278519, + "balance_loss_mlp": 1.02292728, + "epoch": 0.35478731399368707, + "flos": 23519415875760.0, + "grad_norm": 2.0242652038447355, + "language_loss": 0.81812048, + "learning_rate": 2.991781567335093e-06, + "loss": 0.8426249, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16143799, + "step": 5901, + "time_per_iteration": 2.80485200881958 + }, + { + "auxiliary_loss_clip": 0.01420469, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.27865934, + "balance_loss_mlp": 1.01972699, + "epoch": 0.35484743724635504, + "flos": 18628911885240.0, + "grad_norm": 1.8652351201994093, + "language_loss": 0.76157963, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.78614616, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.16442871, + "step": 5902, + "time_per_iteration": 2.7425377368927 + }, + { + "auxiliary_loss_clip": 0.01408753, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.27066803, + "balance_loss_mlp": 1.01884866, + "epoch": 0.354907560499023, + "flos": 17389025673120.0, + "grad_norm": 4.429922456757452, + "language_loss": 0.70727623, + "learning_rate": 2.991105086850381e-06, + "loss": 0.73170537, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.1529541, + "step": 5903, + "time_per_iteration": 4.202639818191528 + }, + { + "auxiliary_loss_clip": 0.01415162, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.27245164, + "balance_loss_mlp": 1.01595521, + "epoch": 0.35496768375169097, + "flos": 19213278744960.0, + "grad_norm": 7.790684744894402, + "language_loss": 0.74806833, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77254122, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16168213, + "step": 5904, + "time_per_iteration": 2.804037570953369 + }, + { + "auxiliary_loss_clip": 0.0141787, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.27538276, + "balance_loss_mlp": 1.02436566, + "epoch": 0.35502780700435893, + "flos": 18337398493320.0, + "grad_norm": 2.0139414565654734, + "language_loss": 0.78771037, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81229299, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.16027832, + "step": 5905, + "time_per_iteration": 2.713047504425049 + }, + { + "auxiliary_loss_clip": 0.01399613, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.26792908, + "balance_loss_mlp": 1.02232003, + "epoch": 0.3550879302570269, + "flos": 15452541429480.0, + "grad_norm": 2.1560189605382307, + "language_loss": 0.72773969, + "learning_rate": 2.990090084284356e-06, + "loss": 0.75209427, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.13531494, + "step": 5906, + "time_per_iteration": 2.7550618648529053 + }, + { + "auxiliary_loss_clip": 0.01416285, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.27302396, + "balance_loss_mlp": 1.01908374, + "epoch": 0.35514805350969486, + "flos": 21983996044080.0, + "grad_norm": 2.1973005144576208, + "language_loss": 0.75128973, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77580947, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.1661377, + "step": 5907, + "time_per_iteration": 2.7591593265533447 + }, + { + "auxiliary_loss_clip": 0.01411237, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.27199697, + "balance_loss_mlp": 1.02135086, + "epoch": 0.3552081767623628, + "flos": 29868014734560.0, + "grad_norm": 2.4720898528402815, + "language_loss": 0.76165575, + "learning_rate": 2.989413228164047e-06, + "loss": 0.78613746, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.15588379, + "step": 5908, + "time_per_iteration": 4.361016273498535 + }, + { + "auxiliary_loss_clip": 0.01410493, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.27082205, + "balance_loss_mlp": 1.02102876, + "epoch": 0.3552683000150308, + "flos": 26437879680480.0, + "grad_norm": 2.054603230298074, + "language_loss": 0.68587714, + "learning_rate": 2.989074743819502e-06, + "loss": 0.71034443, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.15197754, + "step": 5909, + "time_per_iteration": 2.867171049118042 + }, + { + "auxiliary_loss_clip": 0.01401394, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.26805568, + "balance_loss_mlp": 1.01802623, + "epoch": 0.35532842326769876, + "flos": 19789970624640.0, + "grad_norm": 1.7290081864479705, + "language_loss": 0.78248501, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80682349, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.14416504, + "step": 5910, + "time_per_iteration": 2.7808454036712646 + }, + { + "auxiliary_loss_clip": 0.01417872, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.27511144, + "balance_loss_mlp": 1.01722574, + "epoch": 0.3553885465203668, + "flos": 17243878102560.0, + "grad_norm": 1.7805143523902982, + "language_loss": 0.71158326, + "learning_rate": 2.98839766262581e-06, + "loss": 0.73609835, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.16412354, + "step": 5911, + "time_per_iteration": 2.784252166748047 + }, + { + "auxiliary_loss_clip": 0.01404162, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.26608622, + "balance_loss_mlp": 1.02127671, + "epoch": 0.35544866977303474, + "flos": 14937824553120.0, + "grad_norm": 2.439668804143765, + "language_loss": 0.86779958, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89220768, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.15368652, + "step": 5912, + "time_per_iteration": 2.7318129539489746 + }, + { + "auxiliary_loss_clip": 0.01406412, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.26833546, + "balance_loss_mlp": 1.02126062, + "epoch": 0.3555087930257027, + "flos": 19760871411720.0, + "grad_norm": 8.772357247967102, + "language_loss": 0.77285355, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79727495, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.14465332, + "step": 5913, + "time_per_iteration": 2.770038366317749 + }, + { + "auxiliary_loss_clip": 0.01408785, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.27288198, + "balance_loss_mlp": 1.02151489, + "epoch": 0.3555689162783707, + "flos": 21073087758960.0, + "grad_norm": 1.372649784544044, + "language_loss": 0.82925063, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.85370457, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.15112305, + "step": 5914, + "time_per_iteration": 2.817852258682251 + }, + { + "auxiliary_loss_clip": 0.01410069, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.27162361, + "balance_loss_mlp": 1.01989818, + "epoch": 0.35562903953103864, + "flos": 33075717863040.0, + "grad_norm": 2.9864287893195374, + "language_loss": 0.70857978, + "learning_rate": 2.98704305057949e-06, + "loss": 0.73304403, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16467285, + "step": 5915, + "time_per_iteration": 2.8870623111724854 + }, + { + "auxiliary_loss_clip": 0.01407499, + "auxiliary_loss_mlp": 0.01037361, + "balance_loss_clip": 1.26825261, + "balance_loss_mlp": 1.02254343, + "epoch": 0.3556891627837066, + "flos": 20562675368760.0, + "grad_norm": 1.6825250271197532, + "language_loss": 0.76614994, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.79059851, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.14819336, + "step": 5916, + "time_per_iteration": 2.779937505722046 + }, + { + "auxiliary_loss_clip": 0.01411063, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.27261376, + "balance_loss_mlp": 1.01716483, + "epoch": 0.35574928603637457, + "flos": 20708107197840.0, + "grad_norm": 1.7327176140901908, + "language_loss": 0.88363171, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90806472, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.1506958, + "step": 5917, + "time_per_iteration": 2.731405019760132 + }, + { + "auxiliary_loss_clip": 0.01409541, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.27148342, + "balance_loss_mlp": 1.01419806, + "epoch": 0.35580940928904253, + "flos": 15198736222800.0, + "grad_norm": 2.9806669361337033, + "language_loss": 0.75321823, + "learning_rate": 2.98602669849771e-06, + "loss": 0.77760291, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.14715576, + "step": 5918, + "time_per_iteration": 2.7720835208892822 + }, + { + "auxiliary_loss_clip": 0.01303762, + "auxiliary_loss_mlp": 0.01061758, + "balance_loss_clip": 1.23428988, + "balance_loss_mlp": 1.05846834, + "epoch": 0.3558695325417105, + "flos": 58652417518200.0, + "grad_norm": 0.9621635486320183, + "language_loss": 0.63970506, + "learning_rate": 2.985687839672857e-06, + "loss": 0.6633603, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.03295898, + "step": 5919, + "time_per_iteration": 3.011960506439209 + }, + { + "auxiliary_loss_clip": 0.0141657, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.27380037, + "balance_loss_mlp": 1.01914227, + "epoch": 0.35592965579437846, + "flos": 22023369172080.0, + "grad_norm": 2.1436027495564915, + "language_loss": 0.74266636, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76717323, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.14978027, + "step": 5920, + "time_per_iteration": 2.7600553035736084 + }, + { + "auxiliary_loss_clip": 0.0141748, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.27887142, + "balance_loss_mlp": 1.02166188, + "epoch": 0.35598977904704643, + "flos": 23372928229320.0, + "grad_norm": 1.8130209479698176, + "language_loss": 0.77344728, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79798979, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.15093994, + "step": 5921, + "time_per_iteration": 2.742852210998535 + }, + { + "auxiliary_loss_clip": 0.01411822, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.27194524, + "balance_loss_mlp": 1.01831436, + "epoch": 0.3560499022997144, + "flos": 17789846434920.0, + "grad_norm": 2.1525758924590983, + "language_loss": 0.67946947, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.70392168, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.15075684, + "step": 5922, + "time_per_iteration": 2.7538421154022217 + }, + { + "auxiliary_loss_clip": 0.01416945, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.27776289, + "balance_loss_mlp": 1.01956034, + "epoch": 0.35611002555238236, + "flos": 20745328082760.0, + "grad_norm": 2.1499982442626777, + "language_loss": 0.79453468, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81904626, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.14660645, + "step": 5923, + "time_per_iteration": 2.771759510040283 + }, + { + "auxiliary_loss_clip": 0.01415452, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.27601993, + "balance_loss_mlp": 1.01796174, + "epoch": 0.3561701488050504, + "flos": 19466759084760.0, + "grad_norm": 2.366728500435229, + "language_loss": 0.85365999, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87814856, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.15429688, + "step": 5924, + "time_per_iteration": 2.75945782661438 + }, + { + "auxiliary_loss_clip": 0.01409215, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.27159286, + "balance_loss_mlp": 1.02238464, + "epoch": 0.35623027205771834, + "flos": 30781237696200.0, + "grad_norm": 2.1426928317569156, + "language_loss": 0.7829237, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.8073988, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.15893555, + "step": 5925, + "time_per_iteration": 2.8932852745056152 + }, + { + "auxiliary_loss_clip": 0.01411968, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.27176225, + "balance_loss_mlp": 1.02137327, + "epoch": 0.3562903953103863, + "flos": 16985281109400.0, + "grad_norm": 3.5080332178346736, + "language_loss": 0.76467514, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78916478, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.15643311, + "step": 5926, + "time_per_iteration": 2.7145752906799316 + }, + { + "auxiliary_loss_clip": 0.01413668, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.26902568, + "balance_loss_mlp": 1.023579, + "epoch": 0.3563505185630543, + "flos": 23844860875440.0, + "grad_norm": 2.1574676764534146, + "language_loss": 0.70377779, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.72831535, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.16516113, + "step": 5927, + "time_per_iteration": 2.8191287517547607 + }, + { + "auxiliary_loss_clip": 0.01404069, + "auxiliary_loss_mlp": 0.01042069, + "balance_loss_clip": 1.26651752, + "balance_loss_mlp": 1.02780533, + "epoch": 0.35641064181572224, + "flos": 22278595671360.0, + "grad_norm": 3.391328642434926, + "language_loss": 0.7984823, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.82294369, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.14251709, + "step": 5928, + "time_per_iteration": 2.794201612472534 + }, + { + "auxiliary_loss_clip": 0.01406904, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.26784277, + "balance_loss_mlp": 1.02318859, + "epoch": 0.3564707650683902, + "flos": 23006242117080.0, + "grad_norm": 1.4452323841709964, + "language_loss": 0.81756592, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84201729, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.15032959, + "step": 5929, + "time_per_iteration": 2.815084457397461 + }, + { + "auxiliary_loss_clip": 0.01397964, + "auxiliary_loss_mlp": 0.010353, + "balance_loss_clip": 1.26263642, + "balance_loss_mlp": 1.02139437, + "epoch": 0.35653088832105817, + "flos": 14688405049320.0, + "grad_norm": 1.6679602339258062, + "language_loss": 0.7098608, + "learning_rate": 2.981957928520201e-06, + "loss": 0.73419344, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.13916016, + "step": 5930, + "time_per_iteration": 2.7196273803710938 + }, + { + "auxiliary_loss_clip": 0.01413373, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.27202249, + "balance_loss_mlp": 1.03504419, + "epoch": 0.35659101157372614, + "flos": 23482316815920.0, + "grad_norm": 2.7102524057123873, + "language_loss": 0.68431652, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70896709, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.16638184, + "step": 5931, + "time_per_iteration": 2.810537338256836 + }, + { + "auxiliary_loss_clip": 0.01401058, + "auxiliary_loss_mlp": 0.0104342, + "balance_loss_clip": 1.26492763, + "balance_loss_mlp": 1.02875757, + "epoch": 0.3566511348263941, + "flos": 26584367326920.0, + "grad_norm": 1.8066429013802714, + "language_loss": 0.68079674, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70524156, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.14660645, + "step": 5932, + "time_per_iteration": 2.8333194255828857 + }, + { + "auxiliary_loss_clip": 0.01397459, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.26306462, + "balance_loss_mlp": 1.02209437, + "epoch": 0.35671125807906207, + "flos": 13118891176440.0, + "grad_norm": 2.3851858097783794, + "language_loss": 0.79947418, + "learning_rate": 2.980939897348969e-06, + "loss": 0.82381642, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.14685059, + "step": 5933, + "time_per_iteration": 2.724439859390259 + }, + { + "auxiliary_loss_clip": 0.01407129, + "auxiliary_loss_mlp": 0.01050734, + "balance_loss_clip": 1.26665282, + "balance_loss_mlp": 1.03570199, + "epoch": 0.35677138133173003, + "flos": 33007164305400.0, + "grad_norm": 1.4421507368661255, + "language_loss": 0.69914162, + "learning_rate": 2.980600479213388e-06, + "loss": 0.72372019, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.15026855, + "step": 5934, + "time_per_iteration": 2.9393138885498047 + }, + { + "auxiliary_loss_clip": 0.01420101, + "auxiliary_loss_mlp": 0.01046166, + "balance_loss_clip": 1.27511215, + "balance_loss_mlp": 1.02936983, + "epoch": 0.356831504584398, + "flos": 20782833226200.0, + "grad_norm": 1.737550287858709, + "language_loss": 0.71467817, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73934078, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.16784668, + "step": 5935, + "time_per_iteration": 2.8683876991271973 + }, + { + "auxiliary_loss_clip": 0.0140611, + "auxiliary_loss_mlp": 0.01039261, + "balance_loss_clip": 1.26745498, + "balance_loss_mlp": 1.02315569, + "epoch": 0.35689162783706596, + "flos": 12169218888720.0, + "grad_norm": 2.355687757410572, + "language_loss": 0.79087174, + "learning_rate": 2.979921531401692e-06, + "loss": 0.81532544, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16088867, + "step": 5936, + "time_per_iteration": 4.181439399719238 + }, + { + "auxiliary_loss_clip": 0.01402701, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_clip": 1.26551938, + "balance_loss_mlp": 1.02651024, + "epoch": 0.356951751089734, + "flos": 23846728860000.0, + "grad_norm": 1.5037187286210452, + "language_loss": 0.6495223, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.67396712, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.15270996, + "step": 5937, + "time_per_iteration": 2.753164291381836 + }, + { + "auxiliary_loss_clip": 0.0140717, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.26839805, + "balance_loss_mlp": 1.02346087, + "epoch": 0.35701187434240195, + "flos": 11725045379640.0, + "grad_norm": 2.3976230865999457, + "language_loss": 0.78750277, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.81196362, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.15472412, + "step": 5938, + "time_per_iteration": 2.678982734680176 + }, + { + "auxiliary_loss_clip": 0.01404998, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.26818514, + "balance_loss_mlp": 1.02885604, + "epoch": 0.3570719975950699, + "flos": 24904165399920.0, + "grad_norm": 2.221997630214585, + "language_loss": 0.80320603, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.8276931, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.14868164, + "step": 5939, + "time_per_iteration": 2.774052381515503 + }, + { + "auxiliary_loss_clip": 0.01414582, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.26933157, + "balance_loss_mlp": 1.02168703, + "epoch": 0.3571321208477379, + "flos": 26000447159160.0, + "grad_norm": 1.8552646330260802, + "language_loss": 0.79405391, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81857818, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.16174316, + "step": 5940, + "time_per_iteration": 2.7878048419952393 + }, + { + "auxiliary_loss_clip": 0.01405396, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.26630259, + "balance_loss_mlp": 1.01946163, + "epoch": 0.35719224410040584, + "flos": 14505752335320.0, + "grad_norm": 2.037909887937856, + "language_loss": 0.72328246, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74768746, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.15643311, + "step": 5941, + "time_per_iteration": 2.7520272731781006 + }, + { + "auxiliary_loss_clip": 0.01407979, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.27041769, + "balance_loss_mlp": 1.02462816, + "epoch": 0.3572523673530738, + "flos": 31181692982760.0, + "grad_norm": 2.459078162685734, + "language_loss": 0.64337355, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.667858, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15826416, + "step": 5942, + "time_per_iteration": 4.314575433731079 + }, + { + "auxiliary_loss_clip": 0.01406573, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.26952314, + "balance_loss_mlp": 1.01823986, + "epoch": 0.3573124906057418, + "flos": 15856285993200.0, + "grad_norm": 6.272974545624549, + "language_loss": 0.73637384, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76077616, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.15423584, + "step": 5943, + "time_per_iteration": 2.7082839012145996 + }, + { + "auxiliary_loss_clip": 0.0129822, + "auxiliary_loss_mlp": 0.01047173, + "balance_loss_clip": 1.23307967, + "balance_loss_mlp": 1.04393053, + "epoch": 0.35737261385840974, + "flos": 60835559897160.0, + "grad_norm": 0.8137377950015441, + "language_loss": 0.60769683, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.63115072, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.0324707, + "step": 5944, + "time_per_iteration": 3.3364827632904053 + }, + { + "auxiliary_loss_clip": 0.01401014, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.26511383, + "balance_loss_mlp": 1.01716256, + "epoch": 0.3574327371110777, + "flos": 18848460617280.0, + "grad_norm": 1.766319977629576, + "language_loss": 0.7297039, + "learning_rate": 2.976864428379655e-06, + "loss": 0.75404263, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15698242, + "step": 5945, + "time_per_iteration": 2.7671196460723877 + }, + { + "auxiliary_loss_clip": 0.01397592, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.26187301, + "balance_loss_mlp": 1.01713586, + "epoch": 0.35749286036374567, + "flos": 23554890601200.0, + "grad_norm": 1.5801549339113916, + "language_loss": 0.81267929, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83698523, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15856934, + "step": 5946, + "time_per_iteration": 2.898817300796509 + }, + { + "auxiliary_loss_clip": 0.01409838, + "auxiliary_loss_mlp": 0.01037515, + "balance_loss_clip": 1.27253008, + "balance_loss_mlp": 1.02111173, + "epoch": 0.35755298361641363, + "flos": 21110430468960.0, + "grad_norm": 1.9258105827226355, + "language_loss": 0.69218135, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.7166549, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.16394043, + "step": 5947, + "time_per_iteration": 5.713286399841309 + }, + { + "auxiliary_loss_clip": 0.01395787, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.26280367, + "balance_loss_mlp": 1.01541615, + "epoch": 0.3576131068690816, + "flos": 19249768679400.0, + "grad_norm": 1.7804345597980102, + "language_loss": 0.7574259, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.78169376, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.15576172, + "step": 5948, + "time_per_iteration": 2.759399175643921 + }, + { + "auxiliary_loss_clip": 0.01401462, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.26487684, + "balance_loss_mlp": 1.02161801, + "epoch": 0.35767323012174956, + "flos": 28660192145640.0, + "grad_norm": 1.8091377071547639, + "language_loss": 0.70860916, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.73300016, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.16027832, + "step": 5949, + "time_per_iteration": 2.814056634902954 + }, + { + "auxiliary_loss_clip": 0.01403325, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.26691389, + "balance_loss_mlp": 1.02011967, + "epoch": 0.35773335337441753, + "flos": 17088903308880.0, + "grad_norm": 2.1029504933037226, + "language_loss": 0.77868629, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.8030929, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.17236328, + "step": 5950, + "time_per_iteration": 2.8298721313476562 + }, + { + "auxiliary_loss_clip": 0.01408673, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.27131701, + "balance_loss_mlp": 1.01924515, + "epoch": 0.35779347662708555, + "flos": 15892938361080.0, + "grad_norm": 1.9210432361506158, + "language_loss": 0.73169446, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.75614059, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.16687012, + "step": 5951, + "time_per_iteration": 2.80130934715271 + }, + { + "auxiliary_loss_clip": 0.01412442, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.27183533, + "balance_loss_mlp": 1.02035594, + "epoch": 0.3578535998797535, + "flos": 28664862107040.0, + "grad_norm": 3.781706627921848, + "language_loss": 0.70292085, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.7274043, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.15539551, + "step": 5952, + "time_per_iteration": 2.8140432834625244 + }, + { + "auxiliary_loss_clip": 0.0140019, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.2655189, + "balance_loss_mlp": 1.02152967, + "epoch": 0.3579137231324215, + "flos": 37859554027080.0, + "grad_norm": 1.9203633180564923, + "language_loss": 0.70502758, + "learning_rate": 2.974144484269449e-06, + "loss": 0.7293973, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.15258789, + "step": 5953, + "time_per_iteration": 2.983337163925171 + }, + { + "auxiliary_loss_clip": 0.01401194, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.2668463, + "balance_loss_mlp": 1.01790321, + "epoch": 0.35797384638508944, + "flos": 22352062840560.0, + "grad_norm": 1.647170343786839, + "language_loss": 0.66914314, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.69348371, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.14978027, + "step": 5954, + "time_per_iteration": 2.8177947998046875 + }, + { + "auxiliary_loss_clip": 0.01401356, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_clip": 1.26898909, + "balance_loss_mlp": 1.0269978, + "epoch": 0.3580339696377574, + "flos": 13593707016120.0, + "grad_norm": 1.6513200383785842, + "language_loss": 0.75004315, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77447927, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15258789, + "step": 5955, + "time_per_iteration": 2.7450735569000244 + }, + { + "auxiliary_loss_clip": 0.01400254, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.26855731, + "balance_loss_mlp": 1.02016115, + "epoch": 0.3580940928904254, + "flos": 23773424124240.0, + "grad_norm": 2.7081806675580413, + "language_loss": 0.76653707, + "learning_rate": 2.973123895369182e-06, + "loss": 0.79088247, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.14135742, + "step": 5956, + "time_per_iteration": 2.791069984436035 + }, + { + "auxiliary_loss_clip": 0.01397802, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.26772928, + "balance_loss_mlp": 1.02317464, + "epoch": 0.35815421614309334, + "flos": 19468627069320.0, + "grad_norm": 1.8299107782635213, + "language_loss": 0.72858369, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75293815, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.14477539, + "step": 5957, + "time_per_iteration": 2.751323699951172 + }, + { + "auxiliary_loss_clip": 0.01411723, + "auxiliary_loss_mlp": 0.01044311, + "balance_loss_clip": 1.27849555, + "balance_loss_mlp": 1.02960062, + "epoch": 0.3582143393957613, + "flos": 23373496746360.0, + "grad_norm": 1.928888389276484, + "language_loss": 0.71317989, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73774016, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.14709473, + "step": 5958, + "time_per_iteration": 2.7667577266693115 + }, + { + "auxiliary_loss_clip": 0.0140202, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.26930356, + "balance_loss_mlp": 1.02026248, + "epoch": 0.35827446264842927, + "flos": 26328937785840.0, + "grad_norm": 1.647086197258698, + "language_loss": 0.88516909, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90953255, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.14074707, + "step": 5959, + "time_per_iteration": 2.7761852741241455 + }, + { + "auxiliary_loss_clip": 0.01409321, + "auxiliary_loss_mlp": 0.01047346, + "balance_loss_clip": 1.27613568, + "balance_loss_mlp": 1.0322839, + "epoch": 0.35833458590109724, + "flos": 30452868894600.0, + "grad_norm": 1.5238678713919145, + "language_loss": 0.58214629, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60671294, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15045166, + "step": 5960, + "time_per_iteration": 2.849061965942383 + }, + { + "auxiliary_loss_clip": 0.01413368, + "auxiliary_loss_mlp": 0.01047194, + "balance_loss_clip": 1.27725911, + "balance_loss_mlp": 1.03037357, + "epoch": 0.3583947091537652, + "flos": 14833999311840.0, + "grad_norm": 2.65036886007815, + "language_loss": 0.76424271, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78884834, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.16833496, + "step": 5961, + "time_per_iteration": 2.702617645263672 + }, + { + "auxiliary_loss_clip": 0.01416192, + "auxiliary_loss_mlp": 0.01047911, + "balance_loss_clip": 1.28106499, + "balance_loss_mlp": 1.03199625, + "epoch": 0.35845483240643317, + "flos": 34247334776040.0, + "grad_norm": 1.6460113939351209, + "language_loss": 0.70464158, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72928262, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15917969, + "step": 5962, + "time_per_iteration": 2.93725848197937 + }, + { + "auxiliary_loss_clip": 0.01412779, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_clip": 1.27968061, + "balance_loss_mlp": 1.03612924, + "epoch": 0.35851495565910113, + "flos": 20965079856600.0, + "grad_norm": 1.9263096549087, + "language_loss": 0.74802196, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77265495, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.14373779, + "step": 5963, + "time_per_iteration": 2.788940668106079 + }, + { + "auxiliary_loss_clip": 0.01415459, + "auxiliary_loss_mlp": 0.01051406, + "balance_loss_clip": 1.28342462, + "balance_loss_mlp": 1.03627813, + "epoch": 0.35857507891176915, + "flos": 22315004389080.0, + "grad_norm": 1.6062503115405187, + "language_loss": 0.7869212, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.81158984, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15112305, + "step": 5964, + "time_per_iteration": 2.8045573234558105 + }, + { + "auxiliary_loss_clip": 0.01415577, + "auxiliary_loss_mlp": 0.0105104, + "balance_loss_clip": 1.27825534, + "balance_loss_mlp": 1.03557849, + "epoch": 0.3586352021644371, + "flos": 23373131271120.0, + "grad_norm": 2.2961232437969237, + "language_loss": 0.66504538, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68971157, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15466309, + "step": 5965, + "time_per_iteration": 2.747271776199341 + }, + { + "auxiliary_loss_clip": 0.01414388, + "auxiliary_loss_mlp": 0.01050547, + "balance_loss_clip": 1.28069961, + "balance_loss_mlp": 1.03487051, + "epoch": 0.3586953254171051, + "flos": 27854246135880.0, + "grad_norm": 1.506137445797419, + "language_loss": 0.79446268, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81911194, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15686035, + "step": 5966, + "time_per_iteration": 2.834279775619507 + }, + { + "auxiliary_loss_clip": 0.0142118, + "auxiliary_loss_mlp": 0.01053504, + "balance_loss_clip": 1.28615212, + "balance_loss_mlp": 1.03766084, + "epoch": 0.35875544866977305, + "flos": 19505279437200.0, + "grad_norm": 2.0875654412344575, + "language_loss": 0.90804338, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93279022, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.15844727, + "step": 5967, + "time_per_iteration": 2.75486421585083 + }, + { + "auxiliary_loss_clip": 0.01423555, + "auxiliary_loss_mlp": 0.01056609, + "balance_loss_clip": 1.28634596, + "balance_loss_mlp": 1.04039693, + "epoch": 0.358815571922441, + "flos": 21476223197280.0, + "grad_norm": 1.6565137106090924, + "language_loss": 0.80259383, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82739556, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.1619873, + "step": 5968, + "time_per_iteration": 2.748225450515747 + }, + { + "auxiliary_loss_clip": 0.0142154, + "auxiliary_loss_mlp": 0.01062715, + "balance_loss_clip": 1.28366065, + "balance_loss_mlp": 1.04650307, + "epoch": 0.358875695175109, + "flos": 21840472807920.0, + "grad_norm": 2.036260093575732, + "language_loss": 0.84457642, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86941898, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.16210938, + "step": 5969, + "time_per_iteration": 2.7564961910247803 + }, + { + "auxiliary_loss_clip": 0.01414496, + "auxiliary_loss_mlp": 0.01054977, + "balance_loss_clip": 1.28168035, + "balance_loss_mlp": 1.04064226, + "epoch": 0.35893581842777694, + "flos": 32017590981000.0, + "grad_norm": 1.8698263512124147, + "language_loss": 0.72128129, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74597609, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.14337158, + "step": 5970, + "time_per_iteration": 2.841315507888794 + }, + { + "auxiliary_loss_clip": 0.01419638, + "auxiliary_loss_mlp": 0.01056955, + "balance_loss_clip": 1.28651881, + "balance_loss_mlp": 1.04291821, + "epoch": 0.3589959416804449, + "flos": 20490751317240.0, + "grad_norm": 1.7237095801201492, + "language_loss": 0.8030448, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.8278107, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.14044189, + "step": 5971, + "time_per_iteration": 2.75866436958313 + }, + { + "auxiliary_loss_clip": 0.01425101, + "auxiliary_loss_mlp": 0.01055563, + "balance_loss_clip": 1.28572917, + "balance_loss_mlp": 1.03982782, + "epoch": 0.3590560649331129, + "flos": 16185060878400.0, + "grad_norm": 1.8281452246049597, + "language_loss": 0.78615034, + "learning_rate": 2.967675154124696e-06, + "loss": 0.81095695, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.15722656, + "step": 5972, + "time_per_iteration": 2.7369518280029297 + }, + { + "auxiliary_loss_clip": 0.01423215, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_clip": 1.28720653, + "balance_loss_mlp": 1.03227782, + "epoch": 0.35911618818578084, + "flos": 20380185088200.0, + "grad_norm": 2.054298593923788, + "language_loss": 0.81875682, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.84345794, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.14611816, + "step": 5973, + "time_per_iteration": 2.7917256355285645 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_clip": 1.23286843, + "balance_loss_mlp": 1.04786515, + "epoch": 0.3591763114384488, + "flos": 41247855824400.0, + "grad_norm": 0.9341734259126829, + "language_loss": 0.56800455, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.59147185, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.03369141, + "step": 5974, + "time_per_iteration": 3.137005567550659 + }, + { + "auxiliary_loss_clip": 0.01416971, + "auxiliary_loss_mlp": 0.01046276, + "balance_loss_clip": 1.28240967, + "balance_loss_mlp": 1.03257251, + "epoch": 0.35923643469111677, + "flos": 18699820727760.0, + "grad_norm": 1.7494705655174014, + "language_loss": 0.69493514, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7195676, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.13702393, + "step": 5975, + "time_per_iteration": 2.8037970066070557 + }, + { + "auxiliary_loss_clip": 0.01419993, + "auxiliary_loss_mlp": 0.01040508, + "balance_loss_clip": 1.28585279, + "balance_loss_mlp": 1.02603006, + "epoch": 0.35929655794378473, + "flos": 25015462579440.0, + "grad_norm": 1.9624432233711202, + "language_loss": 0.79984456, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82444954, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.14483643, + "step": 5976, + "time_per_iteration": 4.253443479537964 + }, + { + "auxiliary_loss_clip": 0.01417435, + "auxiliary_loss_mlp": 0.01045339, + "balance_loss_clip": 1.28402841, + "balance_loss_mlp": 1.03010964, + "epoch": 0.35935668119645275, + "flos": 14979674791080.0, + "grad_norm": 1.7913311649859875, + "language_loss": 0.78711659, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81174433, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15246582, + "step": 5977, + "time_per_iteration": 2.7534477710723877 + }, + { + "auxiliary_loss_clip": 0.01414466, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.28120327, + "balance_loss_mlp": 1.0259968, + "epoch": 0.3594168044491207, + "flos": 21183044862600.0, + "grad_norm": 1.808628774803596, + "language_loss": 0.80715293, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.83170086, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.14318848, + "step": 5978, + "time_per_iteration": 2.7574219703674316 + }, + { + "auxiliary_loss_clip": 0.01417554, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.28225327, + "balance_loss_mlp": 1.01991701, + "epoch": 0.3594769277017887, + "flos": 27677847109320.0, + "grad_norm": 1.6413041100499572, + "language_loss": 0.67789257, + "learning_rate": 2.965288372816436e-06, + "loss": 0.70241296, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.14575195, + "step": 5979, + "time_per_iteration": 2.8522632122039795 + }, + { + "auxiliary_loss_clip": 0.01419296, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.28585863, + "balance_loss_mlp": 1.02313876, + "epoch": 0.35953705095445665, + "flos": 23007460367880.0, + "grad_norm": 1.9697492183491818, + "language_loss": 0.6725418, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69712061, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15441895, + "step": 5980, + "time_per_iteration": 4.245231628417969 + }, + { + "auxiliary_loss_clip": 0.01428477, + "auxiliary_loss_mlp": 0.01045179, + "balance_loss_clip": 1.28861356, + "balance_loss_mlp": 1.0285728, + "epoch": 0.3595971742071246, + "flos": 25518687289920.0, + "grad_norm": 1.875078825455634, + "language_loss": 0.713691, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73842758, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.16601562, + "step": 5981, + "time_per_iteration": 2.772460699081421 + }, + { + "auxiliary_loss_clip": 0.0142517, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.28779387, + "balance_loss_mlp": 1.02503967, + "epoch": 0.3596572974597926, + "flos": 29868705076680.0, + "grad_norm": 1.7260433441113656, + "language_loss": 0.71300554, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73766953, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1619873, + "step": 5982, + "time_per_iteration": 2.8084046840667725 + }, + { + "auxiliary_loss_clip": 0.01409467, + "auxiliary_loss_mlp": 0.01042891, + "balance_loss_clip": 1.27976775, + "balance_loss_mlp": 1.02785873, + "epoch": 0.35971742071246054, + "flos": 23117782946760.0, + "grad_norm": 1.7692664915148062, + "language_loss": 0.75954914, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78407276, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15039062, + "step": 5983, + "time_per_iteration": 2.7980339527130127 + }, + { + "auxiliary_loss_clip": 0.01426204, + "auxiliary_loss_mlp": 0.01048703, + "balance_loss_clip": 1.28806734, + "balance_loss_mlp": 1.03175139, + "epoch": 0.3597775439651285, + "flos": 16729486093080.0, + "grad_norm": 1.7977047203289684, + "language_loss": 0.76471013, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78945923, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.16943359, + "step": 5984, + "time_per_iteration": 2.7417008876800537 + }, + { + "auxiliary_loss_clip": 0.01411808, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.2801795, + "balance_loss_mlp": 1.02273107, + "epoch": 0.3598376672177965, + "flos": 19724300260560.0, + "grad_norm": 2.158759730237965, + "language_loss": 0.86854672, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.89304143, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.14929199, + "step": 5985, + "time_per_iteration": 2.759324789047241 + }, + { + "auxiliary_loss_clip": 0.01414284, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.28184223, + "balance_loss_mlp": 1.02305961, + "epoch": 0.35989779047046444, + "flos": 17316045804240.0, + "grad_norm": 1.346970250202925, + "language_loss": 0.72569966, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.75023091, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15795898, + "step": 5986, + "time_per_iteration": 5.7164928913116455 + }, + { + "auxiliary_loss_clip": 0.01430285, + "auxiliary_loss_mlp": 0.01038966, + "balance_loss_clip": 1.29122829, + "balance_loss_mlp": 1.02325404, + "epoch": 0.3599579137231324, + "flos": 22716434276280.0, + "grad_norm": 1.706185194045754, + "language_loss": 0.73493779, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75963026, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.15722656, + "step": 5987, + "time_per_iteration": 2.8901805877685547 + }, + { + "auxiliary_loss_clip": 0.01428127, + "auxiliary_loss_mlp": 0.01042494, + "balance_loss_clip": 1.29215884, + "balance_loss_mlp": 1.02609134, + "epoch": 0.36001803697580037, + "flos": 20964836206440.0, + "grad_norm": 2.447725252923799, + "language_loss": 0.70020932, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.7249155, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16381836, + "step": 5988, + "time_per_iteration": 2.741407871246338 + }, + { + "auxiliary_loss_clip": 0.01427914, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.29031086, + "balance_loss_mlp": 1.02571607, + "epoch": 0.36007816022846834, + "flos": 20490467058720.0, + "grad_norm": 2.0480775092259855, + "language_loss": 0.73319781, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.7578916, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15740967, + "step": 5989, + "time_per_iteration": 2.8014075756073 + }, + { + "auxiliary_loss_clip": 0.01422899, + "auxiliary_loss_mlp": 0.01036625, + "balance_loss_clip": 1.28949332, + "balance_loss_mlp": 1.02169371, + "epoch": 0.36013828348113636, + "flos": 28007068686480.0, + "grad_norm": 2.369897196079371, + "language_loss": 0.80122745, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82582259, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.14935303, + "step": 5990, + "time_per_iteration": 2.8055617809295654 + }, + { + "auxiliary_loss_clip": 0.01420722, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.2855556, + "balance_loss_mlp": 1.0248704, + "epoch": 0.3601984067338043, + "flos": 20086763103360.0, + "grad_norm": 1.7819115331359483, + "language_loss": 0.84025121, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86485946, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.15246582, + "step": 5991, + "time_per_iteration": 2.7550604343414307 + }, + { + "auxiliary_loss_clip": 0.01432591, + "auxiliary_loss_mlp": 0.01039273, + "balance_loss_clip": 1.29327857, + "balance_loss_mlp": 1.02309644, + "epoch": 0.3602585299864723, + "flos": 18621318121920.0, + "grad_norm": 2.048451340779969, + "language_loss": 0.76127827, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78599691, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.16174316, + "step": 5992, + "time_per_iteration": 2.7260332107543945 + }, + { + "auxiliary_loss_clip": 0.01423084, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.28938043, + "balance_loss_mlp": 1.02667487, + "epoch": 0.36031865323914025, + "flos": 19577731397400.0, + "grad_norm": 2.1121015885152206, + "language_loss": 0.78135371, + "learning_rate": 2.960509433875627e-06, + "loss": 0.80600876, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.1574707, + "step": 5993, + "time_per_iteration": 2.756042242050171 + }, + { + "auxiliary_loss_clip": 0.01433563, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.29604363, + "balance_loss_mlp": 1.0222435, + "epoch": 0.3603787764918082, + "flos": 17494718898960.0, + "grad_norm": 1.7758896728402984, + "language_loss": 0.74791908, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.77263236, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.1552124, + "step": 5994, + "time_per_iteration": 2.7423603534698486 + }, + { + "auxiliary_loss_clip": 0.01430751, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.2925154, + "balance_loss_mlp": 1.02573466, + "epoch": 0.3604388997444762, + "flos": 15527632933080.0, + "grad_norm": 2.102749588186138, + "language_loss": 0.694049, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71877146, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.15771484, + "step": 5995, + "time_per_iteration": 2.695180892944336 + }, + { + "auxiliary_loss_clip": 0.01432064, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.29335856, + "balance_loss_mlp": 1.03294969, + "epoch": 0.36049902299714415, + "flos": 17315314853760.0, + "grad_norm": 2.046350155502703, + "language_loss": 0.83012259, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.85493159, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.15869141, + "step": 5996, + "time_per_iteration": 2.739109516143799 + }, + { + "auxiliary_loss_clip": 0.01428742, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.29301679, + "balance_loss_mlp": 1.02736056, + "epoch": 0.3605591462498121, + "flos": 17060494438080.0, + "grad_norm": 2.8959352608512603, + "language_loss": 0.73820806, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76292157, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.15252686, + "step": 5997, + "time_per_iteration": 2.71126127243042 + }, + { + "auxiliary_loss_clip": 0.01421646, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.28906655, + "balance_loss_mlp": 1.02698922, + "epoch": 0.3606192695024801, + "flos": 16841270572920.0, + "grad_norm": 2.077862943526953, + "language_loss": 0.69880706, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.7234354, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.14208984, + "step": 5998, + "time_per_iteration": 2.7838637828826904 + }, + { + "auxiliary_loss_clip": 0.01428633, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.29441571, + "balance_loss_mlp": 1.02885962, + "epoch": 0.36067939275514804, + "flos": 12133378688040.0, + "grad_norm": 2.450091244758286, + "language_loss": 0.77829748, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.80301625, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.14367676, + "step": 5999, + "time_per_iteration": 2.808117628097534 + }, + { + "auxiliary_loss_clip": 0.01439087, + "auxiliary_loss_mlp": 0.01053318, + "balance_loss_clip": 1.30333376, + "balance_loss_mlp": 1.03823829, + "epoch": 0.360739516007816, + "flos": 18046088143200.0, + "grad_norm": 1.7285499931296076, + "language_loss": 0.7836917, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.8086158, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15081787, + "step": 6000, + "time_per_iteration": 2.8083977699279785 + }, + { + "auxiliary_loss_clip": 0.01429234, + "auxiliary_loss_mlp": 0.01046284, + "balance_loss_clip": 1.29494405, + "balance_loss_mlp": 1.0314784, + "epoch": 0.360799639260484, + "flos": 18554185856880.0, + "grad_norm": 1.974821432561366, + "language_loss": 0.7832979, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80805314, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.14794922, + "step": 6001, + "time_per_iteration": 2.74980092048645 + }, + { + "auxiliary_loss_clip": 0.01425057, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_clip": 1.29364395, + "balance_loss_mlp": 1.03508162, + "epoch": 0.36085976251315194, + "flos": 19686713900400.0, + "grad_norm": 1.9922705613667915, + "language_loss": 0.83290935, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85765326, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.14245605, + "step": 6002, + "time_per_iteration": 2.733339786529541 + }, + { + "auxiliary_loss_clip": 0.0141886, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.29082465, + "balance_loss_mlp": 1.03746557, + "epoch": 0.3609198857658199, + "flos": 24203262882240.0, + "grad_norm": 2.321034294801734, + "language_loss": 0.91341054, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.93810844, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.13464355, + "step": 6003, + "time_per_iteration": 2.796977996826172 + }, + { + "auxiliary_loss_clip": 0.01350193, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_clip": 1.28634286, + "balance_loss_mlp": 1.05948579, + "epoch": 0.3609800090184879, + "flos": 57130276620240.0, + "grad_norm": 0.9112825034531069, + "language_loss": 0.53431314, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55844021, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03027344, + "step": 6004, + "time_per_iteration": 3.2487194538116455 + }, + { + "auxiliary_loss_clip": 0.0143435, + "auxiliary_loss_mlp": 0.01056518, + "balance_loss_clip": 1.29713321, + "balance_loss_mlp": 1.04099691, + "epoch": 0.3610401322711559, + "flos": 20815830841680.0, + "grad_norm": 1.8430017589663603, + "language_loss": 0.77945763, + "learning_rate": 2.956407517225883e-06, + "loss": 0.80436629, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.15527344, + "step": 6005, + "time_per_iteration": 2.9115121364593506 + }, + { + "auxiliary_loss_clip": 0.01426209, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.29414761, + "balance_loss_mlp": 1.03655791, + "epoch": 0.36110025552382385, + "flos": 13703176819440.0, + "grad_norm": 2.376903189941978, + "language_loss": 0.7925812, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81734335, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.13458252, + "step": 6006, + "time_per_iteration": 2.8826751708984375 + }, + { + "auxiliary_loss_clip": 0.01427545, + "auxiliary_loss_mlp": 0.01042005, + "balance_loss_clip": 1.29342234, + "balance_loss_mlp": 1.02621007, + "epoch": 0.3611603787764918, + "flos": 22460070742920.0, + "grad_norm": 2.6122779566051393, + "language_loss": 0.85153455, + "learning_rate": 2.955723356106876e-06, + "loss": 0.87623, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.15795898, + "step": 6007, + "time_per_iteration": 2.887118101119995 + }, + { + "auxiliary_loss_clip": 0.01443589, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_clip": 1.30217719, + "balance_loss_mlp": 1.02677357, + "epoch": 0.3612205020291598, + "flos": 20891572079040.0, + "grad_norm": 2.079146554939093, + "language_loss": 0.7308085, + "learning_rate": 2.955381221179198e-06, + "loss": 0.75567281, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.16040039, + "step": 6008, + "time_per_iteration": 2.9107816219329834 + }, + { + "auxiliary_loss_clip": 0.0142306, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.28897393, + "balance_loss_mlp": 1.02729237, + "epoch": 0.36128062528182775, + "flos": 15746207064480.0, + "grad_norm": 1.7450995000792566, + "language_loss": 0.83568728, + "learning_rate": 2.955039050023368e-06, + "loss": 0.86033964, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.14880371, + "step": 6009, + "time_per_iteration": 2.8910787105560303 + }, + { + "auxiliary_loss_clip": 0.01424558, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.2909143, + "balance_loss_mlp": 1.02627373, + "epoch": 0.3613407485344957, + "flos": 16768981046160.0, + "grad_norm": 1.7608178985071146, + "language_loss": 0.76811141, + "learning_rate": 2.954696842652362e-06, + "loss": 0.79276437, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.14453125, + "step": 6010, + "time_per_iteration": 2.7643377780914307 + }, + { + "auxiliary_loss_clip": 0.01420057, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.28606868, + "balance_loss_mlp": 1.02318263, + "epoch": 0.3614008717871637, + "flos": 20375433910080.0, + "grad_norm": 1.6929368417820676, + "language_loss": 0.83000153, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85457355, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.13946533, + "step": 6011, + "time_per_iteration": 2.7825284004211426 + }, + { + "auxiliary_loss_clip": 0.01430847, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.29111779, + "balance_loss_mlp": 1.0242188, + "epoch": 0.36146099503983165, + "flos": 22780886389560.0, + "grad_norm": 1.9537465502155713, + "language_loss": 0.62589109, + "learning_rate": 2.954012319316727e-06, + "loss": 0.65059596, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.1541748, + "step": 6012, + "time_per_iteration": 2.836592435836792 + }, + { + "auxiliary_loss_clip": 0.01412918, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.28079295, + "balance_loss_mlp": 1.01628387, + "epoch": 0.3615211182924996, + "flos": 23001044247000.0, + "grad_norm": 1.6491457806572278, + "language_loss": 0.8401314, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86456347, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.14001465, + "step": 6013, + "time_per_iteration": 2.784041404724121 + }, + { + "auxiliary_loss_clip": 0.01416655, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.28252149, + "balance_loss_mlp": 1.01897311, + "epoch": 0.3615812415451676, + "flos": 16651633221000.0, + "grad_norm": 1.8868037854945023, + "language_loss": 0.91896093, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.94346905, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15179443, + "step": 6014, + "time_per_iteration": 4.186485767364502 + }, + { + "auxiliary_loss_clip": 0.01407076, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.27535117, + "balance_loss_mlp": 1.02105975, + "epoch": 0.36164136479783554, + "flos": 21324497072400.0, + "grad_norm": 1.884163609183147, + "language_loss": 0.7409451, + "learning_rate": 2.95298526302391e-06, + "loss": 0.76536906, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.14257812, + "step": 6015, + "time_per_iteration": 2.7734873294830322 + }, + { + "auxiliary_loss_clip": 0.0141518, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.28045046, + "balance_loss_mlp": 1.01733959, + "epoch": 0.3617014880505035, + "flos": 24174813403080.0, + "grad_norm": 1.8608656886162034, + "language_loss": 0.65784925, + "learning_rate": 2.9526428386344e-06, + "loss": 0.68232453, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15002441, + "step": 6016, + "time_per_iteration": 2.745110034942627 + }, + { + "auxiliary_loss_clip": 0.01415743, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.28013635, + "balance_loss_mlp": 1.02055287, + "epoch": 0.3617616113031715, + "flos": 39021424933680.0, + "grad_norm": 1.6626648506595825, + "language_loss": 0.71870244, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74323398, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.16882324, + "step": 6017, + "time_per_iteration": 2.9156572818756104 + }, + { + "auxiliary_loss_clip": 0.01416294, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.27902114, + "balance_loss_mlp": 1.02221239, + "epoch": 0.3618217345558395, + "flos": 12134718763920.0, + "grad_norm": 2.0528696643229445, + "language_loss": 0.74120903, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.76574922, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.1550293, + "step": 6018, + "time_per_iteration": 2.7589569091796875 + }, + { + "auxiliary_loss_clip": 0.01397202, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.27036858, + "balance_loss_mlp": 1.01817322, + "epoch": 0.36188185780850746, + "flos": 24940249250760.0, + "grad_norm": 1.7542594674941467, + "language_loss": 0.69208884, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71639132, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.14868164, + "step": 6019, + "time_per_iteration": 4.255979776382446 + }, + { + "auxiliary_loss_clip": 0.01412632, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.27573395, + "balance_loss_mlp": 1.02295423, + "epoch": 0.3619419810611754, + "flos": 20963658564000.0, + "grad_norm": 1.431853748372649, + "language_loss": 0.76150143, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78602058, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16345215, + "step": 6020, + "time_per_iteration": 2.7721939086914062 + }, + { + "auxiliary_loss_clip": 0.01411917, + "auxiliary_loss_mlp": 0.01042378, + "balance_loss_clip": 1.27689767, + "balance_loss_mlp": 1.02649951, + "epoch": 0.3620021043138434, + "flos": 22534065820800.0, + "grad_norm": 1.7775554261247157, + "language_loss": 0.73707473, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76161766, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.15893555, + "step": 6021, + "time_per_iteration": 2.764911651611328 + }, + { + "auxiliary_loss_clip": 0.01404742, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.27106714, + "balance_loss_mlp": 1.02186394, + "epoch": 0.36206222756651135, + "flos": 15600978277200.0, + "grad_norm": 2.389715843279306, + "language_loss": 0.81034589, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83475888, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14697266, + "step": 6022, + "time_per_iteration": 2.704664945602417 + }, + { + "auxiliary_loss_clip": 0.01396234, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.26768637, + "balance_loss_mlp": 1.01709557, + "epoch": 0.3621223508191793, + "flos": 23592517569720.0, + "grad_norm": 1.7631307982658448, + "language_loss": 0.81909895, + "learning_rate": 2.950244857154417e-06, + "loss": 0.84337121, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.13891602, + "step": 6023, + "time_per_iteration": 4.390122413635254 + }, + { + "auxiliary_loss_clip": 0.01408382, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.27235532, + "balance_loss_mlp": 1.02033806, + "epoch": 0.3621824740718473, + "flos": 22315044997440.0, + "grad_norm": 2.09032272682129, + "language_loss": 0.79795611, + "learning_rate": 2.9499021441341e-06, + "loss": 0.82239628, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15283203, + "step": 6024, + "time_per_iteration": 2.8835558891296387 + }, + { + "auxiliary_loss_clip": 0.01390144, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.26095843, + "balance_loss_mlp": 1.0188818, + "epoch": 0.36224259732451525, + "flos": 16768006445520.0, + "grad_norm": 1.9455130201606752, + "language_loss": 0.75248033, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.77670765, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.13708496, + "step": 6025, + "time_per_iteration": 4.147401332855225 + }, + { + "auxiliary_loss_clip": 0.01396174, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.26330853, + "balance_loss_mlp": 1.01448596, + "epoch": 0.3623027205771832, + "flos": 23155085048400.0, + "grad_norm": 1.7175748684903605, + "language_loss": 0.72776288, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.75201225, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.14282227, + "step": 6026, + "time_per_iteration": 2.7984626293182373 + }, + { + "auxiliary_loss_clip": 0.01412894, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.27483773, + "balance_loss_mlp": 1.02809751, + "epoch": 0.3623628438298512, + "flos": 28555148653560.0, + "grad_norm": 1.9246770321305566, + "language_loss": 0.79231071, + "learning_rate": 2.948873789002833e-06, + "loss": 0.8168776, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.15710449, + "step": 6027, + "time_per_iteration": 2.7953155040740967 + }, + { + "auxiliary_loss_clip": 0.01398139, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.26294184, + "balance_loss_mlp": 1.02156842, + "epoch": 0.36242296708251914, + "flos": 25490603286000.0, + "grad_norm": 1.8495118829671475, + "language_loss": 0.68100512, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.70535648, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15405273, + "step": 6028, + "time_per_iteration": 2.856306552886963 + }, + { + "auxiliary_loss_clip": 0.01396104, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.26427352, + "balance_loss_mlp": 1.01663852, + "epoch": 0.3624830903351871, + "flos": 16294814940240.0, + "grad_norm": 1.761760997884565, + "language_loss": 0.85822862, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.88249445, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.13861084, + "step": 6029, + "time_per_iteration": 2.8193323612213135 + }, + { + "auxiliary_loss_clip": 0.01399215, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.2673192, + "balance_loss_mlp": 1.02206612, + "epoch": 0.36254321358785513, + "flos": 18300908558880.0, + "grad_norm": 1.512947046113689, + "language_loss": 0.72679603, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.75115252, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.14355469, + "step": 6030, + "time_per_iteration": 2.762016534805298 + }, + { + "auxiliary_loss_clip": 0.0140811, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.27022159, + "balance_loss_mlp": 1.01922452, + "epoch": 0.3626033368405231, + "flos": 14869270995480.0, + "grad_norm": 2.1653555125932593, + "language_loss": 0.75344312, + "learning_rate": 2.94750214514905e-06, + "loss": 0.77788472, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.16809082, + "step": 6031, + "time_per_iteration": 2.722975492477417 + }, + { + "auxiliary_loss_clip": 0.01396179, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.2630198, + "balance_loss_mlp": 1.02357817, + "epoch": 0.36266346009319106, + "flos": 22311309028320.0, + "grad_norm": 1.7088748140002517, + "language_loss": 0.73937768, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.76371729, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.14208984, + "step": 6032, + "time_per_iteration": 2.7565665245056152 + }, + { + "auxiliary_loss_clip": 0.0140028, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_clip": 1.26539612, + "balance_loss_mlp": 1.03065276, + "epoch": 0.362723583345859, + "flos": 18226791655920.0, + "grad_norm": 1.9196439842423403, + "language_loss": 0.78255111, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80700207, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.14172363, + "step": 6033, + "time_per_iteration": 2.7345376014709473 + }, + { + "auxiliary_loss_clip": 0.01290591, + "auxiliary_loss_mlp": 0.01007674, + "balance_loss_clip": 1.2294637, + "balance_loss_mlp": 1.00404978, + "epoch": 0.362783706598527, + "flos": 68514892515360.0, + "grad_norm": 0.7965401979765382, + "language_loss": 0.64877135, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.67175406, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03613281, + "step": 6034, + "time_per_iteration": 3.3169748783111572 + }, + { + "auxiliary_loss_clip": 0.01395983, + "auxiliary_loss_mlp": 0.01038493, + "balance_loss_clip": 1.26478016, + "balance_loss_mlp": 1.02435446, + "epoch": 0.36284382985119495, + "flos": 26582012042040.0, + "grad_norm": 1.7035583910462022, + "language_loss": 0.89878559, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92313033, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.14141846, + "step": 6035, + "time_per_iteration": 2.8172755241394043 + }, + { + "auxiliary_loss_clip": 0.014055, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.26834416, + "balance_loss_mlp": 1.02863169, + "epoch": 0.3629039531038629, + "flos": 20161570348440.0, + "grad_norm": 2.7300699664281414, + "language_loss": 0.73796618, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76246023, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.15283203, + "step": 6036, + "time_per_iteration": 2.7636454105377197 + }, + { + "auxiliary_loss_clip": 0.0140965, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.27106249, + "balance_loss_mlp": 1.02630329, + "epoch": 0.3629640763565309, + "flos": 18630495611280.0, + "grad_norm": 2.198549642718971, + "language_loss": 0.76126194, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78577048, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.14904785, + "step": 6037, + "time_per_iteration": 2.705598831176758 + }, + { + "auxiliary_loss_clip": 0.0139558, + "auxiliary_loss_mlp": 0.01055267, + "balance_loss_clip": 1.26454282, + "balance_loss_mlp": 1.04010952, + "epoch": 0.36302419960919885, + "flos": 19575985237920.0, + "grad_norm": 1.6803645460466672, + "language_loss": 0.78639615, + "learning_rate": 2.945100385624828e-06, + "loss": 0.81090462, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15142822, + "step": 6038, + "time_per_iteration": 2.741363525390625 + }, + { + "auxiliary_loss_clip": 0.01282217, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.22113121, + "balance_loss_mlp": 1.00091219, + "epoch": 0.3630843228618668, + "flos": 63813376143000.0, + "grad_norm": 0.8329795313319978, + "language_loss": 0.63488173, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65774369, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03076172, + "step": 6039, + "time_per_iteration": 3.3339202404022217 + }, + { + "auxiliary_loss_clip": 0.01395827, + "auxiliary_loss_mlp": 0.01050159, + "balance_loss_clip": 1.26300621, + "balance_loss_mlp": 1.03619397, + "epoch": 0.3631444461145348, + "flos": 21840188549400.0, + "grad_norm": 2.2673698654556764, + "language_loss": 0.71131146, + "learning_rate": 2.944413845878002e-06, + "loss": 0.7357713, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.13952637, + "step": 6040, + "time_per_iteration": 2.776599168777466 + }, + { + "auxiliary_loss_clip": 0.01408929, + "auxiliary_loss_mlp": 0.0105404, + "balance_loss_clip": 1.27109337, + "balance_loss_mlp": 1.03907907, + "epoch": 0.36320456936720275, + "flos": 21726576693360.0, + "grad_norm": 1.6381369455120076, + "language_loss": 0.81805962, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.84268928, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.1496582, + "step": 6041, + "time_per_iteration": 2.7295174598693848 + }, + { + "auxiliary_loss_clip": 0.01398668, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.26382959, + "balance_loss_mlp": 1.02853465, + "epoch": 0.3632646926198707, + "flos": 17023476594960.0, + "grad_norm": 2.2318362309996562, + "language_loss": 0.8434813, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86790973, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.15625, + "step": 6042, + "time_per_iteration": 2.700942277908325 + }, + { + "auxiliary_loss_clip": 0.013968, + "auxiliary_loss_mlp": 0.01045262, + "balance_loss_clip": 1.26375759, + "balance_loss_mlp": 1.03129685, + "epoch": 0.36332481587253873, + "flos": 23336397686520.0, + "grad_norm": 1.7571417472568824, + "language_loss": 0.78071856, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80513918, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.13964844, + "step": 6043, + "time_per_iteration": 2.8248071670532227 + }, + { + "auxiliary_loss_clip": 0.01394885, + "auxiliary_loss_mlp": 0.0104762, + "balance_loss_clip": 1.26449287, + "balance_loss_mlp": 1.0325458, + "epoch": 0.3633849391252067, + "flos": 10747126654560.0, + "grad_norm": 2.2705047025894856, + "language_loss": 0.66115916, + "learning_rate": 2.943040336741298e-06, + "loss": 0.68558419, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15063477, + "step": 6044, + "time_per_iteration": 2.7410619258880615 + }, + { + "auxiliary_loss_clip": 0.01400508, + "auxiliary_loss_mlp": 0.01043536, + "balance_loss_clip": 1.26905835, + "balance_loss_mlp": 1.02935052, + "epoch": 0.36344506237787466, + "flos": 25854771679920.0, + "grad_norm": 1.7102854856975949, + "language_loss": 0.81207782, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83651829, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.14190674, + "step": 6045, + "time_per_iteration": 2.8246753215789795 + }, + { + "auxiliary_loss_clip": 0.01409709, + "auxiliary_loss_mlp": 0.01049771, + "balance_loss_clip": 1.27542031, + "balance_loss_mlp": 1.03431582, + "epoch": 0.3635051856305426, + "flos": 30160177860240.0, + "grad_norm": 1.8362643836131314, + "language_loss": 0.65406752, + "learning_rate": 2.942353367559755e-06, + "loss": 0.6786623, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.15460205, + "step": 6046, + "time_per_iteration": 2.8093461990356445 + }, + { + "auxiliary_loss_clip": 0.01403611, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.26993573, + "balance_loss_mlp": 1.03053355, + "epoch": 0.3635653088832106, + "flos": 22203260517600.0, + "grad_norm": 2.3793622261646097, + "language_loss": 0.77617443, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.8006593, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14343262, + "step": 6047, + "time_per_iteration": 2.8447251319885254 + }, + { + "auxiliary_loss_clip": 0.0140792, + "auxiliary_loss_mlp": 0.0105008, + "balance_loss_clip": 1.26893294, + "balance_loss_mlp": 1.03428483, + "epoch": 0.36362543213587856, + "flos": 24792015444840.0, + "grad_norm": 1.6651747219679074, + "language_loss": 0.79588538, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.82046545, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.15808105, + "step": 6048, + "time_per_iteration": 2.7832190990448 + }, + { + "auxiliary_loss_clip": 0.01281938, + "auxiliary_loss_mlp": 0.01015218, + "balance_loss_clip": 1.22124481, + "balance_loss_mlp": 1.01238048, + "epoch": 0.3636855553885465, + "flos": 62542725775200.0, + "grad_norm": 0.7606530758254186, + "language_loss": 0.52561367, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54858518, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.02832031, + "step": 6049, + "time_per_iteration": 3.3260750770568848 + }, + { + "auxiliary_loss_clip": 0.01405908, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.27184522, + "balance_loss_mlp": 1.02577329, + "epoch": 0.3637456786412145, + "flos": 24065952725160.0, + "grad_norm": 1.784235734589446, + "language_loss": 0.86853969, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.89300227, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.14587402, + "step": 6050, + "time_per_iteration": 2.9039862155914307 + }, + { + "auxiliary_loss_clip": 0.01396326, + "auxiliary_loss_mlp": 0.01043669, + "balance_loss_clip": 1.26530766, + "balance_loss_mlp": 1.02969742, + "epoch": 0.36380580189388245, + "flos": 16695960568920.0, + "grad_norm": 1.6711424584088614, + "language_loss": 0.78597575, + "learning_rate": 2.940635319486546e-06, + "loss": 0.81037569, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.13964844, + "step": 6051, + "time_per_iteration": 2.8850595951080322 + }, + { + "auxiliary_loss_clip": 0.01395879, + "auxiliary_loss_mlp": 0.01032678, + "balance_loss_clip": 1.2630924, + "balance_loss_mlp": 1.0190165, + "epoch": 0.3638659251465504, + "flos": 25118922345480.0, + "grad_norm": 1.8749239604419812, + "language_loss": 0.82662511, + "learning_rate": 2.940291602812822e-06, + "loss": 0.85091066, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.13677979, + "step": 6052, + "time_per_iteration": 2.844122886657715 + }, + { + "auxiliary_loss_clip": 0.01388501, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.26028383, + "balance_loss_mlp": 1.01675367, + "epoch": 0.3639260483992184, + "flos": 23008231926720.0, + "grad_norm": 1.7167899203583634, + "language_loss": 0.72505593, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74924505, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.13659668, + "step": 6053, + "time_per_iteration": 4.27782416343689 + }, + { + "auxiliary_loss_clip": 0.01278802, + "auxiliary_loss_mlp": 0.01003225, + "balance_loss_clip": 1.21696281, + "balance_loss_mlp": 1.00035214, + "epoch": 0.36398617165188635, + "flos": 70730870076360.0, + "grad_norm": 0.7670084488573174, + "language_loss": 0.61266196, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63548219, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.02868652, + "step": 6054, + "time_per_iteration": 3.272719621658325 + }, + { + "auxiliary_loss_clip": 0.01405046, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.27184474, + "balance_loss_mlp": 1.01822877, + "epoch": 0.3640462949045543, + "flos": 22240400185800.0, + "grad_norm": 2.3144053101745543, + "language_loss": 0.75997192, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78435701, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15246582, + "step": 6055, + "time_per_iteration": 2.782649278640747 + }, + { + "auxiliary_loss_clip": 0.01397182, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.26617301, + "balance_loss_mlp": 1.01832676, + "epoch": 0.3641064181572223, + "flos": 21548553332400.0, + "grad_norm": 1.7077478396594017, + "language_loss": 0.75814748, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7824533, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15063477, + "step": 6056, + "time_per_iteration": 2.74163818359375 + }, + { + "auxiliary_loss_clip": 0.01400166, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.26940274, + "balance_loss_mlp": 1.01806521, + "epoch": 0.3641665414098903, + "flos": 22278433237920.0, + "grad_norm": 2.138910326695024, + "language_loss": 0.80355835, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82788014, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.13952637, + "step": 6057, + "time_per_iteration": 2.764305591583252 + }, + { + "auxiliary_loss_clip": 0.01395628, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.26653802, + "balance_loss_mlp": 1.01678956, + "epoch": 0.36422666466255826, + "flos": 28335559313160.0, + "grad_norm": 1.9064303940999525, + "language_loss": 0.80525565, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82952529, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14544678, + "step": 6058, + "time_per_iteration": 2.797253370285034 + }, + { + "auxiliary_loss_clip": 0.01398282, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.26528847, + "balance_loss_mlp": 1.01566589, + "epoch": 0.36428678791522623, + "flos": 24175869220440.0, + "grad_norm": 1.8972935973662173, + "language_loss": 0.85373366, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87802124, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.14801025, + "step": 6059, + "time_per_iteration": 4.197679758071899 + }, + { + "auxiliary_loss_clip": 0.01401046, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.26748812, + "balance_loss_mlp": 1.01759493, + "epoch": 0.3643469111678942, + "flos": 22533578520480.0, + "grad_norm": 1.9272115819288607, + "language_loss": 0.87944973, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90378928, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15332031, + "step": 6060, + "time_per_iteration": 2.847960948944092 + }, + { + "auxiliary_loss_clip": 0.01405153, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.26945972, + "balance_loss_mlp": 1.01779366, + "epoch": 0.36440703442056216, + "flos": 19431243750960.0, + "grad_norm": 2.5441803765499755, + "language_loss": 0.67123663, + "learning_rate": 2.937196549795971e-06, + "loss": 0.6956262, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.16015625, + "step": 6061, + "time_per_iteration": 2.7255141735076904 + }, + { + "auxiliary_loss_clip": 0.01405927, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.27094722, + "balance_loss_mlp": 1.01413357, + "epoch": 0.3644671576732301, + "flos": 18045032325840.0, + "grad_norm": 2.2641285963996767, + "language_loss": 0.76341057, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.78776187, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1506958, + "step": 6062, + "time_per_iteration": 2.8125667572021484 + }, + { + "auxiliary_loss_clip": 0.01395748, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.26480818, + "balance_loss_mlp": 1.01572669, + "epoch": 0.3645272809258981, + "flos": 21547741165200.0, + "grad_norm": 1.70308853142303, + "language_loss": 0.72624636, + "learning_rate": 2.936508368977432e-06, + "loss": 0.75051653, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.15533447, + "step": 6063, + "time_per_iteration": 4.349549770355225 + }, + { + "auxiliary_loss_clip": 0.01390032, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.26078534, + "balance_loss_mlp": 1.01979148, + "epoch": 0.36458740417856605, + "flos": 22751949610080.0, + "grad_norm": 4.327409325978169, + "language_loss": 0.68461668, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70886433, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14959717, + "step": 6064, + "time_per_iteration": 4.227888584136963 + }, + { + "auxiliary_loss_clip": 0.01400574, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.26606584, + "balance_loss_mlp": 1.02307963, + "epoch": 0.364647527431234, + "flos": 26146285071840.0, + "grad_norm": 1.91997368017333, + "language_loss": 0.74788439, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.77226502, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.14404297, + "step": 6065, + "time_per_iteration": 2.795886278152466 + }, + { + "auxiliary_loss_clip": 0.01401927, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.26560843, + "balance_loss_mlp": 1.01884341, + "epoch": 0.364707650683902, + "flos": 31036179936960.0, + "grad_norm": 2.114253959703197, + "language_loss": 0.75652742, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.7808913, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.15625, + "step": 6066, + "time_per_iteration": 2.8685975074768066 + }, + { + "auxiliary_loss_clip": 0.01391069, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.26084113, + "balance_loss_mlp": 1.01349652, + "epoch": 0.36476777393656995, + "flos": 19577609572320.0, + "grad_norm": 2.748310525540571, + "language_loss": 0.76749051, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79167259, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.13635254, + "step": 6067, + "time_per_iteration": 2.8087141513824463 + }, + { + "auxiliary_loss_clip": 0.01389824, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.26158202, + "balance_loss_mlp": 1.0160774, + "epoch": 0.3648278971892379, + "flos": 17753356500480.0, + "grad_norm": 2.459643689134991, + "language_loss": 0.71434939, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73854399, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.13549805, + "step": 6068, + "time_per_iteration": 2.8182883262634277 + }, + { + "auxiliary_loss_clip": 0.01401305, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.26442862, + "balance_loss_mlp": 1.01890993, + "epoch": 0.3648880204419059, + "flos": 17935684347600.0, + "grad_norm": 1.8557223809685035, + "language_loss": 0.7456497, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.77000302, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.15130615, + "step": 6069, + "time_per_iteration": 2.7691287994384766 + }, + { + "auxiliary_loss_clip": 0.01402239, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.26651609, + "balance_loss_mlp": 1.01968217, + "epoch": 0.3649481436945739, + "flos": 22643454407400.0, + "grad_norm": 1.880561754553702, + "language_loss": 0.66067636, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68505168, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15612793, + "step": 6070, + "time_per_iteration": 2.854325532913208 + }, + { + "auxiliary_loss_clip": 0.01389084, + "auxiliary_loss_mlp": 0.01026559, + "balance_loss_clip": 1.25812316, + "balance_loss_mlp": 1.0128144, + "epoch": 0.36500826694724187, + "flos": 21584637183240.0, + "grad_norm": 1.8457705424918618, + "language_loss": 0.74344611, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76760262, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.1373291, + "step": 6071, + "time_per_iteration": 2.928161144256592 + }, + { + "auxiliary_loss_clip": 0.01392867, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.26093566, + "balance_loss_mlp": 1.01394463, + "epoch": 0.36506839019990983, + "flos": 13776603380280.0, + "grad_norm": 2.7984391563419035, + "language_loss": 0.88897777, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.91319418, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.14813232, + "step": 6072, + "time_per_iteration": 2.719559907913208 + }, + { + "auxiliary_loss_clip": 0.01392253, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.25955033, + "balance_loss_mlp": 1.01588774, + "epoch": 0.3651285134525778, + "flos": 17279474653080.0, + "grad_norm": 1.9255675048885432, + "language_loss": 0.73035002, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.7545718, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.14044189, + "step": 6073, + "time_per_iteration": 2.8228118419647217 + }, + { + "auxiliary_loss_clip": 0.01403401, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.26877236, + "balance_loss_mlp": 1.01418304, + "epoch": 0.36518863670524576, + "flos": 21913005984840.0, + "grad_norm": 2.025976668647204, + "language_loss": 0.67014384, + "learning_rate": 2.932720838132236e-06, + "loss": 0.6944741, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.15429688, + "step": 6074, + "time_per_iteration": 2.7341439723968506 + }, + { + "auxiliary_loss_clip": 0.01394753, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.26247168, + "balance_loss_mlp": 1.0195241, + "epoch": 0.3652487599579137, + "flos": 27127452465720.0, + "grad_norm": 1.6387903719744217, + "language_loss": 0.72928649, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75356972, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.14050293, + "step": 6075, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.01406216, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.26931024, + "balance_loss_mlp": 1.0175246, + "epoch": 0.3653088832105817, + "flos": 19760343503040.0, + "grad_norm": 2.2160213041760217, + "language_loss": 0.89931607, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.92370349, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.14996338, + "step": 6076, + "time_per_iteration": 2.7049221992492676 + }, + { + "auxiliary_loss_clip": 0.01397549, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.2652564, + "balance_loss_mlp": 1.01948643, + "epoch": 0.36536900646324966, + "flos": 13118891176440.0, + "grad_norm": 2.147591207816599, + "language_loss": 0.7043401, + "learning_rate": 2.931687131696872e-06, + "loss": 0.728661, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15057373, + "step": 6077, + "time_per_iteration": 2.7279813289642334 + }, + { + "auxiliary_loss_clip": 0.01254926, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.1907928, + "balance_loss_mlp": 1.02417171, + "epoch": 0.3654291297159176, + "flos": 71117941585320.0, + "grad_norm": 0.7613187905519718, + "language_loss": 0.61775917, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.64058638, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03613281, + "step": 6078, + "time_per_iteration": 3.346635103225708 + }, + { + "auxiliary_loss_clip": 0.01398785, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.26422417, + "balance_loss_mlp": 1.02132106, + "epoch": 0.3654892529685856, + "flos": 23622022866240.0, + "grad_norm": 2.1900743492054473, + "language_loss": 0.78811562, + "learning_rate": 2.930997817403173e-06, + "loss": 0.81245607, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.13970947, + "step": 6079, + "time_per_iteration": 2.7512142658233643 + }, + { + "auxiliary_loss_clip": 0.01402004, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.26670814, + "balance_loss_mlp": 1.02210486, + "epoch": 0.36554937622125355, + "flos": 43478597847240.0, + "grad_norm": 2.1467289299005614, + "language_loss": 0.62775028, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65214372, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15258789, + "step": 6080, + "time_per_iteration": 2.9961259365081787 + }, + { + "auxiliary_loss_clip": 0.01409366, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.27291632, + "balance_loss_mlp": 1.02504134, + "epoch": 0.3656094994739215, + "flos": 23299867143720.0, + "grad_norm": 2.2587847904398712, + "language_loss": 0.68265307, + "learning_rate": 2.930308361895352e-06, + "loss": 0.70715278, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.15563965, + "step": 6081, + "time_per_iteration": 2.7941365242004395 + }, + { + "auxiliary_loss_clip": 0.01415204, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.27548325, + "balance_loss_mlp": 1.03002262, + "epoch": 0.3656696227265895, + "flos": 24577420932720.0, + "grad_norm": 1.5298469360175677, + "language_loss": 0.74999118, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.7745899, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.14642334, + "step": 6082, + "time_per_iteration": 2.8134357929229736 + }, + { + "auxiliary_loss_clip": 0.01403949, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.26876307, + "balance_loss_mlp": 1.01940799, + "epoch": 0.3657297459792575, + "flos": 27934210642680.0, + "grad_norm": 2.546200658120452, + "language_loss": 0.82879132, + "learning_rate": 2.929618765277987e-06, + "loss": 0.85316163, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.13671875, + "step": 6083, + "time_per_iteration": 2.8377678394317627 + }, + { + "auxiliary_loss_clip": 0.01254134, + "auxiliary_loss_mlp": 0.01004419, + "balance_loss_clip": 1.19309986, + "balance_loss_mlp": 1.00065196, + "epoch": 0.36578986923192547, + "flos": 67406631289920.0, + "grad_norm": 0.8254567493408952, + "language_loss": 0.59348041, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61606598, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.03759766, + "step": 6084, + "time_per_iteration": 3.470303773880005 + }, + { + "auxiliary_loss_clip": 0.0140599, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.27171993, + "balance_loss_mlp": 1.027354, + "epoch": 0.36584999248459343, + "flos": 20232032499000.0, + "grad_norm": 1.7160113015559841, + "language_loss": 0.72610903, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75058961, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.14715576, + "step": 6085, + "time_per_iteration": 2.7616822719573975 + }, + { + "auxiliary_loss_clip": 0.01405343, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.27062798, + "balance_loss_mlp": 1.03045523, + "epoch": 0.3659101157372614, + "flos": 19067156573760.0, + "grad_norm": 1.8320707563275822, + "language_loss": 0.78528583, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80978346, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.13970947, + "step": 6086, + "time_per_iteration": 2.7300262451171875 + }, + { + "auxiliary_loss_clip": 0.01395591, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_clip": 1.26628947, + "balance_loss_mlp": 1.03048265, + "epoch": 0.36597023898992936, + "flos": 30816915463440.0, + "grad_norm": 1.995143631096416, + "language_loss": 0.76984, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.7942372, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.13641357, + "step": 6087, + "time_per_iteration": 2.8682408332824707 + }, + { + "auxiliary_loss_clip": 0.01407495, + "auxiliary_loss_mlp": 0.010494, + "balance_loss_clip": 1.2711904, + "balance_loss_mlp": 1.03494549, + "epoch": 0.36603036224259733, + "flos": 20526672734640.0, + "grad_norm": 2.227518629018573, + "language_loss": 0.71071815, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.73528707, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.14459229, + "step": 6088, + "time_per_iteration": 2.726475477218628 + }, + { + "auxiliary_loss_clip": 0.0142043, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_clip": 1.27741241, + "balance_loss_mlp": 1.03315592, + "epoch": 0.3660904854952653, + "flos": 38336928193440.0, + "grad_norm": 1.7415757620037955, + "language_loss": 0.80083036, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82552695, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.1605835, + "step": 6089, + "time_per_iteration": 2.9035379886627197 + }, + { + "auxiliary_loss_clip": 0.01403629, + "auxiliary_loss_mlp": 0.01047417, + "balance_loss_clip": 1.27133012, + "balance_loss_mlp": 1.03373766, + "epoch": 0.36615060874793326, + "flos": 21840635241360.0, + "grad_norm": 2.4822045784178264, + "language_loss": 0.71166855, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73617899, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.13677979, + "step": 6090, + "time_per_iteration": 2.7618730068206787 + }, + { + "auxiliary_loss_clip": 0.01398936, + "auxiliary_loss_mlp": 0.01051969, + "balance_loss_clip": 1.27069223, + "balance_loss_mlp": 1.03883815, + "epoch": 0.3662107320006012, + "flos": 16585922248560.0, + "grad_norm": 2.375339438311171, + "language_loss": 0.73941183, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.7639209, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.13116455, + "step": 6091, + "time_per_iteration": 4.144473075866699 + }, + { + "auxiliary_loss_clip": 0.01407246, + "auxiliary_loss_mlp": 0.01050707, + "balance_loss_clip": 1.27414155, + "balance_loss_mlp": 1.03577018, + "epoch": 0.3662708552532692, + "flos": 20963130655320.0, + "grad_norm": 2.2790254011947075, + "language_loss": 0.73190022, + "learning_rate": 2.926513837074284e-06, + "loss": 0.75647974, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.14935303, + "step": 6092, + "time_per_iteration": 2.867685556411743 + }, + { + "auxiliary_loss_clip": 0.01413361, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_clip": 1.27876925, + "balance_loss_mlp": 1.03923869, + "epoch": 0.36633097850593715, + "flos": 21907036555920.0, + "grad_norm": 2.1667031984384164, + "language_loss": 0.78263003, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80730021, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.14416504, + "step": 6093, + "time_per_iteration": 2.739598512649536 + }, + { + "auxiliary_loss_clip": 0.01408868, + "auxiliary_loss_mlp": 0.01046469, + "balance_loss_clip": 1.2738409, + "balance_loss_mlp": 1.0321753, + "epoch": 0.3663911017586051, + "flos": 32860636050600.0, + "grad_norm": 2.2373197457169143, + "language_loss": 0.75003862, + "learning_rate": 2.925823466224696e-06, + "loss": 0.77459204, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.14300537, + "step": 6094, + "time_per_iteration": 2.8591537475585938 + }, + { + "auxiliary_loss_clip": 0.01414956, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.27831137, + "balance_loss_mlp": 1.04151714, + "epoch": 0.3664512250112731, + "flos": 27277513647840.0, + "grad_norm": 1.7664728204478621, + "language_loss": 0.79477066, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.8194831, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.14752197, + "step": 6095, + "time_per_iteration": 2.887002468109131 + }, + { + "auxiliary_loss_clip": 0.01420082, + "auxiliary_loss_mlp": 0.01046924, + "balance_loss_clip": 1.28285444, + "balance_loss_mlp": 1.0311408, + "epoch": 0.3665113482639411, + "flos": 17788831225920.0, + "grad_norm": 2.120711379495169, + "language_loss": 0.73024678, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75491691, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15783691, + "step": 6096, + "time_per_iteration": 4.198402166366577 + }, + { + "auxiliary_loss_clip": 0.01416522, + "auxiliary_loss_mlp": 0.01044104, + "balance_loss_clip": 1.27913499, + "balance_loss_mlp": 1.02872634, + "epoch": 0.36657147151660907, + "flos": 27860337389880.0, + "grad_norm": 2.3617870778746948, + "language_loss": 0.67076719, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69537342, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.15393066, + "step": 6097, + "time_per_iteration": 2.789485216140747 + }, + { + "auxiliary_loss_clip": 0.01414155, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.2791611, + "balance_loss_mlp": 1.02969396, + "epoch": 0.36663159476927704, + "flos": 25379468539920.0, + "grad_norm": 1.4567806787572748, + "language_loss": 0.77886498, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.80345142, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.14782715, + "step": 6098, + "time_per_iteration": 2.7777321338653564 + }, + { + "auxiliary_loss_clip": 0.01409765, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.2771945, + "balance_loss_mlp": 1.02633739, + "epoch": 0.366691718021945, + "flos": 21361717957320.0, + "grad_norm": 1.897198329891941, + "language_loss": 0.73419476, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75869894, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.14324951, + "step": 6099, + "time_per_iteration": 2.7129507064819336 + }, + { + "auxiliary_loss_clip": 0.01401593, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.2713424, + "balance_loss_mlp": 1.02720213, + "epoch": 0.36675184127461297, + "flos": 16804780638480.0, + "grad_norm": 1.915160573101209, + "language_loss": 0.8476519, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.87207544, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.13555908, + "step": 6100, + "time_per_iteration": 2.737196683883667 + }, + { + "auxiliary_loss_clip": 0.0141624, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.27723455, + "balance_loss_mlp": 1.02405715, + "epoch": 0.36681196452728093, + "flos": 21911341042080.0, + "grad_norm": 1.7952862996987977, + "language_loss": 0.70575297, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.73030967, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.15380859, + "step": 6101, + "time_per_iteration": 4.31298565864563 + }, + { + "auxiliary_loss_clip": 0.01407536, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.27266288, + "balance_loss_mlp": 1.02232718, + "epoch": 0.3668720877799489, + "flos": 17716947782760.0, + "grad_norm": 2.7169746417217473, + "language_loss": 0.76384199, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78828508, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.14453125, + "step": 6102, + "time_per_iteration": 2.724578619003296 + }, + { + "auxiliary_loss_clip": 0.01424295, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.28373826, + "balance_loss_mlp": 1.02539587, + "epoch": 0.36693221103261686, + "flos": 47053596213360.0, + "grad_norm": 1.4905788545106178, + "language_loss": 0.70379698, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72845995, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.16625977, + "step": 6103, + "time_per_iteration": 4.406079053878784 + }, + { + "auxiliary_loss_clip": 0.01413494, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.27741611, + "balance_loss_mlp": 1.02202582, + "epoch": 0.3669923342852848, + "flos": 15965187279480.0, + "grad_norm": 1.7658622715305912, + "language_loss": 0.71812934, + "learning_rate": 2.922369507632716e-06, + "loss": 0.74263799, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.15344238, + "step": 6104, + "time_per_iteration": 2.772650718688965 + }, + { + "auxiliary_loss_clip": 0.01411284, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.27517486, + "balance_loss_mlp": 1.01789379, + "epoch": 0.3670524575379528, + "flos": 19979283109680.0, + "grad_norm": 2.81745232988607, + "language_loss": 0.81420738, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83865738, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.15820312, + "step": 6105, + "time_per_iteration": 2.901381254196167 + }, + { + "auxiliary_loss_clip": 0.0142356, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.28321004, + "balance_loss_mlp": 1.01936388, + "epoch": 0.36711258079062076, + "flos": 25708527683640.0, + "grad_norm": 1.7014638657844872, + "language_loss": 0.80894393, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83353263, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.1595459, + "step": 6106, + "time_per_iteration": 2.8710076808929443 + }, + { + "auxiliary_loss_clip": 0.01255164, + "auxiliary_loss_mlp": 0.01016167, + "balance_loss_clip": 1.18693233, + "balance_loss_mlp": 1.01168489, + "epoch": 0.3671727040432887, + "flos": 60788569378680.0, + "grad_norm": 0.6935486975014434, + "language_loss": 0.59228218, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61499554, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.04492188, + "step": 6107, + "time_per_iteration": 3.3173775672912598 + }, + { + "auxiliary_loss_clip": 0.01403834, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.27021801, + "balance_loss_mlp": 1.01840138, + "epoch": 0.3672328272959567, + "flos": 18665970336720.0, + "grad_norm": 1.593313961365187, + "language_loss": 0.74671012, + "learning_rate": 2.92098694412469e-06, + "loss": 0.77107346, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14093018, + "step": 6108, + "time_per_iteration": 2.8377761840820312 + }, + { + "auxiliary_loss_clip": 0.01414645, + "auxiliary_loss_mlp": 0.01038204, + "balance_loss_clip": 1.27634394, + "balance_loss_mlp": 1.02275467, + "epoch": 0.3672929505486247, + "flos": 15053223177000.0, + "grad_norm": 2.5110099730113205, + "language_loss": 0.73480392, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75933242, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.15454102, + "step": 6109, + "time_per_iteration": 2.7293405532836914 + }, + { + "auxiliary_loss_clip": 0.01404753, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.2705071, + "balance_loss_mlp": 1.02204406, + "epoch": 0.3673530738012927, + "flos": 20593520741160.0, + "grad_norm": 1.9980705244764294, + "language_loss": 0.53208584, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55650294, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.14923096, + "step": 6110, + "time_per_iteration": 2.79753041267395 + }, + { + "auxiliary_loss_clip": 0.01405589, + "auxiliary_loss_mlp": 0.01040461, + "balance_loss_clip": 1.27150118, + "balance_loss_mlp": 1.02511871, + "epoch": 0.36741319705396064, + "flos": 21694919153760.0, + "grad_norm": 1.953001756767843, + "language_loss": 0.80312496, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82758546, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.15344238, + "step": 6111, + "time_per_iteration": 2.8932769298553467 + }, + { + "auxiliary_loss_clip": 0.01408137, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.27519083, + "balance_loss_mlp": 1.02317595, + "epoch": 0.3674733203066286, + "flos": 29868298993080.0, + "grad_norm": 1.5546869733001087, + "language_loss": 0.72912848, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.7535885, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.14691162, + "step": 6112, + "time_per_iteration": 2.8461923599243164 + }, + { + "auxiliary_loss_clip": 0.01402249, + "auxiliary_loss_mlp": 0.01038566, + "balance_loss_clip": 1.26862049, + "balance_loss_mlp": 1.02384317, + "epoch": 0.36753344355929657, + "flos": 18261251172360.0, + "grad_norm": 2.5998263593400304, + "language_loss": 0.85715115, + "learning_rate": 2.919257954049892e-06, + "loss": 0.88155925, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14709473, + "step": 6113, + "time_per_iteration": 2.769486427307129 + }, + { + "auxiliary_loss_clip": 0.01411464, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.27344966, + "balance_loss_mlp": 1.02727056, + "epoch": 0.36759356681196453, + "flos": 25306610496120.0, + "grad_norm": 1.959844439241172, + "language_loss": 0.78985482, + "learning_rate": 2.918912051407413e-06, + "loss": 0.8144021, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.15991211, + "step": 6114, + "time_per_iteration": 2.7671656608581543 + }, + { + "auxiliary_loss_clip": 0.01411728, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_clip": 1.27168083, + "balance_loss_mlp": 1.0310241, + "epoch": 0.3676536900646325, + "flos": 21037937900400.0, + "grad_norm": 1.783242837944854, + "language_loss": 0.66960198, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69419539, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.16589355, + "step": 6115, + "time_per_iteration": 2.791149377822876 + }, + { + "auxiliary_loss_clip": 0.01400449, + "auxiliary_loss_mlp": 0.01038027, + "balance_loss_clip": 1.26901817, + "balance_loss_mlp": 1.02366221, + "epoch": 0.36771381331730046, + "flos": 16292703305520.0, + "grad_norm": 2.7220467428810644, + "language_loss": 0.77132183, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.79570663, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.14367676, + "step": 6116, + "time_per_iteration": 2.696755886077881 + }, + { + "auxiliary_loss_clip": 0.01401835, + "auxiliary_loss_mlp": 0.01039611, + "balance_loss_clip": 1.2663033, + "balance_loss_mlp": 1.02498412, + "epoch": 0.36777393656996843, + "flos": 22315126214160.0, + "grad_norm": 2.00152796401488, + "language_loss": 0.63026643, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65468085, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.14624023, + "step": 6117, + "time_per_iteration": 2.858116388320923 + }, + { + "auxiliary_loss_clip": 0.01399101, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.2651031, + "balance_loss_mlp": 1.01921034, + "epoch": 0.3678340598226364, + "flos": 26839796868000.0, + "grad_norm": 1.781302795310384, + "language_loss": 0.73083878, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75516623, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.14428711, + "step": 6118, + "time_per_iteration": 2.8068580627441406 + }, + { + "auxiliary_loss_clip": 0.01415691, + "auxiliary_loss_mlp": 0.01041472, + "balance_loss_clip": 1.27421212, + "balance_loss_mlp": 1.02477121, + "epoch": 0.36789418307530436, + "flos": 21766680771840.0, + "grad_norm": 1.499475847590137, + "language_loss": 0.72769648, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75226814, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.16723633, + "step": 6119, + "time_per_iteration": 2.758620023727417 + }, + { + "auxiliary_loss_clip": 0.01400547, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.2660768, + "balance_loss_mlp": 1.02230406, + "epoch": 0.3679543063279723, + "flos": 15928413086520.0, + "grad_norm": 3.825313658040653, + "language_loss": 0.80777633, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.83215111, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.14642334, + "step": 6120, + "time_per_iteration": 2.7778375148773193 + }, + { + "auxiliary_loss_clip": 0.01401956, + "auxiliary_loss_mlp": 0.01039639, + "balance_loss_clip": 1.26622295, + "balance_loss_mlp": 1.02521491, + "epoch": 0.3680144295806403, + "flos": 24280344195480.0, + "grad_norm": 1.8154238371106075, + "language_loss": 0.64639068, + "learning_rate": 2.916489757978126e-06, + "loss": 0.67080665, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.14422607, + "step": 6121, + "time_per_iteration": 2.7571043968200684 + }, + { + "auxiliary_loss_clip": 0.01401556, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.26608217, + "balance_loss_mlp": 1.02737176, + "epoch": 0.36807455283330826, + "flos": 26109754529040.0, + "grad_norm": 2.2175853848639595, + "language_loss": 0.71828932, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.74272847, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.14990234, + "step": 6122, + "time_per_iteration": 2.893479347229004 + }, + { + "auxiliary_loss_clip": 0.01398711, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.26682508, + "balance_loss_mlp": 1.01820505, + "epoch": 0.3681346760859763, + "flos": 24650441409960.0, + "grad_norm": 2.5023521286010544, + "language_loss": 0.69542968, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71974587, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.14709473, + "step": 6123, + "time_per_iteration": 2.777056932449341 + }, + { + "auxiliary_loss_clip": 0.01411173, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.27113879, + "balance_loss_mlp": 1.02737021, + "epoch": 0.36819479933864424, + "flos": 23883665486400.0, + "grad_norm": 2.1388333521362064, + "language_loss": 0.74505031, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.76959819, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.16223145, + "step": 6124, + "time_per_iteration": 2.7917392253875732 + }, + { + "auxiliary_loss_clip": 0.01401963, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.26522517, + "balance_loss_mlp": 1.0184741, + "epoch": 0.3682549225913122, + "flos": 25559156843640.0, + "grad_norm": 1.8257151244734884, + "language_loss": 0.74622023, + "learning_rate": 2.915104825441114e-06, + "loss": 0.77058244, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.15783691, + "step": 6125, + "time_per_iteration": 2.7726168632507324 + }, + { + "auxiliary_loss_clip": 0.01414083, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.27449, + "balance_loss_mlp": 1.02344406, + "epoch": 0.36831504584398017, + "flos": 16951065243120.0, + "grad_norm": 2.2345400348153257, + "language_loss": 0.78891611, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.81346196, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.17053223, + "step": 6126, + "time_per_iteration": 2.749267578125 + }, + { + "auxiliary_loss_clip": 0.01412863, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_clip": 1.27134418, + "balance_loss_mlp": 1.02809548, + "epoch": 0.36837516909664814, + "flos": 19869853914720.0, + "grad_norm": 2.0214884625008596, + "language_loss": 0.66998225, + "learning_rate": 2.914412150914888e-06, + "loss": 0.69456923, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.17736816, + "step": 6127, + "time_per_iteration": 2.76670503616333 + }, + { + "auxiliary_loss_clip": 0.01412921, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.27323294, + "balance_loss_mlp": 1.0241009, + "epoch": 0.3684352923493161, + "flos": 37633223698920.0, + "grad_norm": 1.7985151019312682, + "language_loss": 0.70356125, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72809422, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.16271973, + "step": 6128, + "time_per_iteration": 2.9065463542938232 + }, + { + "auxiliary_loss_clip": 0.01403793, + "auxiliary_loss_mlp": 0.01044337, + "balance_loss_clip": 1.26764011, + "balance_loss_mlp": 1.02937603, + "epoch": 0.36849541560198407, + "flos": 14469627876120.0, + "grad_norm": 1.8720950605795856, + "language_loss": 0.75700724, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.78148854, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.1496582, + "step": 6129, + "time_per_iteration": 2.8743247985839844 + }, + { + "auxiliary_loss_clip": 0.01407547, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.26994634, + "balance_loss_mlp": 1.02368855, + "epoch": 0.36855553885465203, + "flos": 25775659948680.0, + "grad_norm": 2.214053469748881, + "language_loss": 0.84947246, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.87394232, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.15759277, + "step": 6130, + "time_per_iteration": 4.244299650192261 + }, + { + "auxiliary_loss_clip": 0.01253597, + "auxiliary_loss_mlp": 0.01009716, + "balance_loss_clip": 1.17984772, + "balance_loss_mlp": 1.00399399, + "epoch": 0.36861566210732, + "flos": 65066825547360.0, + "grad_norm": 0.8109461792742636, + "language_loss": 0.60280257, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62543571, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.05712891, + "step": 6131, + "time_per_iteration": 3.442643404006958 + }, + { + "auxiliary_loss_clip": 0.01400562, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.2657547, + "balance_loss_mlp": 1.02210784, + "epoch": 0.36867578535998796, + "flos": 30960438699600.0, + "grad_norm": 1.6344738219624275, + "language_loss": 0.73342019, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75780356, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.15673828, + "step": 6132, + "time_per_iteration": 2.93546724319458 + }, + { + "auxiliary_loss_clip": 0.01418125, + "auxiliary_loss_mlp": 0.01042432, + "balance_loss_clip": 1.27568841, + "balance_loss_mlp": 1.02632725, + "epoch": 0.3687359086126559, + "flos": 28843575810120.0, + "grad_norm": 1.516023053697016, + "language_loss": 0.74381721, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76842284, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.16088867, + "step": 6133, + "time_per_iteration": 2.797581911087036 + }, + { + "auxiliary_loss_clip": 0.01398313, + "auxiliary_loss_mlp": 0.01045567, + "balance_loss_clip": 1.26531315, + "balance_loss_mlp": 1.02985561, + "epoch": 0.3687960318653239, + "flos": 21401700210720.0, + "grad_norm": 1.6201614919713778, + "language_loss": 0.71505547, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73949432, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15722656, + "step": 6134, + "time_per_iteration": 2.768476724624634 + }, + { + "auxiliary_loss_clip": 0.01406635, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.2707485, + "balance_loss_mlp": 1.01978016, + "epoch": 0.36885615511799186, + "flos": 20270837109960.0, + "grad_norm": 1.531990742736496, + "language_loss": 0.75306463, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77748203, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.15332031, + "step": 6135, + "time_per_iteration": 4.242196798324585 + }, + { + "auxiliary_loss_clip": 0.01252166, + "auxiliary_loss_mlp": 0.01012812, + "balance_loss_clip": 1.18028569, + "balance_loss_mlp": 1.00687492, + "epoch": 0.3689162783706599, + "flos": 63102483968760.0, + "grad_norm": 0.8141595029114352, + "language_loss": 0.5880065, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.61065626, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.05932617, + "step": 6136, + "time_per_iteration": 3.2147693634033203 + }, + { + "auxiliary_loss_clip": 0.01403246, + "auxiliary_loss_mlp": 0.01044469, + "balance_loss_clip": 1.2679286, + "balance_loss_mlp": 1.02923357, + "epoch": 0.36897640162332784, + "flos": 10965538352520.0, + "grad_norm": 1.9003544215471877, + "language_loss": 0.79540604, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81988323, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.15240479, + "step": 6137, + "time_per_iteration": 2.759615659713745 + }, + { + "auxiliary_loss_clip": 0.01405715, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.26846719, + "balance_loss_mlp": 1.02638376, + "epoch": 0.3690365248759958, + "flos": 20709203623560.0, + "grad_norm": 1.8063891603675404, + "language_loss": 0.74510652, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76958197, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.15441895, + "step": 6138, + "time_per_iteration": 2.79386568069458 + }, + { + "auxiliary_loss_clip": 0.01415055, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.27435946, + "balance_loss_mlp": 1.02178216, + "epoch": 0.3690966481286638, + "flos": 31831933248360.0, + "grad_norm": 2.2356563261786064, + "language_loss": 0.65206325, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67658567, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.15393066, + "step": 6139, + "time_per_iteration": 4.452153921127319 + }, + { + "auxiliary_loss_clip": 0.01396427, + "auxiliary_loss_mlp": 0.01048852, + "balance_loss_clip": 1.26210964, + "balance_loss_mlp": 1.03262806, + "epoch": 0.36915677138133174, + "flos": 13118809959720.0, + "grad_norm": 1.9508610716893942, + "language_loss": 0.72195125, + "learning_rate": 2.909906390418006e-06, + "loss": 0.74640405, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.16223145, + "step": 6140, + "time_per_iteration": 4.251903057098389 + }, + { + "auxiliary_loss_clip": 0.01253327, + "auxiliary_loss_mlp": 0.01019009, + "balance_loss_clip": 1.18183339, + "balance_loss_mlp": 1.01416922, + "epoch": 0.3692168946339997, + "flos": 68703311616480.0, + "grad_norm": 0.7502910578846529, + "language_loss": 0.59328032, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61600363, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.04833984, + "step": 6141, + "time_per_iteration": 3.4419312477111816 + }, + { + "auxiliary_loss_clip": 0.01405239, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.26772296, + "balance_loss_mlp": 1.02098656, + "epoch": 0.36927701788666767, + "flos": 22022963088480.0, + "grad_norm": 1.7066131892879444, + "language_loss": 0.75333965, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77775246, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.15075684, + "step": 6142, + "time_per_iteration": 2.8584635257720947 + }, + { + "auxiliary_loss_clip": 0.01393036, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.25925684, + "balance_loss_mlp": 1.02522349, + "epoch": 0.36933714113933563, + "flos": 21840675849720.0, + "grad_norm": 1.6424205486582408, + "language_loss": 0.77081621, + "learning_rate": 2.908865770392555e-06, + "loss": 0.7951411, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.14221191, + "step": 6143, + "time_per_iteration": 2.816743850708008 + }, + { + "auxiliary_loss_clip": 0.01398606, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.26479769, + "balance_loss_mlp": 1.01953769, + "epoch": 0.3693972643920036, + "flos": 23696424027720.0, + "grad_norm": 1.518375025511408, + "language_loss": 0.81629527, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84061813, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.14154053, + "step": 6144, + "time_per_iteration": 2.8065593242645264 + }, + { + "auxiliary_loss_clip": 0.01397764, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.26092148, + "balance_loss_mlp": 1.02527261, + "epoch": 0.36945738764467156, + "flos": 22861906713720.0, + "grad_norm": 6.538663399622165, + "language_loss": 0.78108889, + "learning_rate": 2.908171851365593e-06, + "loss": 0.80546844, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.14916992, + "step": 6145, + "time_per_iteration": 2.7589006423950195 + }, + { + "auxiliary_loss_clip": 0.01406369, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.26826167, + "balance_loss_mlp": 1.02188909, + "epoch": 0.36951751089733953, + "flos": 16620056898120.0, + "grad_norm": 1.7892602328274798, + "language_loss": 0.76617187, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79060769, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.15307617, + "step": 6146, + "time_per_iteration": 2.785392999649048 + }, + { + "auxiliary_loss_clip": 0.0140444, + "auxiliary_loss_mlp": 0.01040784, + "balance_loss_clip": 1.26654434, + "balance_loss_mlp": 1.02512002, + "epoch": 0.3695776341500075, + "flos": 18918963376200.0, + "grad_norm": 1.746031172731547, + "language_loss": 0.80652267, + "learning_rate": 2.907477794586761e-06, + "loss": 0.83097488, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.15661621, + "step": 6147, + "time_per_iteration": 2.744720935821533 + }, + { + "auxiliary_loss_clip": 0.01399382, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.26077604, + "balance_loss_mlp": 1.02106285, + "epoch": 0.36963775740267546, + "flos": 20812703997960.0, + "grad_norm": 1.8032515388303696, + "language_loss": 0.84044892, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.86479568, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.14233398, + "step": 6148, + "time_per_iteration": 2.771007537841797 + }, + { + "auxiliary_loss_clip": 0.01402766, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.26860869, + "balance_loss_mlp": 1.02056062, + "epoch": 0.3696978806553435, + "flos": 26066726648640.0, + "grad_norm": 2.0045886475465853, + "language_loss": 0.74217033, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76655769, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.15423584, + "step": 6149, + "time_per_iteration": 2.7717185020446777 + }, + { + "auxiliary_loss_clip": 0.01408502, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.26992643, + "balance_loss_mlp": 1.02456212, + "epoch": 0.36975800390801145, + "flos": 26839634434560.0, + "grad_norm": 2.108482451155977, + "language_loss": 0.71479303, + "learning_rate": 2.906436451364054e-06, + "loss": 0.7392869, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.16320801, + "step": 6150, + "time_per_iteration": 2.9153192043304443 + }, + { + "auxiliary_loss_clip": 0.01403241, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.267313, + "balance_loss_mlp": 1.02392685, + "epoch": 0.3698181271606794, + "flos": 21147651353880.0, + "grad_norm": 1.7918712133855546, + "language_loss": 0.81450105, + "learning_rate": 2.906089268194611e-06, + "loss": 0.8389169, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.14422607, + "step": 6151, + "time_per_iteration": 2.806135654449463 + }, + { + "auxiliary_loss_clip": 0.01245749, + "auxiliary_loss_mlp": 0.01017829, + "balance_loss_clip": 1.17789221, + "balance_loss_mlp": 1.01339424, + "epoch": 0.3698782504133474, + "flos": 66757446841680.0, + "grad_norm": 0.8173003288017008, + "language_loss": 0.63124651, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65388227, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.04443359, + "step": 6152, + "time_per_iteration": 3.3827130794525146 + }, + { + "auxiliary_loss_clip": 0.01393139, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.26156831, + "balance_loss_mlp": 1.02084398, + "epoch": 0.36993837366601534, + "flos": 24316265612880.0, + "grad_norm": 5.193838001624354, + "language_loss": 0.70583045, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.73011613, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.14593506, + "step": 6153, + "time_per_iteration": 2.8200113773345947 + }, + { + "auxiliary_loss_clip": 0.01402527, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.26499414, + "balance_loss_mlp": 1.01835012, + "epoch": 0.3699984969186833, + "flos": 24354095623200.0, + "grad_norm": 2.254033964668554, + "language_loss": 0.73213083, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.75649154, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.15197754, + "step": 6154, + "time_per_iteration": 2.805643320083618 + }, + { + "auxiliary_loss_clip": 0.01404009, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.26765323, + "balance_loss_mlp": 1.01547623, + "epoch": 0.37005862017135127, + "flos": 19834013714040.0, + "grad_norm": 1.9317008870981447, + "language_loss": 0.67828214, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70262712, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.15008545, + "step": 6155, + "time_per_iteration": 2.766923666000366 + }, + { + "auxiliary_loss_clip": 0.01394294, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.25919211, + "balance_loss_mlp": 1.01610494, + "epoch": 0.37011874342401924, + "flos": 19578624781320.0, + "grad_norm": 1.7052120878709047, + "language_loss": 0.68517971, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70943618, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.15246582, + "step": 6156, + "time_per_iteration": 2.8884541988372803 + }, + { + "auxiliary_loss_clip": 0.01391839, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.25943923, + "balance_loss_mlp": 1.01753247, + "epoch": 0.3701788666766872, + "flos": 20379007445760.0, + "grad_norm": 2.1260164737830216, + "language_loss": 0.82257223, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84680629, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.14025879, + "step": 6157, + "time_per_iteration": 2.79396390914917 + }, + { + "auxiliary_loss_clip": 0.01407369, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.26681805, + "balance_loss_mlp": 1.02018929, + "epoch": 0.37023898992935517, + "flos": 15345142652520.0, + "grad_norm": 2.3073956556609323, + "language_loss": 0.7772367, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.80167836, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.1661377, + "step": 6158, + "time_per_iteration": 2.7258479595184326 + }, + { + "auxiliary_loss_clip": 0.01406595, + "auxiliary_loss_mlp": 0.01036457, + "balance_loss_clip": 1.26782107, + "balance_loss_mlp": 1.02031589, + "epoch": 0.37029911318202313, + "flos": 19578787214760.0, + "grad_norm": 2.219046265516969, + "language_loss": 0.68746871, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71189922, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.16149902, + "step": 6159, + "time_per_iteration": 2.74493670463562 + }, + { + "auxiliary_loss_clip": 0.01395862, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.26084661, + "balance_loss_mlp": 1.0252049, + "epoch": 0.3703592364346911, + "flos": 26218940073840.0, + "grad_norm": 1.8985641962700044, + "language_loss": 0.71469796, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73904788, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.13934326, + "step": 6160, + "time_per_iteration": 2.835991382598877 + }, + { + "auxiliary_loss_clip": 0.01389667, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.25692379, + "balance_loss_mlp": 1.01804972, + "epoch": 0.37041935968735906, + "flos": 20053237579200.0, + "grad_norm": 3.0102251151013304, + "language_loss": 0.79692292, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.82113981, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.13970947, + "step": 6161, + "time_per_iteration": 2.764310359954834 + }, + { + "auxiliary_loss_clip": 0.01396482, + "auxiliary_loss_mlp": 0.01044637, + "balance_loss_clip": 1.2600466, + "balance_loss_mlp": 1.02875257, + "epoch": 0.3704794829400271, + "flos": 24139135635840.0, + "grad_norm": 3.5430914983008086, + "language_loss": 0.79765052, + "learning_rate": 2.902267988534295e-06, + "loss": 0.82206172, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.15893555, + "step": 6162, + "time_per_iteration": 2.916583776473999 + }, + { + "auxiliary_loss_clip": 0.0140119, + "auxiliary_loss_mlp": 0.01043866, + "balance_loss_clip": 1.26625586, + "balance_loss_mlp": 1.0293349, + "epoch": 0.37053960619269505, + "flos": 14871057763320.0, + "grad_norm": 1.9077678007368823, + "language_loss": 0.79855955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.82301009, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.14526367, + "step": 6163, + "time_per_iteration": 2.855095624923706 + }, + { + "auxiliary_loss_clip": 0.01404373, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.26870728, + "balance_loss_mlp": 1.02664447, + "epoch": 0.370599729445363, + "flos": 21366550352160.0, + "grad_norm": 8.214571875834718, + "language_loss": 0.68197179, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70643348, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.15148926, + "step": 6164, + "time_per_iteration": 2.8165876865386963 + }, + { + "auxiliary_loss_clip": 0.01404894, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.26883209, + "balance_loss_mlp": 1.02429438, + "epoch": 0.370659852698031, + "flos": 26834111697600.0, + "grad_norm": 2.8984643399349657, + "language_loss": 0.83698606, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.86143184, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15393066, + "step": 6165, + "time_per_iteration": 2.900562047958374 + }, + { + "auxiliary_loss_clip": 0.01413, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_clip": 1.27282012, + "balance_loss_mlp": 1.03140271, + "epoch": 0.37071997595069894, + "flos": 19103727724920.0, + "grad_norm": 2.0054809787275674, + "language_loss": 0.69295096, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71756375, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.16894531, + "step": 6166, + "time_per_iteration": 2.810539722442627 + }, + { + "auxiliary_loss_clip": 0.0125699, + "auxiliary_loss_mlp": 0.0102193, + "balance_loss_clip": 1.18983006, + "balance_loss_mlp": 1.01742387, + "epoch": 0.3707800992033669, + "flos": 52190450453520.0, + "grad_norm": 0.7930478846982064, + "language_loss": 0.57050526, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59329438, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.04516602, + "step": 6167, + "time_per_iteration": 3.1542091369628906 + }, + { + "auxiliary_loss_clip": 0.01401919, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.26919365, + "balance_loss_mlp": 1.03253841, + "epoch": 0.3708402224560349, + "flos": 19906668716040.0, + "grad_norm": 2.029726353437735, + "language_loss": 0.75444281, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77892703, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.13964844, + "step": 6168, + "time_per_iteration": 2.786098003387451 + }, + { + "auxiliary_loss_clip": 0.01406606, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.27319574, + "balance_loss_mlp": 1.02430201, + "epoch": 0.37090034570870284, + "flos": 20011996466640.0, + "grad_norm": 1.7315129242306284, + "language_loss": 0.73815203, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76260418, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.14306641, + "step": 6169, + "time_per_iteration": 4.199568033218384 + }, + { + "auxiliary_loss_clip": 0.01403122, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.27144122, + "balance_loss_mlp": 1.02606297, + "epoch": 0.3709604689613708, + "flos": 24140150844840.0, + "grad_norm": 1.4832049496641222, + "language_loss": 0.79776758, + "learning_rate": 2.899486274782127e-06, + "loss": 0.82219738, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.13793945, + "step": 6170, + "time_per_iteration": 2.8091225624084473 + }, + { + "auxiliary_loss_clip": 0.01404234, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.26957154, + "balance_loss_mlp": 1.02609098, + "epoch": 0.37102059221403877, + "flos": 23881066551360.0, + "grad_norm": 1.4914095535153833, + "language_loss": 0.76513171, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78959715, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16204834, + "step": 6171, + "time_per_iteration": 2.7613680362701416 + }, + { + "auxiliary_loss_clip": 0.01412147, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.27719474, + "balance_loss_mlp": 1.02250779, + "epoch": 0.37108071546670673, + "flos": 14505305643360.0, + "grad_norm": 2.005272375536879, + "language_loss": 0.8024857, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82697523, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1427002, + "step": 6172, + "time_per_iteration": 2.7849414348602295 + }, + { + "auxiliary_loss_clip": 0.01418568, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.28107905, + "balance_loss_mlp": 1.02601373, + "epoch": 0.3711408387193747, + "flos": 34568515897920.0, + "grad_norm": 2.2276992867615317, + "language_loss": 0.60386562, + "learning_rate": 2.89844256897035e-06, + "loss": 0.62846494, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.15350342, + "step": 6173, + "time_per_iteration": 2.9837276935577393 + }, + { + "auxiliary_loss_clip": 0.01410235, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.27498007, + "balance_loss_mlp": 1.02496362, + "epoch": 0.37120096197204266, + "flos": 17315111811960.0, + "grad_norm": 1.9271628988013667, + "language_loss": 0.81111854, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83562207, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15161133, + "step": 6174, + "time_per_iteration": 4.206800699234009 + }, + { + "auxiliary_loss_clip": 0.01403176, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.27210712, + "balance_loss_mlp": 1.019521, + "epoch": 0.37126108522471063, + "flos": 30670143558480.0, + "grad_norm": 2.351351636347905, + "language_loss": 0.79987007, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82423246, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.13531494, + "step": 6175, + "time_per_iteration": 2.8588452339172363 + }, + { + "auxiliary_loss_clip": 0.0141642, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.28229213, + "balance_loss_mlp": 1.02480483, + "epoch": 0.37132120847737865, + "flos": 25161138058680.0, + "grad_norm": 1.7746632069361843, + "language_loss": 0.88747418, + "learning_rate": 2.89739855653729e-06, + "loss": 0.91203177, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.14544678, + "step": 6176, + "time_per_iteration": 2.780109167098999 + }, + { + "auxiliary_loss_clip": 0.01415749, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.2813921, + "balance_loss_mlp": 1.01922965, + "epoch": 0.3713813317300466, + "flos": 21218235329520.0, + "grad_norm": 3.108747655272559, + "language_loss": 0.73594558, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.76044285, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.14746094, + "step": 6177, + "time_per_iteration": 2.8216476440429688 + }, + { + "auxiliary_loss_clip": 0.01412198, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.27864599, + "balance_loss_mlp": 1.02557695, + "epoch": 0.3714414549827146, + "flos": 21621695634720.0, + "grad_norm": 2.258496127665914, + "language_loss": 0.75206053, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77658987, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15161133, + "step": 6178, + "time_per_iteration": 4.309784173965454 + }, + { + "auxiliary_loss_clip": 0.0140719, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.27558947, + "balance_loss_mlp": 1.01943612, + "epoch": 0.37150157823538255, + "flos": 19976805999720.0, + "grad_norm": 9.029625738645878, + "language_loss": 0.7214644, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74587911, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.14837646, + "step": 6179, + "time_per_iteration": 4.273470640182495 + }, + { + "auxiliary_loss_clip": 0.0141513, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.27896941, + "balance_loss_mlp": 1.0185833, + "epoch": 0.3715617014880505, + "flos": 24865604439120.0, + "grad_norm": 1.8907758316977303, + "language_loss": 0.70195854, + "learning_rate": 2.896006063609283e-06, + "loss": 0.7264514, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.15576172, + "step": 6180, + "time_per_iteration": 2.805670976638794 + }, + { + "auxiliary_loss_clip": 0.01407594, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.27516055, + "balance_loss_mlp": 1.01537943, + "epoch": 0.3716218247407185, + "flos": 20453977124280.0, + "grad_norm": 1.9118194584747707, + "language_loss": 0.78080487, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80518258, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.14801025, + "step": 6181, + "time_per_iteration": 2.822779893875122 + }, + { + "auxiliary_loss_clip": 0.01410075, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.27634001, + "balance_loss_mlp": 1.01946366, + "epoch": 0.37168194799338644, + "flos": 24138770160600.0, + "grad_norm": 1.7162032050257674, + "language_loss": 0.78511387, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80956894, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15966797, + "step": 6182, + "time_per_iteration": 2.960674285888672 + }, + { + "auxiliary_loss_clip": 0.01261758, + "auxiliary_loss_mlp": 0.01015175, + "balance_loss_clip": 1.19487214, + "balance_loss_mlp": 1.00940537, + "epoch": 0.3717420712460544, + "flos": 67425067485360.0, + "grad_norm": 0.7944891203474862, + "language_loss": 0.57449889, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59726822, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.05761719, + "step": 6183, + "time_per_iteration": 3.380215883255005 + }, + { + "auxiliary_loss_clip": 0.01418597, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.27898049, + "balance_loss_mlp": 1.02014601, + "epoch": 0.37180219449872237, + "flos": 22381283878560.0, + "grad_norm": 1.8736321597982464, + "language_loss": 0.77311361, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79766238, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.16125488, + "step": 6184, + "time_per_iteration": 2.965763568878174 + }, + { + "auxiliary_loss_clip": 0.01404125, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.27149117, + "balance_loss_mlp": 1.02388644, + "epoch": 0.37186231775139034, + "flos": 21874688674200.0, + "grad_norm": 2.2781203238958314, + "language_loss": 0.72300804, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74743396, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.14575195, + "step": 6185, + "time_per_iteration": 2.830071449279785 + }, + { + "auxiliary_loss_clip": 0.01398608, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.26758742, + "balance_loss_mlp": 1.01448584, + "epoch": 0.3719224410040583, + "flos": 22419844839360.0, + "grad_norm": 1.6690260949242475, + "language_loss": 0.76854122, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79282439, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15240479, + "step": 6186, + "time_per_iteration": 2.8219189643859863 + }, + { + "auxiliary_loss_clip": 0.01412758, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.27464128, + "balance_loss_mlp": 1.01845157, + "epoch": 0.37198256425672627, + "flos": 25156143230400.0, + "grad_norm": 1.8822007868247028, + "language_loss": 0.83811569, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.86260247, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.17468262, + "step": 6187, + "time_per_iteration": 2.8704748153686523 + }, + { + "auxiliary_loss_clip": 0.01398935, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.26711118, + "balance_loss_mlp": 1.01534224, + "epoch": 0.37204268750939423, + "flos": 21142778350680.0, + "grad_norm": 1.8988636449095189, + "language_loss": 0.84789991, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87219393, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.15130615, + "step": 6188, + "time_per_iteration": 2.912733554840088 + }, + { + "auxiliary_loss_clip": 0.01399842, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.26664019, + "balance_loss_mlp": 1.02164674, + "epoch": 0.37210281076206225, + "flos": 21511819747800.0, + "grad_norm": 1.6791183189173484, + "language_loss": 0.65815485, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.68252313, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.15344238, + "step": 6189, + "time_per_iteration": 2.7847416400909424 + }, + { + "auxiliary_loss_clip": 0.01404247, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.26995623, + "balance_loss_mlp": 1.01476443, + "epoch": 0.3721629340147302, + "flos": 17352332696880.0, + "grad_norm": 1.9251554933754718, + "language_loss": 0.84040797, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86475575, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.15759277, + "step": 6190, + "time_per_iteration": 2.7808713912963867 + }, + { + "auxiliary_loss_clip": 0.01410939, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.27312732, + "balance_loss_mlp": 1.0147965, + "epoch": 0.3722230572673982, + "flos": 16436348366760.0, + "grad_norm": 2.6237349789289834, + "language_loss": 0.8867892, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.91119486, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.14825439, + "step": 6191, + "time_per_iteration": 2.7327818870544434 + }, + { + "auxiliary_loss_clip": 0.01410613, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.27305698, + "balance_loss_mlp": 1.01481557, + "epoch": 0.37228318052006615, + "flos": 22679903733480.0, + "grad_norm": 1.5694655207895876, + "language_loss": 0.73938811, + "learning_rate": 2.891825326449073e-06, + "loss": 0.7638166, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17419434, + "step": 6192, + "time_per_iteration": 2.766523599624634 + }, + { + "auxiliary_loss_clip": 0.01405757, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.27191687, + "balance_loss_mlp": 1.01694, + "epoch": 0.3723433037727341, + "flos": 25271054553960.0, + "grad_norm": 2.219980287857366, + "language_loss": 0.80654621, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.83093232, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.15917969, + "step": 6193, + "time_per_iteration": 2.8082244396209717 + }, + { + "auxiliary_loss_clip": 0.01405426, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.2681601, + "balance_loss_mlp": 1.0151186, + "epoch": 0.3724034270254021, + "flos": 10528227656280.0, + "grad_norm": 1.8854541289124511, + "language_loss": 0.84391248, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86825931, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.14147949, + "step": 6194, + "time_per_iteration": 2.7338719367980957 + }, + { + "auxiliary_loss_clip": 0.01405713, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.27100396, + "balance_loss_mlp": 1.01796174, + "epoch": 0.37246355027807004, + "flos": 20271040151760.0, + "grad_norm": 2.409440725107593, + "language_loss": 0.77619141, + "learning_rate": 2.890779380359646e-06, + "loss": 0.80058515, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.15704346, + "step": 6195, + "time_per_iteration": 2.9910359382629395 + }, + { + "auxiliary_loss_clip": 0.01402363, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.27068162, + "balance_loss_mlp": 1.01677012, + "epoch": 0.372523673530738, + "flos": 19505360653920.0, + "grad_norm": 1.6656681483128002, + "language_loss": 0.79562652, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81996894, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15112305, + "step": 6196, + "time_per_iteration": 2.77325177192688 + }, + { + "auxiliary_loss_clip": 0.01401966, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.26986551, + "balance_loss_mlp": 1.0165453, + "epoch": 0.372583796783406, + "flos": 16768696787640.0, + "grad_norm": 2.0314568633528043, + "language_loss": 0.83781314, + "learning_rate": 2.890081914052443e-06, + "loss": 0.86213708, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.13879395, + "step": 6197, + "time_per_iteration": 2.756154775619507 + }, + { + "auxiliary_loss_clip": 0.01396032, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.26559782, + "balance_loss_mlp": 1.01724243, + "epoch": 0.37264392003607394, + "flos": 22643129540520.0, + "grad_norm": 2.195526763481387, + "language_loss": 0.64503747, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66932535, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15509033, + "step": 6198, + "time_per_iteration": 2.774627208709717 + }, + { + "auxiliary_loss_clip": 0.01402617, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.27092195, + "balance_loss_mlp": 1.02418494, + "epoch": 0.3727040432887419, + "flos": 19977739992000.0, + "grad_norm": 1.7474386431106528, + "language_loss": 0.74140048, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76582021, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.15161133, + "step": 6199, + "time_per_iteration": 2.844442129135132 + }, + { + "auxiliary_loss_clip": 0.01401909, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.27048004, + "balance_loss_mlp": 1.01544738, + "epoch": 0.37276416654140987, + "flos": 63909103339440.0, + "grad_norm": 1.6341151778407788, + "language_loss": 0.81202364, + "learning_rate": 2.889035461484742e-06, + "loss": 0.83634067, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.14349365, + "step": 6200, + "time_per_iteration": 3.241468906402588 + }, + { + "auxiliary_loss_clip": 0.01402047, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.26990747, + "balance_loss_mlp": 1.01982093, + "epoch": 0.37282428979407783, + "flos": 39793317510600.0, + "grad_norm": 2.5390540634965126, + "language_loss": 0.59975815, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62411797, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.14123535, + "step": 6201, + "time_per_iteration": 2.992727518081665 + }, + { + "auxiliary_loss_clip": 0.01411297, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.27555823, + "balance_loss_mlp": 1.02086103, + "epoch": 0.37288441304674586, + "flos": 22713957166320.0, + "grad_norm": 1.7617403459755183, + "language_loss": 0.72998059, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75445068, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.14855957, + "step": 6202, + "time_per_iteration": 2.781405210494995 + }, + { + "auxiliary_loss_clip": 0.01401813, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.2693212, + "balance_loss_mlp": 1.01671624, + "epoch": 0.3729445362994138, + "flos": 18774912231360.0, + "grad_norm": 2.2661029474229997, + "language_loss": 0.74422711, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76856673, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.15423584, + "step": 6203, + "time_per_iteration": 2.759225368499756 + }, + { + "auxiliary_loss_clip": 0.01402909, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.2721312, + "balance_loss_mlp": 1.01799738, + "epoch": 0.3730046595520818, + "flos": 22461410818800.0, + "grad_norm": 2.3049688397493187, + "language_loss": 0.81744611, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.84178948, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.13409424, + "step": 6204, + "time_per_iteration": 2.8229918479919434 + }, + { + "auxiliary_loss_clip": 0.0140972, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.27423763, + "balance_loss_mlp": 1.02246237, + "epoch": 0.37306478280474975, + "flos": 24321747741480.0, + "grad_norm": 1.5777528842537913, + "language_loss": 0.75278699, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77726549, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15637207, + "step": 6205, + "time_per_iteration": 2.8119702339172363 + }, + { + "auxiliary_loss_clip": 0.01399787, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.26752758, + "balance_loss_mlp": 1.01870275, + "epoch": 0.3731249060574177, + "flos": 15819430583520.0, + "grad_norm": 2.0787685553873434, + "language_loss": 0.7866658, + "learning_rate": 2.886941646474128e-06, + "loss": 0.81100869, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.15820312, + "step": 6206, + "time_per_iteration": 2.8073642253875732 + }, + { + "auxiliary_loss_clip": 0.01405846, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.27235198, + "balance_loss_mlp": 1.01516998, + "epoch": 0.3731850293100857, + "flos": 19832795463240.0, + "grad_norm": 2.1042282171325857, + "language_loss": 0.93377662, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95813972, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.1529541, + "step": 6207, + "time_per_iteration": 2.8816721439361572 + }, + { + "auxiliary_loss_clip": 0.01413446, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.27716398, + "balance_loss_mlp": 1.01866794, + "epoch": 0.37324515256275365, + "flos": 19067034748680.0, + "grad_norm": 2.43794186325847, + "language_loss": 0.82712078, + "learning_rate": 2.886243438932759e-06, + "loss": 0.85158658, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.14465332, + "step": 6208, + "time_per_iteration": 4.269459009170532 + }, + { + "auxiliary_loss_clip": 0.01405879, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.27028966, + "balance_loss_mlp": 1.0172832, + "epoch": 0.3733052758154216, + "flos": 20709244231920.0, + "grad_norm": 1.7662320773318885, + "language_loss": 0.73177207, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75616282, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.15930176, + "step": 6209, + "time_per_iteration": 2.772303342819214 + }, + { + "auxiliary_loss_clip": 0.01407103, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.27455246, + "balance_loss_mlp": 1.01883924, + "epoch": 0.3733653990680896, + "flos": 20198344541400.0, + "grad_norm": 1.6068017722168402, + "language_loss": 0.70436001, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72878397, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16461182, + "step": 6210, + "time_per_iteration": 2.7996199131011963 + }, + { + "auxiliary_loss_clip": 0.01407678, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.27230859, + "balance_loss_mlp": 1.01375556, + "epoch": 0.37342552232075754, + "flos": 20344547929320.0, + "grad_norm": 1.868671488600711, + "language_loss": 0.77637786, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80074728, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.1550293, + "step": 6211, + "time_per_iteration": 2.7796120643615723 + }, + { + "auxiliary_loss_clip": 0.01412739, + "auxiliary_loss_mlp": 0.01035634, + "balance_loss_clip": 1.27746594, + "balance_loss_mlp": 1.0202142, + "epoch": 0.3734856455734255, + "flos": 35524766739960.0, + "grad_norm": 3.3436488850237733, + "language_loss": 0.73034555, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75482929, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15405273, + "step": 6212, + "time_per_iteration": 2.9839212894439697 + }, + { + "auxiliary_loss_clip": 0.01428862, + "auxiliary_loss_mlp": 0.0104329, + "balance_loss_clip": 1.28735995, + "balance_loss_mlp": 1.02731621, + "epoch": 0.37354576882609347, + "flos": 21147285878640.0, + "grad_norm": 1.8122814781069365, + "language_loss": 0.82275867, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84748018, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.15991211, + "step": 6213, + "time_per_iteration": 4.2522478103637695 + }, + { + "auxiliary_loss_clip": 0.01409708, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.27583885, + "balance_loss_mlp": 1.02312231, + "epoch": 0.37360589207876144, + "flos": 21511779139440.0, + "grad_norm": 2.049646748924876, + "language_loss": 0.78405476, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.8085438, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.16088867, + "step": 6214, + "time_per_iteration": 2.7541282176971436 + }, + { + "auxiliary_loss_clip": 0.01402975, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.27045739, + "balance_loss_mlp": 1.02519321, + "epoch": 0.37366601533142946, + "flos": 38442946286160.0, + "grad_norm": 1.653541233076874, + "language_loss": 0.84664214, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87106448, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.140625, + "step": 6215, + "time_per_iteration": 2.9493749141693115 + }, + { + "auxiliary_loss_clip": 0.01416662, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.2797482, + "balance_loss_mlp": 1.02210355, + "epoch": 0.3737261385840974, + "flos": 18445650045840.0, + "grad_norm": 1.6974485585915746, + "language_loss": 0.68071628, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70526373, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.15966797, + "step": 6216, + "time_per_iteration": 2.7721946239471436 + }, + { + "auxiliary_loss_clip": 0.0140466, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.27025831, + "balance_loss_mlp": 1.02385485, + "epoch": 0.3737862618367654, + "flos": 22935049016040.0, + "grad_norm": 2.798113478726731, + "language_loss": 0.66844618, + "learning_rate": 2.883099843007303e-06, + "loss": 0.69289315, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16186523, + "step": 6217, + "time_per_iteration": 4.453213930130005 + }, + { + "auxiliary_loss_clip": 0.014124, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.27606142, + "balance_loss_mlp": 1.01816225, + "epoch": 0.37384638508943335, + "flos": 15413330734920.0, + "grad_norm": 1.8080163241823228, + "language_loss": 0.809618, + "learning_rate": 2.88275038695833e-06, + "loss": 0.8340826, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.15905762, + "step": 6218, + "time_per_iteration": 4.332667589187622 + }, + { + "auxiliary_loss_clip": 0.01396365, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.26671636, + "balance_loss_mlp": 1.02254796, + "epoch": 0.3739065083421013, + "flos": 24286313624400.0, + "grad_norm": 1.5394732592742912, + "language_loss": 0.78825879, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81259453, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14666748, + "step": 6219, + "time_per_iteration": 2.8608555793762207 + }, + { + "auxiliary_loss_clip": 0.01399361, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.26857138, + "balance_loss_mlp": 1.02243733, + "epoch": 0.3739666315947693, + "flos": 23008028884920.0, + "grad_norm": 1.760294022591362, + "language_loss": 0.77375662, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79812407, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.14941406, + "step": 6220, + "time_per_iteration": 2.8229422569274902 + }, + { + "auxiliary_loss_clip": 0.01413275, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.27855909, + "balance_loss_mlp": 1.01928174, + "epoch": 0.37402675484743725, + "flos": 19395890850600.0, + "grad_norm": 1.9782802996099493, + "language_loss": 0.82751596, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85199869, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.15722656, + "step": 6221, + "time_per_iteration": 2.8193955421447754 + }, + { + "auxiliary_loss_clip": 0.01405002, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.27232945, + "balance_loss_mlp": 1.02174497, + "epoch": 0.3740868781001052, + "flos": 17130266246520.0, + "grad_norm": 3.1583313270852584, + "language_loss": 0.76879978, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.79321516, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.14794922, + "step": 6222, + "time_per_iteration": 2.756520986557007 + }, + { + "auxiliary_loss_clip": 0.01411095, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.27890205, + "balance_loss_mlp": 1.01936412, + "epoch": 0.3741470013527732, + "flos": 20047796058960.0, + "grad_norm": 1.9335120124911618, + "language_loss": 0.70625609, + "learning_rate": 2.881002604868789e-06, + "loss": 0.73071229, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.1517334, + "step": 6223, + "time_per_iteration": 2.839486837387085 + }, + { + "auxiliary_loss_clip": 0.01406646, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.27418971, + "balance_loss_mlp": 1.01865327, + "epoch": 0.37420712460544114, + "flos": 36903425010120.0, + "grad_norm": 2.454923287415256, + "language_loss": 0.69129562, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71568835, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.13946533, + "step": 6224, + "time_per_iteration": 2.9472827911376953 + }, + { + "auxiliary_loss_clip": 0.01398218, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.26757789, + "balance_loss_mlp": 1.01807117, + "epoch": 0.3742672478581091, + "flos": 22206427969680.0, + "grad_norm": 1.8539893453456282, + "language_loss": 0.70163769, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72594714, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.14642334, + "step": 6225, + "time_per_iteration": 2.8956401348114014 + }, + { + "auxiliary_loss_clip": 0.01399825, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.27099419, + "balance_loss_mlp": 1.02111578, + "epoch": 0.3743273711107771, + "flos": 24687012561120.0, + "grad_norm": 2.0154124412325407, + "language_loss": 0.79395795, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81833088, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.16345215, + "step": 6226, + "time_per_iteration": 2.8406827449798584 + }, + { + "auxiliary_loss_clip": 0.01410095, + "auxiliary_loss_mlp": 0.0103728, + "balance_loss_clip": 1.27697611, + "balance_loss_mlp": 1.02162814, + "epoch": 0.37438749436344504, + "flos": 24464661852240.0, + "grad_norm": 1.712351555089301, + "language_loss": 0.6811496, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70562339, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15649414, + "step": 6227, + "time_per_iteration": 2.8159830570220947 + }, + { + "auxiliary_loss_clip": 0.0140222, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.27296019, + "balance_loss_mlp": 1.01896882, + "epoch": 0.374447617616113, + "flos": 21803779831680.0, + "grad_norm": 1.8291468204820425, + "language_loss": 0.83063674, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85500056, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15185547, + "step": 6228, + "time_per_iteration": 2.765535354614258 + }, + { + "auxiliary_loss_clip": 0.01406947, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.27574909, + "balance_loss_mlp": 1.0233407, + "epoch": 0.374507740868781, + "flos": 17972783407440.0, + "grad_norm": 1.5545866662637327, + "language_loss": 0.74691129, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.7713666, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15258789, + "step": 6229, + "time_per_iteration": 2.811088800430298 + }, + { + "auxiliary_loss_clip": 0.01409552, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.2761085, + "balance_loss_mlp": 1.02551985, + "epoch": 0.374567864121449, + "flos": 16109969374800.0, + "grad_norm": 1.757451831202886, + "language_loss": 0.83544147, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85994929, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15710449, + "step": 6230, + "time_per_iteration": 2.7639832496643066 + }, + { + "auxiliary_loss_clip": 0.01409489, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.27645516, + "balance_loss_mlp": 1.02604675, + "epoch": 0.37462798737411696, + "flos": 25778380708800.0, + "grad_norm": 1.675816591960887, + "language_loss": 0.73385489, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75837171, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.16149902, + "step": 6231, + "time_per_iteration": 2.8570680618286133 + }, + { + "auxiliary_loss_clip": 0.01414161, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.27922273, + "balance_loss_mlp": 1.02225447, + "epoch": 0.3746881106267849, + "flos": 16658902117440.0, + "grad_norm": 1.9735556575272362, + "language_loss": 0.73794603, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76246798, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.15783691, + "step": 6232, + "time_per_iteration": 2.726994752883911 + }, + { + "auxiliary_loss_clip": 0.01409971, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.27650011, + "balance_loss_mlp": 1.02638113, + "epoch": 0.3747482338794529, + "flos": 26183708998560.0, + "grad_norm": 1.6531044718642496, + "language_loss": 0.77116305, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79568571, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.15905762, + "step": 6233, + "time_per_iteration": 2.8126566410064697 + }, + { + "auxiliary_loss_clip": 0.01409427, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.27533674, + "balance_loss_mlp": 1.02274024, + "epoch": 0.37480835713212085, + "flos": 12024883485360.0, + "grad_norm": 1.7543206671767497, + "language_loss": 0.69355226, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71801627, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.14233398, + "step": 6234, + "time_per_iteration": 2.7394583225250244 + }, + { + "auxiliary_loss_clip": 0.01409402, + "auxiliary_loss_mlp": 0.01048269, + "balance_loss_clip": 1.27925396, + "balance_loss_mlp": 1.03391671, + "epoch": 0.3748684803847888, + "flos": 19683587056680.0, + "grad_norm": 2.432690301093353, + "language_loss": 0.8305555, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.85513222, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.14337158, + "step": 6235, + "time_per_iteration": 2.7713050842285156 + }, + { + "auxiliary_loss_clip": 0.01411929, + "auxiliary_loss_mlp": 0.0104203, + "balance_loss_clip": 1.27794254, + "balance_loss_mlp": 1.02618718, + "epoch": 0.3749286036374568, + "flos": 20525982392520.0, + "grad_norm": 1.8411729536586008, + "language_loss": 0.78038561, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8049252, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15844727, + "step": 6236, + "time_per_iteration": 2.7976012229919434 + }, + { + "auxiliary_loss_clip": 0.01415056, + "auxiliary_loss_mlp": 0.01040542, + "balance_loss_clip": 1.2790854, + "balance_loss_mlp": 1.02382898, + "epoch": 0.37498872689012475, + "flos": 20709569098800.0, + "grad_norm": 2.7072451513757207, + "language_loss": 0.73714793, + "learning_rate": 2.876104377085234e-06, + "loss": 0.76170385, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.16699219, + "step": 6237, + "time_per_iteration": 2.7750327587127686 + }, + { + "auxiliary_loss_clip": 0.01418817, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.28207171, + "balance_loss_mlp": 1.01729834, + "epoch": 0.3750488501427927, + "flos": 21579195663000.0, + "grad_norm": 1.9721713185441168, + "language_loss": 0.9331696, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95769089, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.16003418, + "step": 6238, + "time_per_iteration": 2.7453813552856445 + }, + { + "auxiliary_loss_clip": 0.01406269, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.27241528, + "balance_loss_mlp": 1.02411115, + "epoch": 0.3751089733954607, + "flos": 15928047611280.0, + "grad_norm": 2.050940956525469, + "language_loss": 0.70952147, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73398513, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.15979004, + "step": 6239, + "time_per_iteration": 2.7468814849853516 + }, + { + "auxiliary_loss_clip": 0.01415139, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.27928483, + "balance_loss_mlp": 1.01825023, + "epoch": 0.37516909664812864, + "flos": 36291705096960.0, + "grad_norm": 2.810000008570107, + "language_loss": 0.65933484, + "learning_rate": 2.875053908444895e-06, + "loss": 0.68382645, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.15771484, + "step": 6240, + "time_per_iteration": 2.917111396789551 + }, + { + "auxiliary_loss_clip": 0.0140995, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.27555466, + "balance_loss_mlp": 1.01731277, + "epoch": 0.3752292199007966, + "flos": 13519793154960.0, + "grad_norm": 2.093797390969884, + "language_loss": 0.76185453, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.78628314, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.15588379, + "step": 6241, + "time_per_iteration": 2.828129529953003 + }, + { + "auxiliary_loss_clip": 0.01414451, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.27989197, + "balance_loss_mlp": 1.02015495, + "epoch": 0.3752893431534646, + "flos": 27203762220120.0, + "grad_norm": 2.167640715537817, + "language_loss": 0.8366611, + "learning_rate": 2.874353430085213e-06, + "loss": 0.86117381, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16662598, + "step": 6242, + "time_per_iteration": 2.823423147201538 + }, + { + "auxiliary_loss_clip": 0.01413992, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_clip": 1.27976871, + "balance_loss_mlp": 1.02882719, + "epoch": 0.3753494664061326, + "flos": 30013730822160.0, + "grad_norm": 2.1030072134080036, + "language_loss": 0.68353266, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70810944, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.14855957, + "step": 6243, + "time_per_iteration": 2.8203742504119873 + }, + { + "auxiliary_loss_clip": 0.01409964, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.27619171, + "balance_loss_mlp": 1.02333736, + "epoch": 0.37540958965880056, + "flos": 24467301395640.0, + "grad_norm": 2.388609046071115, + "language_loss": 0.84135699, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.8658576, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.16760254, + "step": 6244, + "time_per_iteration": 2.809711217880249 + }, + { + "auxiliary_loss_clip": 0.01402967, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.27339506, + "balance_loss_mlp": 1.01606476, + "epoch": 0.3754697129114685, + "flos": 16512739337880.0, + "grad_norm": 2.4917587584762617, + "language_loss": 0.82615161, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.8504951, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15319824, + "step": 6245, + "time_per_iteration": 2.726799249649048 + }, + { + "auxiliary_loss_clip": 0.01404885, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.27120876, + "balance_loss_mlp": 1.01937413, + "epoch": 0.3755298361641365, + "flos": 19395647200440.0, + "grad_norm": 2.8920775458200545, + "language_loss": 0.63824022, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66264188, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.15869141, + "step": 6246, + "time_per_iteration": 2.950958490371704 + }, + { + "auxiliary_loss_clip": 0.01417286, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.27958012, + "balance_loss_mlp": 1.02282977, + "epoch": 0.37558995941680445, + "flos": 14724488900160.0, + "grad_norm": 1.9966940285471635, + "language_loss": 0.75277036, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77733409, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.16271973, + "step": 6247, + "time_per_iteration": 4.230332136154175 + }, + { + "auxiliary_loss_clip": 0.01408462, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.27336168, + "balance_loss_mlp": 1.02079666, + "epoch": 0.3756500826694724, + "flos": 21695000370480.0, + "grad_norm": 4.268912214435139, + "language_loss": 0.56036955, + "learning_rate": 2.872251199697598e-06, + "loss": 0.58482176, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.1595459, + "step": 6248, + "time_per_iteration": 2.790173053741455 + }, + { + "auxiliary_loss_clip": 0.01403416, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.27112865, + "balance_loss_mlp": 1.02000105, + "epoch": 0.3757102059221404, + "flos": 26511346849680.0, + "grad_norm": 1.7472668649403904, + "language_loss": 0.84445322, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86884272, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15527344, + "step": 6249, + "time_per_iteration": 2.81089186668396 + }, + { + "auxiliary_loss_clip": 0.014091, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.27579188, + "balance_loss_mlp": 1.01749158, + "epoch": 0.37577032917480835, + "flos": 37344674717280.0, + "grad_norm": 1.5927918482050916, + "language_loss": 0.68094617, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70536578, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15374756, + "step": 6250, + "time_per_iteration": 2.9564716815948486 + }, + { + "auxiliary_loss_clip": 0.01407898, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.27441859, + "balance_loss_mlp": 1.02089357, + "epoch": 0.3758304524274763, + "flos": 21913696326960.0, + "grad_norm": 2.0255116504017336, + "language_loss": 0.77954626, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.80398208, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.14794922, + "step": 6251, + "time_per_iteration": 4.3065619468688965 + }, + { + "auxiliary_loss_clip": 0.01404638, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.27239227, + "balance_loss_mlp": 1.01529145, + "epoch": 0.3758905756801443, + "flos": 36575502900480.0, + "grad_norm": 2.0024326026213797, + "language_loss": 0.5822469, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60659862, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15234375, + "step": 6252, + "time_per_iteration": 2.9708635807037354 + }, + { + "auxiliary_loss_clip": 0.01404065, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.26854253, + "balance_loss_mlp": 1.02143121, + "epoch": 0.37595069893281224, + "flos": 24533337234960.0, + "grad_norm": 3.1888765559044523, + "language_loss": 0.89595914, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92037082, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.15661621, + "step": 6253, + "time_per_iteration": 2.806364059448242 + }, + { + "auxiliary_loss_clip": 0.0140286, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.27200174, + "balance_loss_mlp": 1.01929951, + "epoch": 0.3760108221854802, + "flos": 16439312777040.0, + "grad_norm": 2.110878987468704, + "language_loss": 0.77251256, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.79688495, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15087891, + "step": 6254, + "time_per_iteration": 2.7478549480438232 + }, + { + "auxiliary_loss_clip": 0.01410301, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.2753346, + "balance_loss_mlp": 1.02745426, + "epoch": 0.37607094543814823, + "flos": 13775709996360.0, + "grad_norm": 2.02099836676107, + "language_loss": 0.62658489, + "learning_rate": 2.869797092829169e-06, + "loss": 0.65112466, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.16223145, + "step": 6255, + "time_per_iteration": 2.7415196895599365 + }, + { + "auxiliary_loss_clip": 0.0141517, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.27828312, + "balance_loss_mlp": 1.01696372, + "epoch": 0.3761310686908162, + "flos": 19861772851080.0, + "grad_norm": 2.443170580981162, + "language_loss": 0.74130714, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76579678, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.16845703, + "step": 6256, + "time_per_iteration": 5.780294179916382 + }, + { + "auxiliary_loss_clip": 0.01408213, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.27315116, + "balance_loss_mlp": 1.02120936, + "epoch": 0.37619119194348416, + "flos": 12754722782520.0, + "grad_norm": 3.298949447830592, + "language_loss": 0.70992672, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.73438919, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16821289, + "step": 6257, + "time_per_iteration": 2.7530109882354736 + }, + { + "auxiliary_loss_clip": 0.01405501, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.27273703, + "balance_loss_mlp": 1.01926291, + "epoch": 0.3762513151961521, + "flos": 17535107235960.0, + "grad_norm": 1.5954188371935323, + "language_loss": 0.84367532, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86806929, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.14660645, + "step": 6258, + "time_per_iteration": 2.855046510696411 + }, + { + "auxiliary_loss_clip": 0.01401873, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.26894617, + "balance_loss_mlp": 1.02949786, + "epoch": 0.3763114384488201, + "flos": 23621941649520.0, + "grad_norm": 1.448322899816366, + "language_loss": 0.81081212, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8352741, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.14819336, + "step": 6259, + "time_per_iteration": 2.7858774662017822 + }, + { + "auxiliary_loss_clip": 0.01412231, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.27511775, + "balance_loss_mlp": 1.03361654, + "epoch": 0.37637156170148806, + "flos": 25411978855080.0, + "grad_norm": 2.1516873187392087, + "language_loss": 0.71800417, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74263388, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.17102051, + "step": 6260, + "time_per_iteration": 2.818692207336426 + }, + { + "auxiliary_loss_clip": 0.01413549, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.27609944, + "balance_loss_mlp": 1.02279329, + "epoch": 0.376431684954156, + "flos": 23446192356720.0, + "grad_norm": 1.579477155494598, + "language_loss": 0.78728664, + "learning_rate": 2.867692286154594e-06, + "loss": 0.81181288, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.1628418, + "step": 6261, + "time_per_iteration": 2.9084811210632324 + }, + { + "auxiliary_loss_clip": 0.01418951, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.28007412, + "balance_loss_mlp": 1.02930927, + "epoch": 0.376491808206824, + "flos": 34211210316840.0, + "grad_norm": 1.925373341318723, + "language_loss": 0.8077839, + "learning_rate": 2.867341369804132e-06, + "loss": 0.83242077, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.15429688, + "step": 6262, + "time_per_iteration": 3.0402753353118896 + }, + { + "auxiliary_loss_clip": 0.0140346, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.27190101, + "balance_loss_mlp": 1.02644181, + "epoch": 0.37655193145949195, + "flos": 35192012235480.0, + "grad_norm": 1.8899097648656262, + "language_loss": 0.80978543, + "learning_rate": 2.866990420563998e-06, + "loss": 0.83422542, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.14099121, + "step": 6263, + "time_per_iteration": 2.927494764328003 + }, + { + "auxiliary_loss_clip": 0.01408663, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.27392435, + "balance_loss_mlp": 1.02814829, + "epoch": 0.3766120547121599, + "flos": 16765976027520.0, + "grad_norm": 1.8001102192734775, + "language_loss": 0.79427004, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81879014, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.15179443, + "step": 6264, + "time_per_iteration": 2.8056693077087402 + }, + { + "auxiliary_loss_clip": 0.01401163, + "auxiliary_loss_mlp": 0.0104827, + "balance_loss_clip": 1.26782393, + "balance_loss_mlp": 1.03237927, + "epoch": 0.3766721779648279, + "flos": 23555662160040.0, + "grad_norm": 1.884688093521443, + "language_loss": 0.74074328, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.76523763, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15893555, + "step": 6265, + "time_per_iteration": 2.8683080673217773 + }, + { + "auxiliary_loss_clip": 0.01402543, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.27253008, + "balance_loss_mlp": 1.02333868, + "epoch": 0.37673230121749585, + "flos": 29134764335160.0, + "grad_norm": 1.7272668348048013, + "language_loss": 0.6879636, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71236753, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.14520264, + "step": 6266, + "time_per_iteration": 2.8259496688842773 + }, + { + "auxiliary_loss_clip": 0.01418615, + "auxiliary_loss_mlp": 0.01038713, + "balance_loss_clip": 1.27936125, + "balance_loss_mlp": 1.02264357, + "epoch": 0.3767924244701638, + "flos": 28152581732280.0, + "grad_norm": 7.4827976066441595, + "language_loss": 0.63162315, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65619648, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.1607666, + "step": 6267, + "time_per_iteration": 2.8625054359436035 + }, + { + "auxiliary_loss_clip": 0.01246721, + "auxiliary_loss_mlp": 0.01009483, + "balance_loss_clip": 1.18661714, + "balance_loss_mlp": 1.00576389, + "epoch": 0.37685254772283183, + "flos": 60811455512520.0, + "grad_norm": 0.7197211947155459, + "language_loss": 0.58955348, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.6121155, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.03710938, + "step": 6268, + "time_per_iteration": 3.398519992828369 + }, + { + "auxiliary_loss_clip": 0.01406284, + "auxiliary_loss_mlp": 0.01042393, + "balance_loss_clip": 1.2723527, + "balance_loss_mlp": 1.026371, + "epoch": 0.3769126709754998, + "flos": 26038317777840.0, + "grad_norm": 1.419886966909619, + "language_loss": 0.649252, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67373878, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.16015625, + "step": 6269, + "time_per_iteration": 2.8012094497680664 + }, + { + "auxiliary_loss_clip": 0.01408401, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.27803385, + "balance_loss_mlp": 1.02654219, + "epoch": 0.37697279422816776, + "flos": 23584274072640.0, + "grad_norm": 1.8045045466441276, + "language_loss": 0.71178311, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.73629224, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15991211, + "step": 6270, + "time_per_iteration": 2.883394956588745 + }, + { + "auxiliary_loss_clip": 0.01244397, + "auxiliary_loss_mlp": 0.01006766, + "balance_loss_clip": 1.18454432, + "balance_loss_mlp": 1.00302303, + "epoch": 0.3770329174808357, + "flos": 64761789571560.0, + "grad_norm": 0.7054673673168896, + "language_loss": 0.56179094, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58430254, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.03735352, + "step": 6271, + "time_per_iteration": 3.244710922241211 + }, + { + "auxiliary_loss_clip": 0.01403824, + "auxiliary_loss_mlp": 0.01037877, + "balance_loss_clip": 1.27143919, + "balance_loss_mlp": 1.02177238, + "epoch": 0.3770930407335037, + "flos": 21840350982840.0, + "grad_norm": 1.6109350269294331, + "language_loss": 0.79757631, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82199329, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16101074, + "step": 6272, + "time_per_iteration": 2.822343587875366 + }, + { + "auxiliary_loss_clip": 0.01400997, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.27122116, + "balance_loss_mlp": 1.0234983, + "epoch": 0.37715316398617166, + "flos": 22753127252520.0, + "grad_norm": 1.5614110597442101, + "language_loss": 0.74508953, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76947904, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.14459229, + "step": 6273, + "time_per_iteration": 2.8236842155456543 + }, + { + "auxiliary_loss_clip": 0.01405393, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.27412808, + "balance_loss_mlp": 1.0289607, + "epoch": 0.3772132872388396, + "flos": 18919003984560.0, + "grad_norm": 1.5057588147186713, + "language_loss": 0.72151363, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74600345, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.14611816, + "step": 6274, + "time_per_iteration": 2.8376080989837646 + }, + { + "auxiliary_loss_clip": 0.01406769, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.27147341, + "balance_loss_mlp": 1.02624762, + "epoch": 0.3772734104915076, + "flos": 17350505320680.0, + "grad_norm": 3.591121918776682, + "language_loss": 0.84001052, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.86449409, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.15350342, + "step": 6275, + "time_per_iteration": 2.7630093097686768 + }, + { + "auxiliary_loss_clip": 0.01401461, + "auxiliary_loss_mlp": 0.01042116, + "balance_loss_clip": 1.27236915, + "balance_loss_mlp": 1.02839446, + "epoch": 0.37733353374417555, + "flos": 32348680542720.0, + "grad_norm": 2.2020922027025973, + "language_loss": 0.75830305, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.7827388, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.1373291, + "step": 6276, + "time_per_iteration": 2.8739800453186035 + }, + { + "auxiliary_loss_clip": 0.01416001, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.28138471, + "balance_loss_mlp": 1.02208567, + "epoch": 0.3773936569968435, + "flos": 23365009599120.0, + "grad_norm": 1.9025707149546545, + "language_loss": 0.85743225, + "learning_rate": 2.862073685241366e-06, + "loss": 0.88197023, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.15710449, + "step": 6277, + "time_per_iteration": 2.8986973762512207 + }, + { + "auxiliary_loss_clip": 0.01400036, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.27179503, + "balance_loss_mlp": 1.02360177, + "epoch": 0.3774537802495115, + "flos": 21471309585720.0, + "grad_norm": 1.7420156960933881, + "language_loss": 0.78307879, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80746078, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14562988, + "step": 6278, + "time_per_iteration": 2.7988290786743164 + }, + { + "auxiliary_loss_clip": 0.01418464, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.28131413, + "balance_loss_mlp": 1.02706838, + "epoch": 0.37751390350217945, + "flos": 24979256903520.0, + "grad_norm": 2.049708794637754, + "language_loss": 0.83453435, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.8591547, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.16503906, + "step": 6279, + "time_per_iteration": 2.815114974975586 + }, + { + "auxiliary_loss_clip": 0.01408896, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.27552271, + "balance_loss_mlp": 1.02480996, + "epoch": 0.3775740267548474, + "flos": 27824943881160.0, + "grad_norm": 2.120921244173311, + "language_loss": 0.75285482, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77733779, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.14587402, + "step": 6280, + "time_per_iteration": 2.873764991760254 + }, + { + "auxiliary_loss_clip": 0.01404011, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.27536833, + "balance_loss_mlp": 1.02930832, + "epoch": 0.3776341500075154, + "flos": 22570393321800.0, + "grad_norm": 1.3777157535047144, + "language_loss": 0.7622683, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78674942, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.14782715, + "step": 6281, + "time_per_iteration": 2.8443586826324463 + }, + { + "auxiliary_loss_clip": 0.01405687, + "auxiliary_loss_mlp": 0.01045448, + "balance_loss_clip": 1.27420092, + "balance_loss_mlp": 1.03082108, + "epoch": 0.3776942732601834, + "flos": 23082633088200.0, + "grad_norm": 1.4056197566917787, + "language_loss": 0.84562635, + "learning_rate": 2.860316153670974e-06, + "loss": 0.87013769, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.1463623, + "step": 6282, + "time_per_iteration": 2.8724141120910645 + }, + { + "auxiliary_loss_clip": 0.01398425, + "auxiliary_loss_mlp": 0.01038784, + "balance_loss_clip": 1.26887918, + "balance_loss_mlp": 1.02388334, + "epoch": 0.37775439651285136, + "flos": 21729256845120.0, + "grad_norm": 1.9282165821296444, + "language_loss": 0.69936275, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72373486, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14892578, + "step": 6283, + "time_per_iteration": 2.800149917602539 + }, + { + "auxiliary_loss_clip": 0.01406851, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_clip": 1.27691698, + "balance_loss_mlp": 1.02788687, + "epoch": 0.37781451976551933, + "flos": 23993054073000.0, + "grad_norm": 1.8040417145235073, + "language_loss": 0.76508206, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78958321, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15393066, + "step": 6284, + "time_per_iteration": 2.8763580322265625 + }, + { + "auxiliary_loss_clip": 0.0141933, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.28164935, + "balance_loss_mlp": 1.0171113, + "epoch": 0.3778746430181873, + "flos": 13730976564840.0, + "grad_norm": 2.4585723148509797, + "language_loss": 0.85995042, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88448668, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.17175293, + "step": 6285, + "time_per_iteration": 2.769073009490967 + }, + { + "auxiliary_loss_clip": 0.014131, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.27825832, + "balance_loss_mlp": 1.02404475, + "epoch": 0.37793476627085526, + "flos": 19464809883480.0, + "grad_norm": 1.7726688193012003, + "language_loss": 0.84405231, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86858791, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.16418457, + "step": 6286, + "time_per_iteration": 4.209394931793213 + }, + { + "auxiliary_loss_clip": 0.01409794, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.27759778, + "balance_loss_mlp": 1.02447486, + "epoch": 0.3779948895235232, + "flos": 10710596111760.0, + "grad_norm": 6.171234159751318, + "language_loss": 0.82109022, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84559518, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16217041, + "step": 6287, + "time_per_iteration": 2.776930570602417 + }, + { + "auxiliary_loss_clip": 0.01413138, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.2810849, + "balance_loss_mlp": 1.02421057, + "epoch": 0.3780550127761912, + "flos": 22315166822520.0, + "grad_norm": 2.3395510106103274, + "language_loss": 0.73411572, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75864434, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.15533447, + "step": 6288, + "time_per_iteration": 2.7950098514556885 + }, + { + "auxiliary_loss_clip": 0.01406577, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.27598166, + "balance_loss_mlp": 1.02506566, + "epoch": 0.37811513602885916, + "flos": 28956700365840.0, + "grad_norm": 1.6839307306177551, + "language_loss": 0.7607218, + "learning_rate": 2.857854239668352e-06, + "loss": 0.78520375, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.16558838, + "step": 6289, + "time_per_iteration": 2.82804799079895 + }, + { + "auxiliary_loss_clip": 0.01405218, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.27501464, + "balance_loss_mlp": 1.01876438, + "epoch": 0.3781752592815271, + "flos": 23118229638720.0, + "grad_norm": 1.8809709025893513, + "language_loss": 0.73500258, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75939465, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15209961, + "step": 6290, + "time_per_iteration": 4.235899209976196 + }, + { + "auxiliary_loss_clip": 0.01419605, + "auxiliary_loss_mlp": 0.01040155, + "balance_loss_clip": 1.28064132, + "balance_loss_mlp": 1.02214241, + "epoch": 0.3782353825341951, + "flos": 19760749586640.0, + "grad_norm": 14.681112280612265, + "language_loss": 0.80407614, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.82867384, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.18017578, + "step": 6291, + "time_per_iteration": 2.7732534408569336 + }, + { + "auxiliary_loss_clip": 0.01415208, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.28051209, + "balance_loss_mlp": 1.0166378, + "epoch": 0.37829550578686305, + "flos": 22055473403640.0, + "grad_norm": 1.90015088613005, + "language_loss": 0.77013826, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.79462594, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16931152, + "step": 6292, + "time_per_iteration": 2.8425936698913574 + }, + { + "auxiliary_loss_clip": 0.01412654, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.28032732, + "balance_loss_mlp": 1.02056849, + "epoch": 0.378355629039531, + "flos": 16474503243960.0, + "grad_norm": 1.7919137343063796, + "language_loss": 0.69755954, + "learning_rate": 2.856446715715224e-06, + "loss": 0.72204876, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15692139, + "step": 6293, + "time_per_iteration": 2.772071599960327 + }, + { + "auxiliary_loss_clip": 0.01403251, + "auxiliary_loss_mlp": 0.01037081, + "balance_loss_clip": 1.27363765, + "balance_loss_mlp": 1.02066588, + "epoch": 0.378415752292199, + "flos": 19979689193280.0, + "grad_norm": 2.2457776314091964, + "language_loss": 0.71752405, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74192739, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.16430664, + "step": 6294, + "time_per_iteration": 2.8141679763793945 + }, + { + "auxiliary_loss_clip": 0.0142193, + "auxiliary_loss_mlp": 0.01037025, + "balance_loss_clip": 1.28452754, + "balance_loss_mlp": 1.02109289, + "epoch": 0.378475875544867, + "flos": 14651468422920.0, + "grad_norm": 2.167730136452015, + "language_loss": 0.83420086, + "learning_rate": 2.855742758826011e-06, + "loss": 0.8587904, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.15930176, + "step": 6295, + "time_per_iteration": 5.967411518096924 + }, + { + "auxiliary_loss_clip": 0.01413773, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.28037572, + "balance_loss_mlp": 1.01833963, + "epoch": 0.37853599879753497, + "flos": 26656616245320.0, + "grad_norm": 1.9448002604951784, + "language_loss": 0.71633971, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74081457, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.15380859, + "step": 6296, + "time_per_iteration": 2.833306312561035 + }, + { + "auxiliary_loss_clip": 0.0140357, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.27672327, + "balance_loss_mlp": 1.02195954, + "epoch": 0.37859612205020293, + "flos": 17316533104560.0, + "grad_norm": 1.7498949723101511, + "language_loss": 0.77076417, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79516655, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.1472168, + "step": 6297, + "time_per_iteration": 2.83054256439209 + }, + { + "auxiliary_loss_clip": 0.0141968, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.28684986, + "balance_loss_mlp": 1.02041364, + "epoch": 0.3786562453028709, + "flos": 18224598804480.0, + "grad_norm": 3.2908204667546164, + "language_loss": 0.79254091, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81708825, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.14624023, + "step": 6298, + "time_per_iteration": 2.822744369506836 + }, + { + "auxiliary_loss_clip": 0.01414501, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.28477085, + "balance_loss_mlp": 1.02078104, + "epoch": 0.37871636855553886, + "flos": 21219819055560.0, + "grad_norm": 1.9917529080503147, + "language_loss": 0.84064949, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86515105, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14880371, + "step": 6299, + "time_per_iteration": 2.834179401397705 + }, + { + "auxiliary_loss_clip": 0.01417472, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.28578973, + "balance_loss_mlp": 1.01750088, + "epoch": 0.3787764918082068, + "flos": 20956755142800.0, + "grad_norm": 3.2519230292721306, + "language_loss": 0.76677144, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.79126924, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.14807129, + "step": 6300, + "time_per_iteration": 2.8161845207214355 + }, + { + "auxiliary_loss_clip": 0.01434357, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.29778993, + "balance_loss_mlp": 1.01667523, + "epoch": 0.3788366150608748, + "flos": 17312066184960.0, + "grad_norm": 3.4925674384748024, + "language_loss": 0.83067417, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.85535699, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.17236328, + "step": 6301, + "time_per_iteration": 2.8640811443328857 + }, + { + "auxiliary_loss_clip": 0.0142369, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.29088807, + "balance_loss_mlp": 1.01900339, + "epoch": 0.37889673831354276, + "flos": 24315778312560.0, + "grad_norm": 1.8087122677915917, + "language_loss": 0.68234342, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70692682, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.15661621, + "step": 6302, + "time_per_iteration": 2.842684745788574 + }, + { + "auxiliary_loss_clip": 0.01427734, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.29703701, + "balance_loss_mlp": 1.02040005, + "epoch": 0.3789568615662107, + "flos": 26688720476880.0, + "grad_norm": 1.930019367442733, + "language_loss": 0.68701172, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.71164453, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15148926, + "step": 6303, + "time_per_iteration": 2.923851728439331 + }, + { + "auxiliary_loss_clip": 0.01425768, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.29351187, + "balance_loss_mlp": 1.01696217, + "epoch": 0.3790169848188787, + "flos": 23590284109920.0, + "grad_norm": 1.6519766309554431, + "language_loss": 0.77812827, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80270457, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.14892578, + "step": 6304, + "time_per_iteration": 2.78123140335083 + }, + { + "auxiliary_loss_clip": 0.01441284, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.30523527, + "balance_loss_mlp": 1.01948035, + "epoch": 0.37907710807154665, + "flos": 18441426776400.0, + "grad_norm": 2.0466349856808295, + "language_loss": 0.80892169, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.83369899, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16967773, + "step": 6305, + "time_per_iteration": 2.8710176944732666 + }, + { + "auxiliary_loss_clip": 0.0127652, + "auxiliary_loss_mlp": 0.01018187, + "balance_loss_clip": 1.21846032, + "balance_loss_mlp": 1.01534975, + "epoch": 0.3791372313242146, + "flos": 50120067155040.0, + "grad_norm": 1.0136256998088755, + "language_loss": 0.64569604, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66864312, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.02832031, + "step": 6306, + "time_per_iteration": 3.2091259956359863 + }, + { + "auxiliary_loss_clip": 0.01428043, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.29535639, + "balance_loss_mlp": 1.02437854, + "epoch": 0.3791973545768826, + "flos": 24321950783280.0, + "grad_norm": 1.5331780502872516, + "language_loss": 0.736323, + "learning_rate": 2.851516295441817e-06, + "loss": 0.76101524, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.16821289, + "step": 6307, + "time_per_iteration": 2.9235215187072754 + }, + { + "auxiliary_loss_clip": 0.01438464, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.30445862, + "balance_loss_mlp": 1.01911151, + "epoch": 0.3792574778295506, + "flos": 21584921441760.0, + "grad_norm": 1.6131785437051, + "language_loss": 0.78704381, + "learning_rate": 2.851163879959112e-06, + "loss": 0.81178343, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.16381836, + "step": 6308, + "time_per_iteration": 2.9064383506774902 + }, + { + "auxiliary_loss_clip": 0.01423163, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.29141557, + "balance_loss_mlp": 1.01861, + "epoch": 0.37931760108221857, + "flos": 22277783504160.0, + "grad_norm": 2.0138223961670882, + "language_loss": 0.72715253, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75172544, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.1550293, + "step": 6309, + "time_per_iteration": 2.7743263244628906 + }, + { + "auxiliary_loss_clip": 0.0142303, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.29346681, + "balance_loss_mlp": 1.01734042, + "epoch": 0.37937772433488653, + "flos": 19687810326120.0, + "grad_norm": 1.4315615020935375, + "language_loss": 0.78998595, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81455016, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.16040039, + "step": 6310, + "time_per_iteration": 2.862445831298828 + }, + { + "auxiliary_loss_clip": 0.01423464, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.29276657, + "balance_loss_mlp": 1.01982665, + "epoch": 0.3794378475875545, + "flos": 19104133808520.0, + "grad_norm": 1.8953145157868747, + "language_loss": 0.7704407, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.79502642, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.1529541, + "step": 6311, + "time_per_iteration": 2.8069334030151367 + }, + { + "auxiliary_loss_clip": 0.01424286, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.29366684, + "balance_loss_mlp": 1.01899481, + "epoch": 0.37949797084022246, + "flos": 20344547929320.0, + "grad_norm": 1.5100537035083255, + "language_loss": 0.71420294, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73879099, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15515137, + "step": 6312, + "time_per_iteration": 2.7666783332824707 + }, + { + "auxiliary_loss_clip": 0.01268582, + "auxiliary_loss_mlp": 0.01004863, + "balance_loss_clip": 1.21182752, + "balance_loss_mlp": 1.00225186, + "epoch": 0.37955809409289043, + "flos": 63986973192720.0, + "grad_norm": 0.77528022362183, + "language_loss": 0.56153148, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58426595, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.02612305, + "step": 6313, + "time_per_iteration": 3.2312686443328857 + }, + { + "auxiliary_loss_clip": 0.01422276, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.29161227, + "balance_loss_mlp": 1.01999378, + "epoch": 0.3796182173455584, + "flos": 31546998410760.0, + "grad_norm": 1.8438042232699592, + "language_loss": 0.71821117, + "learning_rate": 2.849048709730083e-06, + "loss": 0.74278581, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.1519165, + "step": 6314, + "time_per_iteration": 2.85010027885437 + }, + { + "auxiliary_loss_clip": 0.01424728, + "auxiliary_loss_mlp": 0.01038993, + "balance_loss_clip": 1.29174066, + "balance_loss_mlp": 1.02249479, + "epoch": 0.37967834059822636, + "flos": 12134678155560.0, + "grad_norm": 1.7381385022965148, + "language_loss": 0.73312521, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75776243, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.16491699, + "step": 6315, + "time_per_iteration": 2.844193458557129 + }, + { + "auxiliary_loss_clip": 0.01419194, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.29111326, + "balance_loss_mlp": 1.01853001, + "epoch": 0.3797384638508943, + "flos": 39355925597640.0, + "grad_norm": 2.610997732690744, + "language_loss": 0.71027923, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73481005, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.15350342, + "step": 6316, + "time_per_iteration": 2.938166856765747 + }, + { + "auxiliary_loss_clip": 0.0142199, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.2919873, + "balance_loss_mlp": 1.02070713, + "epoch": 0.3797985871035623, + "flos": 34060133925720.0, + "grad_norm": 1.6573844771931672, + "language_loss": 0.65176046, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67633486, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.14746094, + "step": 6317, + "time_per_iteration": 2.885010004043579 + }, + { + "auxiliary_loss_clip": 0.01415447, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.2872411, + "balance_loss_mlp": 1.01734567, + "epoch": 0.37985871035623026, + "flos": 23227455791880.0, + "grad_norm": 3.093512216818989, + "language_loss": 0.86349839, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.88796747, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.14117432, + "step": 6318, + "time_per_iteration": 3.0342283248901367 + }, + { + "auxiliary_loss_clip": 0.01426861, + "auxiliary_loss_mlp": 0.01041528, + "balance_loss_clip": 1.29489136, + "balance_loss_mlp": 1.02492809, + "epoch": 0.3799188336088982, + "flos": 18119677137480.0, + "grad_norm": 2.2241792152502113, + "language_loss": 0.76027215, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.7849561, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.16589355, + "step": 6319, + "time_per_iteration": 2.824831008911133 + }, + { + "auxiliary_loss_clip": 0.01422286, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.29329014, + "balance_loss_mlp": 1.02043521, + "epoch": 0.3799789568615662, + "flos": 21876962742360.0, + "grad_norm": 1.6158459156523186, + "language_loss": 0.63758272, + "learning_rate": 2.846932380444744e-06, + "loss": 0.6621623, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15246582, + "step": 6320, + "time_per_iteration": 2.8440768718719482 + }, + { + "auxiliary_loss_clip": 0.01419991, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.2893827, + "balance_loss_mlp": 1.01867366, + "epoch": 0.3800390801142342, + "flos": 32969374903440.0, + "grad_norm": 1.8252578072642105, + "language_loss": 0.71536696, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73990762, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.15393066, + "step": 6321, + "time_per_iteration": 2.890695810317993 + }, + { + "auxiliary_loss_clip": 0.01422076, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.29042006, + "balance_loss_mlp": 1.01354337, + "epoch": 0.38009920336690217, + "flos": 26912817345240.0, + "grad_norm": 1.7777681210541378, + "language_loss": 0.75102746, + "learning_rate": 2.846226680280859e-06, + "loss": 0.7755366, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15283203, + "step": 6322, + "time_per_iteration": 2.9166102409362793 + }, + { + "auxiliary_loss_clip": 0.01418672, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.29008651, + "balance_loss_mlp": 1.01918519, + "epoch": 0.38015932661957014, + "flos": 22493636875440.0, + "grad_norm": 3.454156212853549, + "language_loss": 0.85350394, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87803733, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15478516, + "step": 6323, + "time_per_iteration": 2.8749918937683105 + }, + { + "auxiliary_loss_clip": 0.01417776, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.28778291, + "balance_loss_mlp": 1.01496255, + "epoch": 0.3802194498722381, + "flos": 21986026462080.0, + "grad_norm": 2.2052669840318093, + "language_loss": 0.73119128, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75568509, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.16650391, + "step": 6324, + "time_per_iteration": 4.182461500167847 + }, + { + "auxiliary_loss_clip": 0.01422299, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.28961635, + "balance_loss_mlp": 1.01552582, + "epoch": 0.38027957312490607, + "flos": 21329897984280.0, + "grad_norm": 1.8643616138540966, + "language_loss": 0.84570169, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.8702383, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.1583252, + "step": 6325, + "time_per_iteration": 2.833448886871338 + }, + { + "auxiliary_loss_clip": 0.0141896, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.28996122, + "balance_loss_mlp": 1.01820993, + "epoch": 0.38033969637757403, + "flos": 16695676310400.0, + "grad_norm": 1.6642431266661528, + "language_loss": 0.79318821, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81770813, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14807129, + "step": 6326, + "time_per_iteration": 2.7377307415008545 + }, + { + "auxiliary_loss_clip": 0.01413127, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.28565836, + "balance_loss_mlp": 1.01892841, + "epoch": 0.380399819630242, + "flos": 36217791235800.0, + "grad_norm": 2.544376273263359, + "language_loss": 0.7320323, + "learning_rate": 2.844461868547842e-06, + "loss": 0.75649869, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.14599609, + "step": 6327, + "time_per_iteration": 2.901418685913086 + }, + { + "auxiliary_loss_clip": 0.01410563, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.28218675, + "balance_loss_mlp": 1.01661754, + "epoch": 0.38045994288290996, + "flos": 21293935958520.0, + "grad_norm": 1.744488589749687, + "language_loss": 0.83349413, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85791546, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.14941406, + "step": 6328, + "time_per_iteration": 2.8673083782196045 + }, + { + "auxiliary_loss_clip": 0.01406852, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.27784586, + "balance_loss_mlp": 1.0185287, + "epoch": 0.38052006613557793, + "flos": 20927777754960.0, + "grad_norm": 1.5045494283273035, + "language_loss": 0.6154418, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63983625, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14056396, + "step": 6329, + "time_per_iteration": 4.231289386749268 + }, + { + "auxiliary_loss_clip": 0.01413587, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.28484237, + "balance_loss_mlp": 1.02067208, + "epoch": 0.3805801893882459, + "flos": 20994950628360.0, + "grad_norm": 3.3439730025262118, + "language_loss": 0.56203961, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58652771, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14550781, + "step": 6330, + "time_per_iteration": 2.868180990219116 + }, + { + "auxiliary_loss_clip": 0.01403648, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.27985215, + "balance_loss_mlp": 1.01796412, + "epoch": 0.38064031264091386, + "flos": 25564354713720.0, + "grad_norm": 2.383252132868947, + "language_loss": 0.6570673, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68141264, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.12921143, + "step": 6331, + "time_per_iteration": 2.895545721054077 + }, + { + "auxiliary_loss_clip": 0.01417757, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.28881919, + "balance_loss_mlp": 1.02442694, + "epoch": 0.3807004358935818, + "flos": 15090078586680.0, + "grad_norm": 1.9581361196645093, + "language_loss": 0.75906169, + "learning_rate": 2.842696256262919e-06, + "loss": 0.78364134, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15777588, + "step": 6332, + "time_per_iteration": 2.7649753093719482 + }, + { + "auxiliary_loss_clip": 0.01414378, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.2833972, + "balance_loss_mlp": 1.01882041, + "epoch": 0.3807605591462498, + "flos": 16403797443240.0, + "grad_norm": 2.2858315093965076, + "language_loss": 0.81835669, + "learning_rate": 2.842343037886987e-06, + "loss": 0.842839, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15026855, + "step": 6333, + "time_per_iteration": 2.762664794921875 + }, + { + "auxiliary_loss_clip": 0.01413587, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.28429246, + "balance_loss_mlp": 1.01748085, + "epoch": 0.3808206823989178, + "flos": 29062759066920.0, + "grad_norm": 1.5376823717766124, + "language_loss": 0.86202151, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88648093, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.14868164, + "step": 6334, + "time_per_iteration": 4.338144063949585 + }, + { + "auxiliary_loss_clip": 0.01408708, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.27961612, + "balance_loss_mlp": 1.01474726, + "epoch": 0.3808808056515858, + "flos": 15710123213640.0, + "grad_norm": 1.865439992206791, + "language_loss": 0.79346555, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81784058, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.14044189, + "step": 6335, + "time_per_iteration": 2.7682390213012695 + }, + { + "auxiliary_loss_clip": 0.01419355, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.2884326, + "balance_loss_mlp": 1.01623452, + "epoch": 0.38094092890425374, + "flos": 20709528490440.0, + "grad_norm": 1.7664496143988295, + "language_loss": 0.72563899, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75014555, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15075684, + "step": 6336, + "time_per_iteration": 2.8502073287963867 + }, + { + "auxiliary_loss_clip": 0.01407409, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.2798059, + "balance_loss_mlp": 1.01586676, + "epoch": 0.3810010521569217, + "flos": 20672876122560.0, + "grad_norm": 1.802740396973564, + "language_loss": 0.69351429, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71788621, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.13922119, + "step": 6337, + "time_per_iteration": 2.789415121078491 + }, + { + "auxiliary_loss_clip": 0.01414622, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.28716469, + "balance_loss_mlp": 1.01372886, + "epoch": 0.38106117540958967, + "flos": 31833354540960.0, + "grad_norm": 11.497819252375916, + "language_loss": 0.63384587, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65827751, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.14813232, + "step": 6338, + "time_per_iteration": 2.983186960220337 + }, + { + "auxiliary_loss_clip": 0.01421335, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.29050446, + "balance_loss_mlp": 1.01994443, + "epoch": 0.38112129866225763, + "flos": 16906250594880.0, + "grad_norm": 1.883972302312579, + "language_loss": 0.69678169, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.72135258, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15808105, + "step": 6339, + "time_per_iteration": 2.7876152992248535 + }, + { + "auxiliary_loss_clip": 0.01415233, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.28678119, + "balance_loss_mlp": 1.02236962, + "epoch": 0.3811814219149256, + "flos": 20892221812800.0, + "grad_norm": 1.962703928100891, + "language_loss": 0.68368924, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70821029, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.14526367, + "step": 6340, + "time_per_iteration": 2.885934352874756 + }, + { + "auxiliary_loss_clip": 0.0141811, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.28632784, + "balance_loss_mlp": 1.01311374, + "epoch": 0.38124154516759357, + "flos": 16695189010080.0, + "grad_norm": 1.8830286827129872, + "language_loss": 0.90359092, + "learning_rate": 2.839516142102522e-06, + "loss": 0.92806494, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.16174316, + "step": 6341, + "time_per_iteration": 2.771885395050049 + }, + { + "auxiliary_loss_clip": 0.01419161, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.28659868, + "balance_loss_mlp": 1.02098036, + "epoch": 0.38130166842026153, + "flos": 19686632683680.0, + "grad_norm": 1.7314594410463457, + "language_loss": 0.75239217, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77695727, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16375732, + "step": 6342, + "time_per_iteration": 2.8354008197784424 + }, + { + "auxiliary_loss_clip": 0.0141187, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.28143191, + "balance_loss_mlp": 1.0168035, + "epoch": 0.3813617916729295, + "flos": 22203301125960.0, + "grad_norm": 1.6661138322632567, + "language_loss": 0.83566099, + "learning_rate": 2.838809099543007e-06, + "loss": 0.86009574, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.14794922, + "step": 6343, + "time_per_iteration": 2.821700096130371 + }, + { + "auxiliary_loss_clip": 0.01419224, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.2875998, + "balance_loss_mlp": 1.02283657, + "epoch": 0.38142191492559746, + "flos": 19101331831680.0, + "grad_norm": 1.8159905508308962, + "language_loss": 0.7734046, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79797542, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15014648, + "step": 6344, + "time_per_iteration": 2.758405923843384 + }, + { + "auxiliary_loss_clip": 0.01411397, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.28277016, + "balance_loss_mlp": 1.0179944, + "epoch": 0.3814820381782654, + "flos": 24103133001720.0, + "grad_norm": 2.876154306678106, + "language_loss": 0.73472637, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75917351, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15332031, + "step": 6345, + "time_per_iteration": 2.8586888313293457 + }, + { + "auxiliary_loss_clip": 0.01411411, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.28426266, + "balance_loss_mlp": 1.01902378, + "epoch": 0.3815421614309334, + "flos": 15782737607280.0, + "grad_norm": 1.9640990264999032, + "language_loss": 0.69667292, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.7211194, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.14221191, + "step": 6346, + "time_per_iteration": 2.714236259460449 + }, + { + "auxiliary_loss_clip": 0.01423633, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.2921319, + "balance_loss_mlp": 1.02072406, + "epoch": 0.38160228468360136, + "flos": 19904313431160.0, + "grad_norm": 1.7675737947515615, + "language_loss": 0.75705695, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78165042, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.14978027, + "step": 6347, + "time_per_iteration": 2.7677650451660156 + }, + { + "auxiliary_loss_clip": 0.01416853, + "auxiliary_loss_mlp": 0.01037322, + "balance_loss_clip": 1.28639662, + "balance_loss_mlp": 1.02313662, + "epoch": 0.3816624079362694, + "flos": 19285730705160.0, + "grad_norm": 1.6861737742658747, + "language_loss": 0.74952364, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.77406538, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.14196777, + "step": 6348, + "time_per_iteration": 2.725508451461792 + }, + { + "auxiliary_loss_clip": 0.01416897, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.28632498, + "balance_loss_mlp": 1.02020574, + "epoch": 0.38172253118893734, + "flos": 21182395128840.0, + "grad_norm": 1.9040240260983525, + "language_loss": 0.87706232, + "learning_rate": 2.836687208908142e-06, + "loss": 0.90158206, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.14862061, + "step": 6349, + "time_per_iteration": 2.863563299179077 + }, + { + "auxiliary_loss_clip": 0.01412457, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_clip": 1.28277111, + "balance_loss_mlp": 1.02700019, + "epoch": 0.3817826544416053, + "flos": 17533604726640.0, + "grad_norm": 1.9649198293273564, + "language_loss": 0.76737374, + "learning_rate": 2.836333449345341e-06, + "loss": 0.79192013, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15179443, + "step": 6350, + "time_per_iteration": 2.7499547004699707 + }, + { + "auxiliary_loss_clip": 0.01416934, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.28770041, + "balance_loss_mlp": 1.01507032, + "epoch": 0.38184277769427327, + "flos": 16330939399440.0, + "grad_norm": 2.336769662069946, + "language_loss": 0.76439273, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78887337, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.1607666, + "step": 6351, + "time_per_iteration": 2.8544812202453613 + }, + { + "auxiliary_loss_clip": 0.01421516, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.29124975, + "balance_loss_mlp": 1.02001238, + "epoch": 0.38190290094694124, + "flos": 30449295358920.0, + "grad_norm": 1.6384216089775072, + "language_loss": 0.74410599, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76868272, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.16143799, + "step": 6352, + "time_per_iteration": 2.837268114089966 + }, + { + "auxiliary_loss_clip": 0.01406226, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.27977622, + "balance_loss_mlp": 1.01587713, + "epoch": 0.3819630241996092, + "flos": 14213995293240.0, + "grad_norm": 2.144234419122147, + "language_loss": 0.64583647, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.67019469, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.13696289, + "step": 6353, + "time_per_iteration": 2.8025100231170654 + }, + { + "auxiliary_loss_clip": 0.01414042, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.28590119, + "balance_loss_mlp": 1.01902378, + "epoch": 0.38202314745227717, + "flos": 25015178320920.0, + "grad_norm": 1.9933295218943636, + "language_loss": 0.8314355, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85591316, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14691162, + "step": 6354, + "time_per_iteration": 2.902301073074341 + }, + { + "auxiliary_loss_clip": 0.01405517, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.28081286, + "balance_loss_mlp": 1.01861429, + "epoch": 0.38208327070494513, + "flos": 20819526202440.0, + "grad_norm": 1.846788148006089, + "language_loss": 0.80397081, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82834995, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.13787842, + "step": 6355, + "time_per_iteration": 2.8047451972961426 + }, + { + "auxiliary_loss_clip": 0.01413338, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.28552938, + "balance_loss_mlp": 1.01974607, + "epoch": 0.3821433939576131, + "flos": 22642845282000.0, + "grad_norm": 1.8947926094517207, + "language_loss": 0.75443256, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77890557, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.14208984, + "step": 6356, + "time_per_iteration": 2.8453896045684814 + }, + { + "auxiliary_loss_clip": 0.01413982, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.28517032, + "balance_loss_mlp": 1.02048802, + "epoch": 0.38220351721028106, + "flos": 26875515243600.0, + "grad_norm": 1.7686457347398186, + "language_loss": 0.81463969, + "learning_rate": 2.833856245169348e-06, + "loss": 0.8391307, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14630127, + "step": 6357, + "time_per_iteration": 2.8257555961608887 + }, + { + "auxiliary_loss_clip": 0.0141588, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.28611803, + "balance_loss_mlp": 1.02116323, + "epoch": 0.38226364046294903, + "flos": 23372684579160.0, + "grad_norm": 1.7027927018221225, + "language_loss": 0.78290707, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80744529, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.16784668, + "step": 6358, + "time_per_iteration": 2.7810521125793457 + }, + { + "auxiliary_loss_clip": 0.0141694, + "auxiliary_loss_mlp": 0.0103701, + "balance_loss_clip": 1.2852838, + "balance_loss_mlp": 1.02218056, + "epoch": 0.382323763715617, + "flos": 19650995524800.0, + "grad_norm": 2.2954915498062434, + "language_loss": 0.78994966, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.81448913, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.14825439, + "step": 6359, + "time_per_iteration": 2.7975308895111084 + }, + { + "auxiliary_loss_clip": 0.01406827, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.28083968, + "balance_loss_mlp": 1.02484572, + "epoch": 0.38238388696828496, + "flos": 54133942962240.0, + "grad_norm": 1.7784098515098852, + "language_loss": 0.69911742, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.72358274, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14855957, + "step": 6360, + "time_per_iteration": 3.0933732986450195 + }, + { + "auxiliary_loss_clip": 0.01408953, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.28183341, + "balance_loss_mlp": 1.02013159, + "epoch": 0.382444010220953, + "flos": 24941589326640.0, + "grad_norm": 1.5399535483142175, + "language_loss": 0.78544658, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80988854, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15106201, + "step": 6361, + "time_per_iteration": 2.834761381149292 + }, + { + "auxiliary_loss_clip": 0.01397461, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.27361572, + "balance_loss_mlp": 1.0189383, + "epoch": 0.38250413347362094, + "flos": 42344689119480.0, + "grad_norm": 1.532224166807336, + "language_loss": 0.65479141, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67909205, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.13653564, + "step": 6362, + "time_per_iteration": 2.9601590633392334 + }, + { + "auxiliary_loss_clip": 0.01412344, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.28395903, + "balance_loss_mlp": 1.01785779, + "epoch": 0.3825642567262889, + "flos": 16293515472720.0, + "grad_norm": 1.7273860868143545, + "language_loss": 0.82597345, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.85043168, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15625, + "step": 6363, + "time_per_iteration": 4.172910928726196 + }, + { + "auxiliary_loss_clip": 0.01402149, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_clip": 1.27702546, + "balance_loss_mlp": 1.02769506, + "epoch": 0.3826243799789569, + "flos": 45662186918160.0, + "grad_norm": 1.7165973524732565, + "language_loss": 0.58973819, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.61419469, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.15789795, + "step": 6364, + "time_per_iteration": 2.9697046279907227 + }, + { + "auxiliary_loss_clip": 0.01413693, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.28312635, + "balance_loss_mlp": 1.02125788, + "epoch": 0.38268450323162484, + "flos": 25307097796440.0, + "grad_norm": 1.7300921882103026, + "language_loss": 0.69085222, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.7153523, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15057373, + "step": 6365, + "time_per_iteration": 2.763314723968506 + }, + { + "auxiliary_loss_clip": 0.01414292, + "auxiliary_loss_mlp": 0.01040479, + "balance_loss_clip": 1.2808069, + "balance_loss_mlp": 1.0247612, + "epoch": 0.3827446264842928, + "flos": 21841163150040.0, + "grad_norm": 2.9338536241243474, + "language_loss": 0.74096489, + "learning_rate": 2.830668992382758e-06, + "loss": 0.76551259, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15716553, + "step": 6366, + "time_per_iteration": 2.785646438598633 + }, + { + "auxiliary_loss_clip": 0.0140762, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.27891111, + "balance_loss_mlp": 1.02077603, + "epoch": 0.38280474973696077, + "flos": 25739535489480.0, + "grad_norm": 2.3383019383980526, + "language_loss": 0.68749899, + "learning_rate": 2.830314695509902e-06, + "loss": 0.71192837, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.14550781, + "step": 6367, + "time_per_iteration": 2.843113422393799 + }, + { + "auxiliary_loss_clip": 0.01396574, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.27327812, + "balance_loss_mlp": 1.01846468, + "epoch": 0.38286487298962874, + "flos": 24900835514400.0, + "grad_norm": 1.9862160588841893, + "language_loss": 0.64539236, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66968882, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.1461792, + "step": 6368, + "time_per_iteration": 4.204017877578735 + }, + { + "auxiliary_loss_clip": 0.01403547, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.27749109, + "balance_loss_mlp": 1.02484536, + "epoch": 0.3829249962422967, + "flos": 28548976182840.0, + "grad_norm": 1.6453697236146114, + "language_loss": 0.68294269, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70737129, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14465332, + "step": 6369, + "time_per_iteration": 2.810929298400879 + }, + { + "auxiliary_loss_clip": 0.01406737, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.28032458, + "balance_loss_mlp": 1.02953458, + "epoch": 0.38298511949496467, + "flos": 21476304414000.0, + "grad_norm": 2.11750169128907, + "language_loss": 0.78741491, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.81192577, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.14825439, + "step": 6370, + "time_per_iteration": 2.8470981121063232 + }, + { + "auxiliary_loss_clip": 0.01407903, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.28126001, + "balance_loss_mlp": 1.02356184, + "epoch": 0.38304524274763263, + "flos": 31685486210280.0, + "grad_norm": 2.456649824664342, + "language_loss": 0.64070517, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66516566, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14587402, + "step": 6371, + "time_per_iteration": 2.828538656234741 + }, + { + "auxiliary_loss_clip": 0.01416152, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.28284121, + "balance_loss_mlp": 1.02470863, + "epoch": 0.3831053660003006, + "flos": 25081539027120.0, + "grad_norm": 2.6150621630743203, + "language_loss": 0.73102403, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.755588, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.15527344, + "step": 6372, + "time_per_iteration": 5.817823648452759 + }, + { + "auxiliary_loss_clip": 0.01405554, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.27724528, + "balance_loss_mlp": 1.01727629, + "epoch": 0.38316548925296856, + "flos": 23264351809920.0, + "grad_norm": 1.8001537654488593, + "language_loss": 0.8544482, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.8788234, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.14691162, + "step": 6373, + "time_per_iteration": 2.8560643196105957 + }, + { + "auxiliary_loss_clip": 0.01408881, + "auxiliary_loss_mlp": 0.01042679, + "balance_loss_clip": 1.28070259, + "balance_loss_mlp": 1.02780175, + "epoch": 0.3832256125056366, + "flos": 34430231140200.0, + "grad_norm": 1.9997025299965827, + "language_loss": 0.74915451, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77367008, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.14880371, + "step": 6374, + "time_per_iteration": 2.885033130645752 + }, + { + "auxiliary_loss_clip": 0.01412292, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.28085852, + "balance_loss_mlp": 1.02629709, + "epoch": 0.38328573575830455, + "flos": 21767858414280.0, + "grad_norm": 2.3581391191126047, + "language_loss": 0.76616609, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.79071081, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15875244, + "step": 6375, + "time_per_iteration": 2.793088436126709 + }, + { + "auxiliary_loss_clip": 0.01410405, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.28062344, + "balance_loss_mlp": 1.01821303, + "epoch": 0.3833458590109725, + "flos": 17383949628120.0, + "grad_norm": 2.002478183680199, + "language_loss": 0.73233342, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.75676912, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1494751, + "step": 6376, + "time_per_iteration": 2.7492053508758545 + }, + { + "auxiliary_loss_clip": 0.01406373, + "auxiliary_loss_mlp": 0.01041766, + "balance_loss_clip": 1.28032315, + "balance_loss_mlp": 1.02610147, + "epoch": 0.3834059822636405, + "flos": 29430622821600.0, + "grad_norm": 1.6479085966388534, + "language_loss": 0.68420434, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70868576, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.15686035, + "step": 6377, + "time_per_iteration": 2.890941619873047 + }, + { + "auxiliary_loss_clip": 0.01418229, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.28688991, + "balance_loss_mlp": 1.02452326, + "epoch": 0.38346610551630844, + "flos": 21475776505320.0, + "grad_norm": 2.101403892264937, + "language_loss": 0.73706186, + "learning_rate": 2.826415354814344e-06, + "loss": 0.76163858, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.14916992, + "step": 6378, + "time_per_iteration": 2.9270150661468506 + }, + { + "auxiliary_loss_clip": 0.01407786, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.27872133, + "balance_loss_mlp": 1.0181272, + "epoch": 0.3835262287689764, + "flos": 27566712363240.0, + "grad_norm": 1.650325353400538, + "language_loss": 0.69127703, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71568429, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.14831543, + "step": 6379, + "time_per_iteration": 2.9154202938079834 + }, + { + "auxiliary_loss_clip": 0.01404393, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.27928901, + "balance_loss_mlp": 1.02627635, + "epoch": 0.3835863520216444, + "flos": 15527998408320.0, + "grad_norm": 1.8958783910773553, + "language_loss": 0.83463126, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85907757, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.1395874, + "step": 6380, + "time_per_iteration": 2.8391244411468506 + }, + { + "auxiliary_loss_clip": 0.01402447, + "auxiliary_loss_mlp": 0.01034048, + "balance_loss_clip": 1.27755499, + "balance_loss_mlp": 1.01955819, + "epoch": 0.38364647527431234, + "flos": 21909554274240.0, + "grad_norm": 1.4344928181565262, + "language_loss": 0.81323671, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83760166, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.14483643, + "step": 6381, + "time_per_iteration": 2.8077738285064697 + }, + { + "auxiliary_loss_clip": 0.01279657, + "auxiliary_loss_mlp": 0.01007385, + "balance_loss_clip": 1.22395134, + "balance_loss_mlp": 1.00419044, + "epoch": 0.3837065985269803, + "flos": 65549096344080.0, + "grad_norm": 0.8125274723149174, + "language_loss": 0.60494387, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62781429, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.03198242, + "step": 6382, + "time_per_iteration": 3.288278579711914 + }, + { + "auxiliary_loss_clip": 0.01411077, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.28108048, + "balance_loss_mlp": 1.01608658, + "epoch": 0.38376672177964827, + "flos": 28262457619200.0, + "grad_norm": 2.278729283392601, + "language_loss": 0.67374992, + "learning_rate": 2.824641672639794e-06, + "loss": 0.6981777, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15612793, + "step": 6383, + "time_per_iteration": 2.88250994682312 + }, + { + "auxiliary_loss_clip": 0.01411349, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.2827785, + "balance_loss_mlp": 1.02071989, + "epoch": 0.38382684503231623, + "flos": 20636264363040.0, + "grad_norm": 2.3839776221307867, + "language_loss": 0.75368822, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77815568, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.14672852, + "step": 6384, + "time_per_iteration": 2.8433918952941895 + }, + { + "auxiliary_loss_clip": 0.01403052, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.27903295, + "balance_loss_mlp": 1.01921117, + "epoch": 0.3838869682849842, + "flos": 19610079279120.0, + "grad_norm": 1.6972293257141653, + "language_loss": 0.76605183, + "learning_rate": 2.823931980782341e-06, + "loss": 0.7904169, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14251709, + "step": 6385, + "time_per_iteration": 2.800875186920166 + }, + { + "auxiliary_loss_clip": 0.01272783, + "auxiliary_loss_mlp": 0.01001421, + "balance_loss_clip": 1.21681774, + "balance_loss_mlp": 0.99825048, + "epoch": 0.38394709153765216, + "flos": 56568674069280.0, + "grad_norm": 0.9228291598562725, + "language_loss": 0.67018443, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69292647, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.03173828, + "step": 6386, + "time_per_iteration": 3.0959341526031494 + }, + { + "auxiliary_loss_clip": 0.01399429, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.27469182, + "balance_loss_mlp": 1.0144732, + "epoch": 0.3840072147903202, + "flos": 15893100794520.0, + "grad_norm": 1.6907079024980922, + "language_loss": 0.72958261, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.75386339, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14172363, + "step": 6387, + "time_per_iteration": 2.7775192260742188 + }, + { + "auxiliary_loss_clip": 0.01390847, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.26926303, + "balance_loss_mlp": 1.01957107, + "epoch": 0.38406733804298815, + "flos": 28223612399880.0, + "grad_norm": 2.1478087383872597, + "language_loss": 0.81317163, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8374145, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13879395, + "step": 6388, + "time_per_iteration": 2.8114566802978516 + }, + { + "auxiliary_loss_clip": 0.01392313, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.2677629, + "balance_loss_mlp": 1.02172434, + "epoch": 0.3841274612956561, + "flos": 18228212948520.0, + "grad_norm": 1.6111146257948472, + "language_loss": 0.76238382, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78666455, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.14025879, + "step": 6389, + "time_per_iteration": 2.8174550533294678 + }, + { + "auxiliary_loss_clip": 0.01411088, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.28116429, + "balance_loss_mlp": 1.02545214, + "epoch": 0.3841875845483241, + "flos": 19797767429760.0, + "grad_norm": 1.821064210713952, + "language_loss": 0.76496392, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78949785, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.1685791, + "step": 6390, + "time_per_iteration": 2.958916664123535 + }, + { + "auxiliary_loss_clip": 0.01408403, + "auxiliary_loss_mlp": 0.01040666, + "balance_loss_clip": 1.27753901, + "balance_loss_mlp": 1.02421486, + "epoch": 0.38424770780099204, + "flos": 29904951360960.0, + "grad_norm": 2.0040972977151905, + "language_loss": 0.70469886, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72918957, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16455078, + "step": 6391, + "time_per_iteration": 2.872438669204712 + }, + { + "auxiliary_loss_clip": 0.01400462, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.27238643, + "balance_loss_mlp": 1.01661563, + "epoch": 0.38430783105366, + "flos": 20818632818520.0, + "grad_norm": 1.709794788404202, + "language_loss": 0.84545082, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86977941, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15783691, + "step": 6392, + "time_per_iteration": 2.814491033554077 + }, + { + "auxiliary_loss_clip": 0.014025, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.27427733, + "balance_loss_mlp": 1.01881778, + "epoch": 0.384367954306328, + "flos": 11002150112040.0, + "grad_norm": 1.8844928920392188, + "language_loss": 0.61592358, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.64028561, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14892578, + "step": 6393, + "time_per_iteration": 2.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.01419745, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.28563869, + "balance_loss_mlp": 1.01442695, + "epoch": 0.38442807755899594, + "flos": 25343547122520.0, + "grad_norm": 2.84980526586131, + "language_loss": 0.71338546, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73790628, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.17895508, + "step": 6394, + "time_per_iteration": 2.8558554649353027 + }, + { + "auxiliary_loss_clip": 0.01416464, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.28233981, + "balance_loss_mlp": 1.02195477, + "epoch": 0.3844882008116639, + "flos": 21074793310080.0, + "grad_norm": 2.3041950747667235, + "language_loss": 0.819309, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.84385318, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.16003418, + "step": 6395, + "time_per_iteration": 2.8655760288238525 + }, + { + "auxiliary_loss_clip": 0.01404388, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.27533174, + "balance_loss_mlp": 1.01997137, + "epoch": 0.38454832406433187, + "flos": 17967179453760.0, + "grad_norm": 2.099313588145785, + "language_loss": 0.71150517, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73589867, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14978027, + "step": 6396, + "time_per_iteration": 2.788940191268921 + }, + { + "auxiliary_loss_clip": 0.01260904, + "auxiliary_loss_mlp": 0.00999447, + "balance_loss_clip": 1.20385659, + "balance_loss_mlp": 0.99657363, + "epoch": 0.38460844731699984, + "flos": 67941895996440.0, + "grad_norm": 0.8834303856519689, + "language_loss": 0.59726, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61986351, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.02868652, + "step": 6397, + "time_per_iteration": 3.3782808780670166 + }, + { + "auxiliary_loss_clip": 0.01396989, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.27211022, + "balance_loss_mlp": 1.01378417, + "epoch": 0.3846685705696678, + "flos": 25854690463200.0, + "grad_norm": 2.589495210223964, + "language_loss": 0.84920436, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87347245, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.16052246, + "step": 6398, + "time_per_iteration": 2.816347360610962 + }, + { + "auxiliary_loss_clip": 0.01399054, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.27229881, + "balance_loss_mlp": 1.01620209, + "epoch": 0.38472869382233577, + "flos": 16294611898440.0, + "grad_norm": 2.4166609577471805, + "language_loss": 0.80365837, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.8279596, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14849854, + "step": 6399, + "time_per_iteration": 2.823310375213623 + }, + { + "auxiliary_loss_clip": 0.0140371, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.27440774, + "balance_loss_mlp": 1.01323891, + "epoch": 0.38478881707500373, + "flos": 19357695365040.0, + "grad_norm": 2.23996482124204, + "language_loss": 0.67986047, + "learning_rate": 2.818605315732038e-06, + "loss": 0.70419621, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16638184, + "step": 6400, + "time_per_iteration": 2.749361991882324 + }, + { + "auxiliary_loss_clip": 0.01411426, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.28079867, + "balance_loss_mlp": 1.02639389, + "epoch": 0.38484894032767175, + "flos": 24865929306000.0, + "grad_norm": 1.750153263168277, + "language_loss": 0.73673338, + "learning_rate": 2.81824995589303e-06, + "loss": 0.76126724, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15551758, + "step": 6401, + "time_per_iteration": 2.9378793239593506 + }, + { + "auxiliary_loss_clip": 0.01400376, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.27182388, + "balance_loss_mlp": 1.01563168, + "epoch": 0.3849090635803397, + "flos": 14505914768760.0, + "grad_norm": 2.4335530318431853, + "language_loss": 0.72578359, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.75010026, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15661621, + "step": 6402, + "time_per_iteration": 4.173814296722412 + }, + { + "auxiliary_loss_clip": 0.01393129, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.26725173, + "balance_loss_mlp": 1.01490176, + "epoch": 0.3849691868330077, + "flos": 18520579116000.0, + "grad_norm": 1.8564238096099441, + "language_loss": 0.83040702, + "learning_rate": 2.817539143144128e-06, + "loss": 0.85463488, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14752197, + "step": 6403, + "time_per_iteration": 2.714515209197998 + }, + { + "auxiliary_loss_clip": 0.01398166, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.27206254, + "balance_loss_mlp": 1.02009642, + "epoch": 0.38502931008567565, + "flos": 21621411376200.0, + "grad_norm": 2.00262942285451, + "language_loss": 0.83428216, + "learning_rate": 2.817183690261189e-06, + "loss": 0.85861528, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15045166, + "step": 6404, + "time_per_iteration": 2.7641708850860596 + }, + { + "auxiliary_loss_clip": 0.01400827, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.27093482, + "balance_loss_mlp": 1.01937807, + "epoch": 0.3850894333383436, + "flos": 25421440602960.0, + "grad_norm": 1.6805780400464847, + "language_loss": 0.70038462, + "learning_rate": 2.816828206390563e-06, + "loss": 0.72473812, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.15148926, + "step": 6405, + "time_per_iteration": 4.314226150512695 + }, + { + "auxiliary_loss_clip": 0.01392467, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.26789951, + "balance_loss_mlp": 1.02177262, + "epoch": 0.3851495565910116, + "flos": 20232438582600.0, + "grad_norm": 1.947066134270895, + "language_loss": 0.78942293, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81370282, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.13745117, + "step": 6406, + "time_per_iteration": 2.728329658508301 + }, + { + "auxiliary_loss_clip": 0.0140577, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.27730024, + "balance_loss_mlp": 1.02230573, + "epoch": 0.38520967984367954, + "flos": 16512820554600.0, + "grad_norm": 2.7951274793925176, + "language_loss": 0.84831905, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.87275314, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15332031, + "step": 6407, + "time_per_iteration": 2.756279706954956 + }, + { + "auxiliary_loss_clip": 0.01255248, + "auxiliary_loss_mlp": 0.01009146, + "balance_loss_clip": 1.19939208, + "balance_loss_mlp": 1.00612974, + "epoch": 0.3852698030963475, + "flos": 61328852540640.0, + "grad_norm": 0.8415172392374133, + "language_loss": 0.64904916, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67169315, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.03015137, + "step": 6408, + "time_per_iteration": 3.3438894748687744 + }, + { + "auxiliary_loss_clip": 0.01400573, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.27204323, + "balance_loss_mlp": 1.02507699, + "epoch": 0.3853299263490155, + "flos": 22898152998000.0, + "grad_norm": 2.891772455491428, + "language_loss": 0.73571813, + "learning_rate": 2.8154059613008e-06, + "loss": 0.76014799, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.17346191, + "step": 6409, + "time_per_iteration": 2.7546160221099854 + }, + { + "auxiliary_loss_clip": 0.01418861, + "auxiliary_loss_mlp": 0.01046722, + "balance_loss_clip": 1.28361583, + "balance_loss_mlp": 1.03030634, + "epoch": 0.38539004960168344, + "flos": 20052303586920.0, + "grad_norm": 1.9581878685488925, + "language_loss": 0.70784086, + "learning_rate": 2.81505032269396e-06, + "loss": 0.73249668, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16418457, + "step": 6410, + "time_per_iteration": 2.8449172973632812 + }, + { + "auxiliary_loss_clip": 0.01254449, + "auxiliary_loss_mlp": 0.01007385, + "balance_loss_clip": 1.19856751, + "balance_loss_mlp": 1.00464344, + "epoch": 0.3854501728543514, + "flos": 68748085656360.0, + "grad_norm": 0.6756467951814207, + "language_loss": 0.60390687, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62652528, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.02746582, + "step": 6411, + "time_per_iteration": 4.8846423625946045 + }, + { + "auxiliary_loss_clip": 0.01400891, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.27265501, + "balance_loss_mlp": 1.01938868, + "epoch": 0.38551029610701937, + "flos": 20489857933320.0, + "grad_norm": 1.9959826317625142, + "language_loss": 0.77909279, + "learning_rate": 2.814338952773397e-06, + "loss": 0.80343288, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.13726807, + "step": 6412, + "time_per_iteration": 4.303630352020264 + }, + { + "auxiliary_loss_clip": 0.01409638, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.27769017, + "balance_loss_mlp": 1.02036214, + "epoch": 0.38557041935968733, + "flos": 23476347387000.0, + "grad_norm": 2.874111993372091, + "language_loss": 0.78283703, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80731082, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.17370605, + "step": 6413, + "time_per_iteration": 2.8506579399108887 + }, + { + "auxiliary_loss_clip": 0.0124895, + "auxiliary_loss_mlp": 0.00999114, + "balance_loss_clip": 1.19205332, + "balance_loss_mlp": 0.99655116, + "epoch": 0.38563054261235535, + "flos": 63980678896920.0, + "grad_norm": 0.8093856564261408, + "language_loss": 0.61325085, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63573146, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.02563477, + "step": 6414, + "time_per_iteration": 3.0996010303497314 + }, + { + "auxiliary_loss_clip": 0.01412236, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.28052592, + "balance_loss_mlp": 1.01971424, + "epoch": 0.3856906658650233, + "flos": 23993094681360.0, + "grad_norm": 2.20021053224217, + "language_loss": 0.77416545, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79863513, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15026855, + "step": 6415, + "time_per_iteration": 2.7724783420562744 + }, + { + "auxiliary_loss_clip": 0.01395465, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.27358711, + "balance_loss_mlp": 1.01985168, + "epoch": 0.3857507891176913, + "flos": 25012619994240.0, + "grad_norm": 1.6969758923134128, + "language_loss": 0.80225623, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82653916, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.12969971, + "step": 6416, + "time_per_iteration": 2.8906781673431396 + }, + { + "auxiliary_loss_clip": 0.01390185, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.26465058, + "balance_loss_mlp": 1.02131295, + "epoch": 0.38581091237035925, + "flos": 21540918960720.0, + "grad_norm": 2.427886837448295, + "language_loss": 0.79358554, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.8178342, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.13366699, + "step": 6417, + "time_per_iteration": 2.7799389362335205 + }, + { + "auxiliary_loss_clip": 0.01395678, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.26973796, + "balance_loss_mlp": 1.02319384, + "epoch": 0.3858710356230272, + "flos": 17388335331000.0, + "grad_norm": 2.029198113561073, + "language_loss": 0.80116415, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82549727, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14428711, + "step": 6418, + "time_per_iteration": 2.822533130645752 + }, + { + "auxiliary_loss_clip": 0.01392908, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.26819909, + "balance_loss_mlp": 1.01359701, + "epoch": 0.3859311588756952, + "flos": 20344466712600.0, + "grad_norm": 1.7910479622309616, + "language_loss": 0.8029424, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.82714641, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.13903809, + "step": 6419, + "time_per_iteration": 2.826728105545044 + }, + { + "auxiliary_loss_clip": 0.01394295, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.26899016, + "balance_loss_mlp": 1.02133965, + "epoch": 0.38599128212836314, + "flos": 26326541892600.0, + "grad_norm": 1.8827576236138066, + "language_loss": 0.68640846, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.71071643, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15161133, + "step": 6420, + "time_per_iteration": 2.8387467861175537 + }, + { + "auxiliary_loss_clip": 0.01393175, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.27058911, + "balance_loss_mlp": 1.0242002, + "epoch": 0.3860514053810311, + "flos": 13557907423800.0, + "grad_norm": 2.1727910806405966, + "language_loss": 0.81187505, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83618355, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.13464355, + "step": 6421, + "time_per_iteration": 2.7377355098724365 + }, + { + "auxiliary_loss_clip": 0.01397766, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.26944411, + "balance_loss_mlp": 1.01956606, + "epoch": 0.3861115286336991, + "flos": 20958501302280.0, + "grad_norm": 2.039981126921611, + "language_loss": 0.72163355, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74595451, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.14764404, + "step": 6422, + "time_per_iteration": 2.7272560596466064 + }, + { + "auxiliary_loss_clip": 0.01392286, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.27077127, + "balance_loss_mlp": 1.02817702, + "epoch": 0.38617165188636704, + "flos": 16367063858640.0, + "grad_norm": 1.661942309677126, + "language_loss": 0.66644585, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.69078088, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13037109, + "step": 6423, + "time_per_iteration": 2.7358951568603516 + }, + { + "auxiliary_loss_clip": 0.01401807, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.27465153, + "balance_loss_mlp": 1.02558863, + "epoch": 0.386231775139035, + "flos": 34793952842160.0, + "grad_norm": 4.133931090613648, + "language_loss": 0.6945188, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71893531, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14257812, + "step": 6424, + "time_per_iteration": 2.833561420440674 + }, + { + "auxiliary_loss_clip": 0.01395817, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.27157891, + "balance_loss_mlp": 1.02062535, + "epoch": 0.38629189839170297, + "flos": 21731246654760.0, + "grad_norm": 1.4311124090727492, + "language_loss": 0.72870982, + "learning_rate": 2.809712042331429e-06, + "loss": 0.75301588, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14160156, + "step": 6425, + "time_per_iteration": 2.8290889263153076 + }, + { + "auxiliary_loss_clip": 0.01402069, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.27273071, + "balance_loss_mlp": 1.02308798, + "epoch": 0.38635202164437094, + "flos": 27928850339160.0, + "grad_norm": 1.8615159602562164, + "language_loss": 0.80173373, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82612979, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.14434814, + "step": 6426, + "time_per_iteration": 2.80293345451355 + }, + { + "auxiliary_loss_clip": 0.01403133, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.27530599, + "balance_loss_mlp": 1.01879525, + "epoch": 0.38641214489703896, + "flos": 23591908444320.0, + "grad_norm": 1.728483978200212, + "language_loss": 0.75790608, + "learning_rate": 2.80899974864781e-06, + "loss": 0.78227639, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15112305, + "step": 6427, + "time_per_iteration": 2.858288288116455 + }, + { + "auxiliary_loss_clip": 0.01396931, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.27182603, + "balance_loss_mlp": 1.02182817, + "epoch": 0.3864722681497069, + "flos": 12645131154120.0, + "grad_norm": 1.864920159069114, + "language_loss": 0.71036249, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.73469102, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14105225, + "step": 6428, + "time_per_iteration": 2.717578172683716 + }, + { + "auxiliary_loss_clip": 0.01395246, + "auxiliary_loss_mlp": 0.01042682, + "balance_loss_clip": 1.26808083, + "balance_loss_mlp": 1.02903831, + "epoch": 0.3865323914023749, + "flos": 17602889234760.0, + "grad_norm": 2.5221509751671274, + "language_loss": 0.84401953, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86839885, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.13659668, + "step": 6429, + "time_per_iteration": 2.7083325386047363 + }, + { + "auxiliary_loss_clip": 0.01393801, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.26719856, + "balance_loss_mlp": 1.02692592, + "epoch": 0.38659251465504285, + "flos": 18483601881240.0, + "grad_norm": 1.940354749036998, + "language_loss": 0.81553996, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83988965, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14245605, + "step": 6430, + "time_per_iteration": 2.7362358570098877 + }, + { + "auxiliary_loss_clip": 0.01251028, + "auxiliary_loss_mlp": 0.01020622, + "balance_loss_clip": 1.19348526, + "balance_loss_mlp": 1.0180943, + "epoch": 0.3866526379077108, + "flos": 64181970848160.0, + "grad_norm": 0.721288829391042, + "language_loss": 0.58901572, + "learning_rate": 2.807574793260416e-06, + "loss": 0.61173224, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.02526855, + "step": 6431, + "time_per_iteration": 3.351954460144043 + }, + { + "auxiliary_loss_clip": 0.01402709, + "auxiliary_loss_mlp": 0.01039146, + "balance_loss_clip": 1.27295852, + "balance_loss_mlp": 1.02370811, + "epoch": 0.3867127611603788, + "flos": 14391856220760.0, + "grad_norm": 1.9452087347442, + "language_loss": 0.7939887, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81840724, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.15447998, + "step": 6432, + "time_per_iteration": 2.761186122894287 + }, + { + "auxiliary_loss_clip": 0.01404719, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.27272713, + "balance_loss_mlp": 1.02533317, + "epoch": 0.38677288441304675, + "flos": 20015529393960.0, + "grad_norm": 2.6177176473329666, + "language_loss": 0.80748999, + "learning_rate": 2.806862131772779e-06, + "loss": 0.83194542, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15478516, + "step": 6433, + "time_per_iteration": 2.7342214584350586 + }, + { + "auxiliary_loss_clip": 0.01400892, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.27316236, + "balance_loss_mlp": 1.02500689, + "epoch": 0.3868330076657147, + "flos": 22242024520200.0, + "grad_norm": 1.7289290322813493, + "language_loss": 0.70806742, + "learning_rate": 2.806505755127765e-06, + "loss": 0.73248655, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.16009521, + "step": 6434, + "time_per_iteration": 2.8099470138549805 + }, + { + "auxiliary_loss_clip": 0.01409506, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.27600682, + "balance_loss_mlp": 1.02545595, + "epoch": 0.3868931309183827, + "flos": 16731841377960.0, + "grad_norm": 1.835352421096591, + "language_loss": 0.77848935, + "learning_rate": 2.806149347899972e-06, + "loss": 0.80299103, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15197754, + "step": 6435, + "time_per_iteration": 2.7684743404388428 + }, + { + "auxiliary_loss_clip": 0.01394752, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.26970875, + "balance_loss_mlp": 1.02176583, + "epoch": 0.38695325417105064, + "flos": 22679903733480.0, + "grad_norm": 1.7407849194743295, + "language_loss": 0.79746693, + "learning_rate": 2.805792910102915e-06, + "loss": 0.82177705, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14489746, + "step": 6436, + "time_per_iteration": 2.792187213897705 + }, + { + "auxiliary_loss_clip": 0.01388586, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.26468337, + "balance_loss_mlp": 1.01533067, + "epoch": 0.3870133774237186, + "flos": 23117214429720.0, + "grad_norm": 2.735206914343777, + "language_loss": 0.7669245, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79110557, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.14196777, + "step": 6437, + "time_per_iteration": 2.871896266937256 + }, + { + "auxiliary_loss_clip": 0.0139351, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.26934218, + "balance_loss_mlp": 1.02058244, + "epoch": 0.3870735006763866, + "flos": 17680173589800.0, + "grad_norm": 2.1699298933502047, + "language_loss": 0.8251189, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84939146, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.1317749, + "step": 6438, + "time_per_iteration": 2.84306001663208 + }, + { + "auxiliary_loss_clip": 0.01397453, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.27017164, + "balance_loss_mlp": 1.01756299, + "epoch": 0.38713362392905454, + "flos": 23301166611240.0, + "grad_norm": 1.3774746870752557, + "language_loss": 0.75493747, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77924025, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.15264893, + "step": 6439, + "time_per_iteration": 2.8346118927001953 + }, + { + "auxiliary_loss_clip": 0.01385123, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.26293325, + "balance_loss_mlp": 1.01596951, + "epoch": 0.38719374718172256, + "flos": 21035948090760.0, + "grad_norm": 2.3016847880330507, + "language_loss": 0.73874253, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76289463, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14129639, + "step": 6440, + "time_per_iteration": 4.216813802719116 + }, + { + "auxiliary_loss_clip": 0.01402091, + "auxiliary_loss_mlp": 0.01038951, + "balance_loss_clip": 1.27083325, + "balance_loss_mlp": 1.02297735, + "epoch": 0.3872538704343905, + "flos": 19614546198720.0, + "grad_norm": 1.9999249852902978, + "language_loss": 0.82668364, + "learning_rate": 2.804010263051774e-06, + "loss": 0.85109413, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15991211, + "step": 6441, + "time_per_iteration": 2.913752555847168 + }, + { + "auxiliary_loss_clip": 0.01393055, + "auxiliary_loss_mlp": 0.01037687, + "balance_loss_clip": 1.26769638, + "balance_loss_mlp": 1.02390027, + "epoch": 0.3873139936870585, + "flos": 17534538718920.0, + "grad_norm": 3.1284358039047175, + "language_loss": 0.81019092, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83449829, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.13775635, + "step": 6442, + "time_per_iteration": 2.7554171085357666 + }, + { + "auxiliary_loss_clip": 0.0139563, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.26875818, + "balance_loss_mlp": 1.01744998, + "epoch": 0.38737411693972645, + "flos": 17791227119160.0, + "grad_norm": 1.6422541508273105, + "language_loss": 0.84161043, + "learning_rate": 2.803296990719624e-06, + "loss": 0.86589754, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15637207, + "step": 6443, + "time_per_iteration": 2.692965507507324 + }, + { + "auxiliary_loss_clip": 0.01255485, + "auxiliary_loss_mlp": 0.01001432, + "balance_loss_clip": 1.19935691, + "balance_loss_mlp": 0.99811822, + "epoch": 0.3874342401923944, + "flos": 58317064078680.0, + "grad_norm": 0.7657937333972367, + "language_loss": 0.50312334, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52569252, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03320312, + "step": 6444, + "time_per_iteration": 4.781041860580444 + }, + { + "auxiliary_loss_clip": 0.01386322, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.26575387, + "balance_loss_mlp": 1.02182102, + "epoch": 0.3874943634450624, + "flos": 17716379265720.0, + "grad_norm": 1.4900691934510235, + "language_loss": 0.78953969, + "learning_rate": 2.802583596543065e-06, + "loss": 0.81375521, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.13415527, + "step": 6445, + "time_per_iteration": 2.7391202449798584 + }, + { + "auxiliary_loss_clip": 0.01391856, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.26864696, + "balance_loss_mlp": 1.01521528, + "epoch": 0.38755448669773035, + "flos": 19249565637600.0, + "grad_norm": 1.7681990724062784, + "language_loss": 0.81153429, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83574837, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14367676, + "step": 6446, + "time_per_iteration": 2.7271337509155273 + }, + { + "auxiliary_loss_clip": 0.01391591, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.26750016, + "balance_loss_mlp": 1.02076077, + "epoch": 0.3876146099503983, + "flos": 20599084086480.0, + "grad_norm": 1.972868412058244, + "language_loss": 0.77232659, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79659581, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14562988, + "step": 6447, + "time_per_iteration": 2.7610042095184326 + }, + { + "auxiliary_loss_clip": 0.01392396, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.27072108, + "balance_loss_mlp": 1.02314353, + "epoch": 0.3876747332030663, + "flos": 19285811921880.0, + "grad_norm": 1.5119890259176485, + "language_loss": 0.76416588, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78846037, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13903809, + "step": 6448, + "time_per_iteration": 2.75175142288208 + }, + { + "auxiliary_loss_clip": 0.0139634, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.27250957, + "balance_loss_mlp": 1.02018237, + "epoch": 0.38773485645573424, + "flos": 18949646315160.0, + "grad_norm": 1.548237325216443, + "language_loss": 0.7605651, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78487402, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14373779, + "step": 6449, + "time_per_iteration": 2.854410409927368 + }, + { + "auxiliary_loss_clip": 0.01395903, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.2671926, + "balance_loss_mlp": 1.0202291, + "epoch": 0.3877949797084022, + "flos": 23075973317160.0, + "grad_norm": 1.684069412290471, + "language_loss": 0.78795528, + "learning_rate": 2.800799578742542e-06, + "loss": 0.81227106, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15454102, + "step": 6450, + "time_per_iteration": 4.471750020980835 + }, + { + "auxiliary_loss_clip": 0.01406317, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.27504659, + "balance_loss_mlp": 1.0206672, + "epoch": 0.3878551029610702, + "flos": 29101482461160.0, + "grad_norm": 2.7660211452220227, + "language_loss": 0.7812565, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.80567604, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.14978027, + "step": 6451, + "time_per_iteration": 4.244182825088501 + }, + { + "auxiliary_loss_clip": 0.013824, + "auxiliary_loss_mlp": 0.01024267, + "balance_loss_clip": 1.26185775, + "balance_loss_mlp": 1.01088536, + "epoch": 0.38791522621373814, + "flos": 21001366749240.0, + "grad_norm": 1.797352425182936, + "language_loss": 0.76685631, + "learning_rate": 2.800085758962812e-06, + "loss": 0.79092294, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13378906, + "step": 6452, + "time_per_iteration": 2.7575643062591553 + }, + { + "auxiliary_loss_clip": 0.01389421, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.26595485, + "balance_loss_mlp": 1.01885211, + "epoch": 0.3879753494664061, + "flos": 15491061781920.0, + "grad_norm": 1.6855631792978132, + "language_loss": 0.79442394, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81864452, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.13781738, + "step": 6453, + "time_per_iteration": 2.7909834384918213 + }, + { + "auxiliary_loss_clip": 0.01405237, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.2750752, + "balance_loss_mlp": 1.02051425, + "epoch": 0.3880354727190741, + "flos": 22059006330960.0, + "grad_norm": 1.7414225906480938, + "language_loss": 0.72189271, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.74629855, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.14825439, + "step": 6454, + "time_per_iteration": 2.7675740718841553 + }, + { + "auxiliary_loss_clip": 0.01402521, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.27393818, + "balance_loss_mlp": 1.02157044, + "epoch": 0.3880955959717421, + "flos": 20345278879800.0, + "grad_norm": 2.100492495579009, + "language_loss": 0.7782318, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80262482, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15222168, + "step": 6455, + "time_per_iteration": 2.7856523990631104 + }, + { + "auxiliary_loss_clip": 0.01391493, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.26702893, + "balance_loss_mlp": 1.01546407, + "epoch": 0.38815571922441006, + "flos": 23080562061840.0, + "grad_norm": 1.6289979178598337, + "language_loss": 0.76410329, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78832114, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14807129, + "step": 6456, + "time_per_iteration": 2.812084674835205 + }, + { + "auxiliary_loss_clip": 0.01402113, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.27487826, + "balance_loss_mlp": 1.01484799, + "epoch": 0.388215842477078, + "flos": 20781818017200.0, + "grad_norm": 2.0344769759698407, + "language_loss": 0.60449374, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62880379, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14050293, + "step": 6457, + "time_per_iteration": 2.787783145904541 + }, + { + "auxiliary_loss_clip": 0.0139616, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.26944184, + "balance_loss_mlp": 1.01648235, + "epoch": 0.388275965729746, + "flos": 20452961915280.0, + "grad_norm": 2.2733525232306224, + "language_loss": 0.79318506, + "learning_rate": 2.797943571912841e-06, + "loss": 0.81747049, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.15899658, + "step": 6458, + "time_per_iteration": 2.7780404090881348 + }, + { + "auxiliary_loss_clip": 0.01392312, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.26574016, + "balance_loss_mlp": 1.01845515, + "epoch": 0.38833608898241395, + "flos": 27898289225280.0, + "grad_norm": 1.8568377286514337, + "language_loss": 0.82008797, + "learning_rate": 2.797586434755509e-06, + "loss": 0.84433854, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14312744, + "step": 6459, + "time_per_iteration": 2.7971677780151367 + }, + { + "auxiliary_loss_clip": 0.01378497, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.25688171, + "balance_loss_mlp": 1.01600945, + "epoch": 0.3883962122350819, + "flos": 18080831918160.0, + "grad_norm": 1.9136698368096894, + "language_loss": 0.6139527, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.6380322, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.13452148, + "step": 6460, + "time_per_iteration": 2.7544515132904053 + }, + { + "auxiliary_loss_clip": 0.01386928, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.2650044, + "balance_loss_mlp": 1.01602829, + "epoch": 0.3884563354877499, + "flos": 23627423778120.0, + "grad_norm": 1.6617777089647685, + "language_loss": 0.86333334, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88749623, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.13336182, + "step": 6461, + "time_per_iteration": 2.924774169921875 + }, + { + "auxiliary_loss_clip": 0.01391786, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.26576447, + "balance_loss_mlp": 1.01980758, + "epoch": 0.38851645874041785, + "flos": 27459232369560.0, + "grad_norm": 2.0554948974653025, + "language_loss": 0.71170807, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73596931, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14532471, + "step": 6462, + "time_per_iteration": 2.8371849060058594 + }, + { + "auxiliary_loss_clip": 0.01388912, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.26412988, + "balance_loss_mlp": 1.02541375, + "epoch": 0.3885765819930858, + "flos": 25233508802160.0, + "grad_norm": 2.0925106477885715, + "language_loss": 0.76200336, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78629541, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14868164, + "step": 6463, + "time_per_iteration": 2.7878589630126953 + }, + { + "auxiliary_loss_clip": 0.01396672, + "auxiliary_loss_mlp": 0.01039467, + "balance_loss_clip": 1.2682848, + "balance_loss_mlp": 1.02391005, + "epoch": 0.3886367052457538, + "flos": 16951065243120.0, + "grad_norm": 2.707450932607152, + "language_loss": 0.70656872, + "learning_rate": 2.795800295571382e-06, + "loss": 0.73093009, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15563965, + "step": 6464, + "time_per_iteration": 2.7516989707946777 + }, + { + "auxiliary_loss_clip": 0.01383877, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.26155043, + "balance_loss_mlp": 1.01994956, + "epoch": 0.38869682849842174, + "flos": 27158825746800.0, + "grad_norm": 2.912543757722586, + "language_loss": 0.69597316, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.72016084, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14935303, + "step": 6465, + "time_per_iteration": 2.78009295463562 + }, + { + "auxiliary_loss_clip": 0.01389329, + "auxiliary_loss_mlp": 0.01040271, + "balance_loss_clip": 1.26352394, + "balance_loss_mlp": 1.02434468, + "epoch": 0.3887569517510897, + "flos": 21067849280520.0, + "grad_norm": 1.8229617761393095, + "language_loss": 0.77844524, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80274123, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15917969, + "step": 6466, + "time_per_iteration": 2.738530397415161 + }, + { + "auxiliary_loss_clip": 0.01392063, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.26497793, + "balance_loss_mlp": 1.02524328, + "epoch": 0.38881707500375773, + "flos": 29503440257040.0, + "grad_norm": 1.6276924371894221, + "language_loss": 0.6915983, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71592873, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.1574707, + "step": 6467, + "time_per_iteration": 2.8398282527923584 + }, + { + "auxiliary_loss_clip": 0.01393996, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.26667285, + "balance_loss_mlp": 1.02895355, + "epoch": 0.3888771982564257, + "flos": 17492079355560.0, + "grad_norm": 2.8298939274870185, + "language_loss": 0.83442765, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85880768, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.15026855, + "step": 6468, + "time_per_iteration": 2.6837575435638428 + }, + { + "auxiliary_loss_clip": 0.01385422, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.26170218, + "balance_loss_mlp": 1.02071309, + "epoch": 0.38893732150909366, + "flos": 21947181242760.0, + "grad_norm": 1.8658319896114521, + "language_loss": 0.84860969, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.87280333, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.13232422, + "step": 6469, + "time_per_iteration": 2.789774179458618 + }, + { + "auxiliary_loss_clip": 0.01390557, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_clip": 1.26562715, + "balance_loss_mlp": 1.02588964, + "epoch": 0.3889974447617616, + "flos": 24281278187760.0, + "grad_norm": 1.7273087388804724, + "language_loss": 0.74670184, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77102071, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.15435791, + "step": 6470, + "time_per_iteration": 2.7453482151031494 + }, + { + "auxiliary_loss_clip": 0.0139031, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.26518381, + "balance_loss_mlp": 1.02658021, + "epoch": 0.3890575680144296, + "flos": 25672768699680.0, + "grad_norm": 1.5737621393658885, + "language_loss": 0.75137293, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77569789, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15600586, + "step": 6471, + "time_per_iteration": 2.8657591342926025 + }, + { + "auxiliary_loss_clip": 0.01394561, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.26981211, + "balance_loss_mlp": 1.03038931, + "epoch": 0.38911769126709755, + "flos": 22860241770960.0, + "grad_norm": 1.6425795882763183, + "language_loss": 0.68226212, + "learning_rate": 2.792940904386562e-06, + "loss": 0.7066552, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14355469, + "step": 6472, + "time_per_iteration": 2.7969698905944824 + }, + { + "auxiliary_loss_clip": 0.01392936, + "auxiliary_loss_mlp": 0.01045951, + "balance_loss_clip": 1.26814759, + "balance_loss_mlp": 1.03111529, + "epoch": 0.3891778145197655, + "flos": 25453057534200.0, + "grad_norm": 3.948199442276076, + "language_loss": 0.77140146, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.79579031, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14849854, + "step": 6473, + "time_per_iteration": 2.954523801803589 + }, + { + "auxiliary_loss_clip": 0.01394454, + "auxiliary_loss_mlp": 0.01045173, + "balance_loss_clip": 1.26885653, + "balance_loss_mlp": 1.03006911, + "epoch": 0.3892379377724335, + "flos": 14032073529720.0, + "grad_norm": 2.0021641630499336, + "language_loss": 0.71422368, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73861992, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.15100098, + "step": 6474, + "time_per_iteration": 2.821859359741211 + }, + { + "auxiliary_loss_clip": 0.01392898, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.26841331, + "balance_loss_mlp": 1.02847242, + "epoch": 0.38929806102510145, + "flos": 20162341907280.0, + "grad_norm": 1.4622603313949165, + "language_loss": 0.68921608, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71357214, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.14239502, + "step": 6475, + "time_per_iteration": 2.747732162475586 + }, + { + "auxiliary_loss_clip": 0.01403929, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.27405477, + "balance_loss_mlp": 1.03018284, + "epoch": 0.3893581842777694, + "flos": 22169085259680.0, + "grad_norm": 1.9493596866962668, + "language_loss": 0.75932217, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78381753, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.15423584, + "step": 6476, + "time_per_iteration": 2.791595458984375 + }, + { + "auxiliary_loss_clip": 0.01257553, + "auxiliary_loss_mlp": 0.01005738, + "balance_loss_clip": 1.20112085, + "balance_loss_mlp": 1.00332987, + "epoch": 0.3894183075304374, + "flos": 67318318442160.0, + "grad_norm": 0.7831809589312632, + "language_loss": 0.58286083, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60549378, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.02404785, + "step": 6477, + "time_per_iteration": 3.2546098232269287 + }, + { + "auxiliary_loss_clip": 0.01397172, + "auxiliary_loss_mlp": 0.01044186, + "balance_loss_clip": 1.2692337, + "balance_loss_mlp": 1.02865875, + "epoch": 0.38947843078310534, + "flos": 18551586921840.0, + "grad_norm": 1.7977580591953854, + "language_loss": 0.78009331, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.8045069, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15533447, + "step": 6478, + "time_per_iteration": 2.7628910541534424 + }, + { + "auxiliary_loss_clip": 0.01387522, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.26400602, + "balance_loss_mlp": 1.02765727, + "epoch": 0.3895385540357733, + "flos": 14609293318080.0, + "grad_norm": 2.3355043869785117, + "language_loss": 0.82569063, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84998721, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14483643, + "step": 6479, + "time_per_iteration": 4.185853719711304 + }, + { + "auxiliary_loss_clip": 0.01383889, + "auxiliary_loss_mlp": 0.01038765, + "balance_loss_clip": 1.26067722, + "balance_loss_mlp": 1.02377415, + "epoch": 0.38959867728844133, + "flos": 19980095276880.0, + "grad_norm": 1.7837836368181237, + "language_loss": 0.80613589, + "learning_rate": 2.790079588824617e-06, + "loss": 0.83036244, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14978027, + "step": 6480, + "time_per_iteration": 2.8470406532287598 + }, + { + "auxiliary_loss_clip": 0.01386497, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.26330578, + "balance_loss_mlp": 1.01948524, + "epoch": 0.3896588005411093, + "flos": 22676979931560.0, + "grad_norm": 2.008060114656292, + "language_loss": 0.83346379, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85766375, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14013672, + "step": 6481, + "time_per_iteration": 2.7694101333618164 + }, + { + "auxiliary_loss_clip": 0.01383585, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.26381302, + "balance_loss_mlp": 1.02277732, + "epoch": 0.38971892379377726, + "flos": 21000960665640.0, + "grad_norm": 1.6019967543501934, + "language_loss": 0.7580483, + "learning_rate": 2.789363960063863e-06, + "loss": 0.78224027, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.12854004, + "step": 6482, + "time_per_iteration": 2.7545523643493652 + }, + { + "auxiliary_loss_clip": 0.01388675, + "auxiliary_loss_mlp": 0.01042766, + "balance_loss_clip": 1.26337481, + "balance_loss_mlp": 1.02908099, + "epoch": 0.3897790470464452, + "flos": 22533619128840.0, + "grad_norm": 1.900175625914688, + "language_loss": 0.79562402, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81993842, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.13690186, + "step": 6483, + "time_per_iteration": 4.247572660446167 + }, + { + "auxiliary_loss_clip": 0.01385333, + "auxiliary_loss_mlp": 0.0103296, + "balance_loss_clip": 1.26088142, + "balance_loss_mlp": 1.01783276, + "epoch": 0.3898391702991132, + "flos": 26215041671280.0, + "grad_norm": 1.604657327053028, + "language_loss": 0.80336761, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82755053, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.15124512, + "step": 6484, + "time_per_iteration": 2.8202924728393555 + }, + { + "auxiliary_loss_clip": 0.01389448, + "auxiliary_loss_mlp": 0.01045203, + "balance_loss_clip": 1.26620328, + "balance_loss_mlp": 1.02936006, + "epoch": 0.38989929355178116, + "flos": 21070042131960.0, + "grad_norm": 6.825846510675634, + "language_loss": 0.77811009, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80245662, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.1583252, + "step": 6485, + "time_per_iteration": 2.8323123455047607 + }, + { + "auxiliary_loss_clip": 0.01388691, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.2608819, + "balance_loss_mlp": 1.01914692, + "epoch": 0.3899594168044491, + "flos": 25489669293720.0, + "grad_norm": 2.6930214248793183, + "language_loss": 0.85893399, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.88316661, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15423584, + "step": 6486, + "time_per_iteration": 2.846710443496704 + }, + { + "auxiliary_loss_clip": 0.01393278, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.26364779, + "balance_loss_mlp": 1.01993704, + "epoch": 0.3900195400571171, + "flos": 31145649740280.0, + "grad_norm": 2.0439886056311733, + "language_loss": 0.85728997, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.88156915, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14697266, + "step": 6487, + "time_per_iteration": 2.851212978363037 + }, + { + "auxiliary_loss_clip": 0.01385307, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.2612251, + "balance_loss_mlp": 1.01517022, + "epoch": 0.39007966330978505, + "flos": 20234753259120.0, + "grad_norm": 1.4879156511682976, + "language_loss": 0.73625058, + "learning_rate": 2.787216355829633e-06, + "loss": 0.76040697, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.15148926, + "step": 6488, + "time_per_iteration": 2.753403902053833 + }, + { + "auxiliary_loss_clip": 0.01390671, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.26308382, + "balance_loss_mlp": 1.01809216, + "epoch": 0.390139786562453, + "flos": 22533903387360.0, + "grad_norm": 1.6984399541036543, + "language_loss": 0.6867786, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71101969, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15344238, + "step": 6489, + "time_per_iteration": 4.177607297897339 + }, + { + "auxiliary_loss_clip": 0.01376779, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.25436926, + "balance_loss_mlp": 1.0145123, + "epoch": 0.390199909815121, + "flos": 26438610630960.0, + "grad_norm": 1.5940704910363699, + "language_loss": 0.81249225, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.8365373, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.13214111, + "step": 6490, + "time_per_iteration": 4.364814281463623 + }, + { + "auxiliary_loss_clip": 0.0138777, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.26227164, + "balance_loss_mlp": 1.01756883, + "epoch": 0.39026003306778895, + "flos": 17279027961120.0, + "grad_norm": 2.0729632656124886, + "language_loss": 0.89806992, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.92227054, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.14703369, + "step": 6491, + "time_per_iteration": 2.7276368141174316 + }, + { + "auxiliary_loss_clip": 0.01387067, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.26033235, + "balance_loss_mlp": 1.02033222, + "epoch": 0.3903201563204569, + "flos": 24537925979640.0, + "grad_norm": 1.7185319908660648, + "language_loss": 0.79240596, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81662726, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14746094, + "step": 6492, + "time_per_iteration": 2.8937156200408936 + }, + { + "auxiliary_loss_clip": 0.01385336, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.26063526, + "balance_loss_mlp": 1.01630831, + "epoch": 0.39038027957312493, + "flos": 23773099257360.0, + "grad_norm": 1.7624639391631463, + "language_loss": 0.74378431, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794147, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.14074707, + "step": 6493, + "time_per_iteration": 2.8712306022644043 + }, + { + "auxiliary_loss_clip": 0.01398516, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.26697004, + "balance_loss_mlp": 1.01353574, + "epoch": 0.3904404028257929, + "flos": 14104687923360.0, + "grad_norm": 1.8670297957692086, + "language_loss": 0.76376987, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78804147, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15118408, + "step": 6494, + "time_per_iteration": 2.84053897857666 + }, + { + "auxiliary_loss_clip": 0.0140402, + "auxiliary_loss_mlp": 0.01041301, + "balance_loss_clip": 1.26967931, + "balance_loss_mlp": 1.02519608, + "epoch": 0.39050052607846086, + "flos": 16914494091960.0, + "grad_norm": 2.2058571923960466, + "language_loss": 0.74517554, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.7696287, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16101074, + "step": 6495, + "time_per_iteration": 2.784228563308716 + }, + { + "auxiliary_loss_clip": 0.01379076, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.25496078, + "balance_loss_mlp": 1.02346051, + "epoch": 0.39056064933112883, + "flos": 25920563869080.0, + "grad_norm": 1.6708502840642159, + "language_loss": 0.68093705, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70512271, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.16040039, + "step": 6496, + "time_per_iteration": 2.8202908039093018 + }, + { + "auxiliary_loss_clip": 0.01232504, + "auxiliary_loss_mlp": 0.01008839, + "balance_loss_clip": 1.17520058, + "balance_loss_mlp": 1.0055964, + "epoch": 0.3906207725837968, + "flos": 60042243087360.0, + "grad_norm": 0.6960420080268409, + "language_loss": 0.53993452, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56234801, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.0324707, + "step": 6497, + "time_per_iteration": 3.4065287113189697 + }, + { + "auxiliary_loss_clip": 0.01385997, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.26034284, + "balance_loss_mlp": 1.01609814, + "epoch": 0.39068089583646476, + "flos": 21073575059280.0, + "grad_norm": 1.9409023713164217, + "language_loss": 0.69104123, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71520257, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14056396, + "step": 6498, + "time_per_iteration": 2.7952933311462402 + }, + { + "auxiliary_loss_clip": 0.01230843, + "auxiliary_loss_mlp": 0.01007921, + "balance_loss_clip": 1.17421412, + "balance_loss_mlp": 1.00523841, + "epoch": 0.3907410190891327, + "flos": 70463315616840.0, + "grad_norm": 0.7343136039668224, + "language_loss": 0.51737952, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53976715, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.02685547, + "step": 6499, + "time_per_iteration": 3.3046669960021973 + }, + { + "auxiliary_loss_clip": 0.01386401, + "auxiliary_loss_mlp": 0.01038777, + "balance_loss_clip": 1.25850201, + "balance_loss_mlp": 1.02254033, + "epoch": 0.3908011423418007, + "flos": 27967979817000.0, + "grad_norm": 1.5709721919995858, + "language_loss": 0.74229944, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76655126, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16235352, + "step": 6500, + "time_per_iteration": 2.886137008666992 + }, + { + "auxiliary_loss_clip": 0.01397233, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.268484, + "balance_loss_mlp": 1.02428794, + "epoch": 0.39086126559446865, + "flos": 24467463829080.0, + "grad_norm": 1.8412049472440346, + "language_loss": 0.69176716, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71613216, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.14990234, + "step": 6501, + "time_per_iteration": 2.8168742656707764 + }, + { + "auxiliary_loss_clip": 0.01378908, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.25380707, + "balance_loss_mlp": 1.02038527, + "epoch": 0.3909213888471366, + "flos": 16945339464360.0, + "grad_norm": 2.820127935792977, + "language_loss": 0.79227036, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81640011, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.13696289, + "step": 6502, + "time_per_iteration": 2.820596218109131 + }, + { + "auxiliary_loss_clip": 0.01373283, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.25160861, + "balance_loss_mlp": 1.02178812, + "epoch": 0.3909815120998046, + "flos": 29284419433680.0, + "grad_norm": 2.1796739371006826, + "language_loss": 0.80834746, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.83243549, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13726807, + "step": 6503, + "time_per_iteration": 2.8209872245788574 + }, + { + "auxiliary_loss_clip": 0.01377413, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.25429964, + "balance_loss_mlp": 1.01576352, + "epoch": 0.39104163535247255, + "flos": 18955696960800.0, + "grad_norm": 1.6985329337685606, + "language_loss": 0.71341527, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73748797, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14099121, + "step": 6504, + "time_per_iteration": 2.771749496459961 + }, + { + "auxiliary_loss_clip": 0.01381974, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.25619113, + "balance_loss_mlp": 1.01889193, + "epoch": 0.3911017586051405, + "flos": 26329019002560.0, + "grad_norm": 1.5141029109206268, + "language_loss": 0.83677799, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.8609271, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14056396, + "step": 6505, + "time_per_iteration": 2.9567391872406006 + }, + { + "auxiliary_loss_clip": 0.01382107, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.25872278, + "balance_loss_mlp": 1.02075732, + "epoch": 0.3911618818578085, + "flos": 21840878891520.0, + "grad_norm": 1.7956788649709885, + "language_loss": 0.7166723, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.74086338, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.16241455, + "step": 6506, + "time_per_iteration": 2.7777135372161865 + }, + { + "auxiliary_loss_clip": 0.01377212, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.25594604, + "balance_loss_mlp": 1.0234375, + "epoch": 0.3912220051104765, + "flos": 16363652756400.0, + "grad_norm": 1.939211951338194, + "language_loss": 0.75631225, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.78046292, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14428711, + "step": 6507, + "time_per_iteration": 2.7740285396575928 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.0101355, + "balance_loss_clip": 1.17365813, + "balance_loss_mlp": 1.00978267, + "epoch": 0.39128212836314447, + "flos": 71066751424560.0, + "grad_norm": 0.7665165048636586, + "language_loss": 0.56609416, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58854187, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.03759766, + "step": 6508, + "time_per_iteration": 3.4439425468444824 + }, + { + "auxiliary_loss_clip": 0.01382248, + "auxiliary_loss_mlp": 0.01041329, + "balance_loss_clip": 1.25755835, + "balance_loss_mlp": 1.02729857, + "epoch": 0.39134225161581243, + "flos": 20335857740280.0, + "grad_norm": 2.015197007930183, + "language_loss": 0.76314253, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78737831, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.14038086, + "step": 6509, + "time_per_iteration": 2.8767971992492676 + }, + { + "auxiliary_loss_clip": 0.0138503, + "auxiliary_loss_mlp": 0.01046581, + "balance_loss_clip": 1.2592895, + "balance_loss_mlp": 1.02940273, + "epoch": 0.3914023748684804, + "flos": 17022705036120.0, + "grad_norm": 3.422514094977178, + "language_loss": 0.82902032, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85333645, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.17175293, + "step": 6510, + "time_per_iteration": 2.7820332050323486 + }, + { + "auxiliary_loss_clip": 0.01387754, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.2602092, + "balance_loss_mlp": 1.0182755, + "epoch": 0.39146249812114836, + "flos": 18409769236800.0, + "grad_norm": 2.6114385236450506, + "language_loss": 0.76899076, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79320723, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15637207, + "step": 6511, + "time_per_iteration": 2.8807523250579834 + }, + { + "auxiliary_loss_clip": 0.0122628, + "auxiliary_loss_mlp": 0.01011991, + "balance_loss_clip": 1.16851139, + "balance_loss_mlp": 1.00867653, + "epoch": 0.3915226213738163, + "flos": 67654768307400.0, + "grad_norm": 0.7802501307929272, + "language_loss": 0.57758516, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.5999679, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.03320312, + "step": 6512, + "time_per_iteration": 3.345392942428589 + }, + { + "auxiliary_loss_clip": 0.0138509, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.25862539, + "balance_loss_mlp": 1.01733756, + "epoch": 0.3915827446264843, + "flos": 26364940419960.0, + "grad_norm": 1.5610267938386655, + "language_loss": 0.70117271, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72535682, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.15979004, + "step": 6513, + "time_per_iteration": 2.8919150829315186 + }, + { + "auxiliary_loss_clip": 0.0139649, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.26542604, + "balance_loss_mlp": 1.0196023, + "epoch": 0.39164286787915226, + "flos": 21948846185520.0, + "grad_norm": 2.5678469556250496, + "language_loss": 0.76496637, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.78928185, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15447998, + "step": 6514, + "time_per_iteration": 2.8068158626556396 + }, + { + "auxiliary_loss_clip": 0.01380991, + "auxiliary_loss_mlp": 0.01037991, + "balance_loss_clip": 1.25504613, + "balance_loss_mlp": 1.02367949, + "epoch": 0.3917029911318202, + "flos": 16404284743560.0, + "grad_norm": 1.7592162840364305, + "language_loss": 0.78087014, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.80505991, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14324951, + "step": 6515, + "time_per_iteration": 2.7724270820617676 + }, + { + "auxiliary_loss_clip": 0.01375656, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.25199413, + "balance_loss_mlp": 1.02535093, + "epoch": 0.3917631143844882, + "flos": 26217072089280.0, + "grad_norm": 1.5569895884227714, + "language_loss": 0.79962492, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82376921, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.1340332, + "step": 6516, + "time_per_iteration": 2.8065125942230225 + }, + { + "auxiliary_loss_clip": 0.01378467, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.25183678, + "balance_loss_mlp": 1.02051258, + "epoch": 0.39182323763715615, + "flos": 18552561522480.0, + "grad_norm": 3.314454881484782, + "language_loss": 0.70956296, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.73370582, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15307617, + "step": 6517, + "time_per_iteration": 2.8190431594848633 + }, + { + "auxiliary_loss_clip": 0.01386438, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.25977612, + "balance_loss_mlp": 1.02134478, + "epoch": 0.3918833608898241, + "flos": 34320233428200.0, + "grad_norm": 1.5476771187431417, + "language_loss": 0.72564018, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74987006, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.15197754, + "step": 6518, + "time_per_iteration": 4.302093744277954 + }, + { + "auxiliary_loss_clip": 0.01388691, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.26123214, + "balance_loss_mlp": 1.02057981, + "epoch": 0.3919434841424921, + "flos": 36946493498880.0, + "grad_norm": 1.489104107522022, + "language_loss": 0.61828685, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.64253318, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.15362549, + "step": 6519, + "time_per_iteration": 2.9113686084747314 + }, + { + "auxiliary_loss_clip": 0.01404458, + "auxiliary_loss_mlp": 0.01041699, + "balance_loss_clip": 1.27120328, + "balance_loss_mlp": 1.02489042, + "epoch": 0.3920036073951601, + "flos": 23513608880280.0, + "grad_norm": 1.705449088511784, + "language_loss": 0.6709609, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69542253, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.16821289, + "step": 6520, + "time_per_iteration": 2.8862569332122803 + }, + { + "auxiliary_loss_clip": 0.01381837, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.25566363, + "balance_loss_mlp": 1.01664829, + "epoch": 0.39206373064782807, + "flos": 18410743837440.0, + "grad_norm": 1.8494519026831309, + "language_loss": 0.79293609, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81707132, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15026855, + "step": 6521, + "time_per_iteration": 2.754551410675049 + }, + { + "auxiliary_loss_clip": 0.0139853, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.26619077, + "balance_loss_mlp": 1.01681089, + "epoch": 0.39212385390049603, + "flos": 12316924785960.0, + "grad_norm": 2.266893299205693, + "language_loss": 0.70644933, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7307781, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.17541504, + "step": 6522, + "time_per_iteration": 4.182288408279419 + }, + { + "auxiliary_loss_clip": 0.01393552, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.26366878, + "balance_loss_mlp": 1.01930583, + "epoch": 0.392183977153164, + "flos": 19723934785320.0, + "grad_norm": 1.9876734514172945, + "language_loss": 0.77344799, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79773414, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1574707, + "step": 6523, + "time_per_iteration": 2.8793532848358154 + }, + { + "auxiliary_loss_clip": 0.01391116, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.26163578, + "balance_loss_mlp": 1.02948415, + "epoch": 0.39224410040583196, + "flos": 33151296666960.0, + "grad_norm": 3.294014199448641, + "language_loss": 0.62475401, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64910579, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14581299, + "step": 6524, + "time_per_iteration": 2.8789703845977783 + }, + { + "auxiliary_loss_clip": 0.01383594, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.25476789, + "balance_loss_mlp": 1.02134287, + "epoch": 0.39230422365849993, + "flos": 27788250904920.0, + "grad_norm": 1.6598762793817852, + "language_loss": 0.74470949, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76892686, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.16796875, + "step": 6525, + "time_per_iteration": 2.8807849884033203 + }, + { + "auxiliary_loss_clip": 0.01389095, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.25928164, + "balance_loss_mlp": 1.02192235, + "epoch": 0.3923643469111679, + "flos": 17936212256280.0, + "grad_norm": 2.19582598781396, + "language_loss": 0.81715453, + "learning_rate": 2.773590027802719e-06, + "loss": 0.84141976, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.15515137, + "step": 6526, + "time_per_iteration": 2.7743144035339355 + }, + { + "auxiliary_loss_clip": 0.01393439, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.26403797, + "balance_loss_mlp": 1.02295101, + "epoch": 0.39242447016383586, + "flos": 24064693866000.0, + "grad_norm": 1.7120541894200445, + "language_loss": 0.70425606, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72857344, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15344238, + "step": 6527, + "time_per_iteration": 2.8576464653015137 + }, + { + "auxiliary_loss_clip": 0.01390682, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.26449323, + "balance_loss_mlp": 1.01964688, + "epoch": 0.3924845934165038, + "flos": 10666796672520.0, + "grad_norm": 2.6126925390414875, + "language_loss": 0.83239591, + "learning_rate": 2.772871672726965e-06, + "loss": 0.85665679, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.15759277, + "step": 6528, + "time_per_iteration": 4.350889444351196 + }, + { + "auxiliary_loss_clip": 0.01383755, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.26012683, + "balance_loss_mlp": 1.01902342, + "epoch": 0.3925447166691718, + "flos": 31251830266440.0, + "grad_norm": 1.6740192359230048, + "language_loss": 0.69442356, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.71860194, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.1505127, + "step": 6529, + "time_per_iteration": 4.5252814292907715 + }, + { + "auxiliary_loss_clip": 0.01392586, + "auxiliary_loss_mlp": 0.01032678, + "balance_loss_clip": 1.26346338, + "balance_loss_mlp": 1.01627517, + "epoch": 0.39260483992183975, + "flos": 29419861606200.0, + "grad_norm": 2.5351279222404015, + "language_loss": 0.80539131, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82964396, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.1640625, + "step": 6530, + "time_per_iteration": 2.8766608238220215 + }, + { + "auxiliary_loss_clip": 0.01384328, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.2583617, + "balance_loss_mlp": 1.01888227, + "epoch": 0.3926649631745077, + "flos": 22863084356160.0, + "grad_norm": 3.8957442919273277, + "language_loss": 0.75927103, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.78345716, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15374756, + "step": 6531, + "time_per_iteration": 2.9033453464508057 + }, + { + "auxiliary_loss_clip": 0.01228809, + "auxiliary_loss_mlp": 0.01010129, + "balance_loss_clip": 1.17246556, + "balance_loss_mlp": 1.00631452, + "epoch": 0.3927250864271757, + "flos": 63907942678200.0, + "grad_norm": 0.8242372561830612, + "language_loss": 0.6035639, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62595332, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03808594, + "step": 6532, + "time_per_iteration": 3.207340955734253 + }, + { + "auxiliary_loss_clip": 0.01231014, + "auxiliary_loss_mlp": 0.0101461, + "balance_loss_clip": 1.17481852, + "balance_loss_mlp": 1.01067591, + "epoch": 0.3927852096798437, + "flos": 68926718142720.0, + "grad_norm": 0.8014577702579312, + "language_loss": 0.5559206, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57837689, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03930664, + "step": 6533, + "time_per_iteration": 3.360241174697876 + }, + { + "auxiliary_loss_clip": 0.01403396, + "auxiliary_loss_mlp": 0.01039331, + "balance_loss_clip": 1.27256823, + "balance_loss_mlp": 1.02293944, + "epoch": 0.39284533293251167, + "flos": 29722014388440.0, + "grad_norm": 1.7351392421550418, + "language_loss": 0.76187944, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78630674, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16394043, + "step": 6534, + "time_per_iteration": 2.893611431121826 + }, + { + "auxiliary_loss_clip": 0.01403622, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.2709105, + "balance_loss_mlp": 1.0248059, + "epoch": 0.39290545618517964, + "flos": 18556703575200.0, + "grad_norm": 2.1743330232911564, + "language_loss": 0.78415322, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80860162, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.16430664, + "step": 6535, + "time_per_iteration": 2.7448673248291016 + }, + { + "auxiliary_loss_clip": 0.01389808, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.26465154, + "balance_loss_mlp": 1.01820683, + "epoch": 0.3929655794378476, + "flos": 26255023924680.0, + "grad_norm": 1.8443806571453627, + "language_loss": 0.69264758, + "learning_rate": 2.769997081218978e-06, + "loss": 0.71687448, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14672852, + "step": 6536, + "time_per_iteration": 2.9055705070495605 + }, + { + "auxiliary_loss_clip": 0.01382392, + "auxiliary_loss_mlp": 0.01039215, + "balance_loss_clip": 1.25934672, + "balance_loss_mlp": 1.02455187, + "epoch": 0.39302570269051557, + "flos": 29283891525000.0, + "grad_norm": 1.637263067179429, + "language_loss": 0.68911934, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71333539, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14660645, + "step": 6537, + "time_per_iteration": 2.8602261543273926 + }, + { + "auxiliary_loss_clip": 0.01399251, + "auxiliary_loss_mlp": 0.01041615, + "balance_loss_clip": 1.27028871, + "balance_loss_mlp": 1.02569509, + "epoch": 0.39308582594318353, + "flos": 17351886004920.0, + "grad_norm": 1.618023664494972, + "language_loss": 0.79035217, + "learning_rate": 2.769278141085763e-06, + "loss": 0.8147608, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.15924072, + "step": 6538, + "time_per_iteration": 2.776730537414551 + }, + { + "auxiliary_loss_clip": 0.01231419, + "auxiliary_loss_mlp": 0.01015616, + "balance_loss_clip": 1.1759479, + "balance_loss_mlp": 1.01239729, + "epoch": 0.3931459491958515, + "flos": 61020202420800.0, + "grad_norm": 0.8049685153852145, + "language_loss": 0.61942077, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64189112, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.03222656, + "step": 6539, + "time_per_iteration": 3.0933878421783447 + }, + { + "auxiliary_loss_clip": 0.01396888, + "auxiliary_loss_mlp": 0.01037791, + "balance_loss_clip": 1.26798797, + "balance_loss_mlp": 1.02166176, + "epoch": 0.39320607244851946, + "flos": 39022765009560.0, + "grad_norm": 2.8404857114028896, + "language_loss": 0.68990254, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.71424937, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.16125488, + "step": 6540, + "time_per_iteration": 2.9158170223236084 + }, + { + "auxiliary_loss_clip": 0.01392798, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.26624703, + "balance_loss_mlp": 1.02647948, + "epoch": 0.3932661957011874, + "flos": 24684982143120.0, + "grad_norm": 1.7598124387679914, + "language_loss": 0.72486997, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74922323, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.16027832, + "step": 6541, + "time_per_iteration": 2.819255828857422 + }, + { + "auxiliary_loss_clip": 0.01235461, + "auxiliary_loss_mlp": 0.01008749, + "balance_loss_clip": 1.18002594, + "balance_loss_mlp": 1.00464869, + "epoch": 0.3933263189538554, + "flos": 70111434574800.0, + "grad_norm": 0.988661767533469, + "language_loss": 0.60412264, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62656474, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.04101562, + "step": 6542, + "time_per_iteration": 3.0885069370269775 + }, + { + "auxiliary_loss_clip": 0.0139718, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_clip": 1.26864767, + "balance_loss_mlp": 1.02563953, + "epoch": 0.39338644220652336, + "flos": 22934155632120.0, + "grad_norm": 1.5058564274595136, + "language_loss": 0.82545924, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84983885, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15148926, + "step": 6543, + "time_per_iteration": 2.9526853561401367 + }, + { + "auxiliary_loss_clip": 0.01390504, + "auxiliary_loss_mlp": 0.01041061, + "balance_loss_clip": 1.26256895, + "balance_loss_mlp": 1.02476501, + "epoch": 0.3934465654591913, + "flos": 30854501823600.0, + "grad_norm": 1.5052279687122663, + "language_loss": 0.69395202, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71826768, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.1628418, + "step": 6544, + "time_per_iteration": 2.9131414890289307 + }, + { + "auxiliary_loss_clip": 0.01404994, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.27299082, + "balance_loss_mlp": 1.02426505, + "epoch": 0.3935066887118593, + "flos": 29241838245240.0, + "grad_norm": 2.049862138955683, + "language_loss": 0.75190341, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77636135, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.16540527, + "step": 6545, + "time_per_iteration": 2.841045618057251 + }, + { + "auxiliary_loss_clip": 0.01379173, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.25798368, + "balance_loss_mlp": 1.01732159, + "epoch": 0.3935668119645273, + "flos": 19139974009200.0, + "grad_norm": 1.3707469299499695, + "language_loss": 0.74627382, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.77037954, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14080811, + "step": 6546, + "time_per_iteration": 2.8303399085998535 + }, + { + "auxiliary_loss_clip": 0.01413442, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.27807701, + "balance_loss_mlp": 1.01970124, + "epoch": 0.3936269352171953, + "flos": 18520863374520.0, + "grad_norm": 1.6951199490665363, + "language_loss": 0.81642002, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.84091985, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.16845703, + "step": 6547, + "time_per_iteration": 2.7698557376861572 + }, + { + "auxiliary_loss_clip": 0.0139709, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.2678659, + "balance_loss_mlp": 1.02337158, + "epoch": 0.39368705846986324, + "flos": 15637062128040.0, + "grad_norm": 3.94252718127835, + "language_loss": 0.84322119, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86758167, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15588379, + "step": 6548, + "time_per_iteration": 2.8415863513946533 + }, + { + "auxiliary_loss_clip": 0.0138918, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.26401639, + "balance_loss_mlp": 1.01355481, + "epoch": 0.3937471817225312, + "flos": 21330994410000.0, + "grad_norm": 1.8350209590626914, + "language_loss": 0.73121804, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75538957, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14428711, + "step": 6549, + "time_per_iteration": 2.806126356124878 + }, + { + "auxiliary_loss_clip": 0.01398854, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.27086914, + "balance_loss_mlp": 1.02253318, + "epoch": 0.39380730497519917, + "flos": 20781533758680.0, + "grad_norm": 1.600496608783028, + "language_loss": 0.77925497, + "learning_rate": 2.764962053731699e-06, + "loss": 0.8036288, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.16003418, + "step": 6550, + "time_per_iteration": 2.8468070030212402 + }, + { + "auxiliary_loss_clip": 0.01397658, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.26925015, + "balance_loss_mlp": 1.01352787, + "epoch": 0.39386742822786713, + "flos": 21613939437960.0, + "grad_norm": 1.746237450846867, + "language_loss": 0.81336468, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83763248, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15600586, + "step": 6551, + "time_per_iteration": 2.825805425643921 + }, + { + "auxiliary_loss_clip": 0.01390139, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.26185799, + "balance_loss_mlp": 1.01906478, + "epoch": 0.3939275514805351, + "flos": 12417501358440.0, + "grad_norm": 2.4991795098754412, + "language_loss": 0.80663645, + "learning_rate": 2.764242299098596e-06, + "loss": 0.83088458, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.15600586, + "step": 6552, + "time_per_iteration": 2.8240818977355957 + }, + { + "auxiliary_loss_clip": 0.01401451, + "auxiliary_loss_mlp": 0.01043718, + "balance_loss_clip": 1.27119493, + "balance_loss_mlp": 1.0270648, + "epoch": 0.39398767473320306, + "flos": 18556703575200.0, + "grad_norm": 2.0274495105017842, + "language_loss": 0.71543789, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73988962, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.16638184, + "step": 6553, + "time_per_iteration": 2.89748477935791 + }, + { + "auxiliary_loss_clip": 0.01392454, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.26546443, + "balance_loss_mlp": 1.02416754, + "epoch": 0.39404779798587103, + "flos": 29314127772000.0, + "grad_norm": 1.615194984902649, + "language_loss": 0.64642489, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.67075455, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.16351318, + "step": 6554, + "time_per_iteration": 2.9139013290405273 + }, + { + "auxiliary_loss_clip": 0.01389904, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.26384711, + "balance_loss_mlp": 1.02562177, + "epoch": 0.394107921238539, + "flos": 34903788120720.0, + "grad_norm": 2.1512485930683383, + "language_loss": 0.79677379, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.82107401, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.14477539, + "step": 6555, + "time_per_iteration": 3.0046825408935547 + }, + { + "auxiliary_loss_clip": 0.01401502, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_clip": 1.27281857, + "balance_loss_mlp": 1.02752447, + "epoch": 0.39416804449120696, + "flos": 25086736897200.0, + "grad_norm": 2.975017232104094, + "language_loss": 0.72216916, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74662399, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.16455078, + "step": 6556, + "time_per_iteration": 2.8368067741394043 + }, + { + "auxiliary_loss_clip": 0.01392634, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.26497364, + "balance_loss_mlp": 1.01796961, + "epoch": 0.3942281677438749, + "flos": 32313043383840.0, + "grad_norm": 8.571681365346738, + "language_loss": 0.83707929, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86134052, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.1552124, + "step": 6557, + "time_per_iteration": 4.393505573272705 + }, + { + "auxiliary_loss_clip": 0.01393776, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.26562524, + "balance_loss_mlp": 1.02741003, + "epoch": 0.3942882909965429, + "flos": 24942076626960.0, + "grad_norm": 2.0717643804180765, + "language_loss": 0.80861413, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.83298647, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.16040039, + "step": 6558, + "time_per_iteration": 2.889895439147949 + }, + { + "auxiliary_loss_clip": 0.01395279, + "auxiliary_loss_mlp": 0.01041257, + "balance_loss_clip": 1.26984084, + "balance_loss_mlp": 1.02624857, + "epoch": 0.39434841424921085, + "flos": 11878558272360.0, + "grad_norm": 1.806448684425161, + "language_loss": 0.71715105, + "learning_rate": 2.761722245724792e-06, + "loss": 0.74151647, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15020752, + "step": 6559, + "time_per_iteration": 2.797628879547119 + }, + { + "auxiliary_loss_clip": 0.01410777, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.27659178, + "balance_loss_mlp": 1.02549672, + "epoch": 0.3944085375018789, + "flos": 16365845607840.0, + "grad_norm": 1.9606710359866921, + "language_loss": 0.80438042, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82891262, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.16967773, + "step": 6560, + "time_per_iteration": 2.7893810272216797 + }, + { + "auxiliary_loss_clip": 0.01403381, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.27466559, + "balance_loss_mlp": 1.02830815, + "epoch": 0.39446866075454684, + "flos": 10636844684040.0, + "grad_norm": 2.054320994938075, + "language_loss": 0.83452541, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85900903, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.16668701, + "step": 6561, + "time_per_iteration": 2.7415106296539307 + }, + { + "auxiliary_loss_clip": 0.01395594, + "auxiliary_loss_mlp": 0.01047254, + "balance_loss_clip": 1.26853776, + "balance_loss_mlp": 1.03250766, + "epoch": 0.3945287840072148, + "flos": 18192169706040.0, + "grad_norm": 2.4652637495539937, + "language_loss": 0.80120885, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82563734, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14758301, + "step": 6562, + "time_per_iteration": 4.179507255554199 + }, + { + "auxiliary_loss_clip": 0.013931, + "auxiliary_loss_mlp": 0.01043344, + "balance_loss_clip": 1.26860762, + "balance_loss_mlp": 1.0277698, + "epoch": 0.39458890725988277, + "flos": 23045168553120.0, + "grad_norm": 1.5478058906878247, + "language_loss": 0.81429696, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83866143, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.15570068, + "step": 6563, + "time_per_iteration": 2.818965196609497 + }, + { + "auxiliary_loss_clip": 0.01397988, + "auxiliary_loss_mlp": 0.01046262, + "balance_loss_clip": 1.27019286, + "balance_loss_mlp": 1.03053832, + "epoch": 0.39464903051255074, + "flos": 17162695344960.0, + "grad_norm": 2.0063317117217556, + "language_loss": 0.70378518, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72822762, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15698242, + "step": 6564, + "time_per_iteration": 2.74788236618042 + }, + { + "auxiliary_loss_clip": 0.01403001, + "auxiliary_loss_mlp": 0.01048195, + "balance_loss_clip": 1.27435803, + "balance_loss_mlp": 1.03248322, + "epoch": 0.3947091537652187, + "flos": 15893385053040.0, + "grad_norm": 2.1070388539353138, + "language_loss": 0.84177542, + "learning_rate": 2.759561073299676e-06, + "loss": 0.86628735, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.1572876, + "step": 6565, + "time_per_iteration": 2.7425618171691895 + }, + { + "auxiliary_loss_clip": 0.01398432, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.27160311, + "balance_loss_mlp": 1.03493953, + "epoch": 0.39476927701788667, + "flos": 18549312853680.0, + "grad_norm": 1.951118074141771, + "language_loss": 0.83783114, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.86231756, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.15270996, + "step": 6566, + "time_per_iteration": 2.753858804702759 + }, + { + "auxiliary_loss_clip": 0.01414308, + "auxiliary_loss_mlp": 0.01040147, + "balance_loss_clip": 1.27939939, + "balance_loss_mlp": 1.02394688, + "epoch": 0.39482940027055463, + "flos": 22281153998040.0, + "grad_norm": 1.834219669874415, + "language_loss": 0.78271818, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.80726272, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16174316, + "step": 6567, + "time_per_iteration": 4.429594993591309 + }, + { + "auxiliary_loss_clip": 0.01383232, + "auxiliary_loss_mlp": 0.01049088, + "balance_loss_clip": 1.26082027, + "balance_loss_mlp": 1.03472888, + "epoch": 0.3948895235232226, + "flos": 14761831610160.0, + "grad_norm": 1.6781716126005126, + "language_loss": 0.80520165, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82952482, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14367676, + "step": 6568, + "time_per_iteration": 2.804842233657837 + }, + { + "auxiliary_loss_clip": 0.01393295, + "auxiliary_loss_mlp": 0.0104495, + "balance_loss_clip": 1.26644969, + "balance_loss_mlp": 1.02991772, + "epoch": 0.39494964677589056, + "flos": 22571002447200.0, + "grad_norm": 1.9907463777098713, + "language_loss": 0.8520838, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.87646627, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15039062, + "step": 6569, + "time_per_iteration": 2.8221077919006348 + }, + { + "auxiliary_loss_clip": 0.01394094, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.26897085, + "balance_loss_mlp": 1.02932441, + "epoch": 0.3950097700285585, + "flos": 22967965414800.0, + "grad_norm": 2.1640127516638508, + "language_loss": 0.75202858, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77641177, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14904785, + "step": 6570, + "time_per_iteration": 2.7875185012817383 + }, + { + "auxiliary_loss_clip": 0.01398871, + "auxiliary_loss_mlp": 0.01041307, + "balance_loss_clip": 1.27070117, + "balance_loss_mlp": 1.02657866, + "epoch": 0.3950698932812265, + "flos": 20600180512200.0, + "grad_norm": 1.5798072789192854, + "language_loss": 0.80044603, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82484782, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14727783, + "step": 6571, + "time_per_iteration": 2.8037545680999756 + }, + { + "auxiliary_loss_clip": 0.01399168, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_clip": 1.27255499, + "balance_loss_mlp": 1.02813399, + "epoch": 0.39513001653389446, + "flos": 20380631780160.0, + "grad_norm": 2.0609336687719253, + "language_loss": 0.78250414, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8069216, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14440918, + "step": 6572, + "time_per_iteration": 2.900111675262451 + }, + { + "auxiliary_loss_clip": 0.01396629, + "auxiliary_loss_mlp": 0.01042224, + "balance_loss_clip": 1.26703644, + "balance_loss_mlp": 1.02697659, + "epoch": 0.3951901397865625, + "flos": 26468603227800.0, + "grad_norm": 1.8396273570797803, + "language_loss": 0.74400318, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.76839173, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15246582, + "step": 6573, + "time_per_iteration": 2.834272623062134 + }, + { + "auxiliary_loss_clip": 0.01392324, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.26759386, + "balance_loss_mlp": 1.021034, + "epoch": 0.39525026303923044, + "flos": 43846542818640.0, + "grad_norm": 1.59494030890567, + "language_loss": 0.67805529, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70232427, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.13519287, + "step": 6574, + "time_per_iteration": 2.974055290222168 + }, + { + "auxiliary_loss_clip": 0.01398043, + "auxiliary_loss_mlp": 0.01037984, + "balance_loss_clip": 1.26862264, + "balance_loss_mlp": 1.02263618, + "epoch": 0.3953103862918984, + "flos": 18045357192720.0, + "grad_norm": 2.407819114435543, + "language_loss": 0.72144526, + "learning_rate": 2.755956816505072e-06, + "loss": 0.74580556, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15344238, + "step": 6575, + "time_per_iteration": 2.868678092956543 + }, + { + "auxiliary_loss_clip": 0.01405451, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.27504396, + "balance_loss_mlp": 1.02602065, + "epoch": 0.3953705095445664, + "flos": 16979555330640.0, + "grad_norm": 1.9121756002191614, + "language_loss": 0.73862505, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.76309943, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15966797, + "step": 6576, + "time_per_iteration": 2.80601167678833 + }, + { + "auxiliary_loss_clip": 0.01396593, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.26862621, + "balance_loss_mlp": 1.02867115, + "epoch": 0.39543063279723434, + "flos": 17414388916920.0, + "grad_norm": 2.2253578506874034, + "language_loss": 0.83606082, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.86045945, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14587402, + "step": 6577, + "time_per_iteration": 2.7509922981262207 + }, + { + "auxiliary_loss_clip": 0.01393752, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.26830649, + "balance_loss_mlp": 1.02167952, + "epoch": 0.3954907560499023, + "flos": 22789414145160.0, + "grad_norm": 2.701196249640354, + "language_loss": 0.90543795, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92974079, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.1484375, + "step": 6578, + "time_per_iteration": 2.8328468799591064 + }, + { + "auxiliary_loss_clip": 0.0139709, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.26750255, + "balance_loss_mlp": 1.01628757, + "epoch": 0.39555087930257027, + "flos": 21949414702560.0, + "grad_norm": 2.236226158690075, + "language_loss": 0.78360927, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80790561, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.16247559, + "step": 6579, + "time_per_iteration": 2.8131840229034424 + }, + { + "auxiliary_loss_clip": 0.01397405, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.26710486, + "balance_loss_mlp": 1.01193881, + "epoch": 0.39561100255523823, + "flos": 20408472133920.0, + "grad_norm": 2.0151375493921626, + "language_loss": 0.69355643, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71781039, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.1605835, + "step": 6580, + "time_per_iteration": 2.722677230834961 + }, + { + "auxiliary_loss_clip": 0.01389251, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.2641319, + "balance_loss_mlp": 1.01638865, + "epoch": 0.3956711258079062, + "flos": 27970375710240.0, + "grad_norm": 1.7758775979465364, + "language_loss": 0.58994883, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61414987, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14471436, + "step": 6581, + "time_per_iteration": 2.7743654251098633 + }, + { + "auxiliary_loss_clip": 0.01392874, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.26635122, + "balance_loss_mlp": 1.02000701, + "epoch": 0.39573124906057416, + "flos": 14432325774480.0, + "grad_norm": 2.19873168346993, + "language_loss": 0.69727063, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.7215566, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.15722656, + "step": 6582, + "time_per_iteration": 2.7250795364379883 + }, + { + "auxiliary_loss_clip": 0.01399155, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.27120495, + "balance_loss_mlp": 1.015692, + "epoch": 0.39579137231324213, + "flos": 18738300471840.0, + "grad_norm": 2.775990569756829, + "language_loss": 0.76487625, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78917503, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15045166, + "step": 6583, + "time_per_iteration": 2.6959431171417236 + }, + { + "auxiliary_loss_clip": 0.01397026, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.26920009, + "balance_loss_mlp": 1.02324319, + "epoch": 0.3958514955659101, + "flos": 17680985757000.0, + "grad_norm": 2.3717769910600874, + "language_loss": 0.66154319, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68589079, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.1449585, + "step": 6584, + "time_per_iteration": 2.7552716732025146 + }, + { + "auxiliary_loss_clip": 0.01404304, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.27414823, + "balance_loss_mlp": 1.02149284, + "epoch": 0.39591161881857806, + "flos": 29314371422160.0, + "grad_norm": 2.2442736219512547, + "language_loss": 0.72841203, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75283051, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.16052246, + "step": 6585, + "time_per_iteration": 2.838010311126709 + }, + { + "auxiliary_loss_clip": 0.01396695, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.26847696, + "balance_loss_mlp": 1.01824474, + "epoch": 0.3959717420712461, + "flos": 25776878199480.0, + "grad_norm": 2.238345459396569, + "language_loss": 0.73902386, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.76332676, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.15356445, + "step": 6586, + "time_per_iteration": 2.7421462535858154 + }, + { + "auxiliary_loss_clip": 0.01400105, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.27255237, + "balance_loss_mlp": 1.01846409, + "epoch": 0.39603186532391405, + "flos": 20928792963960.0, + "grad_norm": 1.6616504834843855, + "language_loss": 0.71357965, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.7379247, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.15917969, + "step": 6587, + "time_per_iteration": 2.7460741996765137 + }, + { + "auxiliary_loss_clip": 0.01242818, + "auxiliary_loss_mlp": 0.01008133, + "balance_loss_clip": 1.18713748, + "balance_loss_mlp": 1.00524831, + "epoch": 0.396091988576582, + "flos": 54893751229080.0, + "grad_norm": 0.9449453674119658, + "language_loss": 0.61270946, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63521898, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.02880859, + "step": 6588, + "time_per_iteration": 3.0846924781799316 + }, + { + "auxiliary_loss_clip": 0.0140087, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.27239442, + "balance_loss_mlp": 1.02419138, + "epoch": 0.39615211182925, + "flos": 20707619897520.0, + "grad_norm": 2.163024535025351, + "language_loss": 0.81666535, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.84106779, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15185547, + "step": 6589, + "time_per_iteration": 2.8576066493988037 + }, + { + "auxiliary_loss_clip": 0.01397417, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.2691052, + "balance_loss_mlp": 1.01873791, + "epoch": 0.39621223508191794, + "flos": 20999011464360.0, + "grad_norm": 2.058598368941209, + "language_loss": 0.7075814, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.73190081, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15789795, + "step": 6590, + "time_per_iteration": 2.7865355014801025 + }, + { + "auxiliary_loss_clip": 0.01395516, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.26931286, + "balance_loss_mlp": 1.02107668, + "epoch": 0.3962723583345859, + "flos": 23374227696840.0, + "grad_norm": 1.9727544446705936, + "language_loss": 0.7560159, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78032738, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.14544678, + "step": 6591, + "time_per_iteration": 2.869818687438965 + }, + { + "auxiliary_loss_clip": 0.0139694, + "auxiliary_loss_mlp": 0.01039607, + "balance_loss_clip": 1.27045131, + "balance_loss_mlp": 1.02389574, + "epoch": 0.39633248158725387, + "flos": 25120627896600.0, + "grad_norm": 2.736961117194787, + "language_loss": 0.78984618, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81421161, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15698242, + "step": 6592, + "time_per_iteration": 2.8960392475128174 + }, + { + "auxiliary_loss_clip": 0.01386605, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.26438618, + "balance_loss_mlp": 1.01703978, + "epoch": 0.39639260483992184, + "flos": 39795185495160.0, + "grad_norm": 1.885118694522544, + "language_loss": 0.69508636, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71927023, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.1473999, + "step": 6593, + "time_per_iteration": 2.9523377418518066 + }, + { + "auxiliary_loss_clip": 0.01405011, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.27482069, + "balance_loss_mlp": 1.02080774, + "epoch": 0.3964527280925898, + "flos": 17351439312960.0, + "grad_norm": 1.5328424165296806, + "language_loss": 0.77489161, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.79931056, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.1607666, + "step": 6594, + "time_per_iteration": 2.8019275665283203 + }, + { + "auxiliary_loss_clip": 0.01247956, + "auxiliary_loss_mlp": 0.01010529, + "balance_loss_clip": 1.1928947, + "balance_loss_mlp": 1.00781059, + "epoch": 0.39651285134525777, + "flos": 71735021802000.0, + "grad_norm": 0.9553737402224887, + "language_loss": 0.63084406, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65342885, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.02722168, + "step": 6595, + "time_per_iteration": 3.3344738483428955 + }, + { + "auxiliary_loss_clip": 0.0140731, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.27673888, + "balance_loss_mlp": 1.02084255, + "epoch": 0.39657297459792573, + "flos": 25781263902360.0, + "grad_norm": 2.366756588443914, + "language_loss": 0.63628697, + "learning_rate": 2.748378562795223e-06, + "loss": 0.66073358, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.16503906, + "step": 6596, + "time_per_iteration": 4.196780681610107 + }, + { + "auxiliary_loss_clip": 0.01390095, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.26638889, + "balance_loss_mlp": 1.01469612, + "epoch": 0.3966330978505937, + "flos": 20270918326680.0, + "grad_norm": 2.592850327022216, + "language_loss": 0.78716862, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.8113625, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14593506, + "step": 6597, + "time_per_iteration": 2.8259856700897217 + }, + { + "auxiliary_loss_clip": 0.01410934, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.2805748, + "balance_loss_mlp": 1.01967037, + "epoch": 0.39669322110326166, + "flos": 20636061321240.0, + "grad_norm": 2.801852013375337, + "language_loss": 0.68210268, + "learning_rate": 2.747656169644941e-06, + "loss": 0.706581, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.17236328, + "step": 6598, + "time_per_iteration": 2.8008053302764893 + }, + { + "auxiliary_loss_clip": 0.01398317, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.27073026, + "balance_loss_mlp": 1.02078772, + "epoch": 0.3967533443559297, + "flos": 21731124829680.0, + "grad_norm": 2.0545752341183823, + "language_loss": 0.79136974, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81570399, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.14337158, + "step": 6599, + "time_per_iteration": 2.768209457397461 + }, + { + "auxiliary_loss_clip": 0.01403206, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.27493119, + "balance_loss_mlp": 1.01712406, + "epoch": 0.39681346760859765, + "flos": 25489709902080.0, + "grad_norm": 1.7217245483353782, + "language_loss": 0.73329926, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75767362, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.17114258, + "step": 6600, + "time_per_iteration": 2.9404184818267822 + }, + { + "auxiliary_loss_clip": 0.01396419, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.26946139, + "balance_loss_mlp": 1.01545072, + "epoch": 0.3968735908612656, + "flos": 20964511339560.0, + "grad_norm": 2.228774208016086, + "language_loss": 0.86370909, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88797575, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.14794922, + "step": 6601, + "time_per_iteration": 4.127038240432739 + }, + { + "auxiliary_loss_clip": 0.01413913, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.28109276, + "balance_loss_mlp": 1.01701915, + "epoch": 0.3969337141139336, + "flos": 10710677328480.0, + "grad_norm": 6.1541181452252065, + "language_loss": 0.70532393, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.7298131, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.1796875, + "step": 6602, + "time_per_iteration": 2.7817771434783936 + }, + { + "auxiliary_loss_clip": 0.01398621, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.2701149, + "balance_loss_mlp": 1.02210963, + "epoch": 0.39699383736660154, + "flos": 17597001022560.0, + "grad_norm": 2.3612900191409123, + "language_loss": 0.83483559, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85919476, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15179443, + "step": 6603, + "time_per_iteration": 2.709991693496704 + }, + { + "auxiliary_loss_clip": 0.01397317, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.27115703, + "balance_loss_mlp": 1.01627707, + "epoch": 0.3970539606192695, + "flos": 17790536777040.0, + "grad_norm": 2.1019321863748472, + "language_loss": 0.7358374, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.76011878, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.14550781, + "step": 6604, + "time_per_iteration": 2.7819135189056396 + }, + { + "auxiliary_loss_clip": 0.01389652, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.26745749, + "balance_loss_mlp": 1.02002954, + "epoch": 0.3971140838719375, + "flos": 24794776813320.0, + "grad_norm": 1.8854174637005567, + "language_loss": 0.82378834, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84803605, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15112305, + "step": 6605, + "time_per_iteration": 2.859508752822876 + }, + { + "auxiliary_loss_clip": 0.01395035, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.27025378, + "balance_loss_mlp": 1.01584113, + "epoch": 0.39717420712460544, + "flos": 24248970914400.0, + "grad_norm": 1.494257417610795, + "language_loss": 0.74304301, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76728457, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.1328125, + "step": 6606, + "time_per_iteration": 5.776857852935791 + }, + { + "auxiliary_loss_clip": 0.01408186, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.27890015, + "balance_loss_mlp": 1.02269292, + "epoch": 0.3972343303772734, + "flos": 25890205797000.0, + "grad_norm": 2.1094572497565554, + "language_loss": 0.74519229, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76965296, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.1517334, + "step": 6607, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.01411737, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.28275728, + "balance_loss_mlp": 1.02083361, + "epoch": 0.39729445362994137, + "flos": 45631828846080.0, + "grad_norm": 1.551056816233433, + "language_loss": 0.68185031, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70633066, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15441895, + "step": 6608, + "time_per_iteration": 3.000558376312256 + }, + { + "auxiliary_loss_clip": 0.01409413, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.27927554, + "balance_loss_mlp": 1.02018821, + "epoch": 0.39735457688260933, + "flos": 20198872450080.0, + "grad_norm": 1.9509085041809715, + "language_loss": 0.74550802, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76998162, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.17755127, + "step": 6609, + "time_per_iteration": 2.7380199432373047 + }, + { + "auxiliary_loss_clip": 0.01403613, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.27585912, + "balance_loss_mlp": 1.01997066, + "epoch": 0.3974147001352773, + "flos": 23336397686520.0, + "grad_norm": 1.7574162924157217, + "language_loss": 0.71684742, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.74123269, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.1494751, + "step": 6610, + "time_per_iteration": 2.8164095878601074 + }, + { + "auxiliary_loss_clip": 0.01391953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.26955676, + "balance_loss_mlp": 1.02000058, + "epoch": 0.39747482338794526, + "flos": 21693579077880.0, + "grad_norm": 1.5967423299783017, + "language_loss": 0.78781748, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.81207889, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14196777, + "step": 6611, + "time_per_iteration": 2.7756969928741455 + }, + { + "auxiliary_loss_clip": 0.01401341, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.27523768, + "balance_loss_mlp": 1.02575636, + "epoch": 0.3975349466406133, + "flos": 30994370307360.0, + "grad_norm": 2.8155766533086566, + "language_loss": 0.8000896, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.82451594, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15527344, + "step": 6612, + "time_per_iteration": 2.8339614868164062 + }, + { + "auxiliary_loss_clip": 0.01253959, + "auxiliary_loss_mlp": 0.010212, + "balance_loss_clip": 1.19949412, + "balance_loss_mlp": 1.01824367, + "epoch": 0.39759506989328125, + "flos": 63699155161560.0, + "grad_norm": 0.8696130063965529, + "language_loss": 0.65111983, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6738714, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.02954102, + "step": 6613, + "time_per_iteration": 3.174762487411499 + }, + { + "auxiliary_loss_clip": 0.01402712, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.27634406, + "balance_loss_mlp": 1.02088857, + "epoch": 0.3976551931459492, + "flos": 23701093989120.0, + "grad_norm": 4.7804491946563665, + "language_loss": 0.71715212, + "learning_rate": 2.741872951078109e-06, + "loss": 0.74154252, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15435791, + "step": 6614, + "time_per_iteration": 2.7868692874908447 + }, + { + "auxiliary_loss_clip": 0.01401216, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.27547383, + "balance_loss_mlp": 1.02495551, + "epoch": 0.3977153163986172, + "flos": 15673958146080.0, + "grad_norm": 1.583355733861587, + "language_loss": 0.81575763, + "learning_rate": 2.741511260213862e-06, + "loss": 0.84016895, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.1496582, + "step": 6615, + "time_per_iteration": 2.8399980068206787 + }, + { + "auxiliary_loss_clip": 0.01407894, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.28065825, + "balance_loss_mlp": 1.02392721, + "epoch": 0.39777543965128515, + "flos": 14068847722680.0, + "grad_norm": 1.8487397673172585, + "language_loss": 0.68022758, + "learning_rate": 2.741149541231434e-06, + "loss": 0.70469213, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.1461792, + "step": 6616, + "time_per_iteration": 2.924696445465088 + }, + { + "auxiliary_loss_clip": 0.01413707, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_clip": 1.28296685, + "balance_loss_mlp": 1.02787256, + "epoch": 0.3978355629039531, + "flos": 23372684579160.0, + "grad_norm": 2.6515612699356303, + "language_loss": 0.83919519, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86375809, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.14703369, + "step": 6617, + "time_per_iteration": 2.8400609493255615 + }, + { + "auxiliary_loss_clip": 0.01397898, + "auxiliary_loss_mlp": 0.01041241, + "balance_loss_clip": 1.27631533, + "balance_loss_mlp": 1.02691245, + "epoch": 0.3978956861566211, + "flos": 19067522049000.0, + "grad_norm": 1.578320606923424, + "language_loss": 0.72311878, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74751019, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14318848, + "step": 6618, + "time_per_iteration": 2.8051114082336426 + }, + { + "auxiliary_loss_clip": 0.01410637, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.28345942, + "balance_loss_mlp": 1.02207232, + "epoch": 0.39795580940928904, + "flos": 30233929287960.0, + "grad_norm": 3.359207059547013, + "language_loss": 0.66039526, + "learning_rate": 2.740064215712231e-06, + "loss": 0.68489635, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.17382812, + "step": 6619, + "time_per_iteration": 2.8115689754486084 + }, + { + "auxiliary_loss_clip": 0.01252198, + "auxiliary_loss_mlp": 0.01022617, + "balance_loss_clip": 1.19861174, + "balance_loss_mlp": 1.01969647, + "epoch": 0.398015932661957, + "flos": 69862867846560.0, + "grad_norm": 0.7722111549817744, + "language_loss": 0.58234048, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60508859, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.0291748, + "step": 6620, + "time_per_iteration": 3.227151393890381 + }, + { + "auxiliary_loss_clip": 0.01404661, + "auxiliary_loss_mlp": 0.01044817, + "balance_loss_clip": 1.27897239, + "balance_loss_mlp": 1.03061306, + "epoch": 0.39807605591462497, + "flos": 20162707382520.0, + "grad_norm": 1.5385243279099818, + "language_loss": 0.79253298, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81702775, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.14202881, + "step": 6621, + "time_per_iteration": 2.743835926055908 + }, + { + "auxiliary_loss_clip": 0.01407192, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.28251767, + "balance_loss_mlp": 1.02626133, + "epoch": 0.39813617916729294, + "flos": 21146676753240.0, + "grad_norm": 1.8102819805409325, + "language_loss": 0.78258258, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.15008545, + "step": 6622, + "time_per_iteration": 2.772517204284668 + }, + { + "auxiliary_loss_clip": 0.01408503, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.28224921, + "balance_loss_mlp": 1.02484262, + "epoch": 0.3981963024199609, + "flos": 18993039670800.0, + "grad_norm": 1.4127487232687579, + "language_loss": 0.75199038, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77647746, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.15368652, + "step": 6623, + "time_per_iteration": 2.7567296028137207 + }, + { + "auxiliary_loss_clip": 0.01411409, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_clip": 1.28461111, + "balance_loss_mlp": 1.02836967, + "epoch": 0.39825642567262887, + "flos": 16578653352120.0, + "grad_norm": 1.9304015004405142, + "language_loss": 0.79624909, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.82080317, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.15612793, + "step": 6624, + "time_per_iteration": 2.7379868030548096 + }, + { + "auxiliary_loss_clip": 0.01430281, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.29714537, + "balance_loss_mlp": 1.02812481, + "epoch": 0.39831654892529683, + "flos": 22204884852000.0, + "grad_norm": 1.8720035007666147, + "language_loss": 0.83585489, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.86061203, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.17297363, + "step": 6625, + "time_per_iteration": 2.775538206100464 + }, + { + "auxiliary_loss_clip": 0.0141147, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.28521717, + "balance_loss_mlp": 1.03173935, + "epoch": 0.39837667217796485, + "flos": 10491494071680.0, + "grad_norm": 2.111550891744043, + "language_loss": 0.87442952, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89902174, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.16027832, + "step": 6626, + "time_per_iteration": 2.764326810836792 + }, + { + "auxiliary_loss_clip": 0.01410126, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.28316164, + "balance_loss_mlp": 1.02698541, + "epoch": 0.3984367954306328, + "flos": 17969494130280.0, + "grad_norm": 2.328313689732967, + "language_loss": 0.84343624, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86796415, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15686035, + "step": 6627, + "time_per_iteration": 2.7658398151397705 + }, + { + "auxiliary_loss_clip": 0.01408179, + "auxiliary_loss_mlp": 0.0103566, + "balance_loss_clip": 1.2816627, + "balance_loss_mlp": 1.02148628, + "epoch": 0.3984969186833008, + "flos": 22716231234480.0, + "grad_norm": 2.081658601580935, + "language_loss": 0.82895899, + "learning_rate": 2.736806725217998e-06, + "loss": 0.85339743, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.1416626, + "step": 6628, + "time_per_iteration": 2.7798452377319336 + }, + { + "auxiliary_loss_clip": 0.01419955, + "auxiliary_loss_mlp": 0.0104043, + "balance_loss_clip": 1.29052663, + "balance_loss_mlp": 1.02479613, + "epoch": 0.39855704193596875, + "flos": 23411245539960.0, + "grad_norm": 3.2867315418458505, + "language_loss": 0.71600962, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.74061346, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15618896, + "step": 6629, + "time_per_iteration": 2.8132739067077637 + }, + { + "auxiliary_loss_clip": 0.01411683, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.28936827, + "balance_loss_mlp": 1.02513361, + "epoch": 0.3986171651886367, + "flos": 21256958723760.0, + "grad_norm": 3.5333148211072536, + "language_loss": 0.80305868, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82757556, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14880371, + "step": 6630, + "time_per_iteration": 2.785945415496826 + }, + { + "auxiliary_loss_clip": 0.01420231, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.29065156, + "balance_loss_mlp": 1.01593971, + "epoch": 0.3986772884413047, + "flos": 12462559656840.0, + "grad_norm": 1.8707655996353543, + "language_loss": 0.74838829, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77291512, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16516113, + "step": 6631, + "time_per_iteration": 2.8008527755737305 + }, + { + "auxiliary_loss_clip": 0.01419167, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.28939438, + "balance_loss_mlp": 1.01911569, + "epoch": 0.39873741169397264, + "flos": 19650873699720.0, + "grad_norm": 1.7318430464035695, + "language_loss": 0.71751362, + "learning_rate": 2.735358224635783e-06, + "loss": 0.74205846, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.1619873, + "step": 6632, + "time_per_iteration": 2.7995285987854004 + }, + { + "auxiliary_loss_clip": 0.01408449, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.28312147, + "balance_loss_mlp": 1.01833248, + "epoch": 0.3987975349466406, + "flos": 21689193375000.0, + "grad_norm": 1.8206709430260999, + "language_loss": 0.74580783, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77021742, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14172363, + "step": 6633, + "time_per_iteration": 2.818692445755005 + }, + { + "auxiliary_loss_clip": 0.01410418, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.28310585, + "balance_loss_mlp": 1.01939178, + "epoch": 0.3988576581993086, + "flos": 23919262036920.0, + "grad_norm": 2.1249924688518504, + "language_loss": 0.812783, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83723402, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15307617, + "step": 6634, + "time_per_iteration": 4.197982549667358 + }, + { + "auxiliary_loss_clip": 0.01418639, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.29064107, + "balance_loss_mlp": 1.01546836, + "epoch": 0.39891778145197654, + "flos": 18154420912440.0, + "grad_norm": 2.125277567235429, + "language_loss": 0.7470907, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77159089, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15905762, + "step": 6635, + "time_per_iteration": 2.8575077056884766 + }, + { + "auxiliary_loss_clip": 0.01430901, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.29630899, + "balance_loss_mlp": 1.02549171, + "epoch": 0.3989779047046445, + "flos": 22599614359800.0, + "grad_norm": 2.5880782260458, + "language_loss": 0.66478246, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68952477, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.17822266, + "step": 6636, + "time_per_iteration": 2.856316566467285 + }, + { + "auxiliary_loss_clip": 0.01406722, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.28136647, + "balance_loss_mlp": 1.01818371, + "epoch": 0.39903802795731247, + "flos": 18081765910440.0, + "grad_norm": 1.8445288016170236, + "language_loss": 0.82084709, + "learning_rate": 2.733546971601763e-06, + "loss": 0.84524632, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15026855, + "step": 6637, + "time_per_iteration": 2.8228542804718018 + }, + { + "auxiliary_loss_clip": 0.01247859, + "auxiliary_loss_mlp": 0.01004988, + "balance_loss_clip": 1.19475818, + "balance_loss_mlp": 1.00174546, + "epoch": 0.39909815120998043, + "flos": 70458442613640.0, + "grad_norm": 0.7171910021345498, + "language_loss": 0.5322206, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55474913, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.0324707, + "step": 6638, + "time_per_iteration": 3.3140692710876465 + }, + { + "auxiliary_loss_clip": 0.01415985, + "auxiliary_loss_mlp": 0.01040204, + "balance_loss_clip": 1.28773546, + "balance_loss_mlp": 1.02467704, + "epoch": 0.39915827446264845, + "flos": 18553576731480.0, + "grad_norm": 1.4909627651402844, + "language_loss": 0.75355136, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77811325, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15527344, + "step": 6639, + "time_per_iteration": 2.8326385021209717 + }, + { + "auxiliary_loss_clip": 0.01406973, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.28280544, + "balance_loss_mlp": 1.0154624, + "epoch": 0.3992183977153164, + "flos": 29903164593120.0, + "grad_norm": 1.644092024256405, + "language_loss": 0.76216811, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78654385, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.15118408, + "step": 6640, + "time_per_iteration": 4.301151752471924 + }, + { + "auxiliary_loss_clip": 0.01408418, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.28108454, + "balance_loss_mlp": 1.02193904, + "epoch": 0.3992785209679844, + "flos": 22570230888360.0, + "grad_norm": 2.315335810631896, + "language_loss": 0.81976014, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.8442148, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15112305, + "step": 6641, + "time_per_iteration": 2.7776694297790527 + }, + { + "auxiliary_loss_clip": 0.01418697, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.29093587, + "balance_loss_mlp": 1.01617384, + "epoch": 0.39933864422065235, + "flos": 19687444850880.0, + "grad_norm": 2.2873846394882515, + "language_loss": 0.76901579, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.79352391, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.15930176, + "step": 6642, + "time_per_iteration": 2.77095103263855 + }, + { + "auxiliary_loss_clip": 0.01412541, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.28424716, + "balance_loss_mlp": 1.01949573, + "epoch": 0.3993987674733203, + "flos": 23043584827080.0, + "grad_norm": 2.27492917803787, + "language_loss": 0.72703481, + "learning_rate": 2.731372550178393e-06, + "loss": 0.75150549, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.15032959, + "step": 6643, + "time_per_iteration": 2.737455129623413 + }, + { + "auxiliary_loss_clip": 0.01414907, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.2869978, + "balance_loss_mlp": 1.01735353, + "epoch": 0.3994588907259883, + "flos": 19395565983720.0, + "grad_norm": 1.7787264459345498, + "language_loss": 0.66987062, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.69434237, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14916992, + "step": 6644, + "time_per_iteration": 2.766638994216919 + }, + { + "auxiliary_loss_clip": 0.01409039, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.28149962, + "balance_loss_mlp": 1.02222896, + "epoch": 0.39951901397865625, + "flos": 13738245461280.0, + "grad_norm": 2.6232903459673715, + "language_loss": 0.782583, + "learning_rate": 2.730647521020907e-06, + "loss": 0.8070507, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.15515137, + "step": 6645, + "time_per_iteration": 5.725949048995972 + }, + { + "auxiliary_loss_clip": 0.01419872, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.29101372, + "balance_loss_mlp": 1.01588702, + "epoch": 0.3995791372313242, + "flos": 23591705402520.0, + "grad_norm": 1.7312920762795536, + "language_loss": 0.69969738, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72421604, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.16101074, + "step": 6646, + "time_per_iteration": 2.8751022815704346 + }, + { + "auxiliary_loss_clip": 0.01415899, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.28780031, + "balance_loss_mlp": 1.01762545, + "epoch": 0.3996392604839922, + "flos": 21360215448000.0, + "grad_norm": 3.0502511300252237, + "language_loss": 0.71970171, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74418682, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14990234, + "step": 6647, + "time_per_iteration": 2.753558874130249 + }, + { + "auxiliary_loss_clip": 0.01401775, + "auxiliary_loss_mlp": 0.01035876, + "balance_loss_clip": 1.2794807, + "balance_loss_mlp": 1.02197659, + "epoch": 0.39969938373666014, + "flos": 26037992910960.0, + "grad_norm": 2.5289389317628013, + "language_loss": 0.74653691, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.77091342, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.13891602, + "step": 6648, + "time_per_iteration": 2.839050531387329 + }, + { + "auxiliary_loss_clip": 0.01409393, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.28176236, + "balance_loss_mlp": 1.01585472, + "epoch": 0.3997595069893281, + "flos": 20120654102760.0, + "grad_norm": 1.9450435934895587, + "language_loss": 0.66677648, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.69118953, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.16064453, + "step": 6649, + "time_per_iteration": 2.73374342918396 + }, + { + "auxiliary_loss_clip": 0.01410207, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.28300428, + "balance_loss_mlp": 1.01826835, + "epoch": 0.39981963024199607, + "flos": 27789347330640.0, + "grad_norm": 1.553981231695469, + "language_loss": 0.75242597, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77686548, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15466309, + "step": 6650, + "time_per_iteration": 2.9193813800811768 + }, + { + "auxiliary_loss_clip": 0.01408288, + "auxiliary_loss_mlp": 0.01040783, + "balance_loss_clip": 1.28020978, + "balance_loss_mlp": 1.02484524, + "epoch": 0.39987975349466404, + "flos": 21949455310920.0, + "grad_norm": 1.5647147242182822, + "language_loss": 0.71857196, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74306267, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15942383, + "step": 6651, + "time_per_iteration": 2.8189520835876465 + }, + { + "auxiliary_loss_clip": 0.01409082, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.28018892, + "balance_loss_mlp": 1.02075577, + "epoch": 0.39993987674733206, + "flos": 20709325448640.0, + "grad_norm": 3.395266365557582, + "language_loss": 0.73513627, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75958693, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.15222168, + "step": 6652, + "time_per_iteration": 2.816039800643921 + }, + { + "auxiliary_loss_clip": 0.01244488, + "auxiliary_loss_mlp": 0.01007475, + "balance_loss_clip": 1.19311726, + "balance_loss_mlp": 1.00408959, + "epoch": 0.4, + "flos": 61539589258560.0, + "grad_norm": 0.8491235594607753, + "language_loss": 0.6064052, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62892491, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.03393555, + "step": 6653, + "time_per_iteration": 3.1655993461608887 + }, + { + "auxiliary_loss_clip": 0.01399319, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.27779818, + "balance_loss_mlp": 1.02007031, + "epoch": 0.400060123252668, + "flos": 14506483285800.0, + "grad_norm": 1.9678352721127743, + "language_loss": 0.66719812, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69153911, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14703369, + "step": 6654, + "time_per_iteration": 2.763991355895996 + }, + { + "auxiliary_loss_clip": 0.01403024, + "auxiliary_loss_mlp": 0.01041182, + "balance_loss_clip": 1.27637148, + "balance_loss_mlp": 1.02673948, + "epoch": 0.40012024650533595, + "flos": 19097270995680.0, + "grad_norm": 2.29675012455583, + "language_loss": 0.89810145, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92254347, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14434814, + "step": 6655, + "time_per_iteration": 2.820384979248047 + }, + { + "auxiliary_loss_clip": 0.01394785, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.27423704, + "balance_loss_mlp": 1.01861143, + "epoch": 0.4001803697580039, + "flos": 29356993218960.0, + "grad_norm": 1.843360189428183, + "language_loss": 0.73641133, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.7606765, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13104248, + "step": 6656, + "time_per_iteration": 2.9242537021636963 + }, + { + "auxiliary_loss_clip": 0.01407439, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.28008246, + "balance_loss_mlp": 1.02371895, + "epoch": 0.4002404930106719, + "flos": 20924569694520.0, + "grad_norm": 1.7191825209474303, + "language_loss": 0.73645198, + "learning_rate": 2.726295022603144e-06, + "loss": 0.76091695, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.15332031, + "step": 6657, + "time_per_iteration": 2.8929972648620605 + }, + { + "auxiliary_loss_clip": 0.01414242, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.28678632, + "balance_loss_mlp": 1.02300227, + "epoch": 0.40030061626333985, + "flos": 28412031501000.0, + "grad_norm": 1.4520938461444401, + "language_loss": 0.79888797, + "learning_rate": 2.725932135056117e-06, + "loss": 0.82342136, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.16101074, + "step": 6658, + "time_per_iteration": 2.889894723892212 + }, + { + "auxiliary_loss_clip": 0.01403214, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.27572608, + "balance_loss_mlp": 1.02252448, + "epoch": 0.4003607395160078, + "flos": 25927183031760.0, + "grad_norm": 1.8030820466894528, + "language_loss": 0.78148586, + "learning_rate": 2.72556921998167e-06, + "loss": 0.80589437, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15100098, + "step": 6659, + "time_per_iteration": 2.8026769161224365 + }, + { + "auxiliary_loss_clip": 0.01382792, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.26426661, + "balance_loss_mlp": 1.01576841, + "epoch": 0.4004208627686758, + "flos": 20772437486040.0, + "grad_norm": 1.8050333904015936, + "language_loss": 0.73018473, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.75429738, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.1270752, + "step": 6660, + "time_per_iteration": 2.7938358783721924 + }, + { + "auxiliary_loss_clip": 0.0140266, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.27714038, + "balance_loss_mlp": 1.02457142, + "epoch": 0.40048098602134374, + "flos": 24686647085880.0, + "grad_norm": 2.1062555386227992, + "language_loss": 0.71573448, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.74014592, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.13916016, + "step": 6661, + "time_per_iteration": 2.8014187812805176 + }, + { + "auxiliary_loss_clip": 0.01406479, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.27903628, + "balance_loss_mlp": 1.02544665, + "epoch": 0.4005411092740117, + "flos": 23190965857440.0, + "grad_norm": 1.9478645931335334, + "language_loss": 0.75665766, + "learning_rate": 2.724480309731437e-06, + "loss": 0.78113276, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.15582275, + "step": 6662, + "time_per_iteration": 2.7927770614624023 + }, + { + "auxiliary_loss_clip": 0.01409243, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.27902651, + "balance_loss_mlp": 1.01769519, + "epoch": 0.4006012325266797, + "flos": 17525807921520.0, + "grad_norm": 1.9125647485014101, + "language_loss": 0.66593707, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.69035679, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15032959, + "step": 6663, + "time_per_iteration": 2.780301332473755 + }, + { + "auxiliary_loss_clip": 0.01405885, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.27868533, + "balance_loss_mlp": 1.02350831, + "epoch": 0.40066135577934764, + "flos": 19860960683880.0, + "grad_norm": 2.0363682258809845, + "language_loss": 0.86672568, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.8911612, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14172363, + "step": 6664, + "time_per_iteration": 2.739262819290161 + }, + { + "auxiliary_loss_clip": 0.01401803, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.27382195, + "balance_loss_mlp": 1.02019501, + "epoch": 0.40072147903201566, + "flos": 18154542737520.0, + "grad_norm": 2.1839843199645905, + "language_loss": 0.84599686, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87036514, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14819336, + "step": 6665, + "time_per_iteration": 2.7635581493377686 + }, + { + "auxiliary_loss_clip": 0.01405969, + "auxiliary_loss_mlp": 0.01035367, + "balance_loss_clip": 1.27822912, + "balance_loss_mlp": 1.01995277, + "epoch": 0.4007816022846836, + "flos": 18665929728360.0, + "grad_norm": 2.5859812631578643, + "language_loss": 0.78789103, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.81230438, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15429688, + "step": 6666, + "time_per_iteration": 2.7551536560058594 + }, + { + "auxiliary_loss_clip": 0.0140834, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.2818743, + "balance_loss_mlp": 1.01768661, + "epoch": 0.4008417255373516, + "flos": 25708771333800.0, + "grad_norm": 1.7690808419809365, + "language_loss": 0.74052966, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.76494479, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.15490723, + "step": 6667, + "time_per_iteration": 2.8548386096954346 + }, + { + "auxiliary_loss_clip": 0.01407192, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.27855229, + "balance_loss_mlp": 1.02369547, + "epoch": 0.40090184879001955, + "flos": 22864018348440.0, + "grad_norm": 1.5893688187077706, + "language_loss": 0.76124328, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78570849, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15649414, + "step": 6668, + "time_per_iteration": 2.850982904434204 + }, + { + "auxiliary_loss_clip": 0.01399054, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.27556109, + "balance_loss_mlp": 1.02135611, + "epoch": 0.4009619720426875, + "flos": 29065885910640.0, + "grad_norm": 2.182519171904486, + "language_loss": 0.8226198, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84697008, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14611816, + "step": 6669, + "time_per_iteration": 2.8876585960388184 + }, + { + "auxiliary_loss_clip": 0.01221496, + "auxiliary_loss_mlp": 0.0099825, + "balance_loss_clip": 1.1693635, + "balance_loss_mlp": 0.99493617, + "epoch": 0.4010220952953555, + "flos": 66075833295000.0, + "grad_norm": 0.7006943264540753, + "language_loss": 0.53376997, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55596745, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.03320312, + "step": 6670, + "time_per_iteration": 3.436882495880127 + }, + { + "auxiliary_loss_clip": 0.01401821, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.27718246, + "balance_loss_mlp": 1.02061677, + "epoch": 0.40108221854802345, + "flos": 29648831477760.0, + "grad_norm": 1.5519197427403102, + "language_loss": 0.88402259, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90840006, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.1529541, + "step": 6671, + "time_per_iteration": 2.8852813243865967 + }, + { + "auxiliary_loss_clip": 0.01405646, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.27803433, + "balance_loss_mlp": 1.01919818, + "epoch": 0.4011423418006914, + "flos": 19933250210640.0, + "grad_norm": 1.7101791797749268, + "language_loss": 0.78499067, + "learning_rate": 2.720848825281736e-06, + "loss": 0.80940044, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.16131592, + "step": 6672, + "time_per_iteration": 2.777031660079956 + }, + { + "auxiliary_loss_clip": 0.01401601, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.27637339, + "balance_loss_mlp": 1.01733398, + "epoch": 0.4012024650533594, + "flos": 20089158996600.0, + "grad_norm": 6.08593573537318, + "language_loss": 0.64156801, + "learning_rate": 2.72048552626888e-06, + "loss": 0.66589814, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14093018, + "step": 6673, + "time_per_iteration": 4.162125825881958 + }, + { + "auxiliary_loss_clip": 0.01405596, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.27849913, + "balance_loss_mlp": 1.02008867, + "epoch": 0.40126258830602735, + "flos": 21701578924800.0, + "grad_norm": 1.571999716205767, + "language_loss": 0.80507898, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82949036, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15435791, + "step": 6674, + "time_per_iteration": 2.808894634246826 + }, + { + "auxiliary_loss_clip": 0.01412811, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.28171062, + "balance_loss_mlp": 1.01938546, + "epoch": 0.4013227115586953, + "flos": 12024111926520.0, + "grad_norm": 3.0339429722022535, + "language_loss": 0.8268761, + "learning_rate": 2.719758846294294e-06, + "loss": 0.85134745, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.14953613, + "step": 6675, + "time_per_iteration": 2.745785713195801 + }, + { + "auxiliary_loss_clip": 0.01410447, + "auxiliary_loss_mlp": 0.01036842, + "balance_loss_clip": 1.28383601, + "balance_loss_mlp": 1.02136874, + "epoch": 0.4013828348113633, + "flos": 25453057534200.0, + "grad_norm": 1.800298309133767, + "language_loss": 0.93564343, + "learning_rate": 2.71939546536012e-06, + "loss": 0.96011633, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.15472412, + "step": 6676, + "time_per_iteration": 2.771641254425049 + }, + { + "auxiliary_loss_clip": 0.0141725, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.28284192, + "balance_loss_mlp": 1.02367377, + "epoch": 0.40144295806403124, + "flos": 18586899213840.0, + "grad_norm": 2.4903862891468163, + "language_loss": 0.8006587, + "learning_rate": 2.719032057146399e-06, + "loss": 0.82524753, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.17962646, + "step": 6677, + "time_per_iteration": 2.757397174835205 + }, + { + "auxiliary_loss_clip": 0.0141066, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.28292954, + "balance_loss_mlp": 1.02532959, + "epoch": 0.4015030813166992, + "flos": 22935455099640.0, + "grad_norm": 3.0966095907692277, + "language_loss": 0.83963674, + "learning_rate": 2.71866862166691e-06, + "loss": 0.86415005, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15344238, + "step": 6678, + "time_per_iteration": 2.757901430130005 + }, + { + "auxiliary_loss_clip": 0.01399204, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.27477956, + "balance_loss_mlp": 1.02052057, + "epoch": 0.4015632045693672, + "flos": 20599977470400.0, + "grad_norm": 3.0012909376040176, + "language_loss": 0.64235383, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66669452, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.14337158, + "step": 6679, + "time_per_iteration": 4.251043081283569 + }, + { + "auxiliary_loss_clip": 0.01399266, + "auxiliary_loss_mlp": 0.01027628, + "balance_loss_clip": 1.27541018, + "balance_loss_mlp": 1.01355505, + "epoch": 0.4016233278220352, + "flos": 23443958896920.0, + "grad_norm": 1.4320394931239127, + "language_loss": 0.78772736, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81199634, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14080811, + "step": 6680, + "time_per_iteration": 2.8290555477142334 + }, + { + "auxiliary_loss_clip": 0.01418754, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.28638172, + "balance_loss_mlp": 1.03147471, + "epoch": 0.40168345107470316, + "flos": 21435672426840.0, + "grad_norm": 2.022678462792023, + "language_loss": 0.75722611, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.7818867, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15838623, + "step": 6681, + "time_per_iteration": 2.82893443107605 + }, + { + "auxiliary_loss_clip": 0.01411968, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.28339505, + "balance_loss_mlp": 1.02088237, + "epoch": 0.4017435743273711, + "flos": 22862434622400.0, + "grad_norm": 7.79628907325912, + "language_loss": 0.64580184, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.67027879, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.14855957, + "step": 6682, + "time_per_iteration": 2.839308023452759 + }, + { + "auxiliary_loss_clip": 0.01406283, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.27721429, + "balance_loss_mlp": 1.02353442, + "epoch": 0.4018036975800391, + "flos": 28628331564240.0, + "grad_norm": 1.8914743720620415, + "language_loss": 0.73108512, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75553399, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15039062, + "step": 6683, + "time_per_iteration": 4.470202207565308 + }, + { + "auxiliary_loss_clip": 0.01403193, + "auxiliary_loss_mlp": 0.01041904, + "balance_loss_clip": 1.27559829, + "balance_loss_mlp": 1.02742052, + "epoch": 0.40186382083270705, + "flos": 26656859895480.0, + "grad_norm": 2.1109882256900825, + "language_loss": 0.73704374, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.7614947, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.14483643, + "step": 6684, + "time_per_iteration": 2.7712032794952393 + }, + { + "auxiliary_loss_clip": 0.01219551, + "auxiliary_loss_mlp": 0.0101798, + "balance_loss_clip": 1.16764379, + "balance_loss_mlp": 1.01493979, + "epoch": 0.401923944085375, + "flos": 59273396137440.0, + "grad_norm": 0.808849318483071, + "language_loss": 0.60409397, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62646931, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.03039551, + "step": 6685, + "time_per_iteration": 3.409499406814575 + }, + { + "auxiliary_loss_clip": 0.01413599, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.28098536, + "balance_loss_mlp": 1.02020955, + "epoch": 0.401984067338043, + "flos": 16987230310680.0, + "grad_norm": 2.6156319110238004, + "language_loss": 0.70487612, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72937143, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.15734863, + "step": 6686, + "time_per_iteration": 2.8049004077911377 + }, + { + "auxiliary_loss_clip": 0.01406189, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.27940893, + "balance_loss_mlp": 1.02273798, + "epoch": 0.40204419059071095, + "flos": 24977876219280.0, + "grad_norm": 1.438128308926408, + "language_loss": 0.74930048, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77373606, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.1461792, + "step": 6687, + "time_per_iteration": 2.808345079421997 + }, + { + "auxiliary_loss_clip": 0.01412418, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.28476548, + "balance_loss_mlp": 1.02458477, + "epoch": 0.4021043138433789, + "flos": 23482763507880.0, + "grad_norm": 2.3470706229969007, + "language_loss": 0.7106666, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73518097, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.14447021, + "step": 6688, + "time_per_iteration": 2.883965015411377 + }, + { + "auxiliary_loss_clip": 0.01420581, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_clip": 1.28804636, + "balance_loss_mlp": 1.03113413, + "epoch": 0.4021644370960469, + "flos": 26001543584880.0, + "grad_norm": 1.8167853142269634, + "language_loss": 0.6457063, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.67038381, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16040039, + "step": 6689, + "time_per_iteration": 2.8543272018432617 + }, + { + "auxiliary_loss_clip": 0.01410702, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.28015804, + "balance_loss_mlp": 1.02446771, + "epoch": 0.40222456034871484, + "flos": 13591757814840.0, + "grad_norm": 2.0538526611364993, + "language_loss": 0.74023777, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.76474029, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15075684, + "step": 6690, + "time_per_iteration": 2.7997684478759766 + }, + { + "auxiliary_loss_clip": 0.01403808, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.27729678, + "balance_loss_mlp": 1.02453899, + "epoch": 0.4022846836013828, + "flos": 24283146172320.0, + "grad_norm": 1.5111957680232517, + "language_loss": 0.74978459, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77421451, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14648438, + "step": 6691, + "time_per_iteration": 2.8593533039093018 + }, + { + "auxiliary_loss_clip": 0.01416294, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.28682303, + "balance_loss_mlp": 1.02780259, + "epoch": 0.40234480685405083, + "flos": 20155722744600.0, + "grad_norm": 1.5129741249236897, + "language_loss": 0.72537827, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.7499696, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.15045166, + "step": 6692, + "time_per_iteration": 2.818741798400879 + }, + { + "auxiliary_loss_clip": 0.01405918, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.27986825, + "balance_loss_mlp": 1.02289069, + "epoch": 0.4024049301067188, + "flos": 22935455099640.0, + "grad_norm": 2.2788199736063444, + "language_loss": 0.84402156, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86845803, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14837646, + "step": 6693, + "time_per_iteration": 2.8106741905212402 + }, + { + "auxiliary_loss_clip": 0.01406139, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.27852249, + "balance_loss_mlp": 1.02454221, + "epoch": 0.40246505335938676, + "flos": 36035341563600.0, + "grad_norm": 2.278861578219443, + "language_loss": 0.71286136, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73731035, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.14221191, + "step": 6694, + "time_per_iteration": 2.8474156856536865 + }, + { + "auxiliary_loss_clip": 0.01403978, + "auxiliary_loss_mlp": 0.01042953, + "balance_loss_clip": 1.27755046, + "balance_loss_mlp": 1.02864742, + "epoch": 0.4025251766120547, + "flos": 20599124694840.0, + "grad_norm": 2.018310689597685, + "language_loss": 0.68121576, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.70568508, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14318848, + "step": 6695, + "time_per_iteration": 2.7634730339050293 + }, + { + "auxiliary_loss_clip": 0.0140885, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.28292739, + "balance_loss_mlp": 1.02312279, + "epoch": 0.4025852998647227, + "flos": 64534589486640.0, + "grad_norm": 2.0302825347316733, + "language_loss": 0.79875171, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.82322145, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14990234, + "step": 6696, + "time_per_iteration": 3.1232128143310547 + }, + { + "auxiliary_loss_clip": 0.01416645, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.28769529, + "balance_loss_mlp": 1.0309211, + "epoch": 0.40264542311739066, + "flos": 20891165995440.0, + "grad_norm": 2.610940165973265, + "language_loss": 0.71614909, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.74078214, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.15734863, + "step": 6697, + "time_per_iteration": 2.7432949542999268 + }, + { + "auxiliary_loss_clip": 0.01399626, + "auxiliary_loss_mlp": 0.01046947, + "balance_loss_clip": 1.27560043, + "balance_loss_mlp": 1.03241587, + "epoch": 0.4027055463700586, + "flos": 26255836091880.0, + "grad_norm": 2.1379146902476007, + "language_loss": 0.61529636, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63976204, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14526367, + "step": 6698, + "time_per_iteration": 2.7872166633605957 + }, + { + "auxiliary_loss_clip": 0.01405983, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.27751064, + "balance_loss_mlp": 1.02161074, + "epoch": 0.4027656696227266, + "flos": 20636304971400.0, + "grad_norm": 2.0922326922117094, + "language_loss": 0.7679143, + "learning_rate": 2.711030202621491e-06, + "loss": 0.79234779, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15759277, + "step": 6699, + "time_per_iteration": 2.781669855117798 + }, + { + "auxiliary_loss_clip": 0.01399532, + "auxiliary_loss_mlp": 0.01037972, + "balance_loss_clip": 1.27539253, + "balance_loss_mlp": 1.02366638, + "epoch": 0.40282579287539455, + "flos": 22351413106800.0, + "grad_norm": 1.6208553656961846, + "language_loss": 0.80257082, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.8269459, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14306641, + "step": 6700, + "time_per_iteration": 2.901533842086792 + }, + { + "auxiliary_loss_clip": 0.01415602, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.28360367, + "balance_loss_mlp": 1.02226722, + "epoch": 0.4028859161280625, + "flos": 29280561639480.0, + "grad_norm": 2.5615947878840184, + "language_loss": 0.74686742, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77141273, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.16674805, + "step": 6701, + "time_per_iteration": 2.815566301345825 + }, + { + "auxiliary_loss_clip": 0.01406045, + "auxiliary_loss_mlp": 0.01040522, + "balance_loss_clip": 1.27960038, + "balance_loss_mlp": 1.02658653, + "epoch": 0.4029460393807305, + "flos": 28628331564240.0, + "grad_norm": 1.8160635936615865, + "language_loss": 0.66146016, + "learning_rate": 2.709938026276208e-06, + "loss": 0.6859259, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.13952637, + "step": 6702, + "time_per_iteration": 2.844395160675049 + }, + { + "auxiliary_loss_clip": 0.01413954, + "auxiliary_loss_mlp": 0.01040584, + "balance_loss_clip": 1.28424275, + "balance_loss_mlp": 1.02494407, + "epoch": 0.40300616263339845, + "flos": 22607126906400.0, + "grad_norm": 1.521505992025774, + "language_loss": 0.66161001, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68615544, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15631104, + "step": 6703, + "time_per_iteration": 2.7666988372802734 + }, + { + "auxiliary_loss_clip": 0.01418284, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.28929496, + "balance_loss_mlp": 1.0223763, + "epoch": 0.4030662858860664, + "flos": 25525834361280.0, + "grad_norm": 2.248363258733837, + "language_loss": 0.8189491, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84351563, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.16003418, + "step": 6704, + "time_per_iteration": 2.8539211750030518 + }, + { + "auxiliary_loss_clip": 0.01420246, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.29018235, + "balance_loss_mlp": 1.02419865, + "epoch": 0.40312640913873443, + "flos": 23591989661040.0, + "grad_norm": 1.634098348494236, + "language_loss": 0.73397869, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75858355, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.16033936, + "step": 6705, + "time_per_iteration": 2.8481433391571045 + }, + { + "auxiliary_loss_clip": 0.0140292, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.2786572, + "balance_loss_mlp": 1.02228022, + "epoch": 0.4031865323914024, + "flos": 20015854260840.0, + "grad_norm": 1.6820508731628734, + "language_loss": 0.66003621, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68443334, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1449585, + "step": 6706, + "time_per_iteration": 2.8626351356506348 + }, + { + "auxiliary_loss_clip": 0.01409066, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.28213859, + "balance_loss_mlp": 1.02191353, + "epoch": 0.40324665564407036, + "flos": 21876434833680.0, + "grad_norm": 1.4432564559752044, + "language_loss": 0.71449196, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.7389515, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.14978027, + "step": 6707, + "time_per_iteration": 2.7845425605773926 + }, + { + "auxiliary_loss_clip": 0.01393086, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.27162421, + "balance_loss_mlp": 1.01429009, + "epoch": 0.4033067788967383, + "flos": 23884071570000.0, + "grad_norm": 1.612358772008975, + "language_loss": 0.80060297, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82483208, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.15539551, + "step": 6708, + "time_per_iteration": 2.785059690475464 + }, + { + "auxiliary_loss_clip": 0.01419424, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.28638887, + "balance_loss_mlp": 1.02403641, + "epoch": 0.4033669021494063, + "flos": 17424297356760.0, + "grad_norm": 1.8388220162004338, + "language_loss": 0.82871187, + "learning_rate": 2.70738867321606e-06, + "loss": 0.85330641, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15966797, + "step": 6709, + "time_per_iteration": 2.7480804920196533 + }, + { + "auxiliary_loss_clip": 0.01419464, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.28839755, + "balance_loss_mlp": 1.0227114, + "epoch": 0.40342702540207426, + "flos": 29605803597360.0, + "grad_norm": 1.4457653311268366, + "language_loss": 0.71444213, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73901987, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15576172, + "step": 6710, + "time_per_iteration": 2.8809807300567627 + }, + { + "auxiliary_loss_clip": 0.0140758, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_clip": 1.28172994, + "balance_loss_mlp": 1.02463937, + "epoch": 0.4034871486547422, + "flos": 11287450424880.0, + "grad_norm": 2.5608382745115446, + "language_loss": 0.85074472, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87522495, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15795898, + "step": 6711, + "time_per_iteration": 4.115978479385376 + }, + { + "auxiliary_loss_clip": 0.01410095, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.28291941, + "balance_loss_mlp": 1.0223732, + "epoch": 0.4035472719074102, + "flos": 15556163628960.0, + "grad_norm": 3.203985907182919, + "language_loss": 0.77172428, + "learning_rate": 2.706295690693168e-06, + "loss": 0.79619664, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14770508, + "step": 6712, + "time_per_iteration": 2.715654134750366 + }, + { + "auxiliary_loss_clip": 0.01411492, + "auxiliary_loss_mlp": 0.01042351, + "balance_loss_clip": 1.28432155, + "balance_loss_mlp": 1.02740765, + "epoch": 0.40360739516007815, + "flos": 24678890889120.0, + "grad_norm": 2.1528126495424917, + "language_loss": 0.79232693, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81686532, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14935303, + "step": 6713, + "time_per_iteration": 2.7561190128326416 + }, + { + "auxiliary_loss_clip": 0.01410239, + "auxiliary_loss_mlp": 0.0104322, + "balance_loss_clip": 1.27995229, + "balance_loss_mlp": 1.0277462, + "epoch": 0.4036675184127461, + "flos": 17307639873720.0, + "grad_norm": 2.5800109085847307, + "language_loss": 0.87936819, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90390277, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15466309, + "step": 6714, + "time_per_iteration": 2.7303354740142822 + }, + { + "auxiliary_loss_clip": 0.01408638, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.28187275, + "balance_loss_mlp": 1.02857852, + "epoch": 0.4037276416654141, + "flos": 19868676272280.0, + "grad_norm": 1.8138782711450485, + "language_loss": 0.69027364, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71479797, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15222168, + "step": 6715, + "time_per_iteration": 2.735703945159912 + }, + { + "auxiliary_loss_clip": 0.01418817, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.28790903, + "balance_loss_mlp": 1.02516007, + "epoch": 0.40378776491808205, + "flos": 18300867950520.0, + "grad_norm": 2.4377938106476034, + "language_loss": 0.77553141, + "learning_rate": 2.704838005767892e-06, + "loss": 0.80012059, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.14941406, + "step": 6716, + "time_per_iteration": 4.2761149406433105 + }, + { + "auxiliary_loss_clip": 0.01395863, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.27239799, + "balance_loss_mlp": 1.01938701, + "epoch": 0.40384788817075, + "flos": 15053588652240.0, + "grad_norm": 2.2099602019884736, + "language_loss": 0.76453507, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78882754, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.13995361, + "step": 6717, + "time_per_iteration": 2.740431785583496 + }, + { + "auxiliary_loss_clip": 0.01232769, + "auxiliary_loss_mlp": 0.01020824, + "balance_loss_clip": 1.18056715, + "balance_loss_mlp": 1.01826119, + "epoch": 0.40390801142341803, + "flos": 61944552073080.0, + "grad_norm": 0.9432959212632814, + "language_loss": 0.60719657, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62973249, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02563477, + "step": 6718, + "time_per_iteration": 3.094653606414795 + }, + { + "auxiliary_loss_clip": 0.0141791, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.28594065, + "balance_loss_mlp": 1.02858806, + "epoch": 0.403968134676086, + "flos": 22742812729080.0, + "grad_norm": 1.900147534216865, + "language_loss": 0.74887949, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77350843, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.1640625, + "step": 6719, + "time_per_iteration": 2.7223167419433594 + }, + { + "auxiliary_loss_clip": 0.01408324, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_clip": 1.28012037, + "balance_loss_mlp": 1.03146899, + "epoch": 0.40402825792875396, + "flos": 19788183856800.0, + "grad_norm": 2.015501607495915, + "language_loss": 0.81712687, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.84168124, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15649414, + "step": 6720, + "time_per_iteration": 2.8025007247924805 + }, + { + "auxiliary_loss_clip": 0.01412378, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.28353703, + "balance_loss_mlp": 1.01877677, + "epoch": 0.40408838118142193, + "flos": 19613977681680.0, + "grad_norm": 7.000563306266403, + "language_loss": 0.76776218, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79222381, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15002441, + "step": 6721, + "time_per_iteration": 4.177354574203491 + }, + { + "auxiliary_loss_clip": 0.01403163, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.28138912, + "balance_loss_mlp": 1.02036619, + "epoch": 0.4041485044340899, + "flos": 24431461194960.0, + "grad_norm": 2.164486583024818, + "language_loss": 0.7232812, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74765569, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.13916016, + "step": 6722, + "time_per_iteration": 4.328278303146362 + }, + { + "auxiliary_loss_clip": 0.01408012, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.28289115, + "balance_loss_mlp": 1.02264965, + "epoch": 0.40420862768675786, + "flos": 16764108042960.0, + "grad_norm": 2.9990140182102083, + "language_loss": 0.66243708, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68688703, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14331055, + "step": 6723, + "time_per_iteration": 2.871238946914673 + }, + { + "auxiliary_loss_clip": 0.0142011, + "auxiliary_loss_mlp": 0.01040966, + "balance_loss_clip": 1.28919673, + "balance_loss_mlp": 1.02506316, + "epoch": 0.4042687509394258, + "flos": 22496641894080.0, + "grad_norm": 1.4794122716176386, + "language_loss": 0.73631978, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76093054, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15917969, + "step": 6724, + "time_per_iteration": 2.758850336074829 + }, + { + "auxiliary_loss_clip": 0.01394659, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.27384043, + "balance_loss_mlp": 1.01798892, + "epoch": 0.4043288741920938, + "flos": 30342262057200.0, + "grad_norm": 1.7768755742998805, + "language_loss": 0.75311339, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.77738595, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14599609, + "step": 6725, + "time_per_iteration": 2.8189945220947266 + }, + { + "auxiliary_loss_clip": 0.01407831, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.28232169, + "balance_loss_mlp": 1.01618791, + "epoch": 0.40438899744476176, + "flos": 46355861147760.0, + "grad_norm": 1.5944324727973016, + "language_loss": 0.77063465, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79502559, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.15081787, + "step": 6726, + "time_per_iteration": 2.9491913318634033 + }, + { + "auxiliary_loss_clip": 0.01411734, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.28328311, + "balance_loss_mlp": 1.01793218, + "epoch": 0.4044491206974297, + "flos": 13337384091120.0, + "grad_norm": 6.261930656914677, + "language_loss": 0.82031715, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.84476924, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15539551, + "step": 6727, + "time_per_iteration": 2.768775463104248 + }, + { + "auxiliary_loss_clip": 0.01409104, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.28119791, + "balance_loss_mlp": 1.01595902, + "epoch": 0.4045092439500977, + "flos": 12097538487360.0, + "grad_norm": 2.4415433909951454, + "language_loss": 0.85835004, + "learning_rate": 2.700462388688447e-06, + "loss": 0.88274562, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.1449585, + "step": 6728, + "time_per_iteration": 2.7802376747131348 + }, + { + "auxiliary_loss_clip": 0.01406014, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.28098631, + "balance_loss_mlp": 1.02090585, + "epoch": 0.40456936720276565, + "flos": 21184791022080.0, + "grad_norm": 5.080997573203699, + "language_loss": 0.82175195, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84616947, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.1484375, + "step": 6729, + "time_per_iteration": 2.85074782371521 + }, + { + "auxiliary_loss_clip": 0.01410364, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.28497636, + "balance_loss_mlp": 1.02301836, + "epoch": 0.4046294904554336, + "flos": 23920642721160.0, + "grad_norm": 2.314932056618655, + "language_loss": 0.73642254, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.76090336, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.14715576, + "step": 6730, + "time_per_iteration": 2.8992812633514404 + }, + { + "auxiliary_loss_clip": 0.01407499, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.28192616, + "balance_loss_mlp": 1.02194405, + "epoch": 0.4046896137081016, + "flos": 38078371808640.0, + "grad_norm": 1.6560150191041236, + "language_loss": 0.67982316, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70426714, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14929199, + "step": 6731, + "time_per_iteration": 2.886521577835083 + }, + { + "auxiliary_loss_clip": 0.01403171, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.27736807, + "balance_loss_mlp": 1.01766956, + "epoch": 0.4047497369607696, + "flos": 23621779216080.0, + "grad_norm": 1.566836038322816, + "language_loss": 0.74067914, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76502848, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.14105225, + "step": 6732, + "time_per_iteration": 2.83073353767395 + }, + { + "auxiliary_loss_clip": 0.01398139, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.27398527, + "balance_loss_mlp": 1.01314139, + "epoch": 0.40480986021343757, + "flos": 12827499609600.0, + "grad_norm": 1.7682239704877687, + "language_loss": 0.77588093, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.80012435, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.13043213, + "step": 6733, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.01410605, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.28017473, + "balance_loss_mlp": 1.01815891, + "epoch": 0.40486998346610553, + "flos": 23774033249640.0, + "grad_norm": 1.968334538885391, + "language_loss": 0.77279788, + "learning_rate": 2.698273144328627e-06, + "loss": 0.79724252, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.15698242, + "step": 6734, + "time_per_iteration": 2.7798011302948 + }, + { + "auxiliary_loss_clip": 0.01408002, + "auxiliary_loss_mlp": 0.01031103, + "balance_loss_clip": 1.27734184, + "balance_loss_mlp": 1.01673865, + "epoch": 0.4049301067187735, + "flos": 22861987930440.0, + "grad_norm": 2.7815500600924494, + "language_loss": 0.65377182, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67816287, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.14367676, + "step": 6735, + "time_per_iteration": 2.761176347732544 + }, + { + "auxiliary_loss_clip": 0.01394568, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.26963985, + "balance_loss_mlp": 1.0207094, + "epoch": 0.40499022997144146, + "flos": 22789170495000.0, + "grad_norm": 1.7366257981932463, + "language_loss": 0.83361238, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85789716, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.13214111, + "step": 6736, + "time_per_iteration": 2.813514232635498 + }, + { + "auxiliary_loss_clip": 0.01409233, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.28077769, + "balance_loss_mlp": 1.0261693, + "epoch": 0.4050503532241094, + "flos": 23044559427720.0, + "grad_norm": 1.600430297866127, + "language_loss": 0.7535857, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77809048, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.1506958, + "step": 6737, + "time_per_iteration": 2.793062448501587 + }, + { + "auxiliary_loss_clip": 0.01396026, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.27152705, + "balance_loss_mlp": 1.02128184, + "epoch": 0.4051104764767774, + "flos": 16651267745760.0, + "grad_norm": 2.352148908605981, + "language_loss": 0.72688663, + "learning_rate": 2.696813118332519e-06, + "loss": 0.75120807, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14825439, + "step": 6738, + "time_per_iteration": 2.7326459884643555 + }, + { + "auxiliary_loss_clip": 0.01397924, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.27283967, + "balance_loss_mlp": 1.01721346, + "epoch": 0.40517059972944536, + "flos": 16362840589200.0, + "grad_norm": 1.7785876014579605, + "language_loss": 0.75083834, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77512574, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.13592529, + "step": 6739, + "time_per_iteration": 2.7960376739501953 + }, + { + "auxiliary_loss_clip": 0.01404455, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.27695274, + "balance_loss_mlp": 1.01947594, + "epoch": 0.4052307229821133, + "flos": 28809238118760.0, + "grad_norm": 2.01316028331772, + "language_loss": 0.74154294, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76593089, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.14868164, + "step": 6740, + "time_per_iteration": 2.8078079223632812 + }, + { + "auxiliary_loss_clip": 0.01398195, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.2759037, + "balance_loss_mlp": 1.01860344, + "epoch": 0.4052908462347813, + "flos": 21402918461520.0, + "grad_norm": 1.4690084356147268, + "language_loss": 0.77193475, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79624468, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14202881, + "step": 6741, + "time_per_iteration": 2.920156478881836 + }, + { + "auxiliary_loss_clip": 0.01397649, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.2712698, + "balance_loss_mlp": 1.01973307, + "epoch": 0.40535096948744925, + "flos": 22424189933880.0, + "grad_norm": 1.729927781170376, + "language_loss": 0.71651167, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.74084079, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.1552124, + "step": 6742, + "time_per_iteration": 2.822211265563965 + }, + { + "auxiliary_loss_clip": 0.01401993, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.27528906, + "balance_loss_mlp": 1.01569414, + "epoch": 0.4054110927401172, + "flos": 17014096063800.0, + "grad_norm": 2.8075982458781445, + "language_loss": 0.72441888, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74874973, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.1539917, + "step": 6743, + "time_per_iteration": 2.774061441421509 + }, + { + "auxiliary_loss_clip": 0.01408087, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.27809072, + "balance_loss_mlp": 1.01916647, + "epoch": 0.4054712159927852, + "flos": 21619665216720.0, + "grad_norm": 2.40641131674677, + "language_loss": 0.71267116, + "learning_rate": 2.694622286918588e-06, + "loss": 0.73709208, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.14831543, + "step": 6744, + "time_per_iteration": 2.8052847385406494 + }, + { + "auxiliary_loss_clip": 0.01399821, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.27425575, + "balance_loss_mlp": 1.02398252, + "epoch": 0.4055313392454532, + "flos": 25818159920400.0, + "grad_norm": 2.1356210023981204, + "language_loss": 0.79962194, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82400513, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.1451416, + "step": 6745, + "time_per_iteration": 2.825695276260376 + }, + { + "auxiliary_loss_clip": 0.01405017, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.27956522, + "balance_loss_mlp": 1.02396417, + "epoch": 0.40559146249812117, + "flos": 14141177857800.0, + "grad_norm": 1.8480472411140403, + "language_loss": 0.66796249, + "learning_rate": 2.693891798911731e-06, + "loss": 0.69240403, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15179443, + "step": 6746, + "time_per_iteration": 2.744793176651001 + }, + { + "auxiliary_loss_clip": 0.0140067, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.27504992, + "balance_loss_mlp": 1.01644444, + "epoch": 0.40565158575078913, + "flos": 41363115642000.0, + "grad_norm": 1.4555622209623005, + "language_loss": 0.57250988, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59682286, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.14190674, + "step": 6747, + "time_per_iteration": 2.90045166015625 + }, + { + "auxiliary_loss_clip": 0.01402363, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.27524257, + "balance_loss_mlp": 1.02615678, + "epoch": 0.4057117090034571, + "flos": 28549910175120.0, + "grad_norm": 4.643488265786255, + "language_loss": 0.84471941, + "learning_rate": 2.693161205655089e-06, + "loss": 0.8691498, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.1451416, + "step": 6748, + "time_per_iteration": 2.8269145488739014 + }, + { + "auxiliary_loss_clip": 0.01409835, + "auxiliary_loss_mlp": 0.01040461, + "balance_loss_clip": 1.28078389, + "balance_loss_mlp": 1.02460575, + "epoch": 0.40577183225612506, + "flos": 18008582999760.0, + "grad_norm": 2.115960352710205, + "language_loss": 0.8202492, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.84475213, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15856934, + "step": 6749, + "time_per_iteration": 2.7159712314605713 + }, + { + "auxiliary_loss_clip": 0.01403538, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.27807188, + "balance_loss_mlp": 1.02535319, + "epoch": 0.40583195550879303, + "flos": 19541485113120.0, + "grad_norm": 1.5680463510355809, + "language_loss": 0.74986076, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77429312, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14349365, + "step": 6750, + "time_per_iteration": 4.195159435272217 + }, + { + "auxiliary_loss_clip": 0.01414595, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_clip": 1.28158259, + "balance_loss_mlp": 1.02806473, + "epoch": 0.405892078761461, + "flos": 22314557697120.0, + "grad_norm": 2.048057362934114, + "language_loss": 0.73685193, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76143092, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15240479, + "step": 6751, + "time_per_iteration": 2.7785584926605225 + }, + { + "auxiliary_loss_clip": 0.01412082, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.28297901, + "balance_loss_mlp": 1.02037334, + "epoch": 0.40595220201412896, + "flos": 25489994160600.0, + "grad_norm": 1.5723071411058318, + "language_loss": 0.67051136, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69499171, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15576172, + "step": 6752, + "time_per_iteration": 2.787395715713501 + }, + { + "auxiliary_loss_clip": 0.01410972, + "auxiliary_loss_mlp": 0.01040171, + "balance_loss_clip": 1.2803874, + "balance_loss_mlp": 1.0234102, + "epoch": 0.4060123252667969, + "flos": 49864661241120.0, + "grad_norm": 1.6100052654448442, + "language_loss": 0.71090627, + "learning_rate": 2.691334262772948e-06, + "loss": 0.73541772, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.16760254, + "step": 6753, + "time_per_iteration": 3.0135209560394287 + }, + { + "auxiliary_loss_clip": 0.01410652, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.28078818, + "balance_loss_mlp": 1.02096641, + "epoch": 0.4060724485194649, + "flos": 21139570290240.0, + "grad_norm": 1.945977157151112, + "language_loss": 0.71857727, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74305582, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.16235352, + "step": 6754, + "time_per_iteration": 2.7445528507232666 + }, + { + "auxiliary_loss_clip": 0.01410388, + "auxiliary_loss_mlp": 0.01042491, + "balance_loss_clip": 1.2810576, + "balance_loss_mlp": 1.02655852, + "epoch": 0.40613257177213286, + "flos": 21762538719120.0, + "grad_norm": 3.2196866830598365, + "language_loss": 0.83084691, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85537571, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.15924072, + "step": 6755, + "time_per_iteration": 4.271881103515625 + }, + { + "auxiliary_loss_clip": 0.01421015, + "auxiliary_loss_mlp": 0.01035876, + "balance_loss_clip": 1.28850126, + "balance_loss_mlp": 1.02022409, + "epoch": 0.4061926950248008, + "flos": 25560375094440.0, + "grad_norm": 1.6081464081110477, + "language_loss": 0.71576858, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.74033749, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15661621, + "step": 6756, + "time_per_iteration": 2.802187442779541 + }, + { + "auxiliary_loss_clip": 0.01417753, + "auxiliary_loss_mlp": 0.01043752, + "balance_loss_clip": 1.28575778, + "balance_loss_mlp": 1.02844524, + "epoch": 0.4062528182774688, + "flos": 23701053380760.0, + "grad_norm": 1.7532677998463908, + "language_loss": 0.79103369, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81564867, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15307617, + "step": 6757, + "time_per_iteration": 2.787214756011963 + }, + { + "auxiliary_loss_clip": 0.0141487, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.28553236, + "balance_loss_mlp": 1.0173589, + "epoch": 0.4063129415301368, + "flos": 21731084221320.0, + "grad_norm": 1.9501530677967998, + "language_loss": 0.7891261, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.81359673, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.14825439, + "step": 6758, + "time_per_iteration": 2.833596706390381 + }, + { + "auxiliary_loss_clip": 0.01407729, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.28171301, + "balance_loss_mlp": 1.01542449, + "epoch": 0.40637306478280477, + "flos": 12791537583840.0, + "grad_norm": 2.256755693933125, + "language_loss": 0.89681703, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.92120081, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15234375, + "step": 6759, + "time_per_iteration": 2.7832932472229004 + }, + { + "auxiliary_loss_clip": 0.0141498, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.28479171, + "balance_loss_mlp": 1.02074265, + "epoch": 0.40643318803547274, + "flos": 24029828265960.0, + "grad_norm": 2.0618679708888332, + "language_loss": 0.64704841, + "learning_rate": 2.688775442076598e-06, + "loss": 0.67155212, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.14630127, + "step": 6760, + "time_per_iteration": 4.265821218490601 + }, + { + "auxiliary_loss_clip": 0.01424076, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.29338932, + "balance_loss_mlp": 1.02068996, + "epoch": 0.4064933112881407, + "flos": 25597839629520.0, + "grad_norm": 1.4512198650124097, + "language_loss": 0.75182426, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77642941, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15753174, + "step": 6761, + "time_per_iteration": 4.412666320800781 + }, + { + "auxiliary_loss_clip": 0.01404178, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.28062403, + "balance_loss_mlp": 1.02219558, + "epoch": 0.40655343454080867, + "flos": 22059412414560.0, + "grad_norm": 1.4189704172031579, + "language_loss": 0.70210761, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72651428, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14294434, + "step": 6762, + "time_per_iteration": 2.830430030822754 + }, + { + "auxiliary_loss_clip": 0.01419903, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.29132259, + "balance_loss_mlp": 1.02459812, + "epoch": 0.40661355779347663, + "flos": 26474532048360.0, + "grad_norm": 1.8630506750969487, + "language_loss": 0.73257893, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75717163, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.14752197, + "step": 6763, + "time_per_iteration": 2.8309555053710938 + }, + { + "auxiliary_loss_clip": 0.01418938, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.2879467, + "balance_loss_mlp": 1.02041459, + "epoch": 0.4066736810461446, + "flos": 13265053956000.0, + "grad_norm": 3.0735815041976364, + "language_loss": 0.69687963, + "learning_rate": 2.687312683911033e-06, + "loss": 0.72143376, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.16064453, + "step": 6764, + "time_per_iteration": 2.8099167346954346 + }, + { + "auxiliary_loss_clip": 0.01435019, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_clip": 1.30252922, + "balance_loss_mlp": 1.0323925, + "epoch": 0.40673380429881256, + "flos": 28809522377280.0, + "grad_norm": 2.038443595198526, + "language_loss": 0.91461045, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93945563, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.17102051, + "step": 6765, + "time_per_iteration": 2.8680145740509033 + }, + { + "auxiliary_loss_clip": 0.01436996, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_clip": 1.3033514, + "balance_loss_mlp": 1.02732933, + "epoch": 0.4067939275514805, + "flos": 12499212024720.0, + "grad_norm": 2.320687712089166, + "language_loss": 0.78227228, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80707753, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16192627, + "step": 6766, + "time_per_iteration": 2.7708802223205566 + }, + { + "auxiliary_loss_clip": 0.01424009, + "auxiliary_loss_mlp": 0.01045038, + "balance_loss_clip": 1.29150701, + "balance_loss_mlp": 1.02873015, + "epoch": 0.4068540508041485, + "flos": 18775155881520.0, + "grad_norm": 1.9684488033446828, + "language_loss": 0.77398086, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.79867131, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16308594, + "step": 6767, + "time_per_iteration": 2.7945046424865723 + }, + { + "auxiliary_loss_clip": 0.01423642, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.29400921, + "balance_loss_mlp": 1.02310443, + "epoch": 0.40691417405681646, + "flos": 28518821152560.0, + "grad_norm": 1.8816249091546493, + "language_loss": 0.77850223, + "learning_rate": 2.685849508738034e-06, + "loss": 0.80311888, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.14916992, + "step": 6768, + "time_per_iteration": 2.926029682159424 + }, + { + "auxiliary_loss_clip": 0.01419892, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.29105341, + "balance_loss_mlp": 1.02499723, + "epoch": 0.4069742973094844, + "flos": 20818998293760.0, + "grad_norm": 1.9593178001089875, + "language_loss": 0.87560117, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.90019906, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14904785, + "step": 6769, + "time_per_iteration": 2.771343946456909 + }, + { + "auxiliary_loss_clip": 0.0141617, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.29116535, + "balance_loss_mlp": 1.02413225, + "epoch": 0.4070344205621524, + "flos": 21475248596640.0, + "grad_norm": 2.1551570009622223, + "language_loss": 0.80767989, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83222908, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14624023, + "step": 6770, + "time_per_iteration": 2.7480523586273193 + }, + { + "auxiliary_loss_clip": 0.01427963, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.29448462, + "balance_loss_mlp": 1.02189982, + "epoch": 0.4070945438148204, + "flos": 26835248731680.0, + "grad_norm": 1.951195667646934, + "language_loss": 0.8024596, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.8271156, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.15734863, + "step": 6771, + "time_per_iteration": 2.8657121658325195 + }, + { + "auxiliary_loss_clip": 0.01420688, + "auxiliary_loss_mlp": 0.01040703, + "balance_loss_clip": 1.29139733, + "balance_loss_mlp": 1.0245502, + "epoch": 0.4071546670674884, + "flos": 26359092816120.0, + "grad_norm": 1.6275367400957208, + "language_loss": 0.76462275, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78923666, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16137695, + "step": 6772, + "time_per_iteration": 2.823605537414551 + }, + { + "auxiliary_loss_clip": 0.01420133, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.28959072, + "balance_loss_mlp": 1.02441335, + "epoch": 0.40721479032015634, + "flos": 17900006580360.0, + "grad_norm": 1.610724958587066, + "language_loss": 0.81428903, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83889723, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.1628418, + "step": 6773, + "time_per_iteration": 2.736400604248047 + }, + { + "auxiliary_loss_clip": 0.012437, + "auxiliary_loss_mlp": 0.01003818, + "balance_loss_clip": 1.19168711, + "balance_loss_mlp": 1.00119495, + "epoch": 0.4072749135728243, + "flos": 49867805066040.0, + "grad_norm": 0.839472708661569, + "language_loss": 0.64459509, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66707027, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.02624512, + "step": 6774, + "time_per_iteration": 3.1756017208099365 + }, + { + "auxiliary_loss_clip": 0.01422859, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.289536, + "balance_loss_mlp": 1.0183388, + "epoch": 0.40733503682549227, + "flos": 27569433123360.0, + "grad_norm": 1.6182707188977887, + "language_loss": 0.72818387, + "learning_rate": 2.683287951431446e-06, + "loss": 0.75275171, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.15588379, + "step": 6775, + "time_per_iteration": 2.8041534423828125 + }, + { + "auxiliary_loss_clip": 0.01419722, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.28959715, + "balance_loss_mlp": 1.03085089, + "epoch": 0.40739516007816023, + "flos": 22132026808200.0, + "grad_norm": 1.396678142502359, + "language_loss": 0.78036833, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80503213, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15789795, + "step": 6776, + "time_per_iteration": 2.7991182804107666 + }, + { + "auxiliary_loss_clip": 0.01426582, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.29243696, + "balance_loss_mlp": 1.02713263, + "epoch": 0.4074552833308282, + "flos": 23847581635560.0, + "grad_norm": 2.1689204862323876, + "language_loss": 0.79437089, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16381836, + "step": 6777, + "time_per_iteration": 2.8021974563598633 + }, + { + "auxiliary_loss_clip": 0.01244768, + "auxiliary_loss_mlp": 0.01001358, + "balance_loss_clip": 1.19221532, + "balance_loss_mlp": 0.99853325, + "epoch": 0.40751540658349616, + "flos": 58015496794680.0, + "grad_norm": 0.7363753851804353, + "language_loss": 0.53226215, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55472344, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02819824, + "step": 6778, + "time_per_iteration": 3.2634048461914062 + }, + { + "auxiliary_loss_clip": 0.01417588, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.28831124, + "balance_loss_mlp": 1.02749562, + "epoch": 0.40757552983616413, + "flos": 21219575405400.0, + "grad_norm": 2.0515190741574245, + "language_loss": 0.82797134, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.85258305, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16101074, + "step": 6779, + "time_per_iteration": 2.815025806427002 + }, + { + "auxiliary_loss_clip": 0.01419329, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.29007757, + "balance_loss_mlp": 1.01938283, + "epoch": 0.4076356530888321, + "flos": 26839187742600.0, + "grad_norm": 1.5224882587718958, + "language_loss": 0.76430261, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78884685, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.15722656, + "step": 6780, + "time_per_iteration": 2.8036186695098877 + }, + { + "auxiliary_loss_clip": 0.01404784, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.28061008, + "balance_loss_mlp": 1.02080131, + "epoch": 0.40769577634150006, + "flos": 12206805248880.0, + "grad_norm": 6.575025433605859, + "language_loss": 0.66480768, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68920386, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14038086, + "step": 6781, + "time_per_iteration": 2.765477180480957 + }, + { + "auxiliary_loss_clip": 0.01409152, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.27984691, + "balance_loss_mlp": 1.02195048, + "epoch": 0.407755899594168, + "flos": 33661668448800.0, + "grad_norm": 1.6736603540314985, + "language_loss": 0.71540201, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73987627, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16326904, + "step": 6782, + "time_per_iteration": 2.918189287185669 + }, + { + "auxiliary_loss_clip": 0.01414488, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.28448486, + "balance_loss_mlp": 1.01944733, + "epoch": 0.407816022846836, + "flos": 20162220082200.0, + "grad_norm": 2.2523050493722763, + "language_loss": 0.81907523, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84356391, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.1494751, + "step": 6783, + "time_per_iteration": 2.802743673324585 + }, + { + "auxiliary_loss_clip": 0.01412959, + "auxiliary_loss_mlp": 0.01039852, + "balance_loss_clip": 1.28489113, + "balance_loss_mlp": 1.02392542, + "epoch": 0.40787614609950396, + "flos": 21183816421440.0, + "grad_norm": 1.4524544376134492, + "language_loss": 0.81004953, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83457768, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.15905762, + "step": 6784, + "time_per_iteration": 2.8210043907165527 + }, + { + "auxiliary_loss_clip": 0.01425662, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.29061019, + "balance_loss_mlp": 1.02389312, + "epoch": 0.407936269352172, + "flos": 20525373267120.0, + "grad_norm": 1.6394021633338802, + "language_loss": 0.66066241, + "learning_rate": 2.679626382651386e-06, + "loss": 0.68533051, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.17248535, + "step": 6785, + "time_per_iteration": 2.802138328552246 + }, + { + "auxiliary_loss_clip": 0.01409074, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.2801168, + "balance_loss_mlp": 1.02263546, + "epoch": 0.40799639260483994, + "flos": 20123415471240.0, + "grad_norm": 1.9582466895256505, + "language_loss": 0.80068517, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82516158, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15942383, + "step": 6786, + "time_per_iteration": 2.7961983680725098 + }, + { + "auxiliary_loss_clip": 0.01415865, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.28645873, + "balance_loss_mlp": 1.02899122, + "epoch": 0.4080565158575079, + "flos": 21002422566600.0, + "grad_norm": 2.0940879199749682, + "language_loss": 0.81756318, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84215981, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.14807129, + "step": 6787, + "time_per_iteration": 2.7900967597961426 + }, + { + "auxiliary_loss_clip": 0.01409472, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.28130937, + "balance_loss_mlp": 1.01808667, + "epoch": 0.40811663911017587, + "flos": 19322423681400.0, + "grad_norm": 1.8118959821322795, + "language_loss": 0.6800136, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70444018, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.15100098, + "step": 6788, + "time_per_iteration": 2.7992947101593018 + }, + { + "auxiliary_loss_clip": 0.01409915, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_clip": 1.28087759, + "balance_loss_mlp": 1.02560592, + "epoch": 0.40817676236284384, + "flos": 40632870261240.0, + "grad_norm": 1.8002031438861616, + "language_loss": 0.66857815, + "learning_rate": 2.678161032759701e-06, + "loss": 0.69309437, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16107178, + "step": 6789, + "time_per_iteration": 4.364344835281372 + }, + { + "auxiliary_loss_clip": 0.01407529, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.2786355, + "balance_loss_mlp": 1.02098465, + "epoch": 0.4082368856155118, + "flos": 20526997601520.0, + "grad_norm": 1.8640635327726633, + "language_loss": 0.60868645, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.63313091, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15930176, + "step": 6790, + "time_per_iteration": 2.7470929622650146 + }, + { + "auxiliary_loss_clip": 0.01413481, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.28639483, + "balance_loss_mlp": 1.02392924, + "epoch": 0.40829700886817977, + "flos": 11430242710560.0, + "grad_norm": 3.447561863459045, + "language_loss": 0.69537139, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71990478, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15930176, + "step": 6791, + "time_per_iteration": 2.7217514514923096 + }, + { + "auxiliary_loss_clip": 0.01240108, + "auxiliary_loss_mlp": 0.01009051, + "balance_loss_clip": 1.18721557, + "balance_loss_mlp": 1.00573659, + "epoch": 0.40835713212084773, + "flos": 67346118187560.0, + "grad_norm": 0.7487191629117992, + "language_loss": 0.59691215, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61940384, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.03320312, + "step": 6792, + "time_per_iteration": 3.248692274093628 + }, + { + "auxiliary_loss_clip": 0.01418588, + "auxiliary_loss_mlp": 0.01042598, + "balance_loss_clip": 1.28811908, + "balance_loss_mlp": 1.02589667, + "epoch": 0.4084172553735157, + "flos": 21767005638720.0, + "grad_norm": 1.7218623860713895, + "language_loss": 0.80219281, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82680464, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.16711426, + "step": 6793, + "time_per_iteration": 4.223266363143921 + }, + { + "auxiliary_loss_clip": 0.01416537, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.28713751, + "balance_loss_mlp": 1.01831698, + "epoch": 0.40847737862618366, + "flos": 27422986085280.0, + "grad_norm": 1.8234109244622905, + "language_loss": 0.84897125, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87349045, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.17077637, + "step": 6794, + "time_per_iteration": 2.853304624557495 + }, + { + "auxiliary_loss_clip": 0.01413892, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.28630471, + "balance_loss_mlp": 1.02665544, + "epoch": 0.4085375018788516, + "flos": 18591772217040.0, + "grad_norm": 1.9674866244319733, + "language_loss": 0.80023324, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.8248018, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.16308594, + "step": 6795, + "time_per_iteration": 2.7607152462005615 + }, + { + "auxiliary_loss_clip": 0.01425086, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.29056191, + "balance_loss_mlp": 1.01845622, + "epoch": 0.4085976251315196, + "flos": 15415726628160.0, + "grad_norm": 7.626862138483358, + "language_loss": 0.71258026, + "learning_rate": 2.675595680920792e-06, + "loss": 0.73718548, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.1697998, + "step": 6796, + "time_per_iteration": 2.74800968170166 + }, + { + "auxiliary_loss_clip": 0.01410164, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.28158784, + "balance_loss_mlp": 1.01894367, + "epoch": 0.40865774838418756, + "flos": 21257283590640.0, + "grad_norm": 6.74242089946688, + "language_loss": 0.78620589, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.8106451, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.14794922, + "step": 6797, + "time_per_iteration": 2.806528329849243 + }, + { + "auxiliary_loss_clip": 0.01410971, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_clip": 1.28058958, + "balance_loss_mlp": 1.03136086, + "epoch": 0.4087178716368556, + "flos": 13776278513400.0, + "grad_norm": 2.0631279839734296, + "language_loss": 0.86063433, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.88521147, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.15393066, + "step": 6798, + "time_per_iteration": 2.741760015487671 + }, + { + "auxiliary_loss_clip": 0.01408328, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.28250134, + "balance_loss_mlp": 1.0183115, + "epoch": 0.40877799488952354, + "flos": 23626855261080.0, + "grad_norm": 1.465604379133191, + "language_loss": 0.84151858, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86593008, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14501953, + "step": 6799, + "time_per_iteration": 5.805353164672852 + }, + { + "auxiliary_loss_clip": 0.01417203, + "auxiliary_loss_mlp": 0.01039931, + "balance_loss_clip": 1.28988254, + "balance_loss_mlp": 1.0234921, + "epoch": 0.4088381181421915, + "flos": 20923148401920.0, + "grad_norm": 1.9152181022775272, + "language_loss": 0.8363356, + "learning_rate": 2.6741292016681e-06, + "loss": 0.86090696, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.16424561, + "step": 6800, + "time_per_iteration": 2.8023297786712646 + }, + { + "auxiliary_loss_clip": 0.01417151, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.28685069, + "balance_loss_mlp": 1.01774466, + "epoch": 0.4088982413948595, + "flos": 13301503282080.0, + "grad_norm": 2.137634955832881, + "language_loss": 0.74536014, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76986694, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15783691, + "step": 6801, + "time_per_iteration": 2.9024531841278076 + }, + { + "auxiliary_loss_clip": 0.01416887, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.2869401, + "balance_loss_mlp": 1.01797271, + "epoch": 0.40895836464752744, + "flos": 15271837916760.0, + "grad_norm": 2.0340620456683127, + "language_loss": 0.80378962, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82829332, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.1550293, + "step": 6802, + "time_per_iteration": 2.7647266387939453 + }, + { + "auxiliary_loss_clip": 0.01427075, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.295174, + "balance_loss_mlp": 1.02325392, + "epoch": 0.4090184879001954, + "flos": 14505468076800.0, + "grad_norm": 2.639926515849646, + "language_loss": 0.75995088, + "learning_rate": 2.673029073767934e-06, + "loss": 0.78462803, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.17382812, + "step": 6803, + "time_per_iteration": 2.7327473163604736 + }, + { + "auxiliary_loss_clip": 0.01413367, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.28503871, + "balance_loss_mlp": 1.01858401, + "epoch": 0.40907861115286337, + "flos": 13885626491640.0, + "grad_norm": 2.2724091094724903, + "language_loss": 0.79001617, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.81449521, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.1595459, + "step": 6804, + "time_per_iteration": 2.752889633178711 + }, + { + "auxiliary_loss_clip": 0.01419046, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.28512073, + "balance_loss_mlp": 1.02527106, + "epoch": 0.40913873440553133, + "flos": 28043436795840.0, + "grad_norm": 1.9649943231717073, + "language_loss": 0.75831497, + "learning_rate": 2.672295527537998e-06, + "loss": 0.78291202, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.15380859, + "step": 6805, + "time_per_iteration": 2.805335283279419 + }, + { + "auxiliary_loss_clip": 0.01425417, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.29329956, + "balance_loss_mlp": 1.02746236, + "epoch": 0.4091988576581993, + "flos": 21623441794200.0, + "grad_norm": 1.6605557698556332, + "language_loss": 0.79907548, + "learning_rate": 2.671928716175804e-06, + "loss": 0.82375568, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.15148926, + "step": 6806, + "time_per_iteration": 2.813037872314453 + }, + { + "auxiliary_loss_clip": 0.01422872, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.29116189, + "balance_loss_mlp": 1.01784277, + "epoch": 0.40925898091086726, + "flos": 25229001274200.0, + "grad_norm": 2.3753890713834025, + "language_loss": 0.72458971, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74915409, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15722656, + "step": 6807, + "time_per_iteration": 2.8222341537475586 + }, + { + "auxiliary_loss_clip": 0.0125443, + "auxiliary_loss_mlp": 0.01006311, + "balance_loss_clip": 1.19908285, + "balance_loss_mlp": 1.0032953, + "epoch": 0.40931910416353523, + "flos": 68945502832200.0, + "grad_norm": 0.828355690993386, + "language_loss": 0.58835691, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.6109643, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.03015137, + "step": 6808, + "time_per_iteration": 3.3618321418762207 + }, + { + "auxiliary_loss_clip": 0.01421641, + "auxiliary_loss_mlp": 0.01039602, + "balance_loss_clip": 1.29359901, + "balance_loss_mlp": 1.02491522, + "epoch": 0.4093792274162032, + "flos": 20193877621800.0, + "grad_norm": 1.509046660468007, + "language_loss": 0.54821104, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57282352, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14697266, + "step": 6809, + "time_per_iteration": 2.774578809738159 + }, + { + "auxiliary_loss_clip": 0.01419651, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.29104054, + "balance_loss_mlp": 1.01562011, + "epoch": 0.40943935066887116, + "flos": 25234280361000.0, + "grad_norm": 1.930506670881405, + "language_loss": 0.83399397, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85849565, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.14910889, + "step": 6810, + "time_per_iteration": 2.801866054534912 + }, + { + "auxiliary_loss_clip": 0.01425271, + "auxiliary_loss_mlp": 0.01039969, + "balance_loss_clip": 1.29366422, + "balance_loss_mlp": 1.02306461, + "epoch": 0.4094994739215392, + "flos": 23260169148840.0, + "grad_norm": 2.218281199506604, + "language_loss": 0.77938437, + "learning_rate": 2.670094277448999e-06, + "loss": 0.80403674, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.16906738, + "step": 6811, + "time_per_iteration": 2.8243215084075928 + }, + { + "auxiliary_loss_clip": 0.01421667, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.29097462, + "balance_loss_mlp": 1.01708484, + "epoch": 0.40955959717420715, + "flos": 17386386129720.0, + "grad_norm": 4.049877180515986, + "language_loss": 0.70277548, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72733718, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.17401123, + "step": 6812, + "time_per_iteration": 2.897341012954712 + }, + { + "auxiliary_loss_clip": 0.01425832, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.29539323, + "balance_loss_mlp": 1.02220809, + "epoch": 0.4096197204268751, + "flos": 25087873931280.0, + "grad_norm": 1.5510237225544108, + "language_loss": 0.66658103, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.69122571, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.16418457, + "step": 6813, + "time_per_iteration": 2.8750193119049072 + }, + { + "auxiliary_loss_clip": 0.01418562, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.28983712, + "balance_loss_mlp": 1.01743448, + "epoch": 0.4096798436795431, + "flos": 30592290686400.0, + "grad_norm": 1.9248419552820974, + "language_loss": 0.7414397, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.765953, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15319824, + "step": 6814, + "time_per_iteration": 2.870647668838501 + }, + { + "auxiliary_loss_clip": 0.01430629, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.29663825, + "balance_loss_mlp": 1.0216347, + "epoch": 0.40973996693221104, + "flos": 24138932594040.0, + "grad_norm": 2.0951020201990813, + "language_loss": 0.66516644, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68985361, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.16455078, + "step": 6815, + "time_per_iteration": 2.7906711101531982 + }, + { + "auxiliary_loss_clip": 0.01422045, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.29662919, + "balance_loss_mlp": 1.02786076, + "epoch": 0.409800090184879, + "flos": 23994434757240.0, + "grad_norm": 1.6418688381671307, + "language_loss": 0.76940227, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79405564, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.15441895, + "step": 6816, + "time_per_iteration": 2.8071022033691406 + }, + { + "auxiliary_loss_clip": 0.01422562, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.29266632, + "balance_loss_mlp": 1.02260804, + "epoch": 0.40986021343754697, + "flos": 16148002426920.0, + "grad_norm": 2.070328758410857, + "language_loss": 0.81444472, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83905339, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.15686035, + "step": 6817, + "time_per_iteration": 2.7159438133239746 + }, + { + "auxiliary_loss_clip": 0.01439681, + "auxiliary_loss_mlp": 0.01041743, + "balance_loss_clip": 1.30354202, + "balance_loss_mlp": 1.0235393, + "epoch": 0.40992033669021494, + "flos": 24796279322640.0, + "grad_norm": 7.541612805564069, + "language_loss": 0.80497587, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82979012, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.18212891, + "step": 6818, + "time_per_iteration": 2.828432083129883 + }, + { + "auxiliary_loss_clip": 0.01424503, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.29706144, + "balance_loss_mlp": 1.02334034, + "epoch": 0.4099804599428829, + "flos": 29647775660400.0, + "grad_norm": 1.920524693847151, + "language_loss": 0.66526824, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68989551, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.14886475, + "step": 6819, + "time_per_iteration": 2.838916301727295 + }, + { + "auxiliary_loss_clip": 0.0144215, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.30478311, + "balance_loss_mlp": 1.02674508, + "epoch": 0.41004058319555087, + "flos": 24831957089880.0, + "grad_norm": 1.567913674483539, + "language_loss": 0.85346079, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87832367, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.17382812, + "step": 6820, + "time_per_iteration": 2.873318910598755 + }, + { + "auxiliary_loss_clip": 0.01426461, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.29934347, + "balance_loss_mlp": 1.01657653, + "epoch": 0.41010070644821883, + "flos": 25742784158280.0, + "grad_norm": 1.8881050715853764, + "language_loss": 0.7169168, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.74149221, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.14526367, + "step": 6821, + "time_per_iteration": 2.8209991455078125 + }, + { + "auxiliary_loss_clip": 0.0142728, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.2965337, + "balance_loss_mlp": 1.01916432, + "epoch": 0.4101608297008868, + "flos": 22351169456640.0, + "grad_norm": 2.1278450969893195, + "language_loss": 0.74783599, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.77245331, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15264893, + "step": 6822, + "time_per_iteration": 2.7930092811584473 + }, + { + "auxiliary_loss_clip": 0.01427847, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.29672921, + "balance_loss_mlp": 1.02024984, + "epoch": 0.41022095295355476, + "flos": 21950186261400.0, + "grad_norm": 1.9186515943484053, + "language_loss": 0.76071036, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.78534663, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15515137, + "step": 6823, + "time_per_iteration": 2.858322858810425 + }, + { + "auxiliary_loss_clip": 0.01441502, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.30549741, + "balance_loss_mlp": 1.02260256, + "epoch": 0.4102810762062228, + "flos": 27455821267320.0, + "grad_norm": 1.6155251940354982, + "language_loss": 0.73719621, + "learning_rate": 2.665321768127001e-06, + "loss": 0.76201218, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.17504883, + "step": 6824, + "time_per_iteration": 2.8233771324157715 + }, + { + "auxiliary_loss_clip": 0.01435486, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.29956114, + "balance_loss_mlp": 1.01870871, + "epoch": 0.41034119945889075, + "flos": 24504725322360.0, + "grad_norm": 1.8544430027238057, + "language_loss": 0.72596252, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.75067133, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.16687012, + "step": 6825, + "time_per_iteration": 2.8534107208251953 + }, + { + "auxiliary_loss_clip": 0.01428395, + "auxiliary_loss_mlp": 0.0104018, + "balance_loss_clip": 1.29805088, + "balance_loss_mlp": 1.02513003, + "epoch": 0.4104013227115587, + "flos": 24357994025760.0, + "grad_norm": 2.2583932190264058, + "language_loss": 0.84778953, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87247527, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15057373, + "step": 6826, + "time_per_iteration": 2.800581455230713 + }, + { + "auxiliary_loss_clip": 0.01421577, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.29557312, + "balance_loss_mlp": 1.01949215, + "epoch": 0.4104614459642267, + "flos": 23734213429680.0, + "grad_norm": 1.9057912039843665, + "language_loss": 0.66141927, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68599832, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.16821289, + "step": 6827, + "time_per_iteration": 4.252679347991943 + }, + { + "auxiliary_loss_clip": 0.01417152, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.28986597, + "balance_loss_mlp": 1.01775074, + "epoch": 0.41052156921689464, + "flos": 22133163842280.0, + "grad_norm": 1.4321145510446411, + "language_loss": 0.72084951, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74535048, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15197754, + "step": 6828, + "time_per_iteration": 2.8997397422790527 + }, + { + "auxiliary_loss_clip": 0.0143258, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.29733825, + "balance_loss_mlp": 1.02191424, + "epoch": 0.4105816924695626, + "flos": 20089240213320.0, + "grad_norm": 1.7677590608583937, + "language_loss": 0.83978581, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.86449897, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16821289, + "step": 6829, + "time_per_iteration": 2.751330614089966 + }, + { + "auxiliary_loss_clip": 0.01410408, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.28115249, + "balance_loss_mlp": 1.0163455, + "epoch": 0.4106418157222306, + "flos": 18081278610120.0, + "grad_norm": 1.4227779516819485, + "language_loss": 0.90160847, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92603242, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.15637207, + "step": 6830, + "time_per_iteration": 2.754145622253418 + }, + { + "auxiliary_loss_clip": 0.01421091, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.29129958, + "balance_loss_mlp": 1.01694298, + "epoch": 0.41070193897489854, + "flos": 21652216140240.0, + "grad_norm": 1.8528226720344503, + "language_loss": 0.65562207, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68015689, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15454102, + "step": 6831, + "time_per_iteration": 4.185600757598877 + }, + { + "auxiliary_loss_clip": 0.01421154, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.29161751, + "balance_loss_mlp": 1.02051282, + "epoch": 0.4107620622275665, + "flos": 26653448793240.0, + "grad_norm": 1.901512648194879, + "language_loss": 0.69726962, + "learning_rate": 2.662382718122776e-06, + "loss": 0.72184467, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.1583252, + "step": 6832, + "time_per_iteration": 2.7978053092956543 + }, + { + "auxiliary_loss_clip": 0.01412904, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.28389525, + "balance_loss_mlp": 1.02037656, + "epoch": 0.41082218548023447, + "flos": 18738990813960.0, + "grad_norm": 2.1533380232839714, + "language_loss": 0.73749423, + "learning_rate": 2.662015223696666e-06, + "loss": 0.7619797, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15264893, + "step": 6833, + "time_per_iteration": 2.7722792625427246 + }, + { + "auxiliary_loss_clip": 0.01424914, + "auxiliary_loss_mlp": 0.01042649, + "balance_loss_clip": 1.2918787, + "balance_loss_mlp": 1.02564955, + "epoch": 0.41088230873290243, + "flos": 22899005773560.0, + "grad_norm": 1.8461908067262562, + "language_loss": 0.7280215, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75269711, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.17004395, + "step": 6834, + "time_per_iteration": 2.763371229171753 + }, + { + "auxiliary_loss_clip": 0.01421302, + "auxiliary_loss_mlp": 0.0104423, + "balance_loss_clip": 1.28739989, + "balance_loss_mlp": 1.02760029, + "epoch": 0.4109424319855704, + "flos": 24281968529880.0, + "grad_norm": 2.283160496361263, + "language_loss": 0.71170259, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73635793, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.16625977, + "step": 6835, + "time_per_iteration": 2.784885883331299 + }, + { + "auxiliary_loss_clip": 0.01417859, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.28608179, + "balance_loss_mlp": 1.02395904, + "epoch": 0.41100255523823837, + "flos": 12973012655400.0, + "grad_norm": 1.972426095466552, + "language_loss": 0.87293494, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89752328, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.17028809, + "step": 6836, + "time_per_iteration": 4.2465925216674805 + }, + { + "auxiliary_loss_clip": 0.01405662, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.27943707, + "balance_loss_mlp": 1.02404034, + "epoch": 0.4110626784909064, + "flos": 23150455695360.0, + "grad_norm": 2.028613534055066, + "language_loss": 0.69292533, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71737814, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15576172, + "step": 6837, + "time_per_iteration": 2.8356099128723145 + }, + { + "auxiliary_loss_clip": 0.01415969, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.2853353, + "balance_loss_mlp": 1.02547681, + "epoch": 0.41112280174357435, + "flos": 22752558735480.0, + "grad_norm": 3.8887371354232747, + "language_loss": 0.75196809, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77654123, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15869141, + "step": 6838, + "time_per_iteration": 4.333611011505127 + }, + { + "auxiliary_loss_clip": 0.01415673, + "auxiliary_loss_mlp": 0.01044552, + "balance_loss_clip": 1.286062, + "balance_loss_mlp": 1.02798152, + "epoch": 0.4111829249962423, + "flos": 21106938150000.0, + "grad_norm": 2.378854610470042, + "language_loss": 0.82511133, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84971356, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.16564941, + "step": 6839, + "time_per_iteration": 2.824469566345215 + }, + { + "auxiliary_loss_clip": 0.0141245, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.28306413, + "balance_loss_mlp": 1.01982927, + "epoch": 0.4112430482489103, + "flos": 21510642105360.0, + "grad_norm": 2.800067371103799, + "language_loss": 0.80386943, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82834172, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.14941406, + "step": 6840, + "time_per_iteration": 2.7810401916503906 + }, + { + "auxiliary_loss_clip": 0.01406362, + "auxiliary_loss_mlp": 0.01039797, + "balance_loss_clip": 1.27860141, + "balance_loss_mlp": 1.02413321, + "epoch": 0.41130317150157825, + "flos": 19574360903520.0, + "grad_norm": 1.8621807251190752, + "language_loss": 0.6782459, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.70270753, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15673828, + "step": 6841, + "time_per_iteration": 2.782134532928467 + }, + { + "auxiliary_loss_clip": 0.01247314, + "auxiliary_loss_mlp": 0.01013396, + "balance_loss_clip": 1.19031334, + "balance_loss_mlp": 1.00967717, + "epoch": 0.4113632947542462, + "flos": 62399852272800.0, + "grad_norm": 0.7724138295069243, + "language_loss": 0.59713036, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61973751, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03710938, + "step": 6842, + "time_per_iteration": 3.341702938079834 + }, + { + "auxiliary_loss_clip": 0.01408868, + "auxiliary_loss_mlp": 0.01037612, + "balance_loss_clip": 1.28349996, + "balance_loss_mlp": 1.02222252, + "epoch": 0.4114234180069142, + "flos": 13922725551480.0, + "grad_norm": 1.8556324055504982, + "language_loss": 0.69946551, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72393036, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15368652, + "step": 6843, + "time_per_iteration": 2.803487777709961 + }, + { + "auxiliary_loss_clip": 0.01254002, + "auxiliary_loss_mlp": 0.01014912, + "balance_loss_clip": 1.1966567, + "balance_loss_mlp": 1.01143074, + "epoch": 0.41148354125958214, + "flos": 64944401677200.0, + "grad_norm": 0.7571342828597771, + "language_loss": 0.53716946, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55985856, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03491211, + "step": 6844, + "time_per_iteration": 3.2821342945098877 + }, + { + "auxiliary_loss_clip": 0.01421579, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.29336643, + "balance_loss_mlp": 1.02002096, + "epoch": 0.4115436645122501, + "flos": 18732737126520.0, + "grad_norm": 1.761906701121968, + "language_loss": 0.66012895, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68469739, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15246582, + "step": 6845, + "time_per_iteration": 2.863912582397461 + }, + { + "auxiliary_loss_clip": 0.0141598, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.28897464, + "balance_loss_mlp": 1.02552414, + "epoch": 0.41160378776491807, + "flos": 16257228580080.0, + "grad_norm": 1.805429530314165, + "language_loss": 0.70292056, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72748923, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15356445, + "step": 6846, + "time_per_iteration": 2.8170206546783447 + }, + { + "auxiliary_loss_clip": 0.01419398, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.29166067, + "balance_loss_mlp": 1.02316785, + "epoch": 0.41166391101758604, + "flos": 27976548180960.0, + "grad_norm": 1.441936704355574, + "language_loss": 0.65748417, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.682064, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15405273, + "step": 6847, + "time_per_iteration": 2.8756697177886963 + }, + { + "auxiliary_loss_clip": 0.01411495, + "auxiliary_loss_mlp": 0.0104475, + "balance_loss_clip": 1.28248107, + "balance_loss_mlp": 1.0289011, + "epoch": 0.411724034270254, + "flos": 34137986797800.0, + "grad_norm": 1.3920046703510631, + "language_loss": 0.70953441, + "learning_rate": 2.656499802669069e-06, + "loss": 0.73409688, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15863037, + "step": 6848, + "time_per_iteration": 2.900447368621826 + }, + { + "auxiliary_loss_clip": 0.01263174, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.2054671, + "balance_loss_mlp": 1.02549446, + "epoch": 0.41178415752292197, + "flos": 67940393487120.0, + "grad_norm": 0.9024664979957876, + "language_loss": 0.56363511, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58655441, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.03271484, + "step": 6849, + "time_per_iteration": 3.3870198726654053 + }, + { + "auxiliary_loss_clip": 0.0141865, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.29250598, + "balance_loss_mlp": 1.02758002, + "epoch": 0.41184428077558993, + "flos": 34320680120160.0, + "grad_norm": 1.660566860292497, + "language_loss": 0.75891697, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78353906, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.1595459, + "step": 6850, + "time_per_iteration": 2.873075485229492 + }, + { + "auxiliary_loss_clip": 0.01413417, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.28631473, + "balance_loss_mlp": 1.02504349, + "epoch": 0.41190440402825795, + "flos": 35450893487160.0, + "grad_norm": 4.607302024818086, + "language_loss": 0.67697662, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70151657, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.15515137, + "step": 6851, + "time_per_iteration": 2.888415575027466 + }, + { + "auxiliary_loss_clip": 0.01427111, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.29435277, + "balance_loss_mlp": 1.02789211, + "epoch": 0.4119645272809259, + "flos": 20854757277720.0, + "grad_norm": 2.287449258137865, + "language_loss": 0.79976112, + "learning_rate": 2.655028075792743e-06, + "loss": 0.82449603, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.18493652, + "step": 6852, + "time_per_iteration": 2.7865450382232666 + }, + { + "auxiliary_loss_clip": 0.01429676, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.29717946, + "balance_loss_mlp": 1.02425361, + "epoch": 0.4120246505335939, + "flos": 27567646355520.0, + "grad_norm": 2.300266887247165, + "language_loss": 0.78003407, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.8047415, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16784668, + "step": 6853, + "time_per_iteration": 2.8835043907165527 + }, + { + "auxiliary_loss_clip": 0.01430968, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.296085, + "balance_loss_mlp": 1.02566946, + "epoch": 0.41208477378626185, + "flos": 37822495575600.0, + "grad_norm": 1.7609153792735264, + "language_loss": 0.66375136, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.6884796, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.1618042, + "step": 6854, + "time_per_iteration": 2.9157490730285645 + }, + { + "auxiliary_loss_clip": 0.01419913, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.29152751, + "balance_loss_mlp": 1.0198226, + "epoch": 0.4121448970389298, + "flos": 23446029923280.0, + "grad_norm": 1.6898340027203294, + "language_loss": 0.83955312, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.86410427, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15393066, + "step": 6855, + "time_per_iteration": 2.844146966934204 + }, + { + "auxiliary_loss_clip": 0.01411505, + "auxiliary_loss_mlp": 0.01047264, + "balance_loss_clip": 1.28525639, + "balance_loss_mlp": 1.03226113, + "epoch": 0.4122050202915978, + "flos": 21330385284600.0, + "grad_norm": 1.8502833665413898, + "language_loss": 0.79469013, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81927788, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.14996338, + "step": 6856, + "time_per_iteration": 2.8895130157470703 + }, + { + "auxiliary_loss_clip": 0.0141925, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.29050088, + "balance_loss_mlp": 1.02497625, + "epoch": 0.41226514354426574, + "flos": 17310116983680.0, + "grad_norm": 2.5877433691303806, + "language_loss": 0.80383688, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82843381, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15460205, + "step": 6857, + "time_per_iteration": 2.708289384841919 + }, + { + "auxiliary_loss_clip": 0.01422339, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.29108775, + "balance_loss_mlp": 1.02466738, + "epoch": 0.4123252667969337, + "flos": 17643277571760.0, + "grad_norm": 1.754483487054143, + "language_loss": 0.71444839, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.73907733, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.15893555, + "step": 6858, + "time_per_iteration": 2.8359265327453613 + }, + { + "auxiliary_loss_clip": 0.0141709, + "auxiliary_loss_mlp": 0.01044584, + "balance_loss_clip": 1.28921032, + "balance_loss_mlp": 1.02813327, + "epoch": 0.4123853900496017, + "flos": 46432820635920.0, + "grad_norm": 1.5082245538617756, + "language_loss": 0.59647405, + "learning_rate": 2.652451598005391e-06, + "loss": 0.62109083, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16442871, + "step": 6859, + "time_per_iteration": 2.9922335147857666 + }, + { + "auxiliary_loss_clip": 0.01426223, + "auxiliary_loss_mlp": 0.01044112, + "balance_loss_clip": 1.29382598, + "balance_loss_mlp": 1.02884698, + "epoch": 0.41244551330226964, + "flos": 17679564464400.0, + "grad_norm": 2.2398127031628032, + "language_loss": 0.73203129, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75673461, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.15264893, + "step": 6860, + "time_per_iteration": 2.8095362186431885 + }, + { + "auxiliary_loss_clip": 0.01411974, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.28389859, + "balance_loss_mlp": 1.02437413, + "epoch": 0.4125056365549376, + "flos": 18697871526480.0, + "grad_norm": 1.948333066135173, + "language_loss": 0.74433434, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76884329, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14562988, + "step": 6861, + "time_per_iteration": 2.8277974128723145 + }, + { + "auxiliary_loss_clip": 0.01415567, + "auxiliary_loss_mlp": 0.01039371, + "balance_loss_clip": 1.28985858, + "balance_loss_mlp": 1.02536416, + "epoch": 0.41256575980760557, + "flos": 17899884755280.0, + "grad_norm": 2.3121173206730186, + "language_loss": 0.80166447, + "learning_rate": 2.651347021844765e-06, + "loss": 0.82621384, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14013672, + "step": 6862, + "time_per_iteration": 2.793618679046631 + }, + { + "auxiliary_loss_clip": 0.01422753, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.29374242, + "balance_loss_mlp": 1.02155781, + "epoch": 0.41262588306027354, + "flos": 21986391937320.0, + "grad_norm": 1.6579734935736843, + "language_loss": 0.76248467, + "learning_rate": 2.650978780374318e-06, + "loss": 0.7870822, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.15441895, + "step": 6863, + "time_per_iteration": 2.8014631271362305 + }, + { + "auxiliary_loss_clip": 0.01273538, + "auxiliary_loss_mlp": 0.01019074, + "balance_loss_clip": 1.21727204, + "balance_loss_mlp": 1.01580799, + "epoch": 0.41268600631294156, + "flos": 53362757708640.0, + "grad_norm": 0.6992811712926968, + "language_loss": 0.52757281, + "learning_rate": 2.650610514218691e-06, + "loss": 0.55049896, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03271484, + "step": 6864, + "time_per_iteration": 3.3097593784332275 + }, + { + "auxiliary_loss_clip": 0.01431154, + "auxiliary_loss_mlp": 0.01038901, + "balance_loss_clip": 1.29768121, + "balance_loss_mlp": 1.02251005, + "epoch": 0.4127461295656095, + "flos": 24390220082400.0, + "grad_norm": 1.7396481002980784, + "language_loss": 0.72806722, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.7527678, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.1640625, + "step": 6865, + "time_per_iteration": 2.825894594192505 + }, + { + "auxiliary_loss_clip": 0.01273281, + "auxiliary_loss_mlp": 0.01021761, + "balance_loss_clip": 1.21645999, + "balance_loss_mlp": 1.01866138, + "epoch": 0.4128062528182775, + "flos": 71720768267640.0, + "grad_norm": 0.9152676019325121, + "language_loss": 0.66570342, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68865383, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03100586, + "step": 6866, + "time_per_iteration": 4.521556615829468 + }, + { + "auxiliary_loss_clip": 0.014163, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.28726685, + "balance_loss_mlp": 1.02316952, + "epoch": 0.41286637607094545, + "flos": 17852430563640.0, + "grad_norm": 2.0898176598040252, + "language_loss": 0.81596792, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8405118, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14923096, + "step": 6867, + "time_per_iteration": 2.7118520736694336 + }, + { + "auxiliary_loss_clip": 0.01424537, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.29308021, + "balance_loss_mlp": 1.02120817, + "epoch": 0.4129264993236134, + "flos": 25554121407000.0, + "grad_norm": 2.1465319924054773, + "language_loss": 0.77461112, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7992245, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15582275, + "step": 6868, + "time_per_iteration": 2.8516151905059814 + }, + { + "auxiliary_loss_clip": 0.01270917, + "auxiliary_loss_mlp": 0.01006294, + "balance_loss_clip": 1.21440458, + "balance_loss_mlp": 1.00295639, + "epoch": 0.4129866225762814, + "flos": 65427159774240.0, + "grad_norm": 0.836743428807661, + "language_loss": 0.57889408, + "learning_rate": 2.64876881365164e-06, + "loss": 0.60166621, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03344727, + "step": 6869, + "time_per_iteration": 2.9935760498046875 + }, + { + "auxiliary_loss_clip": 0.01412789, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.28578007, + "balance_loss_mlp": 1.01860106, + "epoch": 0.41304674582894935, + "flos": 28882948938120.0, + "grad_norm": 1.6619559875624015, + "language_loss": 0.75238693, + "learning_rate": 2.64840039967822e-06, + "loss": 0.7768535, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.15264893, + "step": 6870, + "time_per_iteration": 2.8169643878936768 + }, + { + "auxiliary_loss_clip": 0.0141917, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.28885627, + "balance_loss_mlp": 1.02863955, + "epoch": 0.4131068690816173, + "flos": 22896975355560.0, + "grad_norm": 1.5630843748018142, + "language_loss": 0.83522081, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.8598572, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15844727, + "step": 6871, + "time_per_iteration": 4.165442228317261 + }, + { + "auxiliary_loss_clip": 0.01423405, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_clip": 1.29220128, + "balance_loss_mlp": 1.02483666, + "epoch": 0.4131669923342853, + "flos": 26070462617760.0, + "grad_norm": 4.309206479354558, + "language_loss": 0.68789697, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71255165, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.17236328, + "step": 6872, + "time_per_iteration": 2.8431153297424316 + }, + { + "auxiliary_loss_clip": 0.01420961, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.2899487, + "balance_loss_mlp": 1.02003968, + "epoch": 0.41322711558695324, + "flos": 19249240770720.0, + "grad_norm": 16.258745994575634, + "language_loss": 0.76079768, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78536141, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15368652, + "step": 6873, + "time_per_iteration": 2.722163677215576 + }, + { + "auxiliary_loss_clip": 0.01430975, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.29896283, + "balance_loss_mlp": 1.01937425, + "epoch": 0.4132872388396212, + "flos": 22679619474960.0, + "grad_norm": 1.9833070382675093, + "language_loss": 0.83336288, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85802376, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.15740967, + "step": 6874, + "time_per_iteration": 2.780057668685913 + }, + { + "auxiliary_loss_clip": 0.01427409, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.29603207, + "balance_loss_mlp": 1.01781178, + "epoch": 0.4133473620922892, + "flos": 20154139018560.0, + "grad_norm": 2.127892002274518, + "language_loss": 0.72196811, + "learning_rate": 2.646557961279436e-06, + "loss": 0.7465874, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.16687012, + "step": 6875, + "time_per_iteration": 4.196726083755493 + }, + { + "auxiliary_loss_clip": 0.01407001, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.28453422, + "balance_loss_mlp": 1.02479172, + "epoch": 0.41340748534495714, + "flos": 24248036922120.0, + "grad_norm": 1.5004122857241164, + "language_loss": 0.82504147, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84950805, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14868164, + "step": 6876, + "time_per_iteration": 2.7953073978424072 + }, + { + "auxiliary_loss_clip": 0.01428269, + "auxiliary_loss_mlp": 0.01043452, + "balance_loss_clip": 1.2947458, + "balance_loss_mlp": 1.025702, + "epoch": 0.41346760859762516, + "flos": 14396241923640.0, + "grad_norm": 2.6362027143109326, + "language_loss": 0.65669107, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.68140829, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.17749023, + "step": 6877, + "time_per_iteration": 4.180009126663208 + }, + { + "auxiliary_loss_clip": 0.01422066, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.29273009, + "balance_loss_mlp": 1.01783478, + "epoch": 0.4135277318502931, + "flos": 22497129194400.0, + "grad_norm": 1.6827070992390902, + "language_loss": 0.76879621, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.79335761, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16210938, + "step": 6878, + "time_per_iteration": 2.7451021671295166 + }, + { + "auxiliary_loss_clip": 0.01418948, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.28913617, + "balance_loss_mlp": 1.02034688, + "epoch": 0.4135878551029611, + "flos": 22423824458640.0, + "grad_norm": 1.8578494262642082, + "language_loss": 0.80749619, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.83204484, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15588379, + "step": 6879, + "time_per_iteration": 2.7473719120025635 + }, + { + "auxiliary_loss_clip": 0.01422019, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.29404211, + "balance_loss_mlp": 1.01826882, + "epoch": 0.41364797835562905, + "flos": 27059020733160.0, + "grad_norm": 1.7704557834384198, + "language_loss": 0.84811378, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87267965, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16296387, + "step": 6880, + "time_per_iteration": 2.823103427886963 + }, + { + "auxiliary_loss_clip": 0.01427666, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.29548776, + "balance_loss_mlp": 1.01234102, + "epoch": 0.413708101608297, + "flos": 22972960243080.0, + "grad_norm": 1.5724596413873346, + "language_loss": 0.70706284, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.73161525, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15222168, + "step": 6881, + "time_per_iteration": 2.798560619354248 + }, + { + "auxiliary_loss_clip": 0.01415294, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.28934872, + "balance_loss_mlp": 1.02279019, + "epoch": 0.413768224860965, + "flos": 13337911999800.0, + "grad_norm": 1.9893402846067845, + "language_loss": 0.81802022, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.84255272, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.1517334, + "step": 6882, + "time_per_iteration": 2.7237894535064697 + }, + { + "auxiliary_loss_clip": 0.01429835, + "auxiliary_loss_mlp": 0.01043752, + "balance_loss_clip": 1.29743242, + "balance_loss_mlp": 1.02545381, + "epoch": 0.41382834811363295, + "flos": 20818917077040.0, + "grad_norm": 2.168410388331322, + "language_loss": 0.703637, + "learning_rate": 2.643608785656077e-06, + "loss": 0.72837281, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.18310547, + "step": 6883, + "time_per_iteration": 2.7949907779693604 + }, + { + "auxiliary_loss_clip": 0.01423714, + "auxiliary_loss_mlp": 0.01042072, + "balance_loss_clip": 1.29374313, + "balance_loss_mlp": 1.02650332, + "epoch": 0.4138884713663009, + "flos": 20671942130280.0, + "grad_norm": 1.7661373812826513, + "language_loss": 0.75879884, + "learning_rate": 2.643240028730663e-06, + "loss": 0.78345668, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15576172, + "step": 6884, + "time_per_iteration": 2.8665926456451416 + }, + { + "auxiliary_loss_clip": 0.01425213, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.29245305, + "balance_loss_mlp": 1.02643454, + "epoch": 0.4139485946189689, + "flos": 29062231158240.0, + "grad_norm": 1.4426694386721852, + "language_loss": 0.75929976, + "learning_rate": 2.642871247413523e-06, + "loss": 0.78398418, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16796875, + "step": 6885, + "time_per_iteration": 2.8022353649139404 + }, + { + "auxiliary_loss_clip": 0.01427926, + "auxiliary_loss_mlp": 0.01042088, + "balance_loss_clip": 1.29725933, + "balance_loss_mlp": 1.02562511, + "epoch": 0.41400871787163684, + "flos": 24431095719720.0, + "grad_norm": 2.1379041216362062, + "language_loss": 0.69712913, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.7218293, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.16467285, + "step": 6886, + "time_per_iteration": 2.7735278606414795 + }, + { + "auxiliary_loss_clip": 0.01433958, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.30205119, + "balance_loss_mlp": 1.02142096, + "epoch": 0.4140688411243048, + "flos": 19468830111120.0, + "grad_norm": 1.659235936577309, + "language_loss": 0.75532305, + "learning_rate": 2.642133611660002e-06, + "loss": 0.78004134, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.16467285, + "step": 6887, + "time_per_iteration": 2.7107226848602295 + }, + { + "auxiliary_loss_clip": 0.01422855, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.29299295, + "balance_loss_mlp": 1.01755691, + "epoch": 0.4141289643769728, + "flos": 19317834936720.0, + "grad_norm": 2.091573092285849, + "language_loss": 0.71152639, + "learning_rate": 2.641764757251592e-06, + "loss": 0.73609257, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.16210938, + "step": 6888, + "time_per_iteration": 2.7120344638824463 + }, + { + "auxiliary_loss_clip": 0.01418486, + "auxiliary_loss_mlp": 0.01039384, + "balance_loss_clip": 1.289325, + "balance_loss_mlp": 1.02426863, + "epoch": 0.41418908762964074, + "flos": 16731232252560.0, + "grad_norm": 2.0428084558094466, + "language_loss": 0.76424754, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78882623, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.15100098, + "step": 6889, + "time_per_iteration": 2.7090985774993896 + }, + { + "auxiliary_loss_clip": 0.01419883, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.29189777, + "balance_loss_mlp": 1.02519441, + "epoch": 0.41424921088230876, + "flos": 25301737492920.0, + "grad_norm": 1.6094586248748382, + "language_loss": 0.80124545, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.8258462, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15002441, + "step": 6890, + "time_per_iteration": 2.748718500137329 + }, + { + "auxiliary_loss_clip": 0.01418678, + "auxiliary_loss_mlp": 0.01042869, + "balance_loss_clip": 1.29225445, + "balance_loss_mlp": 1.02640593, + "epoch": 0.4143093341349767, + "flos": 20965567156920.0, + "grad_norm": 1.5375946465740782, + "language_loss": 0.74457347, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.769189, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.16442871, + "step": 6891, + "time_per_iteration": 2.72072434425354 + }, + { + "auxiliary_loss_clip": 0.01431649, + "auxiliary_loss_mlp": 0.01049091, + "balance_loss_clip": 1.29790819, + "balance_loss_mlp": 1.03193653, + "epoch": 0.4143694573876447, + "flos": 22022760046680.0, + "grad_norm": 1.72635879978408, + "language_loss": 0.84757066, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.87237799, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.17163086, + "step": 6892, + "time_per_iteration": 2.72298264503479 + }, + { + "auxiliary_loss_clip": 0.01417415, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_clip": 1.29120636, + "balance_loss_mlp": 1.02819395, + "epoch": 0.41442958064031266, + "flos": 35703927135000.0, + "grad_norm": 1.610391575122795, + "language_loss": 0.70525801, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72986382, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.14978027, + "step": 6893, + "time_per_iteration": 2.8623156547546387 + }, + { + "auxiliary_loss_clip": 0.01423408, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.29426837, + "balance_loss_mlp": 1.02966094, + "epoch": 0.4144897038929806, + "flos": 28299719112480.0, + "grad_norm": 2.6840344596581747, + "language_loss": 0.73117316, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75586081, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15686035, + "step": 6894, + "time_per_iteration": 2.77590012550354 + }, + { + "auxiliary_loss_clip": 0.01419626, + "auxiliary_loss_mlp": 0.0104113, + "balance_loss_clip": 1.28969669, + "balance_loss_mlp": 1.02615142, + "epoch": 0.4145498271456486, + "flos": 11650806651600.0, + "grad_norm": 2.3280916964598752, + "language_loss": 0.62412047, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64872801, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14996338, + "step": 6895, + "time_per_iteration": 2.7440528869628906 + }, + { + "auxiliary_loss_clip": 0.0141749, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.28911078, + "balance_loss_mlp": 1.02864432, + "epoch": 0.41460995039831655, + "flos": 27241429797000.0, + "grad_norm": 2.2080900660296963, + "language_loss": 0.71005636, + "learning_rate": 2.638813047071192e-06, + "loss": 0.7346707, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15307617, + "step": 6896, + "time_per_iteration": 2.8221664428710938 + }, + { + "auxiliary_loss_clip": 0.01426083, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.29587054, + "balance_loss_mlp": 1.03552377, + "epoch": 0.4146700736509845, + "flos": 25928035807320.0, + "grad_norm": 1.9270362301377102, + "language_loss": 0.73591673, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.76069719, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.16442871, + "step": 6897, + "time_per_iteration": 2.8494391441345215 + }, + { + "auxiliary_loss_clip": 0.01423978, + "auxiliary_loss_mlp": 0.0104871, + "balance_loss_clip": 1.29421306, + "balance_loss_mlp": 1.03314698, + "epoch": 0.4147301969036525, + "flos": 26838862875720.0, + "grad_norm": 1.9115655310537398, + "language_loss": 0.84670663, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.8714335, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15570068, + "step": 6898, + "time_per_iteration": 2.8400826454162598 + }, + { + "auxiliary_loss_clip": 0.014225, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_clip": 1.29131734, + "balance_loss_mlp": 1.02866232, + "epoch": 0.41479032015632045, + "flos": 20302535257920.0, + "grad_norm": 1.7315657181335966, + "language_loss": 0.74553967, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.77021301, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.16149902, + "step": 6899, + "time_per_iteration": 2.745342969894409 + }, + { + "auxiliary_loss_clip": 0.01433619, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.29919219, + "balance_loss_mlp": 1.02465141, + "epoch": 0.4148504434089884, + "flos": 25270486036920.0, + "grad_norm": 1.9764868856071216, + "language_loss": 0.76107597, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.78582788, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.16931152, + "step": 6900, + "time_per_iteration": 2.8545949459075928 + }, + { + "auxiliary_loss_clip": 0.01419008, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.29051721, + "balance_loss_mlp": 1.02481854, + "epoch": 0.4149105666616564, + "flos": 12826037708640.0, + "grad_norm": 2.0640124630641346, + "language_loss": 0.80331945, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82792437, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.16674805, + "step": 6901, + "time_per_iteration": 2.7025814056396484 + }, + { + "auxiliary_loss_clip": 0.01415103, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.28687024, + "balance_loss_mlp": 1.02269661, + "epoch": 0.41497068991432434, + "flos": 16768290704040.0, + "grad_norm": 1.602372886149794, + "language_loss": 0.69545686, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71999216, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.1572876, + "step": 6902, + "time_per_iteration": 2.759368658065796 + }, + { + "auxiliary_loss_clip": 0.01410338, + "auxiliary_loss_mlp": 0.0105158, + "balance_loss_clip": 1.28404772, + "balance_loss_mlp": 1.03720367, + "epoch": 0.4150308131669923, + "flos": 18005009464080.0, + "grad_norm": 1.581865292201541, + "language_loss": 0.83953148, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.86415064, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.1439209, + "step": 6903, + "time_per_iteration": 2.7087581157684326 + }, + { + "auxiliary_loss_clip": 0.01435414, + "auxiliary_loss_mlp": 0.01043281, + "balance_loss_clip": 1.30093169, + "balance_loss_mlp": 1.02531636, + "epoch": 0.41509093641966033, + "flos": 30050911098720.0, + "grad_norm": 1.9341274965883803, + "language_loss": 0.6807974, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70558435, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.17980957, + "step": 6904, + "time_per_iteration": 2.802478313446045 + }, + { + "auxiliary_loss_clip": 0.01427248, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.29433334, + "balance_loss_mlp": 1.0224638, + "epoch": 0.4151510596723283, + "flos": 24285379632120.0, + "grad_norm": 2.268962018424243, + "language_loss": 0.77464771, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79930532, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.16033936, + "step": 6905, + "time_per_iteration": 2.783074140548706 + }, + { + "auxiliary_loss_clip": 0.0142435, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.29297209, + "balance_loss_mlp": 1.01687598, + "epoch": 0.41521118292499626, + "flos": 23481342215280.0, + "grad_norm": 1.967914272768014, + "language_loss": 0.68462247, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70919472, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.16003418, + "step": 6906, + "time_per_iteration": 4.253796577453613 + }, + { + "auxiliary_loss_clip": 0.01423141, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.29459977, + "balance_loss_mlp": 1.02007031, + "epoch": 0.4152713061776642, + "flos": 22130361865440.0, + "grad_norm": 17.917376610911756, + "language_loss": 0.67456973, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69915485, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.15307617, + "step": 6907, + "time_per_iteration": 2.7317869663238525 + }, + { + "auxiliary_loss_clip": 0.01425042, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.29401338, + "balance_loss_mlp": 1.02268445, + "epoch": 0.4153314294303322, + "flos": 21256349598360.0, + "grad_norm": 1.8362461065569557, + "language_loss": 0.76995361, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79458272, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15179443, + "step": 6908, + "time_per_iteration": 4.162097454071045 + }, + { + "auxiliary_loss_clip": 0.01260136, + "auxiliary_loss_mlp": 0.01015518, + "balance_loss_clip": 1.20019209, + "balance_loss_mlp": 1.01108301, + "epoch": 0.41539155268300015, + "flos": 57935329246080.0, + "grad_norm": 0.8398640365898848, + "language_loss": 0.64891285, + "learning_rate": 2.634013214657026e-06, + "loss": 0.67166942, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.04443359, + "step": 6909, + "time_per_iteration": 3.2103543281555176 + }, + { + "auxiliary_loss_clip": 0.01420532, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.29203057, + "balance_loss_mlp": 1.02225316, + "epoch": 0.4154516759356681, + "flos": 21908132981640.0, + "grad_norm": 1.5928777161289198, + "language_loss": 0.87267274, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89725864, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15795898, + "step": 6910, + "time_per_iteration": 2.815633535385132 + }, + { + "auxiliary_loss_clip": 0.01257563, + "auxiliary_loss_mlp": 0.01013062, + "balance_loss_clip": 1.1983614, + "balance_loss_mlp": 1.00922322, + "epoch": 0.4155117991883361, + "flos": 67848165255600.0, + "grad_norm": 0.8101683546047665, + "language_loss": 0.62197149, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64467776, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.03833008, + "step": 6911, + "time_per_iteration": 3.1937594413757324 + }, + { + "auxiliary_loss_clip": 0.01433202, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_clip": 1.29723477, + "balance_loss_mlp": 1.03073764, + "epoch": 0.41557192244100405, + "flos": 14286731511960.0, + "grad_norm": 2.384296699409751, + "language_loss": 0.87768614, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90249014, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16467285, + "step": 6912, + "time_per_iteration": 2.7101504802703857 + }, + { + "auxiliary_loss_clip": 0.01418045, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.28696108, + "balance_loss_mlp": 1.0252676, + "epoch": 0.415632045693672, + "flos": 24467301395640.0, + "grad_norm": 1.9741144486559623, + "language_loss": 0.62913954, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65372324, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15057373, + "step": 6913, + "time_per_iteration": 4.208149671554565 + }, + { + "auxiliary_loss_clip": 0.01414989, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.28822374, + "balance_loss_mlp": 1.02282786, + "epoch": 0.41569216894634, + "flos": 20119435851960.0, + "grad_norm": 1.8741303838932328, + "language_loss": 0.75868255, + "learning_rate": 2.632166041703586e-06, + "loss": 0.78320765, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14697266, + "step": 6914, + "time_per_iteration": 2.7175350189208984 + }, + { + "auxiliary_loss_clip": 0.01424997, + "auxiliary_loss_mlp": 0.01049315, + "balance_loss_clip": 1.29289174, + "balance_loss_mlp": 1.03253031, + "epoch": 0.41575229219900794, + "flos": 23803254287640.0, + "grad_norm": 1.9989335294092914, + "language_loss": 0.8800832, + "learning_rate": 2.631796535141458e-06, + "loss": 0.90482628, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.16784668, + "step": 6915, + "time_per_iteration": 2.7964906692504883 + }, + { + "auxiliary_loss_clip": 0.01422505, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.29182887, + "balance_loss_mlp": 1.02805829, + "epoch": 0.4158124154516759, + "flos": 23112909943560.0, + "grad_norm": 2.0134497263542226, + "language_loss": 0.71359563, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73825788, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15661621, + "step": 6916, + "time_per_iteration": 4.226213693618774 + }, + { + "auxiliary_loss_clip": 0.01429513, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.29626274, + "balance_loss_mlp": 1.02356815, + "epoch": 0.41587253870434393, + "flos": 24248280572280.0, + "grad_norm": 1.5377653764406087, + "language_loss": 0.72051263, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74520868, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.16516113, + "step": 6917, + "time_per_iteration": 2.747722625732422 + }, + { + "auxiliary_loss_clip": 0.01416148, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.28586984, + "balance_loss_mlp": 1.02156925, + "epoch": 0.4159326619570119, + "flos": 23887320238800.0, + "grad_norm": 1.493323782615699, + "language_loss": 0.81047916, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.8350116, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.15527344, + "step": 6918, + "time_per_iteration": 2.791313886642456 + }, + { + "auxiliary_loss_clip": 0.01428381, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_clip": 1.29503763, + "balance_loss_mlp": 1.02749801, + "epoch": 0.41599278520967986, + "flos": 40634169728760.0, + "grad_norm": 1.381559311483852, + "language_loss": 0.7044487, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72916609, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.1585083, + "step": 6919, + "time_per_iteration": 2.975820302963257 + }, + { + "auxiliary_loss_clip": 0.01422434, + "auxiliary_loss_mlp": 0.01047755, + "balance_loss_clip": 1.29000294, + "balance_loss_mlp": 1.03050506, + "epoch": 0.4160529084623478, + "flos": 18227481998040.0, + "grad_norm": 3.7804975411126653, + "language_loss": 0.8178091, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.842511, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.17248535, + "step": 6920, + "time_per_iteration": 2.722676992416382 + }, + { + "auxiliary_loss_clip": 0.01430874, + "auxiliary_loss_mlp": 0.01045228, + "balance_loss_clip": 1.29632926, + "balance_loss_mlp": 1.02734637, + "epoch": 0.4161130317150158, + "flos": 13665874717800.0, + "grad_norm": 2.450874544987717, + "language_loss": 0.65114343, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67590445, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.17871094, + "step": 6921, + "time_per_iteration": 2.72684907913208 + }, + { + "auxiliary_loss_clip": 0.01421556, + "auxiliary_loss_mlp": 0.01045908, + "balance_loss_clip": 1.28888655, + "balance_loss_mlp": 1.02932644, + "epoch": 0.41617315496768376, + "flos": 16182624376800.0, + "grad_norm": 2.045505030963713, + "language_loss": 0.80821031, + "learning_rate": 2.629209319173274e-06, + "loss": 0.83288491, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.16577148, + "step": 6922, + "time_per_iteration": 2.7579445838928223 + }, + { + "auxiliary_loss_clip": 0.0142919, + "auxiliary_loss_mlp": 0.010399, + "balance_loss_clip": 1.2954514, + "balance_loss_mlp": 1.02427185, + "epoch": 0.4162332782203517, + "flos": 26218899465480.0, + "grad_norm": 2.445081407104034, + "language_loss": 0.67763138, + "learning_rate": 2.628839621341247e-06, + "loss": 0.7023223, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.15625, + "step": 6923, + "time_per_iteration": 2.78840708732605 + }, + { + "auxiliary_loss_clip": 0.01423107, + "auxiliary_loss_mlp": 0.0105421, + "balance_loss_clip": 1.29052281, + "balance_loss_mlp": 1.03682923, + "epoch": 0.4162934014730197, + "flos": 28189883833920.0, + "grad_norm": 1.910705554947294, + "language_loss": 0.76174724, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78652036, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.17382812, + "step": 6924, + "time_per_iteration": 2.77756667137146 + }, + { + "auxiliary_loss_clip": 0.01419463, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_clip": 1.28622997, + "balance_loss_mlp": 1.02749181, + "epoch": 0.41635352472568765, + "flos": 19870138173240.0, + "grad_norm": 2.0515798285537934, + "language_loss": 0.72696924, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75159657, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.15777588, + "step": 6925, + "time_per_iteration": 2.7474937438964844 + }, + { + "auxiliary_loss_clip": 0.01412178, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.28067148, + "balance_loss_mlp": 1.02459407, + "epoch": 0.4164136479783556, + "flos": 14938961587200.0, + "grad_norm": 2.0842644840089273, + "language_loss": 0.84064651, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86516863, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.15466309, + "step": 6926, + "time_per_iteration": 2.711658239364624 + }, + { + "auxiliary_loss_clip": 0.01409084, + "auxiliary_loss_mlp": 0.0104425, + "balance_loss_clip": 1.28012967, + "balance_loss_mlp": 1.03005171, + "epoch": 0.4164737712310236, + "flos": 21762010810440.0, + "grad_norm": 1.6410084820631294, + "language_loss": 0.86702001, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.8915534, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.14208984, + "step": 6927, + "time_per_iteration": 2.7509853839874268 + }, + { + "auxiliary_loss_clip": 0.01411477, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.27969837, + "balance_loss_mlp": 1.02484632, + "epoch": 0.41653389448369155, + "flos": 20744962607520.0, + "grad_norm": 2.3245910788708253, + "language_loss": 0.72502565, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74955428, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.1651001, + "step": 6928, + "time_per_iteration": 2.725023031234741 + }, + { + "auxiliary_loss_clip": 0.01412434, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.28128195, + "balance_loss_mlp": 1.02598262, + "epoch": 0.4165940177363595, + "flos": 24978322911240.0, + "grad_norm": 1.8138511446976193, + "language_loss": 0.7799753, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80451298, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15344238, + "step": 6929, + "time_per_iteration": 2.746901273727417 + }, + { + "auxiliary_loss_clip": 0.01409465, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.27830088, + "balance_loss_mlp": 1.0309664, + "epoch": 0.41665414098902753, + "flos": 20526591517920.0, + "grad_norm": 3.3109834493647488, + "language_loss": 0.70815462, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73271608, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15722656, + "step": 6930, + "time_per_iteration": 2.7419962882995605 + }, + { + "auxiliary_loss_clip": 0.01415753, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.28379345, + "balance_loss_mlp": 1.02459526, + "epoch": 0.4167142642416955, + "flos": 19687729109400.0, + "grad_norm": 1.7088658662264506, + "language_loss": 0.81219923, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83675528, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.15270996, + "step": 6931, + "time_per_iteration": 2.74493408203125 + }, + { + "auxiliary_loss_clip": 0.01403293, + "auxiliary_loss_mlp": 0.01046237, + "balance_loss_clip": 1.27342415, + "balance_loss_mlp": 1.02991116, + "epoch": 0.41677438749436346, + "flos": 23768348079240.0, + "grad_norm": 4.000605726130005, + "language_loss": 0.79781628, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.82231152, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.16326904, + "step": 6932, + "time_per_iteration": 2.8208887577056885 + }, + { + "auxiliary_loss_clip": 0.01413581, + "auxiliary_loss_mlp": 0.01048031, + "balance_loss_clip": 1.28116035, + "balance_loss_mlp": 1.03201485, + "epoch": 0.41683451074703143, + "flos": 30416297743440.0, + "grad_norm": 4.715799692521602, + "language_loss": 0.82048917, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.84510529, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16009521, + "step": 6933, + "time_per_iteration": 2.799813747406006 + }, + { + "auxiliary_loss_clip": 0.01421581, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.28604209, + "balance_loss_mlp": 1.02197433, + "epoch": 0.4168946339996994, + "flos": 21511738531080.0, + "grad_norm": 1.7130316194243398, + "language_loss": 0.77212948, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79673207, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.16699219, + "step": 6934, + "time_per_iteration": 2.7211368083953857 + }, + { + "auxiliary_loss_clip": 0.01415386, + "auxiliary_loss_mlp": 0.01039961, + "balance_loss_clip": 1.28344703, + "balance_loss_mlp": 1.02422559, + "epoch": 0.41695475725236736, + "flos": 17643074529960.0, + "grad_norm": 2.006795899919877, + "language_loss": 0.67616683, + "learning_rate": 2.624401391405668e-06, + "loss": 0.70072031, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.1574707, + "step": 6935, + "time_per_iteration": 2.690756320953369 + }, + { + "auxiliary_loss_clip": 0.01406224, + "auxiliary_loss_mlp": 0.01047285, + "balance_loss_clip": 1.27507234, + "balance_loss_mlp": 1.03063178, + "epoch": 0.4170148805050353, + "flos": 15673105370520.0, + "grad_norm": 2.2995874169723307, + "language_loss": 0.73198009, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.7565152, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.16650391, + "step": 6936, + "time_per_iteration": 2.7413806915283203 + }, + { + "auxiliary_loss_clip": 0.0140459, + "auxiliary_loss_mlp": 0.01039678, + "balance_loss_clip": 1.27576494, + "balance_loss_mlp": 1.02408552, + "epoch": 0.4170750037577033, + "flos": 15163423930800.0, + "grad_norm": 2.0423031777572773, + "language_loss": 0.73992968, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76437235, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15588379, + "step": 6937, + "time_per_iteration": 2.7693963050842285 + }, + { + "auxiliary_loss_clip": 0.01403965, + "auxiliary_loss_mlp": 0.01039146, + "balance_loss_clip": 1.27436471, + "balance_loss_mlp": 1.02325583, + "epoch": 0.41713512701037125, + "flos": 28774331910360.0, + "grad_norm": 1.4244465113469498, + "language_loss": 0.84440553, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86883664, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15893555, + "step": 6938, + "time_per_iteration": 2.7976086139678955 + }, + { + "auxiliary_loss_clip": 0.01416354, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.28204823, + "balance_loss_mlp": 1.02405763, + "epoch": 0.4171952502630392, + "flos": 28262904311160.0, + "grad_norm": 1.7622491193853782, + "language_loss": 0.74772346, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.77229917, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.17126465, + "step": 6939, + "time_per_iteration": 2.81453800201416 + }, + { + "auxiliary_loss_clip": 0.01408071, + "auxiliary_loss_mlp": 0.01039293, + "balance_loss_clip": 1.27751088, + "balance_loss_mlp": 1.02275884, + "epoch": 0.4172553735157072, + "flos": 24577096065840.0, + "grad_norm": 1.5132027632586953, + "language_loss": 0.75538099, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77985466, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.1652832, + "step": 6940, + "time_per_iteration": 2.784451723098755 + }, + { + "auxiliary_loss_clip": 0.01410216, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.27842951, + "balance_loss_mlp": 1.01903057, + "epoch": 0.41731549676837515, + "flos": 27050614802640.0, + "grad_norm": 1.7116267894588792, + "language_loss": 0.71197116, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73642051, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.15686035, + "step": 6941, + "time_per_iteration": 2.8201560974121094 + }, + { + "auxiliary_loss_clip": 0.01416252, + "auxiliary_loss_mlp": 0.01042517, + "balance_loss_clip": 1.28267765, + "balance_loss_mlp": 1.02546978, + "epoch": 0.4173756200210431, + "flos": 28398833784000.0, + "grad_norm": 2.2360229935030884, + "language_loss": 0.73780453, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76239228, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.17053223, + "step": 6942, + "time_per_iteration": 2.7798731327056885 + }, + { + "auxiliary_loss_clip": 0.01420792, + "auxiliary_loss_mlp": 0.01045254, + "balance_loss_clip": 1.28521061, + "balance_loss_mlp": 1.02811217, + "epoch": 0.41743574327371114, + "flos": 22525781715360.0, + "grad_norm": 1.964932593446472, + "language_loss": 0.72612834, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.75078881, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.17150879, + "step": 6943, + "time_per_iteration": 2.755573272705078 + }, + { + "auxiliary_loss_clip": 0.01414845, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.28163183, + "balance_loss_mlp": 1.02001071, + "epoch": 0.4174958665263791, + "flos": 30119017964400.0, + "grad_norm": 1.7027069378793163, + "language_loss": 0.64004338, + "learning_rate": 2.621070480118111e-06, + "loss": 0.66456324, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.17114258, + "step": 6944, + "time_per_iteration": 2.852102756500244 + }, + { + "auxiliary_loss_clip": 0.0140632, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.27532554, + "balance_loss_mlp": 1.01983738, + "epoch": 0.41755598977904707, + "flos": 25268739877440.0, + "grad_norm": 1.5530115907889979, + "language_loss": 0.70427918, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72869337, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.15264893, + "step": 6945, + "time_per_iteration": 4.373206377029419 + }, + { + "auxiliary_loss_clip": 0.01413088, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.2802453, + "balance_loss_mlp": 1.02951217, + "epoch": 0.41761611303171503, + "flos": 19833363980280.0, + "grad_norm": 1.604663667960015, + "language_loss": 0.81054133, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83514547, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.17810059, + "step": 6946, + "time_per_iteration": 2.7141103744506836 + }, + { + "auxiliary_loss_clip": 0.01410208, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.27945924, + "balance_loss_mlp": 1.02565217, + "epoch": 0.417676236284383, + "flos": 15527470499640.0, + "grad_norm": 2.362386261973256, + "language_loss": 0.78493881, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.80945826, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16088867, + "step": 6947, + "time_per_iteration": 4.204957008361816 + }, + { + "auxiliary_loss_clip": 0.014133, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.28175318, + "balance_loss_mlp": 1.02272296, + "epoch": 0.41773635953705096, + "flos": 32530764739680.0, + "grad_norm": 1.663290864403485, + "language_loss": 0.71859038, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.74312478, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.17419434, + "step": 6948, + "time_per_iteration": 2.7828891277313232 + }, + { + "auxiliary_loss_clip": 0.01403626, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.27402449, + "balance_loss_mlp": 1.01748717, + "epoch": 0.4177964827897189, + "flos": 23446232965080.0, + "grad_norm": 1.529919104416907, + "language_loss": 0.77131158, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79569, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.16723633, + "step": 6949, + "time_per_iteration": 2.744319200515747 + }, + { + "auxiliary_loss_clip": 0.01424061, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_clip": 1.28819728, + "balance_loss_mlp": 1.02713728, + "epoch": 0.4178566060423869, + "flos": 22754264286600.0, + "grad_norm": 1.566277029282225, + "language_loss": 0.81497395, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.83965588, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.17004395, + "step": 6950, + "time_per_iteration": 2.7304863929748535 + }, + { + "auxiliary_loss_clip": 0.01411052, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.28334904, + "balance_loss_mlp": 1.02219629, + "epoch": 0.41791672929505486, + "flos": 26038602036360.0, + "grad_norm": 1.2512863984926486, + "language_loss": 0.76298422, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78747511, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15844727, + "step": 6951, + "time_per_iteration": 4.199250221252441 + }, + { + "auxiliary_loss_clip": 0.01423695, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.28785276, + "balance_loss_mlp": 1.02343655, + "epoch": 0.4179768525477228, + "flos": 19572858394200.0, + "grad_norm": 2.3147019061598186, + "language_loss": 0.72526491, + "learning_rate": 2.61810806829516e-06, + "loss": 0.74990326, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16699219, + "step": 6952, + "time_per_iteration": 2.716721534729004 + }, + { + "auxiliary_loss_clip": 0.01420351, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.28818917, + "balance_loss_mlp": 1.02627945, + "epoch": 0.4180369758003908, + "flos": 17788140883800.0, + "grad_norm": 10.835478445788096, + "language_loss": 0.71755379, + "learning_rate": 2.617737661195593e-06, + "loss": 0.74217451, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.15429688, + "step": 6953, + "time_per_iteration": 2.6910178661346436 + }, + { + "auxiliary_loss_clip": 0.01408388, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.27993798, + "balance_loss_mlp": 1.02614546, + "epoch": 0.41809709905305875, + "flos": 20965729590360.0, + "grad_norm": 1.8802405423007205, + "language_loss": 0.76833194, + "learning_rate": 2.617367230671353e-06, + "loss": 0.79285717, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.17993164, + "step": 6954, + "time_per_iteration": 2.8704206943511963 + }, + { + "auxiliary_loss_clip": 0.014173, + "auxiliary_loss_mlp": 0.01048997, + "balance_loss_clip": 1.28505063, + "balance_loss_mlp": 1.03133011, + "epoch": 0.4181572223057267, + "flos": 22022678829960.0, + "grad_norm": 2.1576701318577447, + "language_loss": 0.8540234, + "learning_rate": 2.616996776736485e-06, + "loss": 0.87868637, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.17663574, + "step": 6955, + "time_per_iteration": 4.291091442108154 + }, + { + "auxiliary_loss_clip": 0.01415478, + "auxiliary_loss_mlp": 0.01044285, + "balance_loss_clip": 1.28594804, + "balance_loss_mlp": 1.02907419, + "epoch": 0.4182173455583947, + "flos": 26250557005080.0, + "grad_norm": 1.5670350512422382, + "language_loss": 0.83640885, + "learning_rate": 2.616626299405037e-06, + "loss": 0.86100644, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15209961, + "step": 6956, + "time_per_iteration": 2.7603280544281006 + }, + { + "auxiliary_loss_clip": 0.01426235, + "auxiliary_loss_mlp": 0.01051606, + "balance_loss_clip": 1.29098213, + "balance_loss_mlp": 1.03398728, + "epoch": 0.4182774688110627, + "flos": 14795478959400.0, + "grad_norm": 2.1829304106800294, + "language_loss": 0.71875262, + "learning_rate": 2.616255798691059e-06, + "loss": 0.74353105, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.17614746, + "step": 6957, + "time_per_iteration": 2.7978081703186035 + }, + { + "auxiliary_loss_clip": 0.01411538, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.28057432, + "balance_loss_mlp": 1.02824152, + "epoch": 0.41833759206373067, + "flos": 20416837456080.0, + "grad_norm": 1.761287260856311, + "language_loss": 0.75957775, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.78413016, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.15466309, + "step": 6958, + "time_per_iteration": 2.703458309173584 + }, + { + "auxiliary_loss_clip": 0.0141139, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.28147209, + "balance_loss_mlp": 1.02327621, + "epoch": 0.41839771531639863, + "flos": 23661192952440.0, + "grad_norm": 1.6309241083328585, + "language_loss": 0.77557445, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.80009592, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.17486572, + "step": 6959, + "time_per_iteration": 2.73292875289917 + }, + { + "auxiliary_loss_clip": 0.01415756, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_clip": 1.28445482, + "balance_loss_mlp": 1.02591419, + "epoch": 0.4184578385690666, + "flos": 19758719168640.0, + "grad_norm": 1.6831725186859599, + "language_loss": 0.77278006, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.79738104, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.18432617, + "step": 6960, + "time_per_iteration": 2.73160719871521 + }, + { + "auxiliary_loss_clip": 0.01404712, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.27954662, + "balance_loss_mlp": 1.02292132, + "epoch": 0.41851796182173456, + "flos": 20198222716320.0, + "grad_norm": 1.901910077082338, + "language_loss": 0.75458193, + "learning_rate": 2.614773562290835e-06, + "loss": 0.77900708, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14886475, + "step": 6961, + "time_per_iteration": 2.7824251651763916 + }, + { + "auxiliary_loss_clip": 0.01262684, + "auxiliary_loss_mlp": 0.01013603, + "balance_loss_clip": 1.19313288, + "balance_loss_mlp": 1.00842881, + "epoch": 0.41857808507440253, + "flos": 59034047506920.0, + "grad_norm": 0.777515439576893, + "language_loss": 0.54743481, + "learning_rate": 2.61440294487496e-06, + "loss": 0.57019764, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.05175781, + "step": 6962, + "time_per_iteration": 3.170600414276123 + }, + { + "auxiliary_loss_clip": 0.01425987, + "auxiliary_loss_mlp": 0.01043737, + "balance_loss_clip": 1.29252267, + "balance_loss_mlp": 1.02723813, + "epoch": 0.4186382083270705, + "flos": 18483439447800.0, + "grad_norm": 1.762578786021541, + "language_loss": 0.85730422, + "learning_rate": 2.614032304160864e-06, + "loss": 0.88200146, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.16491699, + "step": 6963, + "time_per_iteration": 2.765151023864746 + }, + { + "auxiliary_loss_clip": 0.01418317, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.28724408, + "balance_loss_mlp": 1.0235548, + "epoch": 0.41869833157973846, + "flos": 21583621974240.0, + "grad_norm": 1.5876095391141605, + "language_loss": 0.70460755, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72919595, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.16955566, + "step": 6964, + "time_per_iteration": 2.834670066833496 + }, + { + "auxiliary_loss_clip": 0.0141038, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.28129315, + "balance_loss_mlp": 1.02762437, + "epoch": 0.4187584548324064, + "flos": 35524238831280.0, + "grad_norm": 1.5641697307704137, + "language_loss": 0.71591413, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.74045247, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.1583252, + "step": 6965, + "time_per_iteration": 2.858874797821045 + }, + { + "auxiliary_loss_clip": 0.01412024, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.28453255, + "balance_loss_mlp": 1.02158332, + "epoch": 0.4188185780850744, + "flos": 18659838474360.0, + "grad_norm": 1.5320013888741624, + "language_loss": 0.720586, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74507433, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15228271, + "step": 6966, + "time_per_iteration": 2.7502217292785645 + }, + { + "auxiliary_loss_clip": 0.01425993, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.2908895, + "balance_loss_mlp": 1.02206373, + "epoch": 0.41887870133774235, + "flos": 40340869569000.0, + "grad_norm": 2.0113280934737316, + "language_loss": 0.71560377, + "learning_rate": 2.612549508603375e-06, + "loss": 0.74025029, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16577148, + "step": 6967, + "time_per_iteration": 2.880234479904175 + }, + { + "auxiliary_loss_clip": 0.01260346, + "auxiliary_loss_mlp": 0.01007219, + "balance_loss_clip": 1.19018507, + "balance_loss_mlp": 1.00195014, + "epoch": 0.4189388245904103, + "flos": 61384411423080.0, + "grad_norm": 0.7219314036933572, + "language_loss": 0.46320486, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48588055, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.05273438, + "step": 6968, + "time_per_iteration": 3.291780471801758 + }, + { + "auxiliary_loss_clip": 0.01417491, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.28314078, + "balance_loss_mlp": 1.02084696, + "epoch": 0.4189989478430783, + "flos": 28220891639760.0, + "grad_norm": 1.6770547359590244, + "language_loss": 0.74719203, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.7717483, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.17297363, + "step": 6969, + "time_per_iteration": 2.825498104095459 + }, + { + "auxiliary_loss_clip": 0.01406786, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.2762506, + "balance_loss_mlp": 1.02103782, + "epoch": 0.4190590710957463, + "flos": 24570517511520.0, + "grad_norm": 1.7099964080830723, + "language_loss": 0.80818343, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83261049, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.14868164, + "step": 6970, + "time_per_iteration": 2.748552083969116 + }, + { + "auxiliary_loss_clip": 0.01408063, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.27966833, + "balance_loss_mlp": 1.02374196, + "epoch": 0.41911919434841427, + "flos": 21731084221320.0, + "grad_norm": 1.7130578000584766, + "language_loss": 0.8304683, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.854949, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.16271973, + "step": 6971, + "time_per_iteration": 2.7552812099456787 + }, + { + "auxiliary_loss_clip": 0.01407379, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_clip": 1.28130877, + "balance_loss_mlp": 1.02443039, + "epoch": 0.41917931760108224, + "flos": 17605853645040.0, + "grad_norm": 1.8056940805224213, + "language_loss": 0.75067383, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77516341, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.17175293, + "step": 6972, + "time_per_iteration": 2.6987345218658447 + }, + { + "auxiliary_loss_clip": 0.01406521, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.27680051, + "balance_loss_mlp": 1.01937413, + "epoch": 0.4192394408537502, + "flos": 37823916868200.0, + "grad_norm": 1.6773332171613062, + "language_loss": 0.72725439, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75167525, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.1619873, + "step": 6973, + "time_per_iteration": 2.884721517562866 + }, + { + "auxiliary_loss_clip": 0.01426183, + "auxiliary_loss_mlp": 0.01046489, + "balance_loss_clip": 1.29012752, + "balance_loss_mlp": 1.0295372, + "epoch": 0.41929956410641817, + "flos": 23112341426520.0, + "grad_norm": 2.155828862289234, + "language_loss": 0.74835229, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77307904, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.16955566, + "step": 6974, + "time_per_iteration": 2.7120566368103027 + }, + { + "auxiliary_loss_clip": 0.01410028, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.27931857, + "balance_loss_mlp": 1.01786804, + "epoch": 0.41935968735908613, + "flos": 22529152209240.0, + "grad_norm": 5.977510925275606, + "language_loss": 0.73304534, + "learning_rate": 2.609582803447259e-06, + "loss": 0.75749123, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.16711426, + "step": 6975, + "time_per_iteration": 2.7544705867767334 + }, + { + "auxiliary_loss_clip": 0.01412883, + "auxiliary_loss_mlp": 0.01039451, + "balance_loss_clip": 1.28538942, + "balance_loss_mlp": 1.02309513, + "epoch": 0.4194198106117541, + "flos": 26876164977360.0, + "grad_norm": 1.5740875394870404, + "language_loss": 0.81105781, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83558112, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16333008, + "step": 6976, + "time_per_iteration": 2.8674674034118652 + }, + { + "auxiliary_loss_clip": 0.01411215, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.28009832, + "balance_loss_mlp": 1.01824892, + "epoch": 0.41947993386442206, + "flos": 19907318449800.0, + "grad_norm": 2.099465698130553, + "language_loss": 0.68233132, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70678669, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.16064453, + "step": 6977, + "time_per_iteration": 2.7228453159332275 + }, + { + "auxiliary_loss_clip": 0.0141624, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.28422344, + "balance_loss_mlp": 1.02256405, + "epoch": 0.41954005711709, + "flos": 17388335331000.0, + "grad_norm": 2.9968796832797304, + "language_loss": 0.80215096, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.82670021, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.16113281, + "step": 6978, + "time_per_iteration": 2.769927740097046 + }, + { + "auxiliary_loss_clip": 0.01418437, + "auxiliary_loss_mlp": 0.01041994, + "balance_loss_clip": 1.28439105, + "balance_loss_mlp": 1.02466106, + "epoch": 0.419600180369758, + "flos": 25007950032840.0, + "grad_norm": 1.695274715545134, + "language_loss": 0.82777071, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.85237491, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.17346191, + "step": 6979, + "time_per_iteration": 2.7628700733184814 + }, + { + "auxiliary_loss_clip": 0.01408602, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.2793597, + "balance_loss_mlp": 1.02019882, + "epoch": 0.41966030362242596, + "flos": 17388213505920.0, + "grad_norm": 2.4484743938475466, + "language_loss": 0.83494383, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.8593924, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16052246, + "step": 6980, + "time_per_iteration": 2.7477641105651855 + }, + { + "auxiliary_loss_clip": 0.01413404, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.27901983, + "balance_loss_mlp": 1.0185672, + "epoch": 0.4197204268750939, + "flos": 22160273245560.0, + "grad_norm": 2.7684935613052843, + "language_loss": 0.79363132, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81810999, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.15917969, + "step": 6981, + "time_per_iteration": 2.7489564418792725 + }, + { + "auxiliary_loss_clip": 0.01398399, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.27184689, + "balance_loss_mlp": 1.02479529, + "epoch": 0.4197805501277619, + "flos": 22088349194040.0, + "grad_norm": 1.660359981983069, + "language_loss": 0.84527886, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86966932, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15856934, + "step": 6982, + "time_per_iteration": 2.8180127143859863 + }, + { + "auxiliary_loss_clip": 0.01417536, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.28346479, + "balance_loss_mlp": 1.0243026, + "epoch": 0.4198406733804299, + "flos": 26437798463760.0, + "grad_norm": 1.9785475150375296, + "language_loss": 0.57091868, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59550428, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.1673584, + "step": 6983, + "time_per_iteration": 4.168780565261841 + }, + { + "auxiliary_loss_clip": 0.01404604, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.27702737, + "balance_loss_mlp": 1.02373528, + "epoch": 0.4199007966330979, + "flos": 12534808575240.0, + "grad_norm": 1.7671893432573544, + "language_loss": 0.81980056, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84424156, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15759277, + "step": 6984, + "time_per_iteration": 2.818502426147461 + }, + { + "auxiliary_loss_clip": 0.01408898, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.27943635, + "balance_loss_mlp": 1.01888001, + "epoch": 0.41996091988576584, + "flos": 21768061456080.0, + "grad_norm": 1.62959320740322, + "language_loss": 0.79780328, + "learning_rate": 2.605872342456914e-06, + "loss": 0.82224077, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.15979004, + "step": 6985, + "time_per_iteration": 2.7444393634796143 + }, + { + "auxiliary_loss_clip": 0.01416663, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.28068733, + "balance_loss_mlp": 1.02263784, + "epoch": 0.4200210431384338, + "flos": 26547593133960.0, + "grad_norm": 1.6416496294586247, + "language_loss": 0.78298062, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.807549, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.17529297, + "step": 6986, + "time_per_iteration": 4.228528022766113 + }, + { + "auxiliary_loss_clip": 0.01398204, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.272416, + "balance_loss_mlp": 1.02266502, + "epoch": 0.42008116639110177, + "flos": 26801317123920.0, + "grad_norm": 1.487630550362649, + "language_loss": 0.72853339, + "learning_rate": 2.605129974111655e-06, + "loss": 0.75289494, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15283203, + "step": 6987, + "time_per_iteration": 2.7357277870178223 + }, + { + "auxiliary_loss_clip": 0.01414702, + "auxiliary_loss_mlp": 0.01042585, + "balance_loss_clip": 1.28406024, + "balance_loss_mlp": 1.02676654, + "epoch": 0.42014128964376973, + "flos": 32093657085240.0, + "grad_norm": 1.4159960182476845, + "language_loss": 0.75210398, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77667683, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.15820312, + "step": 6988, + "time_per_iteration": 2.919224739074707 + }, + { + "auxiliary_loss_clip": 0.01409481, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.27686012, + "balance_loss_mlp": 1.02256727, + "epoch": 0.4202014128964377, + "flos": 26472542238720.0, + "grad_norm": 1.6605602064551328, + "language_loss": 0.7458182, + "learning_rate": 2.60438751398004e-06, + "loss": 0.77031124, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.17248535, + "step": 6989, + "time_per_iteration": 2.811525344848633 + }, + { + "auxiliary_loss_clip": 0.01409487, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.27635229, + "balance_loss_mlp": 1.02016556, + "epoch": 0.42026153614910566, + "flos": 13404394531080.0, + "grad_norm": 4.345595439481536, + "language_loss": 0.71484798, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73931593, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.17138672, + "step": 6990, + "time_per_iteration": 4.236135959625244 + }, + { + "auxiliary_loss_clip": 0.01247327, + "auxiliary_loss_mlp": 0.01007669, + "balance_loss_clip": 1.18113995, + "balance_loss_mlp": 1.0026387, + "epoch": 0.42032165940177363, + "flos": 60264715621320.0, + "grad_norm": 0.8228639267275328, + "language_loss": 0.60421193, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62676191, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.05029297, + "step": 6991, + "time_per_iteration": 3.1480531692504883 + }, + { + "auxiliary_loss_clip": 0.01414209, + "auxiliary_loss_mlp": 0.01047734, + "balance_loss_clip": 1.28241682, + "balance_loss_mlp": 1.03119946, + "epoch": 0.4203817826544416, + "flos": 24540606131400.0, + "grad_norm": 1.6538332825799003, + "language_loss": 0.83273053, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85734993, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.16540527, + "step": 6992, + "time_per_iteration": 2.7669973373413086 + }, + { + "auxiliary_loss_clip": 0.0124302, + "auxiliary_loss_mlp": 0.01004494, + "balance_loss_clip": 1.17790961, + "balance_loss_mlp": 0.99963057, + "epoch": 0.42044190590710956, + "flos": 58833973806480.0, + "grad_norm": 1.338737128753788, + "language_loss": 0.65524429, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67771941, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.04858398, + "step": 6993, + "time_per_iteration": 4.620036840438843 + }, + { + "auxiliary_loss_clip": 0.01418677, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_clip": 1.2821244, + "balance_loss_mlp": 1.02653885, + "epoch": 0.4205020291597775, + "flos": 16440490419480.0, + "grad_norm": 1.7650389258135162, + "language_loss": 0.83608603, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.86071491, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.17663574, + "step": 6994, + "time_per_iteration": 2.7105538845062256 + }, + { + "auxiliary_loss_clip": 0.01401456, + "auxiliary_loss_mlp": 0.0104229, + "balance_loss_clip": 1.27329826, + "balance_loss_mlp": 1.02676845, + "epoch": 0.4205621524124455, + "flos": 18410256537120.0, + "grad_norm": 1.9348593494625261, + "language_loss": 0.77913654, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80357397, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.1550293, + "step": 6995, + "time_per_iteration": 2.6956229209899902 + }, + { + "auxiliary_loss_clip": 0.01394209, + "auxiliary_loss_mlp": 0.01039931, + "balance_loss_clip": 1.26880944, + "balance_loss_mlp": 1.02443385, + "epoch": 0.4206222756651135, + "flos": 25525590711120.0, + "grad_norm": 1.4360900031861739, + "language_loss": 0.79834497, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82268643, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.1550293, + "step": 6996, + "time_per_iteration": 2.828824043273926 + }, + { + "auxiliary_loss_clip": 0.01406983, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.27632928, + "balance_loss_mlp": 1.02023077, + "epoch": 0.4206823989177815, + "flos": 15308530893000.0, + "grad_norm": 1.7148153914260669, + "language_loss": 0.75633246, + "learning_rate": 2.601416757842559e-06, + "loss": 0.78076786, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.16314697, + "step": 6997, + "time_per_iteration": 2.7470314502716064 + }, + { + "auxiliary_loss_clip": 0.01408191, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_clip": 1.27565432, + "balance_loss_mlp": 1.02956605, + "epoch": 0.42074252217044944, + "flos": 15557584921560.0, + "grad_norm": 1.731957658373136, + "language_loss": 0.75828159, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78282952, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.17041016, + "step": 6998, + "time_per_iteration": 2.899986743927002 + }, + { + "auxiliary_loss_clip": 0.01413448, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.27893996, + "balance_loss_mlp": 1.02915096, + "epoch": 0.4208026454231174, + "flos": 26152173284040.0, + "grad_norm": 1.7987296092494331, + "language_loss": 0.76298946, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78758752, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.17211914, + "step": 6999, + "time_per_iteration": 2.806117534637451 + }, + { + "auxiliary_loss_clip": 0.01404952, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.27520275, + "balance_loss_mlp": 1.02936912, + "epoch": 0.42086276867578537, + "flos": 23555377901520.0, + "grad_norm": 1.7303073982107526, + "language_loss": 0.64152181, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66603661, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.17150879, + "step": 7000, + "time_per_iteration": 2.7563014030456543 + }, + { + "auxiliary_loss_clip": 0.01412046, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.27985716, + "balance_loss_mlp": 1.03389645, + "epoch": 0.42092289192845334, + "flos": 18117930978000.0, + "grad_norm": 1.4368889190132494, + "language_loss": 0.76924706, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.79386675, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.16015625, + "step": 7001, + "time_per_iteration": 2.756190061569214 + }, + { + "auxiliary_loss_clip": 0.01405194, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.27652872, + "balance_loss_mlp": 1.03100109, + "epoch": 0.4209830151811213, + "flos": 20010818824200.0, + "grad_norm": 1.437081170369356, + "language_loss": 0.8674159, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.89193201, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15405273, + "step": 7002, + "time_per_iteration": 2.7382588386535645 + }, + { + "auxiliary_loss_clip": 0.01406369, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_clip": 1.27665663, + "balance_loss_mlp": 1.03252888, + "epoch": 0.42104313843378927, + "flos": 21983711785560.0, + "grad_norm": 2.818472218060605, + "language_loss": 0.68002701, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70457137, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.15551758, + "step": 7003, + "time_per_iteration": 2.816880941390991 + }, + { + "auxiliary_loss_clip": 0.01410306, + "auxiliary_loss_mlp": 0.01046197, + "balance_loss_clip": 1.27739954, + "balance_loss_mlp": 1.02913785, + "epoch": 0.42110326168645723, + "flos": 25449037306560.0, + "grad_norm": 2.896044810931973, + "language_loss": 0.77189398, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79645896, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.1706543, + "step": 7004, + "time_per_iteration": 2.848191499710083 + }, + { + "auxiliary_loss_clip": 0.01396854, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_clip": 1.27061224, + "balance_loss_mlp": 1.03217769, + "epoch": 0.4211633849391252, + "flos": 17827189144920.0, + "grad_norm": 1.6143005989388906, + "language_loss": 0.68418562, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70863867, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.16271973, + "step": 7005, + "time_per_iteration": 2.7555720806121826 + }, + { + "auxiliary_loss_clip": 0.01412353, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_clip": 1.28017247, + "balance_loss_mlp": 1.03421652, + "epoch": 0.42122350819179316, + "flos": 16285840492680.0, + "grad_norm": 1.900284152492379, + "language_loss": 0.72876108, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.75339663, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.1697998, + "step": 7006, + "time_per_iteration": 2.691387176513672 + }, + { + "auxiliary_loss_clip": 0.0141128, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_clip": 1.27854812, + "balance_loss_mlp": 1.0321281, + "epoch": 0.4212836314444611, + "flos": 19650589441200.0, + "grad_norm": 1.6409713127477434, + "language_loss": 0.709638, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73423284, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.16088867, + "step": 7007, + "time_per_iteration": 2.6876626014709473 + }, + { + "auxiliary_loss_clip": 0.01412998, + "auxiliary_loss_mlp": 0.01043366, + "balance_loss_clip": 1.28109634, + "balance_loss_mlp": 1.02718902, + "epoch": 0.4213437546971291, + "flos": 18373604169240.0, + "grad_norm": 1.7304635748762798, + "language_loss": 0.82597899, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.85054266, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.16174316, + "step": 7008, + "time_per_iteration": 2.687072277069092 + }, + { + "auxiliary_loss_clip": 0.01413943, + "auxiliary_loss_mlp": 0.01049859, + "balance_loss_clip": 1.28221917, + "balance_loss_mlp": 1.03368258, + "epoch": 0.42140387794979706, + "flos": 27709667082360.0, + "grad_norm": 1.686781390716673, + "language_loss": 0.72318584, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74782389, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.16174316, + "step": 7009, + "time_per_iteration": 2.8365514278411865 + }, + { + "auxiliary_loss_clip": 0.01417006, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.28367949, + "balance_loss_mlp": 1.02297175, + "epoch": 0.4214640012024651, + "flos": 28152906599160.0, + "grad_norm": 2.4565893945826223, + "language_loss": 0.6604259, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68499482, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.16900635, + "step": 7010, + "time_per_iteration": 2.797274112701416 + }, + { + "auxiliary_loss_clip": 0.01405429, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.27549267, + "balance_loss_mlp": 1.02359426, + "epoch": 0.42152412445513304, + "flos": 23001734589120.0, + "grad_norm": 1.6732346560161138, + "language_loss": 0.72592843, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.7503798, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.16101074, + "step": 7011, + "time_per_iteration": 2.7107372283935547 + }, + { + "auxiliary_loss_clip": 0.01234705, + "auxiliary_loss_mlp": 0.0101688, + "balance_loss_clip": 1.17230654, + "balance_loss_mlp": 1.01235032, + "epoch": 0.421584247707801, + "flos": 63763850925000.0, + "grad_norm": 0.7911227877256751, + "language_loss": 0.54429924, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56681514, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.04541016, + "step": 7012, + "time_per_iteration": 3.1202731132507324 + }, + { + "auxiliary_loss_clip": 0.01415927, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.28421259, + "balance_loss_mlp": 1.01880169, + "epoch": 0.421644370960469, + "flos": 24319757931840.0, + "grad_norm": 1.2747837417965726, + "language_loss": 0.78874314, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.81326443, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.17407227, + "step": 7013, + "time_per_iteration": 2.7462871074676514 + }, + { + "auxiliary_loss_clip": 0.01413272, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.28001237, + "balance_loss_mlp": 1.02405787, + "epoch": 0.42170449421313694, + "flos": 23445948706560.0, + "grad_norm": 1.9690793991079178, + "language_loss": 0.81190985, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83645141, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.16821289, + "step": 7014, + "time_per_iteration": 2.7414016723632812 + }, + { + "auxiliary_loss_clip": 0.01412399, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.28217232, + "balance_loss_mlp": 1.0206821, + "epoch": 0.4217646174658049, + "flos": 23700687905520.0, + "grad_norm": 1.5650268994814165, + "language_loss": 0.78154469, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80603975, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.16430664, + "step": 7015, + "time_per_iteration": 2.8021202087402344 + }, + { + "auxiliary_loss_clip": 0.01418623, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.28630495, + "balance_loss_mlp": 1.02764845, + "epoch": 0.42182474071847287, + "flos": 24976820401920.0, + "grad_norm": 1.5260032699474055, + "language_loss": 0.82657814, + "learning_rate": 2.594355375584368e-06, + "loss": 0.85121179, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.17077637, + "step": 7016, + "time_per_iteration": 2.784822463989258 + }, + { + "auxiliary_loss_clip": 0.01407749, + "auxiliary_loss_mlp": 0.01039777, + "balance_loss_clip": 1.27519369, + "balance_loss_mlp": 1.02243233, + "epoch": 0.42188486397114083, + "flos": 22861784888640.0, + "grad_norm": 1.8154294767275034, + "language_loss": 0.67836952, + "learning_rate": 2.593983497660586e-06, + "loss": 0.7028448, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.17346191, + "step": 7017, + "time_per_iteration": 2.8220114707946777 + }, + { + "auxiliary_loss_clip": 0.01218604, + "auxiliary_loss_mlp": 0.01004951, + "balance_loss_clip": 1.15888715, + "balance_loss_mlp": 1.00039697, + "epoch": 0.4219449872238088, + "flos": 66992386142160.0, + "grad_norm": 0.6872904143404922, + "language_loss": 0.59479874, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61703432, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.0456543, + "step": 7018, + "time_per_iteration": 3.316014528274536 + }, + { + "auxiliary_loss_clip": 0.01417452, + "auxiliary_loss_mlp": 0.01040171, + "balance_loss_clip": 1.28423476, + "balance_loss_mlp": 1.02352929, + "epoch": 0.42200511047647676, + "flos": 13119297260040.0, + "grad_norm": 1.8644887102234504, + "language_loss": 0.75603068, + "learning_rate": 2.593239674255382e-06, + "loss": 0.78060687, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.16625977, + "step": 7019, + "time_per_iteration": 2.725503444671631 + }, + { + "auxiliary_loss_clip": 0.01412531, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.28134942, + "balance_loss_mlp": 1.02446473, + "epoch": 0.42206523372914473, + "flos": 13995258728400.0, + "grad_norm": 2.371713977564443, + "language_loss": 0.69275641, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71730071, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.17431641, + "step": 7020, + "time_per_iteration": 2.774770498275757 + }, + { + "auxiliary_loss_clip": 0.01397905, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.2722795, + "balance_loss_mlp": 1.02407634, + "epoch": 0.4221253569818127, + "flos": 21947059417680.0, + "grad_norm": 1.9282003651245077, + "language_loss": 0.81175363, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83612478, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.15136719, + "step": 7021, + "time_per_iteration": 2.756424903869629 + }, + { + "auxiliary_loss_clip": 0.01408517, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.27713728, + "balance_loss_mlp": 1.02108216, + "epoch": 0.42218548023448066, + "flos": 32198375710440.0, + "grad_norm": 1.5605251841958034, + "language_loss": 0.69762468, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72208238, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.16174316, + "step": 7022, + "time_per_iteration": 4.26214075088501 + }, + { + "auxiliary_loss_clip": 0.01397169, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.27371097, + "balance_loss_mlp": 1.01856911, + "epoch": 0.4222456034871487, + "flos": 30125271651840.0, + "grad_norm": 2.2401524157964543, + "language_loss": 0.67489457, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69920248, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.15063477, + "step": 7023, + "time_per_iteration": 2.787181854248047 + }, + { + "auxiliary_loss_clip": 0.01397943, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.2722187, + "balance_loss_mlp": 1.03100908, + "epoch": 0.42230572673981664, + "flos": 22133082625560.0, + "grad_norm": 1.5212489028199458, + "language_loss": 0.69538248, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71984035, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.16821289, + "step": 7024, + "time_per_iteration": 2.7468879222869873 + }, + { + "auxiliary_loss_clip": 0.01407868, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.27784812, + "balance_loss_mlp": 1.02367902, + "epoch": 0.4223658499924846, + "flos": 22060265190120.0, + "grad_norm": 1.652671092203909, + "language_loss": 0.77040005, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79487479, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.15917969, + "step": 7025, + "time_per_iteration": 4.226133108139038 + }, + { + "auxiliary_loss_clip": 0.01397206, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.26977062, + "balance_loss_mlp": 1.02459371, + "epoch": 0.4224259732451526, + "flos": 20415375555120.0, + "grad_norm": 1.572616393463301, + "language_loss": 0.79735172, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.82172573, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15588379, + "step": 7026, + "time_per_iteration": 2.7251837253570557 + }, + { + "auxiliary_loss_clip": 0.01210817, + "auxiliary_loss_mlp": 0.01002343, + "balance_loss_clip": 1.15183663, + "balance_loss_mlp": 0.99850416, + "epoch": 0.42248609649782054, + "flos": 62861998304160.0, + "grad_norm": 0.7379635609686884, + "language_loss": 0.62027705, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64240861, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.03833008, + "step": 7027, + "time_per_iteration": 3.3118550777435303 + }, + { + "auxiliary_loss_clip": 0.01394664, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.26758111, + "balance_loss_mlp": 1.02314329, + "epoch": 0.4225462197504885, + "flos": 26255551833360.0, + "grad_norm": 1.993458538840501, + "language_loss": 0.71428752, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73862535, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15966797, + "step": 7028, + "time_per_iteration": 4.256757020950317 + }, + { + "auxiliary_loss_clip": 0.01408554, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.2782526, + "balance_loss_mlp": 1.02329326, + "epoch": 0.42260634300315647, + "flos": 20526835168080.0, + "grad_norm": 1.8176380364472218, + "language_loss": 0.82511955, + "learning_rate": 2.589519209743846e-06, + "loss": 0.8496089, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.1706543, + "step": 7029, + "time_per_iteration": 2.7402937412261963 + }, + { + "auxiliary_loss_clip": 0.01411633, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.27803612, + "balance_loss_mlp": 1.02678275, + "epoch": 0.42266646625582444, + "flos": 24322153825080.0, + "grad_norm": 1.867105154769707, + "language_loss": 0.75369889, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77825296, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.17004395, + "step": 7030, + "time_per_iteration": 2.7427163124084473 + }, + { + "auxiliary_loss_clip": 0.01402522, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.27297759, + "balance_loss_mlp": 1.02093065, + "epoch": 0.4227265895084924, + "flos": 24209151094440.0, + "grad_norm": 1.904884184119981, + "language_loss": 0.86897755, + "learning_rate": 2.588774848134486e-06, + "loss": 0.89338613, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.17407227, + "step": 7031, + "time_per_iteration": 2.732412099838257 + }, + { + "auxiliary_loss_clip": 0.01408027, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.27830791, + "balance_loss_mlp": 1.02291703, + "epoch": 0.42278671276116037, + "flos": 16914169225080.0, + "grad_norm": 1.8726033112065128, + "language_loss": 0.73594934, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.76042974, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.17102051, + "step": 7032, + "time_per_iteration": 4.2562150955200195 + }, + { + "auxiliary_loss_clip": 0.01406779, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.27497315, + "balance_loss_mlp": 1.02766013, + "epoch": 0.42284683601382833, + "flos": 25416608208120.0, + "grad_norm": 1.627326103362318, + "language_loss": 0.70419472, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72869623, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.31738281, + "router_z_loss_mlp": 0.15710449, + "step": 7033, + "time_per_iteration": 2.905740261077881 + }, + { + "auxiliary_loss_clip": 0.01405916, + "auxiliary_loss_mlp": 0.01042192, + "balance_loss_clip": 1.27495742, + "balance_loss_mlp": 1.02534807, + "epoch": 0.4229069592664963, + "flos": 23045655853440.0, + "grad_norm": 1.6796646581124157, + "language_loss": 0.90844232, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.93292344, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16845703, + "step": 7034, + "time_per_iteration": 2.7505853176116943 + }, + { + "auxiliary_loss_clip": 0.01395848, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.26787698, + "balance_loss_mlp": 1.02752256, + "epoch": 0.42296708251916426, + "flos": 26073020944440.0, + "grad_norm": 1.7683314513830062, + "language_loss": 0.7735039, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79789013, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15246582, + "step": 7035, + "time_per_iteration": 2.75508451461792 + }, + { + "auxiliary_loss_clip": 0.01409396, + "auxiliary_loss_mlp": 0.01046214, + "balance_loss_clip": 1.27869606, + "balance_loss_mlp": 1.02995431, + "epoch": 0.4230272057718323, + "flos": 19462413990240.0, + "grad_norm": 1.7300874842410412, + "language_loss": 0.8290742, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.8536303, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.16259766, + "step": 7036, + "time_per_iteration": 2.741447687149048 + }, + { + "auxiliary_loss_clip": 0.01401115, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.27483416, + "balance_loss_mlp": 1.02545965, + "epoch": 0.42308732902450025, + "flos": 22388512166640.0, + "grad_norm": 1.7910715365500338, + "language_loss": 0.70089334, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72531718, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.15808105, + "step": 7037, + "time_per_iteration": 2.7231757640838623 + }, + { + "auxiliary_loss_clip": 0.01402481, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.27329898, + "balance_loss_mlp": 1.02226341, + "epoch": 0.4231474522771682, + "flos": 21000270323520.0, + "grad_norm": 1.556500535471191, + "language_loss": 0.78014857, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80455685, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.1607666, + "step": 7038, + "time_per_iteration": 2.7775697708129883 + }, + { + "auxiliary_loss_clip": 0.01413454, + "auxiliary_loss_mlp": 0.01045517, + "balance_loss_clip": 1.2795825, + "balance_loss_mlp": 1.02818453, + "epoch": 0.4232075755298362, + "flos": 14979999657960.0, + "grad_norm": 2.118355862876419, + "language_loss": 0.66383684, + "learning_rate": 2.585796509770259e-06, + "loss": 0.68842655, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.17321777, + "step": 7039, + "time_per_iteration": 2.7507457733154297 + }, + { + "auxiliary_loss_clip": 0.01420959, + "auxiliary_loss_mlp": 0.01040645, + "balance_loss_clip": 1.28588367, + "balance_loss_mlp": 1.02397966, + "epoch": 0.42326769878250414, + "flos": 24537722937840.0, + "grad_norm": 1.7607914471068926, + "language_loss": 0.75860888, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78322494, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.16662598, + "step": 7040, + "time_per_iteration": 2.891173839569092 + }, + { + "auxiliary_loss_clip": 0.01407677, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.27598739, + "balance_loss_mlp": 1.02070272, + "epoch": 0.4233278220351721, + "flos": 26876002543920.0, + "grad_norm": 1.819522448803563, + "language_loss": 0.65841985, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.68286967, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.16625977, + "step": 7041, + "time_per_iteration": 2.805903434753418 + }, + { + "auxiliary_loss_clip": 0.01414831, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.28274202, + "balance_loss_mlp": 1.01928663, + "epoch": 0.4233879452878401, + "flos": 42822753627960.0, + "grad_norm": 1.8389824259679166, + "language_loss": 0.73540473, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75991386, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.16796875, + "step": 7042, + "time_per_iteration": 2.9599530696868896 + }, + { + "auxiliary_loss_clip": 0.01403134, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.2756083, + "balance_loss_mlp": 1.02010322, + "epoch": 0.42344806854050804, + "flos": 25234564619520.0, + "grad_norm": 1.2485965776659054, + "language_loss": 0.82349211, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84787774, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15319824, + "step": 7043, + "time_per_iteration": 2.804316759109497 + }, + { + "auxiliary_loss_clip": 0.01408253, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.27969384, + "balance_loss_mlp": 1.03027558, + "epoch": 0.423508191793176, + "flos": 22783322891160.0, + "grad_norm": 2.1692057782077487, + "language_loss": 0.65566778, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.68024385, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.1907959, + "step": 7044, + "time_per_iteration": 2.6993067264556885 + }, + { + "auxiliary_loss_clip": 0.01416604, + "auxiliary_loss_mlp": 0.01047471, + "balance_loss_clip": 1.28300285, + "balance_loss_mlp": 1.03012621, + "epoch": 0.42356831504584397, + "flos": 34643648009880.0, + "grad_norm": 1.6234666457918105, + "language_loss": 0.7554723, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.78011304, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.17346191, + "step": 7045, + "time_per_iteration": 2.868743419647217 + }, + { + "auxiliary_loss_clip": 0.01401184, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.27357078, + "balance_loss_mlp": 1.02580309, + "epoch": 0.42362843829851193, + "flos": 17600452733160.0, + "grad_norm": 2.4284992541573045, + "language_loss": 0.80908728, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83351934, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.16223145, + "step": 7046, + "time_per_iteration": 2.7046549320220947 + }, + { + "auxiliary_loss_clip": 0.01409249, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.27707982, + "balance_loss_mlp": 1.02472019, + "epoch": 0.4236885615511799, + "flos": 22570921230480.0, + "grad_norm": 1.5513734601493399, + "language_loss": 0.76970661, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79421455, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.16833496, + "step": 7047, + "time_per_iteration": 2.7514219284057617 + }, + { + "auxiliary_loss_clip": 0.01398599, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.27231765, + "balance_loss_mlp": 1.02558804, + "epoch": 0.42374868480384786, + "flos": 26474897523600.0, + "grad_norm": 1.6605579507894392, + "language_loss": 0.68516147, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70956796, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.16467285, + "step": 7048, + "time_per_iteration": 2.7731008529663086 + }, + { + "auxiliary_loss_clip": 0.01407329, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_clip": 1.27673447, + "balance_loss_mlp": 1.02803636, + "epoch": 0.4238088080565159, + "flos": 20374337484360.0, + "grad_norm": 1.7172897840700065, + "language_loss": 0.78279525, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80732751, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.1784668, + "step": 7049, + "time_per_iteration": 2.722862720489502 + }, + { + "auxiliary_loss_clip": 0.01414424, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.28188825, + "balance_loss_mlp": 1.03102636, + "epoch": 0.42386893130918385, + "flos": 21176222658120.0, + "grad_norm": 1.984858385757617, + "language_loss": 0.83024514, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.85487062, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.17102051, + "step": 7050, + "time_per_iteration": 2.695549249649048 + }, + { + "auxiliary_loss_clip": 0.01410661, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.27991796, + "balance_loss_mlp": 1.02352881, + "epoch": 0.4239290545618518, + "flos": 17680092373080.0, + "grad_norm": 2.058566247160523, + "language_loss": 0.73465598, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75915635, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15856934, + "step": 7051, + "time_per_iteration": 2.73146915435791 + }, + { + "auxiliary_loss_clip": 0.0141004, + "auxiliary_loss_mlp": 0.01041179, + "balance_loss_clip": 1.28111362, + "balance_loss_mlp": 1.02476358, + "epoch": 0.4239891778145198, + "flos": 24319636106760.0, + "grad_norm": 1.4099238636862381, + "language_loss": 0.86447442, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88898659, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.16418457, + "step": 7052, + "time_per_iteration": 2.7731211185455322 + }, + { + "auxiliary_loss_clip": 0.01407874, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.27688491, + "balance_loss_mlp": 1.03108215, + "epoch": 0.42404930106718774, + "flos": 20563325102520.0, + "grad_norm": 1.4456260368092002, + "language_loss": 0.72712612, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.75168085, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.16516113, + "step": 7053, + "time_per_iteration": 2.815948963165283 + }, + { + "auxiliary_loss_clip": 0.01411813, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.28222954, + "balance_loss_mlp": 1.01976502, + "epoch": 0.4241094243198557, + "flos": 22312973971080.0, + "grad_norm": 1.7645267548596382, + "language_loss": 0.82588816, + "learning_rate": 2.580208299200704e-06, + "loss": 0.85037339, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.16943359, + "step": 7054, + "time_per_iteration": 2.771620035171509 + }, + { + "auxiliary_loss_clip": 0.01210396, + "auxiliary_loss_mlp": 0.01013797, + "balance_loss_clip": 1.15412498, + "balance_loss_mlp": 1.0101012, + "epoch": 0.4241695475725237, + "flos": 70628060044080.0, + "grad_norm": 0.7830318437229995, + "language_loss": 0.60436159, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62660354, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.03686523, + "step": 7055, + "time_per_iteration": 3.194326877593994 + }, + { + "auxiliary_loss_clip": 0.01414706, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.28240347, + "balance_loss_mlp": 1.02603078, + "epoch": 0.42422967082519164, + "flos": 14031505012680.0, + "grad_norm": 2.3385694298831865, + "language_loss": 0.77214682, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.7967304, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.17626953, + "step": 7056, + "time_per_iteration": 2.7316231727600098 + }, + { + "auxiliary_loss_clip": 0.01414532, + "auxiliary_loss_mlp": 0.01046126, + "balance_loss_clip": 1.27970862, + "balance_loss_mlp": 1.02764893, + "epoch": 0.4242897940778596, + "flos": 22350560331240.0, + "grad_norm": 2.2470515897761802, + "language_loss": 0.84565967, + "learning_rate": 2.579090061518714e-06, + "loss": 0.87026632, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.18469238, + "step": 7057, + "time_per_iteration": 2.7220218181610107 + }, + { + "auxiliary_loss_clip": 0.01410936, + "auxiliary_loss_mlp": 0.01043825, + "balance_loss_clip": 1.27783096, + "balance_loss_mlp": 1.0261941, + "epoch": 0.42434991733052757, + "flos": 22600264093560.0, + "grad_norm": 2.8083845830058687, + "language_loss": 0.82698166, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.8515293, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.17626953, + "step": 7058, + "time_per_iteration": 2.732222080230713 + }, + { + "auxiliary_loss_clip": 0.01397832, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.27216887, + "balance_loss_mlp": 1.01506674, + "epoch": 0.42441004058319554, + "flos": 20016300952800.0, + "grad_norm": 1.6600823083676575, + "language_loss": 0.80549657, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82977998, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15429688, + "step": 7059, + "time_per_iteration": 2.7227907180786133 + }, + { + "auxiliary_loss_clip": 0.01412457, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.27899587, + "balance_loss_mlp": 1.02553248, + "epoch": 0.4244701638358635, + "flos": 11148637758480.0, + "grad_norm": 2.0057213550056128, + "language_loss": 0.70424104, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72880125, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.18041992, + "step": 7060, + "time_per_iteration": 4.121862888336182 + }, + { + "auxiliary_loss_clip": 0.01407057, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.27639174, + "balance_loss_mlp": 1.01914632, + "epoch": 0.42453028708853147, + "flos": 23993054073000.0, + "grad_norm": 1.5873329692662617, + "language_loss": 0.76166689, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78609288, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.16381836, + "step": 7061, + "time_per_iteration": 2.82008957862854 + }, + { + "auxiliary_loss_clip": 0.01407043, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.27371657, + "balance_loss_mlp": 1.01973379, + "epoch": 0.42459041034119943, + "flos": 18411028095960.0, + "grad_norm": 2.12697841167694, + "language_loss": 0.72975528, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75420368, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.33203125, + "router_z_loss_mlp": 0.18054199, + "step": 7062, + "time_per_iteration": 2.7141077518463135 + }, + { + "auxiliary_loss_clip": 0.01402911, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_clip": 1.27122378, + "balance_loss_mlp": 1.02549338, + "epoch": 0.42465053359386745, + "flos": 20962724571720.0, + "grad_norm": 1.7261070132687382, + "language_loss": 0.66475236, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68921131, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.17468262, + "step": 7063, + "time_per_iteration": 4.177645206451416 + }, + { + "auxiliary_loss_clip": 0.01390869, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.2638278, + "balance_loss_mlp": 1.01899838, + "epoch": 0.4247106568465354, + "flos": 33112289014200.0, + "grad_norm": 1.4847428684410415, + "language_loss": 0.78828466, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.81253815, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15478516, + "step": 7064, + "time_per_iteration": 2.830671787261963 + }, + { + "auxiliary_loss_clip": 0.01409627, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.27726007, + "balance_loss_mlp": 1.02109194, + "epoch": 0.4247707800992034, + "flos": 20051653853160.0, + "grad_norm": 2.4682674131018123, + "language_loss": 0.75753146, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.78200781, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16906738, + "step": 7065, + "time_per_iteration": 2.717289447784424 + }, + { + "auxiliary_loss_clip": 0.01402682, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.27331853, + "balance_loss_mlp": 1.02277803, + "epoch": 0.42483090335187135, + "flos": 22390583193000.0, + "grad_norm": 1.360062664250897, + "language_loss": 0.72377241, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74819577, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16870117, + "step": 7066, + "time_per_iteration": 2.8637115955352783 + }, + { + "auxiliary_loss_clip": 0.01406957, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.27365625, + "balance_loss_mlp": 1.02075183, + "epoch": 0.4248910266045393, + "flos": 21361271265360.0, + "grad_norm": 4.549480769801183, + "language_loss": 0.80744648, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.83190727, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.18359375, + "step": 7067, + "time_per_iteration": 4.225301265716553 + }, + { + "auxiliary_loss_clip": 0.01208576, + "auxiliary_loss_mlp": 0.01002542, + "balance_loss_clip": 1.15255034, + "balance_loss_mlp": 0.9987753, + "epoch": 0.4249511498572073, + "flos": 64022813393400.0, + "grad_norm": 0.9150959357215889, + "language_loss": 0.63509184, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65720302, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.03759766, + "step": 7068, + "time_per_iteration": 3.1427712440490723 + }, + { + "auxiliary_loss_clip": 0.01408174, + "auxiliary_loss_mlp": 0.01043407, + "balance_loss_clip": 1.27540588, + "balance_loss_mlp": 1.02495313, + "epoch": 0.42501127310987524, + "flos": 19611459963360.0, + "grad_norm": 1.6303188933319201, + "language_loss": 0.72642297, + "learning_rate": 2.574615138284361e-06, + "loss": 0.75093877, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.18444824, + "step": 7069, + "time_per_iteration": 2.7113053798675537 + }, + { + "auxiliary_loss_clip": 0.01406934, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.27438068, + "balance_loss_mlp": 1.01791024, + "epoch": 0.4250713963625432, + "flos": 19466880909840.0, + "grad_norm": 1.8004768931224449, + "language_loss": 0.79462886, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81905222, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.17480469, + "step": 7070, + "time_per_iteration": 2.68817400932312 + }, + { + "auxiliary_loss_clip": 0.0140835, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.27650249, + "balance_loss_mlp": 1.01957023, + "epoch": 0.4251315196152112, + "flos": 25343181647280.0, + "grad_norm": 1.673017888311881, + "language_loss": 0.70524251, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72970086, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.17907715, + "step": 7071, + "time_per_iteration": 2.783601760864258 + }, + { + "auxiliary_loss_clip": 0.01398434, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.26780677, + "balance_loss_mlp": 1.01605225, + "epoch": 0.42519164286787914, + "flos": 26364777986520.0, + "grad_norm": 3.6892095261934386, + "language_loss": 0.71419573, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73851162, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.17114258, + "step": 7072, + "time_per_iteration": 4.258885383605957 + }, + { + "auxiliary_loss_clip": 0.01410991, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.27632689, + "balance_loss_mlp": 1.02017355, + "epoch": 0.4252517661205471, + "flos": 26036652835080.0, + "grad_norm": 1.8536734017421745, + "language_loss": 0.81783462, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.84230942, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.16308594, + "step": 7073, + "time_per_iteration": 2.777663230895996 + }, + { + "auxiliary_loss_clip": 0.01401573, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.27201366, + "balance_loss_mlp": 1.01775622, + "epoch": 0.42531188937321507, + "flos": 12717461289240.0, + "grad_norm": 7.0045750085852285, + "language_loss": 0.91530907, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93966013, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15783691, + "step": 7074, + "time_per_iteration": 2.7212107181549072 + }, + { + "auxiliary_loss_clip": 0.01411695, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.27689004, + "balance_loss_mlp": 1.022614, + "epoch": 0.42537201262588303, + "flos": 22096876949640.0, + "grad_norm": 2.2064108783568046, + "language_loss": 0.64688766, + "learning_rate": 2.572376498508805e-06, + "loss": 0.67140377, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.17297363, + "step": 7075, + "time_per_iteration": 2.714318037033081 + }, + { + "auxiliary_loss_clip": 0.0139785, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.27073598, + "balance_loss_mlp": 1.01864219, + "epoch": 0.42543213587855105, + "flos": 23008353751800.0, + "grad_norm": 1.645341846572921, + "language_loss": 0.74439746, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.76871687, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15441895, + "step": 7076, + "time_per_iteration": 2.899495840072632 + }, + { + "auxiliary_loss_clip": 0.01414093, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.28041339, + "balance_loss_mlp": 1.02684951, + "epoch": 0.425492259131219, + "flos": 25087914539640.0, + "grad_norm": 1.9345557256933985, + "language_loss": 0.78648221, + "learning_rate": 2.571630111462766e-06, + "loss": 0.81105793, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.16638184, + "step": 7077, + "time_per_iteration": 2.7816009521484375 + }, + { + "auxiliary_loss_clip": 0.01391446, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.26613557, + "balance_loss_mlp": 1.02619505, + "epoch": 0.425552382383887, + "flos": 22821558985080.0, + "grad_norm": 1.557946764306925, + "language_loss": 0.73500502, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75933123, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14984131, + "step": 7078, + "time_per_iteration": 2.8443684577941895 + }, + { + "auxiliary_loss_clip": 0.01397579, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.2705456, + "balance_loss_mlp": 1.02998543, + "epoch": 0.42561250563655495, + "flos": 13557623165280.0, + "grad_norm": 2.188671629041453, + "language_loss": 0.80214155, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.82657039, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15325928, + "step": 7079, + "time_per_iteration": 2.720534324645996 + }, + { + "auxiliary_loss_clip": 0.01402483, + "auxiliary_loss_mlp": 0.01042149, + "balance_loss_clip": 1.27488756, + "balance_loss_mlp": 1.02743268, + "epoch": 0.4256726288892229, + "flos": 46987194898800.0, + "grad_norm": 1.5604097300533397, + "language_loss": 0.72206181, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74650812, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.14715576, + "step": 7080, + "time_per_iteration": 2.994908571243286 + }, + { + "auxiliary_loss_clip": 0.0139824, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.26865518, + "balance_loss_mlp": 1.02453923, + "epoch": 0.4257327521418909, + "flos": 23591705402520.0, + "grad_norm": 2.1135987834855317, + "language_loss": 0.8019743, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.8263582, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.15588379, + "step": 7081, + "time_per_iteration": 2.7444052696228027 + }, + { + "auxiliary_loss_clip": 0.0139436, + "auxiliary_loss_mlp": 0.01042087, + "balance_loss_clip": 1.2697407, + "balance_loss_mlp": 1.02756751, + "epoch": 0.42579287539455885, + "flos": 18994745221920.0, + "grad_norm": 1.6034333444021998, + "language_loss": 0.82026428, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.84462875, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.14526367, + "step": 7082, + "time_per_iteration": 2.7775352001190186 + }, + { + "auxiliary_loss_clip": 0.01406554, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.2775712, + "balance_loss_mlp": 1.02968347, + "epoch": 0.4258529986472268, + "flos": 25197384342960.0, + "grad_norm": 1.8532132459258157, + "language_loss": 0.69845724, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72296888, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.14916992, + "step": 7083, + "time_per_iteration": 2.7842721939086914 + }, + { + "auxiliary_loss_clip": 0.01219844, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.16434193, + "balance_loss_mlp": 1.03475654, + "epoch": 0.4259131218998948, + "flos": 69985454150160.0, + "grad_norm": 0.8813167825960524, + "language_loss": 0.67153966, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69411683, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.03125, + "step": 7084, + "time_per_iteration": 3.329749584197998 + }, + { + "auxiliary_loss_clip": 0.01399327, + "auxiliary_loss_mlp": 0.01050628, + "balance_loss_clip": 1.27034688, + "balance_loss_mlp": 1.03410602, + "epoch": 0.42597324515256274, + "flos": 18009598208760.0, + "grad_norm": 2.1116180029513365, + "language_loss": 0.78440285, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80890244, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.1652832, + "step": 7085, + "time_per_iteration": 2.7227790355682373 + }, + { + "auxiliary_loss_clip": 0.01418929, + "auxiliary_loss_mlp": 0.01060351, + "balance_loss_clip": 1.28357768, + "balance_loss_mlp": 1.04258919, + "epoch": 0.4260333684052307, + "flos": 15163017847200.0, + "grad_norm": 1.9496372483565299, + "language_loss": 0.76150942, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78630221, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.1776123, + "step": 7086, + "time_per_iteration": 2.7406787872314453 + }, + { + "auxiliary_loss_clip": 0.01412863, + "auxiliary_loss_mlp": 0.01047472, + "balance_loss_clip": 1.28350198, + "balance_loss_mlp": 1.03133094, + "epoch": 0.42609349165789867, + "flos": 14943469115160.0, + "grad_norm": 1.9847215215410623, + "language_loss": 0.80240917, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82701254, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16131592, + "step": 7087, + "time_per_iteration": 2.715651750564575 + }, + { + "auxiliary_loss_clip": 0.0141185, + "auxiliary_loss_mlp": 0.01040573, + "balance_loss_clip": 1.28173184, + "balance_loss_mlp": 1.02421737, + "epoch": 0.42615361491056664, + "flos": 23737177839960.0, + "grad_norm": 1.615363844820761, + "language_loss": 0.66083074, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68535495, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.16357422, + "step": 7088, + "time_per_iteration": 2.780463695526123 + }, + { + "auxiliary_loss_clip": 0.01412105, + "auxiliary_loss_mlp": 0.01049878, + "balance_loss_clip": 1.28081429, + "balance_loss_mlp": 1.03460717, + "epoch": 0.42621373816323466, + "flos": 24941873585160.0, + "grad_norm": 2.0950531799387426, + "language_loss": 0.68630028, + "learning_rate": 2.56714997234313e-06, + "loss": 0.7109201, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.15270996, + "step": 7089, + "time_per_iteration": 2.747157573699951 + }, + { + "auxiliary_loss_clip": 0.01409648, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_clip": 1.27700305, + "balance_loss_mlp": 1.03011894, + "epoch": 0.4262738614159026, + "flos": 13556932823160.0, + "grad_norm": 2.2969098571859097, + "language_loss": 0.74668008, + "learning_rate": 2.566776487287525e-06, + "loss": 0.77123547, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.15771484, + "step": 7090, + "time_per_iteration": 2.7174124717712402 + }, + { + "auxiliary_loss_clip": 0.01419146, + "auxiliary_loss_mlp": 0.01056643, + "balance_loss_clip": 1.28536761, + "balance_loss_mlp": 1.04107404, + "epoch": 0.4263339846685706, + "flos": 29754362270160.0, + "grad_norm": 1.9941881301038105, + "language_loss": 0.74995065, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77470857, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.15563965, + "step": 7091, + "time_per_iteration": 2.8119029998779297 + }, + { + "auxiliary_loss_clip": 0.01401565, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.27715194, + "balance_loss_mlp": 1.02528167, + "epoch": 0.42639410792123855, + "flos": 16838265554280.0, + "grad_norm": 2.2071062125204093, + "language_loss": 0.82419908, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84860522, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.13763428, + "step": 7092, + "time_per_iteration": 2.754430055618286 + }, + { + "auxiliary_loss_clip": 0.01429056, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.2931881, + "balance_loss_mlp": 1.03492486, + "epoch": 0.4264542311739065, + "flos": 28768484306520.0, + "grad_norm": 1.4883472878650175, + "language_loss": 0.74162793, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76643163, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.16381836, + "step": 7093, + "time_per_iteration": 2.8856260776519775 + }, + { + "auxiliary_loss_clip": 0.01410918, + "auxiliary_loss_mlp": 0.01041897, + "balance_loss_clip": 1.2802887, + "balance_loss_mlp": 1.0257082, + "epoch": 0.4265143544265745, + "flos": 24718507667280.0, + "grad_norm": 2.261595506642364, + "language_loss": 0.70553482, + "learning_rate": 2.565282332284532e-06, + "loss": 0.73006296, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.16174316, + "step": 7094, + "time_per_iteration": 2.7614595890045166 + }, + { + "auxiliary_loss_clip": 0.01417995, + "auxiliary_loss_mlp": 0.01043545, + "balance_loss_clip": 1.28559327, + "balance_loss_mlp": 1.02746344, + "epoch": 0.42657447767924245, + "flos": 21870262362960.0, + "grad_norm": 1.5308577859628683, + "language_loss": 0.81664819, + "learning_rate": 2.564908739909464e-06, + "loss": 0.84126359, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.16088867, + "step": 7095, + "time_per_iteration": 2.7906901836395264 + }, + { + "auxiliary_loss_clip": 0.014129, + "auxiliary_loss_mlp": 0.0105044, + "balance_loss_clip": 1.28069019, + "balance_loss_mlp": 1.03422785, + "epoch": 0.4266346009319104, + "flos": 21475207988280.0, + "grad_norm": 1.8020908973934593, + "language_loss": 0.80795169, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.8325851, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.16223145, + "step": 7096, + "time_per_iteration": 2.708834648132324 + }, + { + "auxiliary_loss_clip": 0.01420163, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.284338, + "balance_loss_mlp": 1.02504277, + "epoch": 0.4266947241845784, + "flos": 25524778543920.0, + "grad_norm": 2.089311753948364, + "language_loss": 0.66445661, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.68907046, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.16174316, + "step": 7097, + "time_per_iteration": 2.7668564319610596 + }, + { + "auxiliary_loss_clip": 0.0140604, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.27843451, + "balance_loss_mlp": 1.02049041, + "epoch": 0.42675484743724634, + "flos": 26546780966760.0, + "grad_norm": 2.0470783931816277, + "language_loss": 0.7467227, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.77114201, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.15405273, + "step": 7098, + "time_per_iteration": 2.7778899669647217 + }, + { + "auxiliary_loss_clip": 0.01403783, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.27579641, + "balance_loss_mlp": 1.01941526, + "epoch": 0.4268149706899143, + "flos": 23117985988560.0, + "grad_norm": 1.5963101256754506, + "language_loss": 0.75227571, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77665949, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.15155029, + "step": 7099, + "time_per_iteration": 4.176356792449951 + }, + { + "auxiliary_loss_clip": 0.01414021, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.28123534, + "balance_loss_mlp": 1.0320183, + "epoch": 0.4268750939425823, + "flos": 22711114581120.0, + "grad_norm": 2.1733286976073978, + "language_loss": 0.82615155, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85077322, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.16125488, + "step": 7100, + "time_per_iteration": 2.774322986602783 + }, + { + "auxiliary_loss_clip": 0.01420051, + "auxiliary_loss_mlp": 0.01040801, + "balance_loss_clip": 1.28656876, + "balance_loss_mlp": 1.02510095, + "epoch": 0.42693521719525024, + "flos": 25380443140560.0, + "grad_norm": 1.3256344479066065, + "language_loss": 0.82313669, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84774518, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.33398438, + "router_z_loss_mlp": 0.15710449, + "step": 7101, + "time_per_iteration": 2.7801454067230225 + }, + { + "auxiliary_loss_clip": 0.01417905, + "auxiliary_loss_mlp": 0.01037565, + "balance_loss_clip": 1.28348541, + "balance_loss_mlp": 1.02130473, + "epoch": 0.42699534044791826, + "flos": 18155314296360.0, + "grad_norm": 2.1583942129068086, + "language_loss": 0.73045278, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75500745, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.1628418, + "step": 7102, + "time_per_iteration": 4.183936357498169 + }, + { + "auxiliary_loss_clip": 0.01404353, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.27721596, + "balance_loss_mlp": 1.02431273, + "epoch": 0.4270554637005862, + "flos": 13702324043880.0, + "grad_norm": 1.6665448895370059, + "language_loss": 0.83003783, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85448301, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15856934, + "step": 7103, + "time_per_iteration": 2.807478904724121 + }, + { + "auxiliary_loss_clip": 0.01415647, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.28321004, + "balance_loss_mlp": 1.02035332, + "epoch": 0.4271155869532542, + "flos": 17498089392840.0, + "grad_norm": 1.9673840241070069, + "language_loss": 0.7408036, + "learning_rate": 2.561545446271294e-06, + "loss": 0.7653259, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.16223145, + "step": 7104, + "time_per_iteration": 2.702364921569824 + }, + { + "auxiliary_loss_clip": 0.01405902, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.27580988, + "balance_loss_mlp": 1.01777911, + "epoch": 0.42717571020592215, + "flos": 32458678254720.0, + "grad_norm": 2.213156362229883, + "language_loss": 0.75454843, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77893335, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.14819336, + "step": 7105, + "time_per_iteration": 2.8180062770843506 + }, + { + "auxiliary_loss_clip": 0.01412821, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.28146005, + "balance_loss_mlp": 1.02507985, + "epoch": 0.4272358334585901, + "flos": 16257391013520.0, + "grad_norm": 2.06515280987244, + "language_loss": 0.76928014, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79381627, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.15710449, + "step": 7106, + "time_per_iteration": 4.226214408874512 + }, + { + "auxiliary_loss_clip": 0.01406956, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.27699399, + "balance_loss_mlp": 1.02448702, + "epoch": 0.4272959567112581, + "flos": 24204562349760.0, + "grad_norm": 2.2144428905549582, + "language_loss": 0.80687726, + "learning_rate": 2.560423964592229e-06, + "loss": 0.83134758, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15582275, + "step": 7107, + "time_per_iteration": 2.7375690937042236 + }, + { + "auxiliary_loss_clip": 0.01410322, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.28223586, + "balance_loss_mlp": 1.02512336, + "epoch": 0.42735607996392605, + "flos": 27969157459440.0, + "grad_norm": 1.3226527537627717, + "language_loss": 0.6825465, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.7070576, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15673828, + "step": 7108, + "time_per_iteration": 2.805840015411377 + }, + { + "auxiliary_loss_clip": 0.01407705, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.27726603, + "balance_loss_mlp": 1.02039158, + "epoch": 0.427416203216594, + "flos": 20299489630920.0, + "grad_norm": 1.6599903301158707, + "language_loss": 0.71315724, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73758698, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.14868164, + "step": 7109, + "time_per_iteration": 2.6962132453918457 + }, + { + "auxiliary_loss_clip": 0.01415258, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_clip": 1.28346479, + "balance_loss_mlp": 1.02385378, + "epoch": 0.427476326469262, + "flos": 26949753971640.0, + "grad_norm": 2.0025311822783203, + "language_loss": 0.64593202, + "learning_rate": 2.559302291651174e-06, + "loss": 0.67050087, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.17785645, + "step": 7110, + "time_per_iteration": 4.263514757156372 + }, + { + "auxiliary_loss_clip": 0.0141226, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.28225708, + "balance_loss_mlp": 1.02181196, + "epoch": 0.42753644972192995, + "flos": 25708405858560.0, + "grad_norm": 1.643873787228383, + "language_loss": 0.76685923, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.79136044, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.16015625, + "step": 7111, + "time_per_iteration": 2.796349048614502 + }, + { + "auxiliary_loss_clip": 0.0141556, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.28400338, + "balance_loss_mlp": 1.02039397, + "epoch": 0.4275965729745979, + "flos": 18771663562560.0, + "grad_norm": 1.8234612215762525, + "language_loss": 0.73361981, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75812805, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.14862061, + "step": 7112, + "time_per_iteration": 2.768310308456421 + }, + { + "auxiliary_loss_clip": 0.01408006, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.28077769, + "balance_loss_mlp": 1.02215588, + "epoch": 0.4276566962272659, + "flos": 23769119638080.0, + "grad_norm": 1.5599441726772396, + "language_loss": 0.71887666, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.74332929, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15106201, + "step": 7113, + "time_per_iteration": 2.8350095748901367 + }, + { + "auxiliary_loss_clip": 0.01419395, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.28759897, + "balance_loss_mlp": 1.02695417, + "epoch": 0.42771681947993384, + "flos": 22497697711440.0, + "grad_norm": 1.5685114971489835, + "language_loss": 0.61477822, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63939768, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.15612793, + "step": 7114, + "time_per_iteration": 2.77508544921875 + }, + { + "auxiliary_loss_clip": 0.01426186, + "auxiliary_loss_mlp": 0.01045749, + "balance_loss_clip": 1.29005337, + "balance_loss_mlp": 1.02764082, + "epoch": 0.42777694273260186, + "flos": 25050003312600.0, + "grad_norm": 2.4751576523926757, + "language_loss": 0.64741337, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.67213273, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.18127441, + "step": 7115, + "time_per_iteration": 2.7779436111450195 + }, + { + "auxiliary_loss_clip": 0.01418111, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.28804207, + "balance_loss_mlp": 1.01733363, + "epoch": 0.4278370659852698, + "flos": 18666213986880.0, + "grad_norm": 1.4989810140127955, + "language_loss": 0.73819351, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76270604, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.15795898, + "step": 7116, + "time_per_iteration": 2.7721011638641357 + }, + { + "auxiliary_loss_clip": 0.0139768, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.27200389, + "balance_loss_mlp": 1.02302504, + "epoch": 0.4278971892379378, + "flos": 27314044190640.0, + "grad_norm": 2.051487064754626, + "language_loss": 0.69225967, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71661294, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14611816, + "step": 7117, + "time_per_iteration": 2.779353618621826 + }, + { + "auxiliary_loss_clip": 0.01411363, + "auxiliary_loss_mlp": 0.01039767, + "balance_loss_clip": 1.28330588, + "balance_loss_mlp": 1.02358997, + "epoch": 0.42795731249060576, + "flos": 12891586247640.0, + "grad_norm": 2.4612288930213833, + "language_loss": 0.70043766, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72494894, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.16186523, + "step": 7118, + "time_per_iteration": 2.7158544063568115 + }, + { + "auxiliary_loss_clip": 0.01417658, + "auxiliary_loss_mlp": 0.01043516, + "balance_loss_clip": 1.28882766, + "balance_loss_mlp": 1.02700567, + "epoch": 0.4280174357432737, + "flos": 33408025675560.0, + "grad_norm": 1.789615288534264, + "language_loss": 0.74639416, + "learning_rate": 2.55593612908444e-06, + "loss": 0.77100587, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.16503906, + "step": 7119, + "time_per_iteration": 2.8266096115112305 + }, + { + "auxiliary_loss_clip": 0.01410325, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.28290069, + "balance_loss_mlp": 1.01855814, + "epoch": 0.4280775589959417, + "flos": 18263606457240.0, + "grad_norm": 1.9135659610492521, + "language_loss": 0.75332594, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15625, + "step": 7120, + "time_per_iteration": 2.7405316829681396 + }, + { + "auxiliary_loss_clip": 0.01414859, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.28686154, + "balance_loss_mlp": 1.02587056, + "epoch": 0.42813768224860965, + "flos": 21476385630720.0, + "grad_norm": 2.0307002298410874, + "language_loss": 0.77626628, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.80082279, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.14910889, + "step": 7121, + "time_per_iteration": 2.72149395942688 + }, + { + "auxiliary_loss_clip": 0.01407682, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.28149843, + "balance_loss_mlp": 1.02214956, + "epoch": 0.4281978055012776, + "flos": 15673511454120.0, + "grad_norm": 1.8012029550856474, + "language_loss": 0.86304641, + "learning_rate": 2.554813694924126e-06, + "loss": 0.88748962, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.14465332, + "step": 7122, + "time_per_iteration": 2.7472481727600098 + }, + { + "auxiliary_loss_clip": 0.01405758, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.27890635, + "balance_loss_mlp": 1.02239704, + "epoch": 0.4282579287539456, + "flos": 17716460482440.0, + "grad_norm": 1.7073527420883043, + "language_loss": 0.81136513, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83579797, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15124512, + "step": 7123, + "time_per_iteration": 2.787222146987915 + }, + { + "auxiliary_loss_clip": 0.01407415, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.28229547, + "balance_loss_mlp": 1.02179289, + "epoch": 0.42831805200661355, + "flos": 19285852530240.0, + "grad_norm": 1.6593231282581897, + "language_loss": 0.81031352, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.834759, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15368652, + "step": 7124, + "time_per_iteration": 2.820889949798584 + }, + { + "auxiliary_loss_clip": 0.01412391, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.28430641, + "balance_loss_mlp": 1.02646613, + "epoch": 0.4283781752592815, + "flos": 19797483171240.0, + "grad_norm": 1.6395413881735659, + "language_loss": 0.80535233, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82989943, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15838623, + "step": 7125, + "time_per_iteration": 2.800356388092041 + }, + { + "auxiliary_loss_clip": 0.01404242, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.27954328, + "balance_loss_mlp": 1.01895738, + "epoch": 0.4284382985119495, + "flos": 16512373862640.0, + "grad_norm": 1.8969880087138296, + "language_loss": 0.75273275, + "learning_rate": 2.553316821569659e-06, + "loss": 0.77710879, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.144104, + "step": 7126, + "time_per_iteration": 2.744270086288452 + }, + { + "auxiliary_loss_clip": 0.0141512, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.28661585, + "balance_loss_mlp": 1.01565099, + "epoch": 0.42849842176461744, + "flos": 23335950994560.0, + "grad_norm": 1.5318744231237889, + "language_loss": 0.81726456, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.8417275, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15527344, + "step": 7127, + "time_per_iteration": 2.742551565170288 + }, + { + "auxiliary_loss_clip": 0.01415019, + "auxiliary_loss_mlp": 0.01040307, + "balance_loss_clip": 1.28715014, + "balance_loss_mlp": 1.0248816, + "epoch": 0.4285585450172854, + "flos": 17278865527680.0, + "grad_norm": 1.820875604714028, + "language_loss": 0.76211059, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78666389, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15411377, + "step": 7128, + "time_per_iteration": 2.6938066482543945 + }, + { + "auxiliary_loss_clip": 0.01414831, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.28485084, + "balance_loss_mlp": 1.02476168, + "epoch": 0.42861866826995343, + "flos": 24284932940160.0, + "grad_norm": 2.134653731952511, + "language_loss": 0.7452718, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76982474, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.15698242, + "step": 7129, + "time_per_iteration": 2.789898157119751 + }, + { + "auxiliary_loss_clip": 0.01416384, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.28838885, + "balance_loss_mlp": 1.01674592, + "epoch": 0.4286787915226214, + "flos": 24358521934440.0, + "grad_norm": 1.643070345936005, + "language_loss": 0.78344083, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.80792272, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.1505127, + "step": 7130, + "time_per_iteration": 2.799593925476074 + }, + { + "auxiliary_loss_clip": 0.0141901, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.28940058, + "balance_loss_mlp": 1.02201009, + "epoch": 0.42873891477528936, + "flos": 15454165763880.0, + "grad_norm": 2.4944067342023843, + "language_loss": 0.73327935, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75784647, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.15698242, + "step": 7131, + "time_per_iteration": 2.7605152130126953 + }, + { + "auxiliary_loss_clip": 0.01418113, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.28880084, + "balance_loss_mlp": 1.02274132, + "epoch": 0.4287990380279573, + "flos": 17644089738960.0, + "grad_norm": 2.1258573140008314, + "language_loss": 0.77705956, + "learning_rate": 2.551070882366973e-06, + "loss": 0.80163383, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16589355, + "step": 7132, + "time_per_iteration": 2.7085111141204834 + }, + { + "auxiliary_loss_clip": 0.01416538, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.28702116, + "balance_loss_mlp": 1.02580285, + "epoch": 0.4288591612806253, + "flos": 27168084452880.0, + "grad_norm": 1.5932912376653292, + "language_loss": 0.78359264, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80817848, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.16259766, + "step": 7133, + "time_per_iteration": 2.902769088745117 + }, + { + "auxiliary_loss_clip": 0.01413352, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.2837255, + "balance_loss_mlp": 1.02452135, + "epoch": 0.42891928453329325, + "flos": 17167365306360.0, + "grad_norm": 1.7798639272469146, + "language_loss": 0.74947, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77399611, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14733887, + "step": 7134, + "time_per_iteration": 2.74222469329834 + }, + { + "auxiliary_loss_clip": 0.01401566, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.27559233, + "balance_loss_mlp": 1.01898432, + "epoch": 0.4289794077859612, + "flos": 18191641797360.0, + "grad_norm": 2.316363730330602, + "language_loss": 0.84055519, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86490721, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.1463623, + "step": 7135, + "time_per_iteration": 2.7677111625671387 + }, + { + "auxiliary_loss_clip": 0.01406165, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.28081989, + "balance_loss_mlp": 1.0241617, + "epoch": 0.4290395310386292, + "flos": 28262782486080.0, + "grad_norm": 1.8764753950538249, + "language_loss": 0.75476861, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77922577, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15374756, + "step": 7136, + "time_per_iteration": 2.8229243755340576 + }, + { + "auxiliary_loss_clip": 0.0141513, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.28407979, + "balance_loss_mlp": 1.02739239, + "epoch": 0.42909965429129715, + "flos": 16220332562040.0, + "grad_norm": 1.9019567450574129, + "language_loss": 0.79336727, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81794965, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15710449, + "step": 7137, + "time_per_iteration": 4.338674306869507 + }, + { + "auxiliary_loss_clip": 0.01419578, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.28943217, + "balance_loss_mlp": 1.020962, + "epoch": 0.4291597775439651, + "flos": 23118148422000.0, + "grad_norm": 2.0867206799601377, + "language_loss": 0.76984131, + "learning_rate": 2.548824190884499e-06, + "loss": 0.79440439, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.15771484, + "step": 7138, + "time_per_iteration": 2.79545259475708 + }, + { + "auxiliary_loss_clip": 0.01225512, + "auxiliary_loss_mlp": 0.0100606, + "balance_loss_clip": 1.17113471, + "balance_loss_mlp": 1.00286508, + "epoch": 0.4292199007966331, + "flos": 67561240608360.0, + "grad_norm": 0.9740740325669696, + "language_loss": 0.56194526, + "learning_rate": 2.548449669381113e-06, + "loss": 0.584261, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.03198242, + "step": 7139, + "time_per_iteration": 3.143354654312134 + }, + { + "auxiliary_loss_clip": 0.01397834, + "auxiliary_loss_mlp": 0.01035424, + "balance_loss_clip": 1.27531922, + "balance_loss_mlp": 1.02163768, + "epoch": 0.42928002404930105, + "flos": 23004658391040.0, + "grad_norm": 1.5830132842264095, + "language_loss": 0.81306118, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83739376, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13793945, + "step": 7140, + "time_per_iteration": 2.740588426589966 + }, + { + "auxiliary_loss_clip": 0.01412074, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.28218913, + "balance_loss_mlp": 1.02345061, + "epoch": 0.429340147301969, + "flos": 11548443311280.0, + "grad_norm": 2.169200012241525, + "language_loss": 0.82107735, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84559131, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.15869141, + "step": 7141, + "time_per_iteration": 4.333849668502808 + }, + { + "auxiliary_loss_clip": 0.01418344, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.28650808, + "balance_loss_mlp": 1.02659774, + "epoch": 0.42940027055463703, + "flos": 25270486036920.0, + "grad_norm": 1.8274737392348155, + "language_loss": 0.86362302, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88823634, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.16369629, + "step": 7142, + "time_per_iteration": 2.857320785522461 + }, + { + "auxiliary_loss_clip": 0.01401611, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.27685642, + "balance_loss_mlp": 1.02560973, + "epoch": 0.429460393807305, + "flos": 23810238925560.0, + "grad_norm": 1.9245760111621528, + "language_loss": 0.78510052, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.8095246, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.15197754, + "step": 7143, + "time_per_iteration": 2.737293004989624 + }, + { + "auxiliary_loss_clip": 0.01410081, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.28254437, + "balance_loss_mlp": 1.02480912, + "epoch": 0.42952051705997296, + "flos": 13922360076240.0, + "grad_norm": 2.177476787136955, + "language_loss": 0.77178228, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79627979, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.14855957, + "step": 7144, + "time_per_iteration": 4.139500617980957 + }, + { + "auxiliary_loss_clip": 0.01407693, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.27938962, + "balance_loss_mlp": 1.01833928, + "epoch": 0.4295806403126409, + "flos": 26766004831920.0, + "grad_norm": 1.5934223268841456, + "language_loss": 0.73722845, + "learning_rate": 2.54620210411532e-06, + "loss": 0.76164055, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.1517334, + "step": 7145, + "time_per_iteration": 2.7925662994384766 + }, + { + "auxiliary_loss_clip": 0.01413276, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.28271055, + "balance_loss_mlp": 1.01822758, + "epoch": 0.4296407635653089, + "flos": 20956633317720.0, + "grad_norm": 2.617996381229241, + "language_loss": 0.7941308, + "learning_rate": 2.545827437329352e-06, + "loss": 0.8185975, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15155029, + "step": 7146, + "time_per_iteration": 2.7371201515197754 + }, + { + "auxiliary_loss_clip": 0.01404191, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.27771521, + "balance_loss_mlp": 1.02156425, + "epoch": 0.42970088681797686, + "flos": 15856692076800.0, + "grad_norm": 2.4311601102442646, + "language_loss": 0.83436239, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85876465, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.14483643, + "step": 7147, + "time_per_iteration": 2.7383084297180176 + }, + { + "auxiliary_loss_clip": 0.01411739, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.28225446, + "balance_loss_mlp": 1.02448571, + "epoch": 0.4297610100706448, + "flos": 22387740607800.0, + "grad_norm": 1.9044580512092846, + "language_loss": 0.87872344, + "learning_rate": 2.545078041678131e-06, + "loss": 0.903247, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16131592, + "step": 7148, + "time_per_iteration": 2.742913246154785 + }, + { + "auxiliary_loss_clip": 0.01405704, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.27729535, + "balance_loss_mlp": 1.02525842, + "epoch": 0.4298211333233128, + "flos": 27931165015680.0, + "grad_norm": 1.519025686421391, + "language_loss": 0.77573025, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80018878, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.14880371, + "step": 7149, + "time_per_iteration": 4.350829839706421 + }, + { + "auxiliary_loss_clip": 0.01400504, + "auxiliary_loss_mlp": 0.01035888, + "balance_loss_clip": 1.27521002, + "balance_loss_mlp": 1.02036095, + "epoch": 0.42988125657598075, + "flos": 24430973894640.0, + "grad_norm": 1.8426581995528253, + "language_loss": 0.79726112, + "learning_rate": 2.544328563349256e-06, + "loss": 0.82162511, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.1552124, + "step": 7150, + "time_per_iteration": 2.7726311683654785 + }, + { + "auxiliary_loss_clip": 0.01420056, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.28525567, + "balance_loss_mlp": 1.02923596, + "epoch": 0.4299413798286487, + "flos": 15854255575200.0, + "grad_norm": 1.593302235324503, + "language_loss": 0.75367439, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77834409, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.17675781, + "step": 7151, + "time_per_iteration": 2.690734624862671 + }, + { + "auxiliary_loss_clip": 0.01416156, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.28469253, + "balance_loss_mlp": 1.0196054, + "epoch": 0.4300015030813167, + "flos": 22314598305480.0, + "grad_norm": 2.083707015091204, + "language_loss": 0.70671231, + "learning_rate": 2.543579002456406e-06, + "loss": 0.73122776, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.15783691, + "step": 7152, + "time_per_iteration": 2.7704203128814697 + }, + { + "auxiliary_loss_clip": 0.01405461, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.27630138, + "balance_loss_mlp": 1.02442241, + "epoch": 0.43006162633398465, + "flos": 34904478462840.0, + "grad_norm": 1.5300402421108679, + "language_loss": 0.71464419, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73909539, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15234375, + "step": 7153, + "time_per_iteration": 2.8594753742218018 + }, + { + "auxiliary_loss_clip": 0.01407404, + "auxiliary_loss_mlp": 0.01036187, + "balance_loss_clip": 1.27892327, + "balance_loss_mlp": 1.02030826, + "epoch": 0.4301217495866526, + "flos": 15965918229960.0, + "grad_norm": 2.224502853332346, + "language_loss": 0.79098141, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81541729, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.15905762, + "step": 7154, + "time_per_iteration": 2.7316250801086426 + }, + { + "auxiliary_loss_clip": 0.01397801, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.27261424, + "balance_loss_mlp": 1.0230453, + "epoch": 0.43018187283932063, + "flos": 18774993448080.0, + "grad_norm": 1.7085929656921162, + "language_loss": 0.79166871, + "learning_rate": 2.542454506558389e-06, + "loss": 0.81601524, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.13806152, + "step": 7155, + "time_per_iteration": 2.829054594039917 + }, + { + "auxiliary_loss_clip": 0.01393774, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.26845527, + "balance_loss_mlp": 1.01958334, + "epoch": 0.4302419960919886, + "flos": 20155844569680.0, + "grad_norm": 1.7536000266159482, + "language_loss": 0.88552171, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90979707, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14178467, + "step": 7156, + "time_per_iteration": 2.71809720993042 + }, + { + "auxiliary_loss_clip": 0.01411644, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.28064573, + "balance_loss_mlp": 1.02140188, + "epoch": 0.43030211934465656, + "flos": 26438407589160.0, + "grad_norm": 1.7255235948831291, + "language_loss": 0.83144474, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85593629, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.16113281, + "step": 7157, + "time_per_iteration": 2.8062939643859863 + }, + { + "auxiliary_loss_clip": 0.01413154, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.28113699, + "balance_loss_mlp": 1.02130342, + "epoch": 0.43036224259732453, + "flos": 24394605785280.0, + "grad_norm": 1.6861615134046222, + "language_loss": 0.72338355, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74789357, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.16540527, + "step": 7158, + "time_per_iteration": 2.7343883514404297 + }, + { + "auxiliary_loss_clip": 0.01404502, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.27592254, + "balance_loss_mlp": 1.02005506, + "epoch": 0.4304223658499925, + "flos": 17206169917320.0, + "grad_norm": 3.672343900237245, + "language_loss": 0.82983351, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85422969, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15063477, + "step": 7159, + "time_per_iteration": 2.723926544189453 + }, + { + "auxiliary_loss_clip": 0.01405103, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.27596033, + "balance_loss_mlp": 1.01720881, + "epoch": 0.43048248910266046, + "flos": 14907141614160.0, + "grad_norm": 2.0414592863211882, + "language_loss": 0.83635187, + "learning_rate": 2.54057993551933e-06, + "loss": 0.8607316, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15673828, + "step": 7160, + "time_per_iteration": 2.695064067840576 + }, + { + "auxiliary_loss_clip": 0.01416576, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.28468299, + "balance_loss_mlp": 1.02467251, + "epoch": 0.4305426123553284, + "flos": 21584840225040.0, + "grad_norm": 1.8464795411929829, + "language_loss": 0.77380121, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79838699, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.17346191, + "step": 7161, + "time_per_iteration": 2.8176913261413574 + }, + { + "auxiliary_loss_clip": 0.01404812, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.27561951, + "balance_loss_mlp": 1.02388382, + "epoch": 0.4306027356079964, + "flos": 22606395955920.0, + "grad_norm": 1.828984932038046, + "language_loss": 0.73052299, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75495791, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.14794922, + "step": 7162, + "time_per_iteration": 2.7447474002838135 + }, + { + "auxiliary_loss_clip": 0.01216711, + "auxiliary_loss_mlp": 0.01015613, + "balance_loss_clip": 1.16409373, + "balance_loss_mlp": 1.01297843, + "epoch": 0.43066285886066435, + "flos": 70685974211400.0, + "grad_norm": 0.792297848984246, + "language_loss": 0.59083754, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61316079, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02636719, + "step": 7163, + "time_per_iteration": 3.1150882244110107 + }, + { + "auxiliary_loss_clip": 0.01397262, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.27058876, + "balance_loss_mlp": 1.02048945, + "epoch": 0.4307229821133323, + "flos": 26725778928360.0, + "grad_norm": 1.8818812768052424, + "language_loss": 0.79364395, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81796396, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14245605, + "step": 7164, + "time_per_iteration": 2.7676961421966553 + }, + { + "auxiliary_loss_clip": 0.01411712, + "auxiliary_loss_mlp": 0.01039244, + "balance_loss_clip": 1.279477, + "balance_loss_mlp": 1.02336514, + "epoch": 0.4307831053660003, + "flos": 26182734397920.0, + "grad_norm": 1.7910515827749998, + "language_loss": 0.67671973, + "learning_rate": 2.538704852009177e-06, + "loss": 0.70122927, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.15856934, + "step": 7165, + "time_per_iteration": 2.7636165618896484 + }, + { + "auxiliary_loss_clip": 0.01401095, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.2731092, + "balance_loss_mlp": 1.03137684, + "epoch": 0.43084322861866825, + "flos": 18914212198080.0, + "grad_norm": 1.9234513619150284, + "language_loss": 0.74994397, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77441669, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.14794922, + "step": 7166, + "time_per_iteration": 2.7339894771575928 + }, + { + "auxiliary_loss_clip": 0.01400532, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.27438903, + "balance_loss_mlp": 1.02166378, + "epoch": 0.4309033518713362, + "flos": 26438610630960.0, + "grad_norm": 1.5841546501085635, + "language_loss": 0.71695399, + "learning_rate": 2.537954675511372e-06, + "loss": 0.74132055, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14465332, + "step": 7167, + "time_per_iteration": 2.7713518142700195 + }, + { + "auxiliary_loss_clip": 0.01394223, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.27106333, + "balance_loss_mlp": 1.02305913, + "epoch": 0.43096347512400424, + "flos": 21218032287720.0, + "grad_norm": 1.5734725740635274, + "language_loss": 0.78776658, + "learning_rate": 2.537579556656414e-06, + "loss": 0.81208813, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14868164, + "step": 7168, + "time_per_iteration": 2.8362748622894287 + }, + { + "auxiliary_loss_clip": 0.01400851, + "auxiliary_loss_mlp": 0.01044998, + "balance_loss_clip": 1.2736671, + "balance_loss_mlp": 1.0282433, + "epoch": 0.4310235983766722, + "flos": 16543868968800.0, + "grad_norm": 1.929866448445742, + "language_loss": 0.82190824, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84636676, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.16741943, + "step": 7169, + "time_per_iteration": 2.7013747692108154 + }, + { + "auxiliary_loss_clip": 0.01213481, + "auxiliary_loss_mlp": 0.01003341, + "balance_loss_clip": 1.16066229, + "balance_loss_mlp": 1.00075424, + "epoch": 0.43108372162934017, + "flos": 64790685742680.0, + "grad_norm": 0.7223910499862478, + "language_loss": 0.6082685, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.63043678, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.02587891, + "step": 7170, + "time_per_iteration": 3.376880407333374 + }, + { + "auxiliary_loss_clip": 0.01396898, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.2703253, + "balance_loss_mlp": 1.0180099, + "epoch": 0.43114384488200813, + "flos": 13448112753600.0, + "grad_norm": 1.8290724766299902, + "language_loss": 0.76095593, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78525251, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.14758301, + "step": 7171, + "time_per_iteration": 2.7087955474853516 + }, + { + "auxiliary_loss_clip": 0.01396416, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.27037418, + "balance_loss_mlp": 1.02173817, + "epoch": 0.4312039681346761, + "flos": 26292326026320.0, + "grad_norm": 1.525187415801665, + "language_loss": 0.77493727, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79926574, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14709473, + "step": 7172, + "time_per_iteration": 2.7538278102874756 + }, + { + "auxiliary_loss_clip": 0.01404341, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.27505159, + "balance_loss_mlp": 1.02375555, + "epoch": 0.43126409138734406, + "flos": 20381646989160.0, + "grad_norm": 1.7264154881881413, + "language_loss": 0.77119732, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79564267, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16467285, + "step": 7173, + "time_per_iteration": 2.769361972808838 + }, + { + "auxiliary_loss_clip": 0.0139096, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.26511025, + "balance_loss_mlp": 1.01977944, + "epoch": 0.431324214640012, + "flos": 22127641105320.0, + "grad_norm": 1.4523131722539884, + "language_loss": 0.77301639, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79727137, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14770508, + "step": 7174, + "time_per_iteration": 2.7695884704589844 + }, + { + "auxiliary_loss_clip": 0.01403999, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.27470064, + "balance_loss_mlp": 1.02137995, + "epoch": 0.43138433789268, + "flos": 15235144940520.0, + "grad_norm": 1.6301253381185177, + "language_loss": 0.82743466, + "learning_rate": 2.534953154686407e-06, + "loss": 0.85185438, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16589355, + "step": 7175, + "time_per_iteration": 2.75521183013916 + }, + { + "auxiliary_loss_clip": 0.01408028, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.27558494, + "balance_loss_mlp": 1.01651669, + "epoch": 0.43144446114534796, + "flos": 18154867604400.0, + "grad_norm": 2.429781009466138, + "language_loss": 0.74875045, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77316755, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.17163086, + "step": 7176, + "time_per_iteration": 4.1541337966918945 + }, + { + "auxiliary_loss_clip": 0.01402579, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.27280676, + "balance_loss_mlp": 1.01406217, + "epoch": 0.4315045843980159, + "flos": 22935049016040.0, + "grad_norm": 1.8251502656842673, + "language_loss": 0.74203521, + "learning_rate": 2.534202571340819e-06, + "loss": 0.76635563, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.15393066, + "step": 7177, + "time_per_iteration": 2.774000883102417 + }, + { + "auxiliary_loss_clip": 0.01411245, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_clip": 1.27495718, + "balance_loss_mlp": 1.02417374, + "epoch": 0.4315647076506839, + "flos": 22131904983120.0, + "grad_norm": 1.8868266553649777, + "language_loss": 0.81965673, + "learning_rate": 2.533827249275387e-06, + "loss": 0.8441928, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.18188477, + "step": 7178, + "time_per_iteration": 2.765620708465576 + }, + { + "auxiliary_loss_clip": 0.01392163, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.26955581, + "balance_loss_mlp": 1.02312946, + "epoch": 0.43162483090335185, + "flos": 26876936536200.0, + "grad_norm": 1.4645995147580209, + "language_loss": 0.84318191, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86749208, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.15734863, + "step": 7179, + "time_per_iteration": 2.7637674808502197 + }, + { + "auxiliary_loss_clip": 0.0139199, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.26528955, + "balance_loss_mlp": 1.01675642, + "epoch": 0.4316849541560198, + "flos": 13916512472400.0, + "grad_norm": 1.6864525466896634, + "language_loss": 0.7583077, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.7825433, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14801025, + "step": 7180, + "time_per_iteration": 4.31322717666626 + }, + { + "auxiliary_loss_clip": 0.01400293, + "auxiliary_loss_mlp": 0.01042208, + "balance_loss_clip": 1.27011728, + "balance_loss_mlp": 1.0260551, + "epoch": 0.4317450774086878, + "flos": 16439475210480.0, + "grad_norm": 1.8942803353611632, + "language_loss": 0.81914616, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84357119, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.16149902, + "step": 7181, + "time_per_iteration": 2.697594165802002 + }, + { + "auxiliary_loss_clip": 0.01405716, + "auxiliary_loss_mlp": 0.01040058, + "balance_loss_clip": 1.27519703, + "balance_loss_mlp": 1.02382207, + "epoch": 0.4318052006613558, + "flos": 20559426699960.0, + "grad_norm": 1.6514678283620656, + "language_loss": 0.88825488, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91271269, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.16235352, + "step": 7182, + "time_per_iteration": 2.707411527633667 + }, + { + "auxiliary_loss_clip": 0.01393547, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.26844811, + "balance_loss_mlp": 1.01743937, + "epoch": 0.43186532391402377, + "flos": 22825173129120.0, + "grad_norm": 1.5745741254559265, + "language_loss": 0.75918561, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.78344882, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.15319824, + "step": 7183, + "time_per_iteration": 4.153677940368652 + }, + { + "auxiliary_loss_clip": 0.01398474, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.27010894, + "balance_loss_mlp": 1.01764238, + "epoch": 0.43192544716669173, + "flos": 25562080645560.0, + "grad_norm": 1.8753645805422103, + "language_loss": 0.78248298, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.80679095, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.14691162, + "step": 7184, + "time_per_iteration": 2.768752336502075 + }, + { + "auxiliary_loss_clip": 0.01383258, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.26278734, + "balance_loss_mlp": 1.01771379, + "epoch": 0.4319855704193597, + "flos": 30960641741400.0, + "grad_norm": 1.6018948817720733, + "language_loss": 0.73521024, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75936085, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14093018, + "step": 7185, + "time_per_iteration": 2.8246538639068604 + }, + { + "auxiliary_loss_clip": 0.01399066, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.26749063, + "balance_loss_mlp": 1.02244377, + "epoch": 0.43204569367202766, + "flos": 24243488785800.0, + "grad_norm": 2.2316510106287666, + "language_loss": 0.7620039, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78637755, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.15856934, + "step": 7186, + "time_per_iteration": 2.853679656982422 + }, + { + "auxiliary_loss_clip": 0.01395058, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.26782751, + "balance_loss_mlp": 1.02433109, + "epoch": 0.43210581692469563, + "flos": 18411718438080.0, + "grad_norm": 2.1056408356899126, + "language_loss": 0.77082789, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.7951811, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15905762, + "step": 7187, + "time_per_iteration": 2.759042501449585 + }, + { + "auxiliary_loss_clip": 0.01208543, + "auxiliary_loss_mlp": 0.0100874, + "balance_loss_clip": 1.15592289, + "balance_loss_mlp": 1.00628412, + "epoch": 0.4321659401773636, + "flos": 49847477258880.0, + "grad_norm": 0.8558964102284692, + "language_loss": 0.68293655, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70510936, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02453613, + "step": 7188, + "time_per_iteration": 4.721666574478149 + }, + { + "auxiliary_loss_clip": 0.01389737, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.26605153, + "balance_loss_mlp": 1.02001047, + "epoch": 0.43222606343003156, + "flos": 17132987006640.0, + "grad_norm": 1.6781901813082785, + "language_loss": 0.78266943, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80690682, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.13995361, + "step": 7189, + "time_per_iteration": 2.68802809715271 + }, + { + "auxiliary_loss_clip": 0.01403172, + "auxiliary_loss_mlp": 0.01047959, + "balance_loss_clip": 1.27060723, + "balance_loss_mlp": 1.03212833, + "epoch": 0.4322861866826995, + "flos": 22755441929040.0, + "grad_norm": 2.3406322718931873, + "language_loss": 0.72040808, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.74491942, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.1583252, + "step": 7190, + "time_per_iteration": 2.7326526641845703 + }, + { + "auxiliary_loss_clip": 0.01382185, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.25697088, + "balance_loss_mlp": 1.01824856, + "epoch": 0.4323463099353675, + "flos": 27897761316600.0, + "grad_norm": 1.3886684637176037, + "language_loss": 0.79967684, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.8238306, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14935303, + "step": 7191, + "time_per_iteration": 2.8173530101776123 + }, + { + "auxiliary_loss_clip": 0.01383043, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.25803757, + "balance_loss_mlp": 1.0210557, + "epoch": 0.43240643318803546, + "flos": 21619543391640.0, + "grad_norm": 1.7172745439281356, + "language_loss": 0.75540906, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77959633, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14624023, + "step": 7192, + "time_per_iteration": 2.726482629776001 + }, + { + "auxiliary_loss_clip": 0.01391141, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.26456833, + "balance_loss_mlp": 1.02277541, + "epoch": 0.4324665564407034, + "flos": 17561810555640.0, + "grad_norm": 2.12675652918227, + "language_loss": 0.79229933, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81659889, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.16052246, + "step": 7193, + "time_per_iteration": 2.725900173187256 + }, + { + "auxiliary_loss_clip": 0.0138939, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_clip": 1.26129746, + "balance_loss_mlp": 1.02775025, + "epoch": 0.4325266796933714, + "flos": 18406601784720.0, + "grad_norm": 1.715634267090194, + "language_loss": 0.75796831, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78230542, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.16552734, + "step": 7194, + "time_per_iteration": 2.694603443145752 + }, + { + "auxiliary_loss_clip": 0.0139159, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.2644577, + "balance_loss_mlp": 1.02457452, + "epoch": 0.4325868029460394, + "flos": 22569906021480.0, + "grad_norm": 1.7947044048393501, + "language_loss": 0.59828752, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62259448, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14520264, + "step": 7195, + "time_per_iteration": 2.787536144256592 + }, + { + "auxiliary_loss_clip": 0.01396137, + "auxiliary_loss_mlp": 0.01045416, + "balance_loss_clip": 1.26532495, + "balance_loss_mlp": 1.0288161, + "epoch": 0.43264692619870737, + "flos": 14608887234480.0, + "grad_norm": 1.9554783057364382, + "language_loss": 0.6523667, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67678225, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.16601562, + "step": 7196, + "time_per_iteration": 2.71042537689209 + }, + { + "auxiliary_loss_clip": 0.01396991, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_clip": 1.2656455, + "balance_loss_mlp": 1.02954841, + "epoch": 0.43270704945137534, + "flos": 21505972143960.0, + "grad_norm": 2.138338699008844, + "language_loss": 0.72717112, + "learning_rate": 2.526692300132797e-06, + "loss": 0.75159454, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.15795898, + "step": 7197, + "time_per_iteration": 2.8069379329681396 + }, + { + "auxiliary_loss_clip": 0.01382135, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_clip": 1.25714934, + "balance_loss_mlp": 1.03374839, + "epoch": 0.4327671727040433, + "flos": 25161422317200.0, + "grad_norm": 1.719457818108193, + "language_loss": 0.72825468, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75257099, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15759277, + "step": 7198, + "time_per_iteration": 2.819653272628784 + }, + { + "auxiliary_loss_clip": 0.01380926, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.25579309, + "balance_loss_mlp": 1.02368426, + "epoch": 0.43282729595671127, + "flos": 25452651450600.0, + "grad_norm": 1.434716682844131, + "language_loss": 0.81413913, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83833313, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.14782715, + "step": 7199, + "time_per_iteration": 2.779003858566284 + }, + { + "auxiliary_loss_clip": 0.01390749, + "auxiliary_loss_mlp": 0.01037441, + "balance_loss_clip": 1.26330757, + "balance_loss_mlp": 1.02259398, + "epoch": 0.43288741920937923, + "flos": 24130445446800.0, + "grad_norm": 2.53829855035307, + "language_loss": 0.68892169, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71320367, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.14837646, + "step": 7200, + "time_per_iteration": 2.7593722343444824 + }, + { + "auxiliary_loss_clip": 0.01386811, + "auxiliary_loss_mlp": 0.010497, + "balance_loss_clip": 1.2589047, + "balance_loss_mlp": 1.03400016, + "epoch": 0.4329475424620472, + "flos": 19209217908960.0, + "grad_norm": 1.827450137498669, + "language_loss": 0.87052858, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89489365, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15698242, + "step": 7201, + "time_per_iteration": 2.7244720458984375 + }, + { + "auxiliary_loss_clip": 0.01402594, + "auxiliary_loss_mlp": 0.01052126, + "balance_loss_clip": 1.27057195, + "balance_loss_mlp": 1.03431666, + "epoch": 0.43300766571471516, + "flos": 22643576232480.0, + "grad_norm": 1.9635273238278472, + "language_loss": 0.64926946, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.67381668, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.17810059, + "step": 7202, + "time_per_iteration": 2.744387626647949 + }, + { + "auxiliary_loss_clip": 0.01382251, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.25530756, + "balance_loss_mlp": 1.02246904, + "epoch": 0.4330677889673831, + "flos": 22125367037160.0, + "grad_norm": 1.8548053186267681, + "language_loss": 0.82552111, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84971517, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.14678955, + "step": 7203, + "time_per_iteration": 2.718841075897217 + }, + { + "auxiliary_loss_clip": 0.01397698, + "auxiliary_loss_mlp": 0.01043098, + "balance_loss_clip": 1.26612735, + "balance_loss_mlp": 1.02708817, + "epoch": 0.4331279122200511, + "flos": 23226724841400.0, + "grad_norm": 1.8057293242464791, + "language_loss": 0.81704879, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.84145665, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.16021729, + "step": 7204, + "time_per_iteration": 2.772852897644043 + }, + { + "auxiliary_loss_clip": 0.01383734, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.25662208, + "balance_loss_mlp": 1.02205276, + "epoch": 0.43318803547271906, + "flos": 18264337407720.0, + "grad_norm": 1.96517893087666, + "language_loss": 0.74332088, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.76753122, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15240479, + "step": 7205, + "time_per_iteration": 2.7154316902160645 + }, + { + "auxiliary_loss_clip": 0.01380375, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.25746655, + "balance_loss_mlp": 1.02639186, + "epoch": 0.433248158725387, + "flos": 27424285552800.0, + "grad_norm": 1.6909698825387292, + "language_loss": 0.74785161, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77206349, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14422607, + "step": 7206, + "time_per_iteration": 2.779513120651245 + }, + { + "auxiliary_loss_clip": 0.01379192, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.25416756, + "balance_loss_mlp": 1.01960373, + "epoch": 0.433308281978055, + "flos": 23222745222120.0, + "grad_norm": 1.8972573563127844, + "language_loss": 0.7957027, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81984353, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15283203, + "step": 7207, + "time_per_iteration": 2.757040023803711 + }, + { + "auxiliary_loss_clip": 0.01397053, + "auxiliary_loss_mlp": 0.01043314, + "balance_loss_clip": 1.26590061, + "balance_loss_mlp": 1.02741098, + "epoch": 0.433368405230723, + "flos": 15856773293520.0, + "grad_norm": 1.6849739678472182, + "language_loss": 0.8064369, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83084059, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15930176, + "step": 7208, + "time_per_iteration": 2.790611505508423 + }, + { + "auxiliary_loss_clip": 0.01389796, + "auxiliary_loss_mlp": 0.0103881, + "balance_loss_clip": 1.26297665, + "balance_loss_mlp": 1.02418268, + "epoch": 0.433428528483391, + "flos": 19030788464400.0, + "grad_norm": 1.9181232015452432, + "language_loss": 0.70986164, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.73414767, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.14624023, + "step": 7209, + "time_per_iteration": 2.707995653152466 + }, + { + "auxiliary_loss_clip": 0.01386219, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.26070988, + "balance_loss_mlp": 1.02137375, + "epoch": 0.43348865173605894, + "flos": 24723746145720.0, + "grad_norm": 1.5678570809498902, + "language_loss": 0.81644595, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.8406837, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.16174316, + "step": 7210, + "time_per_iteration": 2.7624330520629883 + }, + { + "auxiliary_loss_clip": 0.01387531, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.26129746, + "balance_loss_mlp": 1.01883864, + "epoch": 0.4335487749887269, + "flos": 22095739915560.0, + "grad_norm": 1.7343736052964043, + "language_loss": 0.82824123, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.85244906, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14428711, + "step": 7211, + "time_per_iteration": 2.834993839263916 + }, + { + "auxiliary_loss_clip": 0.01386402, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.25950682, + "balance_loss_mlp": 1.0219512, + "epoch": 0.43360889824139487, + "flos": 22388106083040.0, + "grad_norm": 1.810832376214285, + "language_loss": 0.74922109, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77344316, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.13861084, + "step": 7212, + "time_per_iteration": 2.720243453979492 + }, + { + "auxiliary_loss_clip": 0.01389427, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.26337123, + "balance_loss_mlp": 1.02040625, + "epoch": 0.43366902149406283, + "flos": 17532548909280.0, + "grad_norm": 1.6271298430733574, + "language_loss": 0.77033436, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.7945739, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14135742, + "step": 7213, + "time_per_iteration": 4.158345937728882 + }, + { + "auxiliary_loss_clip": 0.01386213, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.25952506, + "balance_loss_mlp": 1.0220505, + "epoch": 0.4337291447467308, + "flos": 19026930670200.0, + "grad_norm": 1.4545309467076732, + "language_loss": 0.65052235, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67475146, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14654541, + "step": 7214, + "time_per_iteration": 2.7308011054992676 + }, + { + "auxiliary_loss_clip": 0.01377552, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.25587642, + "balance_loss_mlp": 1.02127314, + "epoch": 0.43378926799939876, + "flos": 27239643029160.0, + "grad_norm": 1.6194960998064036, + "language_loss": 0.71855938, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74269247, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14483643, + "step": 7215, + "time_per_iteration": 2.7618207931518555 + }, + { + "auxiliary_loss_clip": 0.01381167, + "auxiliary_loss_mlp": 0.01033144, + "balance_loss_clip": 1.25837874, + "balance_loss_mlp": 1.0185287, + "epoch": 0.43384939125206673, + "flos": 15965390321280.0, + "grad_norm": 1.933706239827678, + "language_loss": 0.75545555, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77959865, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14593506, + "step": 7216, + "time_per_iteration": 2.707456588745117 + }, + { + "auxiliary_loss_clip": 0.01409972, + "auxiliary_loss_mlp": 0.01044458, + "balance_loss_clip": 1.27688837, + "balance_loss_mlp": 1.02717257, + "epoch": 0.4339095145047347, + "flos": 21797526144240.0, + "grad_norm": 4.8100271101784156, + "language_loss": 0.76033092, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78487527, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.17297363, + "step": 7217, + "time_per_iteration": 2.7156829833984375 + }, + { + "auxiliary_loss_clip": 0.01390913, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.26344931, + "balance_loss_mlp": 1.02406955, + "epoch": 0.43396963775740266, + "flos": 14213914076520.0, + "grad_norm": 1.824680543967731, + "language_loss": 0.74119389, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76550394, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.16040039, + "step": 7218, + "time_per_iteration": 2.7543368339538574 + }, + { + "auxiliary_loss_clip": 0.01395188, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.26719427, + "balance_loss_mlp": 1.01939678, + "epoch": 0.4340297610100706, + "flos": 19723812960240.0, + "grad_norm": 1.9235665939564075, + "language_loss": 0.69574004, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.72003555, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.1494751, + "step": 7219, + "time_per_iteration": 4.174927473068237 + }, + { + "auxiliary_loss_clip": 0.0139227, + "auxiliary_loss_mlp": 0.01040142, + "balance_loss_clip": 1.26702666, + "balance_loss_mlp": 1.02559853, + "epoch": 0.4340898842627386, + "flos": 18958255287480.0, + "grad_norm": 1.5360428203787706, + "language_loss": 0.77715778, + "learning_rate": 2.518045619038202e-06, + "loss": 0.80148196, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14556885, + "step": 7220, + "time_per_iteration": 2.766354560852051 + }, + { + "auxiliary_loss_clip": 0.01383081, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.25725698, + "balance_loss_mlp": 1.01579881, + "epoch": 0.4341500075154066, + "flos": 22023450388800.0, + "grad_norm": 1.86083907549872, + "language_loss": 0.6989876, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.7231282, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15179443, + "step": 7221, + "time_per_iteration": 4.233444452285767 + }, + { + "auxiliary_loss_clip": 0.01397475, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.26955426, + "balance_loss_mlp": 1.02052581, + "epoch": 0.4342101307680746, + "flos": 23587197874560.0, + "grad_norm": 1.8037728767974346, + "language_loss": 0.65272599, + "learning_rate": 2.51729324012157e-06, + "loss": 0.6770547, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.14892578, + "step": 7222, + "time_per_iteration": 2.8195765018463135 + }, + { + "auxiliary_loss_clip": 0.01393669, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.26632285, + "balance_loss_mlp": 1.02407587, + "epoch": 0.43427025402074254, + "flos": 17972539757280.0, + "grad_norm": 2.056153581110756, + "language_loss": 0.73585325, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.76019549, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.16479492, + "step": 7223, + "time_per_iteration": 2.7293243408203125 + }, + { + "auxiliary_loss_clip": 0.01395682, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.26626372, + "balance_loss_mlp": 1.01383066, + "epoch": 0.4343303772734105, + "flos": 26292001159440.0, + "grad_norm": 1.8605560946465407, + "language_loss": 0.93671882, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96097428, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.16027832, + "step": 7224, + "time_per_iteration": 2.812603235244751 + }, + { + "auxiliary_loss_clip": 0.01387402, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.2617079, + "balance_loss_mlp": 1.02054536, + "epoch": 0.43439050052607847, + "flos": 26839675042920.0, + "grad_norm": 1.609862433783652, + "language_loss": 0.61603594, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.64027023, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.15490723, + "step": 7225, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01390649, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.26467228, + "balance_loss_mlp": 1.02178502, + "epoch": 0.43445062377874644, + "flos": 21402674811360.0, + "grad_norm": 1.9808783891480015, + "language_loss": 0.77818447, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.80245841, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.1496582, + "step": 7226, + "time_per_iteration": 2.725524663925171 + }, + { + "auxiliary_loss_clip": 0.01382083, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.26041651, + "balance_loss_mlp": 1.01765764, + "epoch": 0.4345107470314144, + "flos": 19906790541120.0, + "grad_norm": 1.671151807158633, + "language_loss": 0.8507297, + "learning_rate": 2.515411949802964e-06, + "loss": 0.87487561, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.1484375, + "step": 7227, + "time_per_iteration": 4.271904706954956 + }, + { + "auxiliary_loss_clip": 0.01387955, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.26357424, + "balance_loss_mlp": 1.02291203, + "epoch": 0.43457087028408237, + "flos": 26438285764080.0, + "grad_norm": 1.8967889791518457, + "language_loss": 0.76666963, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79093552, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.15710449, + "step": 7228, + "time_per_iteration": 2.853590726852417 + }, + { + "auxiliary_loss_clip": 0.01384625, + "auxiliary_loss_mlp": 0.01040588, + "balance_loss_clip": 1.26066422, + "balance_loss_mlp": 1.02555561, + "epoch": 0.43463099353675033, + "flos": 31875164170560.0, + "grad_norm": 1.578981370245347, + "language_loss": 0.80919933, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.83345145, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.15039062, + "step": 7229, + "time_per_iteration": 2.831547498703003 + }, + { + "auxiliary_loss_clip": 0.01391586, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_clip": 1.2649256, + "balance_loss_mlp": 1.02726436, + "epoch": 0.4346911167894183, + "flos": 24576608765520.0, + "grad_norm": 1.8868252289706129, + "language_loss": 0.81791002, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84224814, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.1496582, + "step": 7230, + "time_per_iteration": 2.770005464553833 + }, + { + "auxiliary_loss_clip": 0.01398963, + "auxiliary_loss_mlp": 0.01044058, + "balance_loss_clip": 1.26837921, + "balance_loss_mlp": 1.02848887, + "epoch": 0.43475124004208626, + "flos": 17094913346160.0, + "grad_norm": 2.0335677529295677, + "language_loss": 0.77869123, + "learning_rate": 2.513906565661973e-06, + "loss": 0.80312145, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.15576172, + "step": 7231, + "time_per_iteration": 2.7052299976348877 + }, + { + "auxiliary_loss_clip": 0.01381225, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.25828135, + "balance_loss_mlp": 1.01970172, + "epoch": 0.4348113632947542, + "flos": 26110282437720.0, + "grad_norm": 1.541485765483435, + "language_loss": 0.69090694, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71504962, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.13330078, + "step": 7232, + "time_per_iteration": 2.783498525619507 + }, + { + "auxiliary_loss_clip": 0.01400325, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.27129912, + "balance_loss_mlp": 1.02177167, + "epoch": 0.4348714865474222, + "flos": 34206702788880.0, + "grad_norm": 1.6308192409343742, + "language_loss": 0.72170639, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74608153, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15411377, + "step": 7233, + "time_per_iteration": 2.85168719291687 + }, + { + "auxiliary_loss_clip": 0.01395705, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.26639116, + "balance_loss_mlp": 1.02672315, + "epoch": 0.43493160980009016, + "flos": 31543384266720.0, + "grad_norm": 6.204908930164477, + "language_loss": 0.74665028, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.77102894, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.15429688, + "step": 7234, + "time_per_iteration": 2.9079034328460693 + }, + { + "auxiliary_loss_clip": 0.01401748, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.26921046, + "balance_loss_mlp": 1.03154111, + "epoch": 0.4349917330527582, + "flos": 24066724284000.0, + "grad_norm": 6.939030147212317, + "language_loss": 0.59079027, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61528337, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.16027832, + "step": 7235, + "time_per_iteration": 2.7932331562042236 + }, + { + "auxiliary_loss_clip": 0.01391939, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.26492786, + "balance_loss_mlp": 1.01580822, + "epoch": 0.43505185630542614, + "flos": 30525564504960.0, + "grad_norm": 3.3454176910219644, + "language_loss": 0.77514088, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79936361, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14526367, + "step": 7236, + "time_per_iteration": 2.814628839492798 + }, + { + "auxiliary_loss_clip": 0.01384411, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.26081645, + "balance_loss_mlp": 1.02153587, + "epoch": 0.4351119795580941, + "flos": 15738491476080.0, + "grad_norm": 2.747987310108538, + "language_loss": 0.81352842, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83773208, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14416504, + "step": 7237, + "time_per_iteration": 2.7015323638916016 + }, + { + "auxiliary_loss_clip": 0.01383975, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.25873399, + "balance_loss_mlp": 1.02546597, + "epoch": 0.4351721028107621, + "flos": 18736107620400.0, + "grad_norm": 1.4210631427823976, + "language_loss": 0.63121867, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65546358, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15063477, + "step": 7238, + "time_per_iteration": 2.7777531147003174 + }, + { + "auxiliary_loss_clip": 0.01379838, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.25724113, + "balance_loss_mlp": 1.01987743, + "epoch": 0.43523222606343004, + "flos": 25232168726280.0, + "grad_norm": 1.7770072404352228, + "language_loss": 0.85749829, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88164258, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.1473999, + "step": 7239, + "time_per_iteration": 2.8641974925994873 + }, + { + "auxiliary_loss_clip": 0.01392745, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.26680398, + "balance_loss_mlp": 1.01903069, + "epoch": 0.435292349316098, + "flos": 22714119599760.0, + "grad_norm": 1.4949767830962797, + "language_loss": 0.72891879, + "learning_rate": 2.510518312724309e-06, + "loss": 0.75318402, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14746094, + "step": 7240, + "time_per_iteration": 2.744690179824829 + }, + { + "auxiliary_loss_clip": 0.01392022, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.26404142, + "balance_loss_mlp": 1.02235126, + "epoch": 0.43535247256876597, + "flos": 25781466944160.0, + "grad_norm": 1.8902632576507683, + "language_loss": 0.82048738, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84478581, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.15478516, + "step": 7241, + "time_per_iteration": 2.770280599594116 + }, + { + "auxiliary_loss_clip": 0.01397336, + "auxiliary_loss_mlp": 0.01043004, + "balance_loss_clip": 1.26513553, + "balance_loss_mlp": 1.02745342, + "epoch": 0.43541259582143393, + "flos": 17532386475840.0, + "grad_norm": 2.6841532756783586, + "language_loss": 0.79995036, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.82435381, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.15539551, + "step": 7242, + "time_per_iteration": 2.690424919128418 + }, + { + "auxiliary_loss_clip": 0.0139733, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.26698184, + "balance_loss_mlp": 1.01768148, + "epoch": 0.4354727190741019, + "flos": 15199426564920.0, + "grad_norm": 2.756168924895826, + "language_loss": 0.68881536, + "learning_rate": 2.509388546104138e-06, + "loss": 0.71311599, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.15026855, + "step": 7243, + "time_per_iteration": 2.7207467555999756 + }, + { + "auxiliary_loss_clip": 0.01378109, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.25553942, + "balance_loss_mlp": 1.02037537, + "epoch": 0.43553284232676986, + "flos": 16652973296880.0, + "grad_norm": 1.526829104541075, + "language_loss": 0.81543452, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83955717, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13769531, + "step": 7244, + "time_per_iteration": 2.6849889755249023 + }, + { + "auxiliary_loss_clip": 0.01385478, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.26003814, + "balance_loss_mlp": 1.01944637, + "epoch": 0.43559296557943783, + "flos": 23405682194640.0, + "grad_norm": 1.692843276785745, + "language_loss": 0.73521137, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75939792, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.1373291, + "step": 7245, + "time_per_iteration": 2.716989517211914 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.26079345, + "balance_loss_mlp": 1.02343547, + "epoch": 0.4356530888321058, + "flos": 22424189933880.0, + "grad_norm": 1.793935162320618, + "language_loss": 0.77280754, + "learning_rate": 2.508258605639389e-06, + "loss": 0.79705429, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14422607, + "step": 7246, + "time_per_iteration": 2.7329440116882324 + }, + { + "auxiliary_loss_clip": 0.01388768, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.2628144, + "balance_loss_mlp": 1.0257889, + "epoch": 0.43571321208477376, + "flos": 21621127117680.0, + "grad_norm": 1.6728172231963974, + "language_loss": 0.8593955, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.88369524, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15429688, + "step": 7247, + "time_per_iteration": 2.7604281902313232 + }, + { + "auxiliary_loss_clip": 0.01388923, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.26309741, + "balance_loss_mlp": 1.02092433, + "epoch": 0.4357733353374418, + "flos": 23992607381040.0, + "grad_norm": 1.5359475042722055, + "language_loss": 0.72283864, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74707717, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14001465, + "step": 7248, + "time_per_iteration": 2.79072904586792 + }, + { + "auxiliary_loss_clip": 0.01384751, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.25986099, + "balance_loss_mlp": 1.02073884, + "epoch": 0.43583345859010975, + "flos": 25270201778400.0, + "grad_norm": 1.54166240506658, + "language_loss": 0.8730557, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89725906, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14849854, + "step": 7249, + "time_per_iteration": 2.785677433013916 + }, + { + "auxiliary_loss_clip": 0.01396015, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.26664734, + "balance_loss_mlp": 1.03040516, + "epoch": 0.4358935818427777, + "flos": 23701621897800.0, + "grad_norm": 1.8497867466199667, + "language_loss": 0.81500137, + "learning_rate": 2.506751748594683e-06, + "loss": 0.83941323, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.14727783, + "step": 7250, + "time_per_iteration": 2.86088228225708 + }, + { + "auxiliary_loss_clip": 0.01391886, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.26547766, + "balance_loss_mlp": 1.01982546, + "epoch": 0.4359537050954457, + "flos": 29538752549040.0, + "grad_norm": 1.8264671139165, + "language_loss": 0.85458338, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.87884557, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.14520264, + "step": 7251, + "time_per_iteration": 2.764644145965576 + }, + { + "auxiliary_loss_clip": 0.01383847, + "auxiliary_loss_mlp": 0.01041659, + "balance_loss_clip": 1.25871396, + "balance_loss_mlp": 1.02659082, + "epoch": 0.43601382834811364, + "flos": 22716474884640.0, + "grad_norm": 1.6297465369678124, + "language_loss": 0.69726533, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.72152042, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15075684, + "step": 7252, + "time_per_iteration": 4.127485036849976 + }, + { + "auxiliary_loss_clip": 0.01383723, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.26063895, + "balance_loss_mlp": 1.0232358, + "epoch": 0.4360739516007816, + "flos": 19103402858040.0, + "grad_norm": 1.6066763913128257, + "language_loss": 0.83775592, + "learning_rate": 2.505621403992348e-06, + "loss": 0.86197466, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14904785, + "step": 7253, + "time_per_iteration": 2.757044792175293 + }, + { + "auxiliary_loss_clip": 0.01388381, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.26390004, + "balance_loss_mlp": 1.02587318, + "epoch": 0.43613407485344957, + "flos": 23409702422280.0, + "grad_norm": 1.574230764889661, + "language_loss": 0.70375991, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72805619, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.15393066, + "step": 7254, + "time_per_iteration": 2.7515885829925537 + }, + { + "auxiliary_loss_clip": 0.01382509, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.25929368, + "balance_loss_mlp": 1.02548885, + "epoch": 0.43619419810611754, + "flos": 22642926498720.0, + "grad_norm": 1.6854927450478503, + "language_loss": 0.81798542, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.84220839, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14306641, + "step": 7255, + "time_per_iteration": 2.764732837677002 + }, + { + "auxiliary_loss_clip": 0.01390574, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.26358628, + "balance_loss_mlp": 1.02601933, + "epoch": 0.4362543213587855, + "flos": 20052831495600.0, + "grad_norm": 1.842048664327556, + "language_loss": 0.77637446, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80068445, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.1439209, + "step": 7256, + "time_per_iteration": 2.7133452892303467 + }, + { + "auxiliary_loss_clip": 0.01386952, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.26406789, + "balance_loss_mlp": 1.01886201, + "epoch": 0.43631444461145347, + "flos": 21366347310360.0, + "grad_norm": 1.5042280843547948, + "language_loss": 0.75805187, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78225392, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14379883, + "step": 7257, + "time_per_iteration": 2.7368900775909424 + }, + { + "auxiliary_loss_clip": 0.01384879, + "auxiliary_loss_mlp": 0.01039098, + "balance_loss_clip": 1.25820851, + "balance_loss_mlp": 1.02336276, + "epoch": 0.43637456786412143, + "flos": 22423580808480.0, + "grad_norm": 1.791591736600125, + "language_loss": 0.73713684, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.76137662, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.1574707, + "step": 7258, + "time_per_iteration": 4.310421466827393 + }, + { + "auxiliary_loss_clip": 0.01397543, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.26996922, + "balance_loss_mlp": 1.02419901, + "epoch": 0.4364346911167894, + "flos": 28554783178320.0, + "grad_norm": 1.6719476740778063, + "language_loss": 0.7733922, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.79775631, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.14678955, + "step": 7259, + "time_per_iteration": 4.2275779247283936 + }, + { + "auxiliary_loss_clip": 0.01203982, + "auxiliary_loss_mlp": 0.01025162, + "balance_loss_clip": 1.15119696, + "balance_loss_mlp": 1.02264619, + "epoch": 0.43649481436945736, + "flos": 62674025895000.0, + "grad_norm": 0.7462754893542038, + "language_loss": 0.57043087, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.5927223, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.02514648, + "step": 7260, + "time_per_iteration": 3.2164363861083984 + }, + { + "auxiliary_loss_clip": 0.01388114, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_clip": 1.26219344, + "balance_loss_mlp": 1.0326916, + "epoch": 0.4365549376221254, + "flos": 30598341332040.0, + "grad_norm": 1.830543611401372, + "language_loss": 0.70885158, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73321384, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.15411377, + "step": 7261, + "time_per_iteration": 2.809159994125366 + }, + { + "auxiliary_loss_clip": 0.01387262, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_clip": 1.26148939, + "balance_loss_mlp": 1.02785778, + "epoch": 0.43661506087479335, + "flos": 17170410933360.0, + "grad_norm": 2.8938093108715375, + "language_loss": 0.6948247, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71913236, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.15661621, + "step": 7262, + "time_per_iteration": 2.6828126907348633 + }, + { + "auxiliary_loss_clip": 0.013776, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.25849915, + "balance_loss_mlp": 1.02124596, + "epoch": 0.4366751841274613, + "flos": 22051696826160.0, + "grad_norm": 1.66437005776567, + "language_loss": 0.79938686, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82350779, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13250732, + "step": 7263, + "time_per_iteration": 2.73483943939209 + }, + { + "auxiliary_loss_clip": 0.01384941, + "auxiliary_loss_mlp": 0.0104668, + "balance_loss_clip": 1.26158345, + "balance_loss_mlp": 1.03140974, + "epoch": 0.4367353073801293, + "flos": 16001555388840.0, + "grad_norm": 1.7149322914193776, + "language_loss": 0.75774693, + "learning_rate": 2.50147533371401e-06, + "loss": 0.78206319, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.15270996, + "step": 7264, + "time_per_iteration": 2.664813756942749 + }, + { + "auxiliary_loss_clip": 0.0138254, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.25944364, + "balance_loss_mlp": 1.01786876, + "epoch": 0.43679543063279724, + "flos": 38224737630000.0, + "grad_norm": 2.0814949425867213, + "language_loss": 0.61978471, + "learning_rate": 2.501098303852298e-06, + "loss": 0.64393491, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14624023, + "step": 7265, + "time_per_iteration": 4.472175598144531 + }, + { + "auxiliary_loss_clip": 0.0138094, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.25866413, + "balance_loss_mlp": 1.01741481, + "epoch": 0.4368555538854652, + "flos": 15197193105120.0, + "grad_norm": 1.9759149337658435, + "language_loss": 0.72650033, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75062215, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.1383667, + "step": 7266, + "time_per_iteration": 2.8025693893432617 + }, + { + "auxiliary_loss_clip": 0.01388382, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.26306558, + "balance_loss_mlp": 1.02278864, + "epoch": 0.4369156771381332, + "flos": 23073414990480.0, + "grad_norm": 1.899924532923576, + "language_loss": 0.82692921, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.85119247, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.15179443, + "step": 7267, + "time_per_iteration": 2.7315866947174072 + }, + { + "auxiliary_loss_clip": 0.01379066, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.25705731, + "balance_loss_mlp": 1.01756632, + "epoch": 0.43697580039080114, + "flos": 23446395398520.0, + "grad_norm": 1.836488576950244, + "language_loss": 0.74959308, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.77370429, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14477539, + "step": 7268, + "time_per_iteration": 2.7404704093933105 + }, + { + "auxiliary_loss_clip": 0.0139066, + "auxiliary_loss_mlp": 0.01035597, + "balance_loss_clip": 1.26365006, + "balance_loss_mlp": 1.01999283, + "epoch": 0.4370359236434691, + "flos": 18519239040120.0, + "grad_norm": 2.2572169476114756, + "language_loss": 0.80006385, + "learning_rate": 2.499589994531454e-06, + "loss": 0.8243264, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15612793, + "step": 7269, + "time_per_iteration": 2.704667091369629 + }, + { + "auxiliary_loss_clip": 0.01386368, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.26333404, + "balance_loss_mlp": 1.02060568, + "epoch": 0.43709604689613707, + "flos": 23227821267120.0, + "grad_norm": 1.7982490745904154, + "language_loss": 0.75450766, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77872813, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15063477, + "step": 7270, + "time_per_iteration": 2.844820737838745 + }, + { + "auxiliary_loss_clip": 0.01390462, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.26505971, + "balance_loss_mlp": 1.01460099, + "epoch": 0.43715617014880503, + "flos": 23808777024600.0, + "grad_norm": 1.8237674523316574, + "language_loss": 0.79301572, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81720769, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14147949, + "step": 7271, + "time_per_iteration": 2.769683837890625 + }, + { + "auxiliary_loss_clip": 0.01201169, + "auxiliary_loss_mlp": 0.01016077, + "balance_loss_clip": 1.14909458, + "balance_loss_mlp": 1.01288247, + "epoch": 0.437216293401473, + "flos": 61957303098120.0, + "grad_norm": 0.6905314068865932, + "language_loss": 0.54929841, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.57147086, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.03198242, + "step": 7272, + "time_per_iteration": 3.322129964828491 + }, + { + "auxiliary_loss_clip": 0.01396909, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.26997554, + "balance_loss_mlp": 1.01633036, + "epoch": 0.43727641665414096, + "flos": 21987447754680.0, + "grad_norm": 1.6784392177498755, + "language_loss": 0.70485258, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72914183, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.15686035, + "step": 7273, + "time_per_iteration": 2.771054744720459 + }, + { + "auxiliary_loss_clip": 0.01393063, + "auxiliary_loss_mlp": 0.01037775, + "balance_loss_clip": 1.26497698, + "balance_loss_mlp": 1.02218294, + "epoch": 0.437336539906809, + "flos": 39538212836400.0, + "grad_norm": 1.9802733287494314, + "language_loss": 0.75968611, + "learning_rate": 2.497704181736367e-06, + "loss": 0.7839945, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15588379, + "step": 7274, + "time_per_iteration": 2.868516445159912 + }, + { + "auxiliary_loss_clip": 0.01383864, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.26034832, + "balance_loss_mlp": 1.01694977, + "epoch": 0.43739666315947695, + "flos": 17461640066760.0, + "grad_norm": 1.7079020971884746, + "language_loss": 0.80478716, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82892799, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.13287354, + "step": 7275, + "time_per_iteration": 2.767343282699585 + }, + { + "auxiliary_loss_clip": 0.01387091, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.26343656, + "balance_loss_mlp": 1.02422464, + "epoch": 0.4374567864121449, + "flos": 16362596939040.0, + "grad_norm": 2.027601640600543, + "language_loss": 0.80779231, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83205521, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14978027, + "step": 7276, + "time_per_iteration": 2.7043159008026123 + }, + { + "auxiliary_loss_clip": 0.01403334, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.27220237, + "balance_loss_mlp": 1.01434195, + "epoch": 0.4375169096648129, + "flos": 30593143461960.0, + "grad_norm": 1.7548532465498974, + "language_loss": 0.73534745, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75967997, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15563965, + "step": 7277, + "time_per_iteration": 2.82812762260437 + }, + { + "auxiliary_loss_clip": 0.01383509, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.26010585, + "balance_loss_mlp": 1.01792312, + "epoch": 0.43757703291748085, + "flos": 30561810789240.0, + "grad_norm": 2.043367942704677, + "language_loss": 0.7236535, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74781954, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.15167236, + "step": 7278, + "time_per_iteration": 2.8376030921936035 + }, + { + "auxiliary_loss_clip": 0.01384118, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.26252031, + "balance_loss_mlp": 1.01965308, + "epoch": 0.4376371561701488, + "flos": 21402349944480.0, + "grad_norm": 1.7013360760696346, + "language_loss": 0.66248655, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68666267, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.13848877, + "step": 7279, + "time_per_iteration": 2.8085286617279053 + }, + { + "auxiliary_loss_clip": 0.01403024, + "auxiliary_loss_mlp": 0.01038503, + "balance_loss_clip": 1.27328813, + "balance_loss_mlp": 1.02334583, + "epoch": 0.4376972794228168, + "flos": 23409661813920.0, + "grad_norm": 1.7382547210367307, + "language_loss": 0.82000679, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.8444221, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.15155029, + "step": 7280, + "time_per_iteration": 2.757688045501709 + }, + { + "auxiliary_loss_clip": 0.0137922, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.25808775, + "balance_loss_mlp": 1.02176607, + "epoch": 0.43775740267548474, + "flos": 22898071781280.0, + "grad_norm": 1.5163216913590962, + "language_loss": 0.76982474, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79398131, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14678955, + "step": 7281, + "time_per_iteration": 2.7762186527252197 + }, + { + "auxiliary_loss_clip": 0.01385659, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.26061392, + "balance_loss_mlp": 1.02228951, + "epoch": 0.4378175259281527, + "flos": 23299582885200.0, + "grad_norm": 1.8751645729192337, + "language_loss": 0.7615388, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78575927, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14086914, + "step": 7282, + "time_per_iteration": 2.73637318611145 + }, + { + "auxiliary_loss_clip": 0.01387941, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.26181197, + "balance_loss_mlp": 1.01633406, + "epoch": 0.43787764918082067, + "flos": 23882000543640.0, + "grad_norm": 1.901263561764418, + "language_loss": 0.85284048, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87703079, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.14758301, + "step": 7283, + "time_per_iteration": 2.7557029724121094 + }, + { + "auxiliary_loss_clip": 0.01397127, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.26761222, + "balance_loss_mlp": 1.01845586, + "epoch": 0.43793777243348864, + "flos": 23993500764960.0, + "grad_norm": 2.680309664219635, + "language_loss": 0.80769551, + "learning_rate": 2.49393114246007e-06, + "loss": 0.83200151, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.15032959, + "step": 7284, + "time_per_iteration": 2.8444390296936035 + }, + { + "auxiliary_loss_clip": 0.01385238, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.2614007, + "balance_loss_mlp": 1.02556145, + "epoch": 0.4379978956861566, + "flos": 18628627626720.0, + "grad_norm": 1.5732686896484314, + "language_loss": 0.80231202, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8265636, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14367676, + "step": 7285, + "time_per_iteration": 2.714155912399292 + }, + { + "auxiliary_loss_clip": 0.01387557, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.26330614, + "balance_loss_mlp": 1.01562345, + "epoch": 0.43805801893882457, + "flos": 21986473154040.0, + "grad_norm": 2.164151471739152, + "language_loss": 0.74693996, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77111495, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14331055, + "step": 7286, + "time_per_iteration": 2.7740700244903564 + }, + { + "auxiliary_loss_clip": 0.0139111, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.26355207, + "balance_loss_mlp": 1.0176754, + "epoch": 0.43811814219149253, + "flos": 26398628377560.0, + "grad_norm": 1.7500898362561477, + "language_loss": 0.74164987, + "learning_rate": 2.492798864792712e-06, + "loss": 0.76588762, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.14990234, + "step": 7287, + "time_per_iteration": 2.7821455001831055 + }, + { + "auxiliary_loss_clip": 0.01389697, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.26305497, + "balance_loss_mlp": 1.0271697, + "epoch": 0.43817826544416055, + "flos": 17497764525960.0, + "grad_norm": 1.7309459565282497, + "language_loss": 0.82852989, + "learning_rate": 2.492421401510545e-06, + "loss": 0.85285056, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.15209961, + "step": 7288, + "time_per_iteration": 2.7092700004577637 + }, + { + "auxiliary_loss_clip": 0.01394176, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.26530433, + "balance_loss_mlp": 1.02188611, + "epoch": 0.4382383886968285, + "flos": 21586180300920.0, + "grad_norm": 1.3962827981918018, + "language_loss": 0.8411268, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86544096, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15362549, + "step": 7289, + "time_per_iteration": 2.745349168777466 + }, + { + "auxiliary_loss_clip": 0.01400239, + "auxiliary_loss_mlp": 0.01039132, + "balance_loss_clip": 1.2700907, + "balance_loss_mlp": 1.02483869, + "epoch": 0.4382985119494965, + "flos": 27928972164240.0, + "grad_norm": 1.5260205255219033, + "language_loss": 0.78587496, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.81026864, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.1428833, + "step": 7290, + "time_per_iteration": 2.820230722427368 + }, + { + "auxiliary_loss_clip": 0.01390685, + "auxiliary_loss_mlp": 0.01042474, + "balance_loss_clip": 1.26583862, + "balance_loss_mlp": 1.02832961, + "epoch": 0.43835863520216445, + "flos": 24942320277120.0, + "grad_norm": 1.8239048172241727, + "language_loss": 0.78067112, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80500269, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14135742, + "step": 7291, + "time_per_iteration": 4.259098529815674 + }, + { + "auxiliary_loss_clip": 0.01391032, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.26531482, + "balance_loss_mlp": 1.0184418, + "epoch": 0.4384187584548324, + "flos": 33516805136760.0, + "grad_norm": 1.7716550657672472, + "language_loss": 0.65169048, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67593193, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.14666748, + "step": 7292, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.01391168, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.26437235, + "balance_loss_mlp": 1.01831293, + "epoch": 0.4384788817075004, + "flos": 23956320488400.0, + "grad_norm": 1.5095776178422855, + "language_loss": 0.74806464, + "learning_rate": 2.49053380529597e-06, + "loss": 0.77230453, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14508057, + "step": 7293, + "time_per_iteration": 2.7442872524261475 + }, + { + "auxiliary_loss_clip": 0.013899, + "auxiliary_loss_mlp": 0.01043452, + "balance_loss_clip": 1.26547384, + "balance_loss_mlp": 1.02813411, + "epoch": 0.43853900496016834, + "flos": 19103159207880.0, + "grad_norm": 1.8757925039545404, + "language_loss": 0.78851819, + "learning_rate": 2.490156230192516e-06, + "loss": 0.81285173, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.15319824, + "step": 7294, + "time_per_iteration": 2.748120069503784 + }, + { + "auxiliary_loss_clip": 0.01392464, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.26616859, + "balance_loss_mlp": 1.02897286, + "epoch": 0.4385991282128363, + "flos": 13229376188760.0, + "grad_norm": 1.5910662670613496, + "language_loss": 0.73316646, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75752497, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.144104, + "step": 7295, + "time_per_iteration": 2.803360939025879 + }, + { + "auxiliary_loss_clip": 0.01393671, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.26726055, + "balance_loss_mlp": 1.02990222, + "epoch": 0.4386592514655043, + "flos": 14324033613600.0, + "grad_norm": 2.187033727601717, + "language_loss": 0.74933904, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77372682, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.15197754, + "step": 7296, + "time_per_iteration": 2.8658106327056885 + }, + { + "auxiliary_loss_clip": 0.01385172, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.2603035, + "balance_loss_mlp": 1.02311587, + "epoch": 0.43871937471817224, + "flos": 22789657795320.0, + "grad_norm": 1.506920488615041, + "language_loss": 0.6944102, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71863925, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.14611816, + "step": 7297, + "time_per_iteration": 4.241605043411255 + }, + { + "auxiliary_loss_clip": 0.01387111, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.26296651, + "balance_loss_mlp": 1.01823795, + "epoch": 0.4387794979708402, + "flos": 28077896312280.0, + "grad_norm": 1.4575853414815878, + "language_loss": 0.70581996, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.73001295, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.13928223, + "step": 7298, + "time_per_iteration": 4.253544569015503 + }, + { + "auxiliary_loss_clip": 0.01385054, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.26188576, + "balance_loss_mlp": 1.0202949, + "epoch": 0.43883962122350817, + "flos": 26255105141400.0, + "grad_norm": 1.7042544273737472, + "language_loss": 0.72272265, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74691403, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.13775635, + "step": 7299, + "time_per_iteration": 2.7690253257751465 + }, + { + "auxiliary_loss_clip": 0.01390903, + "auxiliary_loss_mlp": 0.01039163, + "balance_loss_clip": 1.26384068, + "balance_loss_mlp": 1.02380884, + "epoch": 0.43889974447617613, + "flos": 25889149979640.0, + "grad_norm": 1.7926709632999678, + "language_loss": 0.7686305, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79293114, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.15356445, + "step": 7300, + "time_per_iteration": 2.7884576320648193 + }, + { + "auxiliary_loss_clip": 0.0138447, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.25918365, + "balance_loss_mlp": 1.01933408, + "epoch": 0.43895986772884416, + "flos": 25052155555680.0, + "grad_norm": 1.604310080827778, + "language_loss": 0.70777869, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73196435, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14758301, + "step": 7301, + "time_per_iteration": 2.7851576805114746 + }, + { + "auxiliary_loss_clip": 0.0139742, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.27139354, + "balance_loss_mlp": 1.02270937, + "epoch": 0.4390199909815121, + "flos": 26000162900640.0, + "grad_norm": 1.806558818698072, + "language_loss": 0.71354496, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.73790318, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.15686035, + "step": 7302, + "time_per_iteration": 2.905829906463623 + }, + { + "auxiliary_loss_clip": 0.01390255, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.26671791, + "balance_loss_mlp": 1.02703476, + "epoch": 0.4390801142341801, + "flos": 29028137117040.0, + "grad_norm": 1.6535769771224313, + "language_loss": 0.82322359, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84753799, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14147949, + "step": 7303, + "time_per_iteration": 2.802567481994629 + }, + { + "auxiliary_loss_clip": 0.01399186, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.26919556, + "balance_loss_mlp": 1.02271104, + "epoch": 0.43914023748684805, + "flos": 33445205952120.0, + "grad_norm": 2.1072133519091367, + "language_loss": 0.68141627, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70578611, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.15100098, + "step": 7304, + "time_per_iteration": 4.327269554138184 + }, + { + "auxiliary_loss_clip": 0.01377358, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.2555275, + "balance_loss_mlp": 1.02115178, + "epoch": 0.439200360739516, + "flos": 34538482692720.0, + "grad_norm": 1.6274880258746742, + "language_loss": 0.7846247, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80874509, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13525391, + "step": 7305, + "time_per_iteration": 2.8463258743286133 + }, + { + "auxiliary_loss_clip": 0.01385813, + "auxiliary_loss_mlp": 0.01034263, + "balance_loss_clip": 1.26140296, + "balance_loss_mlp": 1.01970768, + "epoch": 0.439260483992184, + "flos": 21912843551400.0, + "grad_norm": 2.0059432809346247, + "language_loss": 0.68908989, + "learning_rate": 2.485623883278308e-06, + "loss": 0.71329057, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14532471, + "step": 7306, + "time_per_iteration": 2.776808500289917 + }, + { + "auxiliary_loss_clip": 0.01391064, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.26540017, + "balance_loss_mlp": 1.01861382, + "epoch": 0.43932060724485195, + "flos": 21001407357600.0, + "grad_norm": 1.873897608768361, + "language_loss": 0.62928152, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.6535278, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.1496582, + "step": 7307, + "time_per_iteration": 2.81684947013855 + }, + { + "auxiliary_loss_clip": 0.01397152, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.2700243, + "balance_loss_mlp": 1.01882827, + "epoch": 0.4393807304975199, + "flos": 17751326082480.0, + "grad_norm": 2.1813613739577473, + "language_loss": 0.72039926, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74469841, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.13934326, + "step": 7308, + "time_per_iteration": 2.737912654876709 + }, + { + "auxiliary_loss_clip": 0.01392074, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.26460183, + "balance_loss_mlp": 1.02590561, + "epoch": 0.4394408537501879, + "flos": 22533659737200.0, + "grad_norm": 1.7110083210222935, + "language_loss": 0.76978874, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79411149, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.14306641, + "step": 7309, + "time_per_iteration": 2.845428228378296 + }, + { + "auxiliary_loss_clip": 0.01379065, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.25957096, + "balance_loss_mlp": 1.01701522, + "epoch": 0.43950097700285584, + "flos": 23445908098200.0, + "grad_norm": 1.7275700877763027, + "language_loss": 0.70765966, + "learning_rate": 2.484112510474251e-06, + "loss": 0.73175168, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13122559, + "step": 7310, + "time_per_iteration": 2.802513360977173 + }, + { + "auxiliary_loss_clip": 0.0139688, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.26957321, + "balance_loss_mlp": 1.02306342, + "epoch": 0.4395611002555238, + "flos": 23185321295400.0, + "grad_norm": 2.296192016587136, + "language_loss": 0.75745749, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78180796, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15100098, + "step": 7311, + "time_per_iteration": 2.865161180496216 + }, + { + "auxiliary_loss_clip": 0.01397282, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.2717222, + "balance_loss_mlp": 1.0265882, + "epoch": 0.43962122350819177, + "flos": 22132473500160.0, + "grad_norm": 2.081974210531132, + "language_loss": 0.82009423, + "learning_rate": 2.483356713869341e-06, + "loss": 0.84446836, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.13555908, + "step": 7312, + "time_per_iteration": 2.8222975730895996 + }, + { + "auxiliary_loss_clip": 0.01388115, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.26470637, + "balance_loss_mlp": 1.01858032, + "epoch": 0.43968134676085974, + "flos": 17425028307240.0, + "grad_norm": 1.9293415459704117, + "language_loss": 0.8533293, + "learning_rate": 2.482978788066318e-06, + "loss": 0.8775382, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14202881, + "step": 7313, + "time_per_iteration": 2.7148993015289307 + }, + { + "auxiliary_loss_clip": 0.01394646, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.26764584, + "balance_loss_mlp": 1.02521408, + "epoch": 0.43974147001352776, + "flos": 18957280686840.0, + "grad_norm": 2.117546555976463, + "language_loss": 0.68200505, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.70634168, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.13818359, + "step": 7314, + "time_per_iteration": 2.7920334339141846 + }, + { + "auxiliary_loss_clip": 0.01400113, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.27273512, + "balance_loss_mlp": 1.02086425, + "epoch": 0.4398015932661957, + "flos": 18958255287480.0, + "grad_norm": 1.6830255967320618, + "language_loss": 0.77123052, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79559267, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15234375, + "step": 7315, + "time_per_iteration": 2.742227554321289 + }, + { + "auxiliary_loss_clip": 0.01386603, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.26227713, + "balance_loss_mlp": 1.02018785, + "epoch": 0.4398617165188637, + "flos": 24204359307960.0, + "grad_norm": 3.1370147047418464, + "language_loss": 0.75031507, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.77452588, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14306641, + "step": 7316, + "time_per_iteration": 2.7864034175872803 + }, + { + "auxiliary_loss_clip": 0.0139174, + "auxiliary_loss_mlp": 0.01040963, + "balance_loss_clip": 1.26882935, + "balance_loss_mlp": 1.02679479, + "epoch": 0.43992183977153165, + "flos": 22241780870040.0, + "grad_norm": 3.2670971107803775, + "language_loss": 0.65065169, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67497873, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.1418457, + "step": 7317, + "time_per_iteration": 2.7684438228607178 + }, + { + "auxiliary_loss_clip": 0.01395823, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.27005208, + "balance_loss_mlp": 1.02085805, + "epoch": 0.4399819630241996, + "flos": 18702094795920.0, + "grad_norm": 2.220234320837138, + "language_loss": 0.80408239, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82839686, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14752197, + "step": 7318, + "time_per_iteration": 2.7798726558685303 + }, + { + "auxiliary_loss_clip": 0.0140014, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.27138901, + "balance_loss_mlp": 1.02736056, + "epoch": 0.4400420862768676, + "flos": 23884802520480.0, + "grad_norm": 1.7458711262172169, + "language_loss": 0.7993679, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.82378966, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14672852, + "step": 7319, + "time_per_iteration": 2.7817773818969727 + }, + { + "auxiliary_loss_clip": 0.01397652, + "auxiliary_loss_mlp": 0.01045122, + "balance_loss_clip": 1.27182198, + "balance_loss_mlp": 1.02951145, + "epoch": 0.44010220952953555, + "flos": 28043436795840.0, + "grad_norm": 1.784949260110848, + "language_loss": 0.79844165, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.8228693, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15625, + "step": 7320, + "time_per_iteration": 2.7933990955352783 + }, + { + "auxiliary_loss_clip": 0.01392754, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.26871347, + "balance_loss_mlp": 1.0234288, + "epoch": 0.4401623327822035, + "flos": 23774276899800.0, + "grad_norm": 1.5839860970447723, + "language_loss": 0.69868195, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.72298229, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.13861084, + "step": 7321, + "time_per_iteration": 2.7817630767822266 + }, + { + "auxiliary_loss_clip": 0.0121134, + "auxiliary_loss_mlp": 0.0101871, + "balance_loss_clip": 1.15881467, + "balance_loss_mlp": 1.01629055, + "epoch": 0.4402224560348715, + "flos": 70792334152200.0, + "grad_norm": 0.9205014758622663, + "language_loss": 0.56931913, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59161961, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02416992, + "step": 7322, + "time_per_iteration": 3.330634593963623 + }, + { + "auxiliary_loss_clip": 0.01389423, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.26657987, + "balance_loss_mlp": 1.02640581, + "epoch": 0.44028257928753944, + "flos": 22896772313760.0, + "grad_norm": 1.4256221526012638, + "language_loss": 0.76243341, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78672069, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.12908936, + "step": 7323, + "time_per_iteration": 2.803679943084717 + }, + { + "auxiliary_loss_clip": 0.01396461, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.2705195, + "balance_loss_mlp": 1.02859712, + "epoch": 0.4403427025402074, + "flos": 17900412663960.0, + "grad_norm": 1.4974552435230417, + "language_loss": 0.81046474, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83486527, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.14996338, + "step": 7324, + "time_per_iteration": 2.7292535305023193 + }, + { + "auxiliary_loss_clip": 0.01213863, + "auxiliary_loss_mlp": 0.01009647, + "balance_loss_clip": 1.16051602, + "balance_loss_mlp": 1.00709617, + "epoch": 0.4404028257928754, + "flos": 69579070043040.0, + "grad_norm": 0.6769151980205724, + "language_loss": 0.54558718, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56782228, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.0255127, + "step": 7325, + "time_per_iteration": 3.2210021018981934 + }, + { + "auxiliary_loss_clip": 0.0139486, + "auxiliary_loss_mlp": 0.01026613, + "balance_loss_clip": 1.27286315, + "balance_loss_mlp": 1.01406038, + "epoch": 0.44046294904554334, + "flos": 20928671138880.0, + "grad_norm": 1.5279584099788033, + "language_loss": 0.69811559, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.72233027, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.12554932, + "step": 7326, + "time_per_iteration": 2.7492573261260986 + }, + { + "auxiliary_loss_clip": 0.01389919, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.26813078, + "balance_loss_mlp": 1.01354516, + "epoch": 0.44052307229821136, + "flos": 23628804462360.0, + "grad_norm": 1.4818412501970168, + "language_loss": 0.76616025, + "learning_rate": 2.477685910312432e-06, + "loss": 0.79033834, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14355469, + "step": 7327, + "time_per_iteration": 2.743434429168701 + }, + { + "auxiliary_loss_clip": 0.01393483, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.27180815, + "balance_loss_mlp": 1.01628363, + "epoch": 0.4405831955508793, + "flos": 17601467942160.0, + "grad_norm": 4.697823903807814, + "language_loss": 0.84051013, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86474228, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13452148, + "step": 7328, + "time_per_iteration": 2.715914011001587 + }, + { + "auxiliary_loss_clip": 0.01390744, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.26961136, + "balance_loss_mlp": 1.01350808, + "epoch": 0.4406433188035473, + "flos": 21467451791520.0, + "grad_norm": 1.8785436574860603, + "language_loss": 0.78248268, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80666566, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.14050293, + "step": 7329, + "time_per_iteration": 4.15577507019043 + }, + { + "auxiliary_loss_clip": 0.01393291, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.26779449, + "balance_loss_mlp": 1.01517439, + "epoch": 0.44070344205621526, + "flos": 22678604265960.0, + "grad_norm": 1.6363160927412308, + "language_loss": 0.74279016, + "learning_rate": 2.476551258977278e-06, + "loss": 0.76702422, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14941406, + "step": 7330, + "time_per_iteration": 2.7870171070098877 + }, + { + "auxiliary_loss_clip": 0.01394991, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.27223015, + "balance_loss_mlp": 1.0168848, + "epoch": 0.4407635653088832, + "flos": 23446517223600.0, + "grad_norm": 1.8435237847967347, + "language_loss": 0.75008726, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77434695, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14099121, + "step": 7331, + "time_per_iteration": 2.865180492401123 + }, + { + "auxiliary_loss_clip": 0.01390996, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.26902723, + "balance_loss_mlp": 1.01915467, + "epoch": 0.4408236885615512, + "flos": 24026214121920.0, + "grad_norm": 1.4935226014955978, + "language_loss": 0.76205802, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78629565, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.1361084, + "step": 7332, + "time_per_iteration": 2.8199236392974854 + }, + { + "auxiliary_loss_clip": 0.01388363, + "auxiliary_loss_mlp": 0.01036096, + "balance_loss_clip": 1.2651999, + "balance_loss_mlp": 1.02209473, + "epoch": 0.44088381181421915, + "flos": 12680605879560.0, + "grad_norm": 1.8972900919076423, + "language_loss": 0.73539984, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75964439, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14001465, + "step": 7333, + "time_per_iteration": 2.7360575199127197 + }, + { + "auxiliary_loss_clip": 0.01382345, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.26375806, + "balance_loss_mlp": 1.01595056, + "epoch": 0.4409439350668871, + "flos": 24575025039480.0, + "grad_norm": 1.6502693295738544, + "language_loss": 0.79691303, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.82103205, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13616943, + "step": 7334, + "time_per_iteration": 2.7544257640838623 + }, + { + "auxiliary_loss_clip": 0.0140878, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.27682662, + "balance_loss_mlp": 1.02079201, + "epoch": 0.4410040583195551, + "flos": 22672959703920.0, + "grad_norm": 3.4561679445725217, + "language_loss": 0.75723135, + "learning_rate": 2.47465981219252e-06, + "loss": 0.78170133, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.17431641, + "step": 7335, + "time_per_iteration": 2.9542651176452637 + }, + { + "auxiliary_loss_clip": 0.01388809, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.2651608, + "balance_loss_mlp": 1.02026176, + "epoch": 0.44106418157222305, + "flos": 10856068549200.0, + "grad_norm": 1.7506708351833775, + "language_loss": 0.72298926, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74722779, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14782715, + "step": 7336, + "time_per_iteration": 4.325063467025757 + }, + { + "auxiliary_loss_clip": 0.01397976, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.27038753, + "balance_loss_mlp": 1.02679372, + "epoch": 0.441124304824891, + "flos": 21731855780160.0, + "grad_norm": 2.1897920405327054, + "language_loss": 0.64134347, + "learning_rate": 2.473903107384165e-06, + "loss": 0.66574073, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.1496582, + "step": 7337, + "time_per_iteration": 4.208892822265625 + }, + { + "auxiliary_loss_clip": 0.01212203, + "auxiliary_loss_mlp": 0.01003589, + "balance_loss_clip": 1.15954995, + "balance_loss_mlp": 1.00091922, + "epoch": 0.441184428077559, + "flos": 63237049738560.0, + "grad_norm": 0.7486404477080293, + "language_loss": 0.5262596, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54841751, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.0267334, + "step": 7338, + "time_per_iteration": 3.2496964931488037 + }, + { + "auxiliary_loss_clip": 0.01400794, + "auxiliary_loss_mlp": 0.01049246, + "balance_loss_clip": 1.27150595, + "balance_loss_mlp": 1.03278363, + "epoch": 0.44124455133022694, + "flos": 21182841820800.0, + "grad_norm": 1.874826335307606, + "language_loss": 0.70937425, + "learning_rate": 2.473146330693997e-06, + "loss": 0.73387468, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.16473389, + "step": 7339, + "time_per_iteration": 2.745863676071167 + }, + { + "auxiliary_loss_clip": 0.01377873, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.26102042, + "balance_loss_mlp": 1.02196085, + "epoch": 0.4413046745828949, + "flos": 17462452233960.0, + "grad_norm": 1.521837512057801, + "language_loss": 0.69820964, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72233796, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.12994385, + "step": 7340, + "time_per_iteration": 2.6988437175750732 + }, + { + "auxiliary_loss_clip": 0.01211593, + "auxiliary_loss_mlp": 0.01004498, + "balance_loss_clip": 1.15902877, + "balance_loss_mlp": 1.00178003, + "epoch": 0.4413647978355629, + "flos": 61598721676680.0, + "grad_norm": 0.9021401525653978, + "language_loss": 0.64015478, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66231567, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02722168, + "step": 7341, + "time_per_iteration": 3.0183942317962646 + }, + { + "auxiliary_loss_clip": 0.01388497, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.26577234, + "balance_loss_mlp": 1.01589823, + "epoch": 0.4414249210882309, + "flos": 27532780755480.0, + "grad_norm": 2.218836279927064, + "language_loss": 0.73901522, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76320231, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14331055, + "step": 7342, + "time_per_iteration": 2.818594217300415 + }, + { + "auxiliary_loss_clip": 0.01383082, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.26045251, + "balance_loss_mlp": 1.01920104, + "epoch": 0.44148504434089886, + "flos": 23519294050680.0, + "grad_norm": 1.8699208270792034, + "language_loss": 0.798572, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.82273805, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14331055, + "step": 7343, + "time_per_iteration": 2.7768912315368652 + }, + { + "auxiliary_loss_clip": 0.0138223, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.26067531, + "balance_loss_mlp": 1.01913166, + "epoch": 0.4415451675935668, + "flos": 21585693000600.0, + "grad_norm": 1.5834629130698559, + "language_loss": 0.76728714, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.79143989, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13909912, + "step": 7344, + "time_per_iteration": 4.380061149597168 + }, + { + "auxiliary_loss_clip": 0.01208423, + "auxiliary_loss_mlp": 0.01007739, + "balance_loss_clip": 1.15558493, + "balance_loss_mlp": 1.00516367, + "epoch": 0.4416052908462348, + "flos": 59019997014360.0, + "grad_norm": 0.7907625503460984, + "language_loss": 0.63831168, + "learning_rate": 2.470875570480556e-06, + "loss": 0.66047329, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.02575684, + "step": 7345, + "time_per_iteration": 2.9429352283477783 + }, + { + "auxiliary_loss_clip": 0.01384691, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.26032281, + "balance_loss_mlp": 1.01409435, + "epoch": 0.44166541409890275, + "flos": 26363031827040.0, + "grad_norm": 1.780613027728469, + "language_loss": 0.86213481, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88626778, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.14501953, + "step": 7346, + "time_per_iteration": 2.7865610122680664 + }, + { + "auxiliary_loss_clip": 0.01384171, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.26194024, + "balance_loss_mlp": 1.02127814, + "epoch": 0.4417255373515707, + "flos": 20197410549120.0, + "grad_norm": 2.491152259915654, + "language_loss": 0.8050406, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82924789, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15283203, + "step": 7347, + "time_per_iteration": 2.8086447715759277 + }, + { + "auxiliary_loss_clip": 0.0138353, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.25935531, + "balance_loss_mlp": 1.01796377, + "epoch": 0.4417856606042387, + "flos": 17891519433120.0, + "grad_norm": 2.0245770803423584, + "language_loss": 0.83137393, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.85553455, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14562988, + "step": 7348, + "time_per_iteration": 2.7500052452087402 + }, + { + "auxiliary_loss_clip": 0.01386897, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.26202214, + "balance_loss_mlp": 1.02692246, + "epoch": 0.44184578385690665, + "flos": 27969766584840.0, + "grad_norm": 2.1002123272292232, + "language_loss": 0.70576572, + "learning_rate": 2.469361373033938e-06, + "loss": 0.73005337, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14953613, + "step": 7349, + "time_per_iteration": 2.854509115219116 + }, + { + "auxiliary_loss_clip": 0.01383576, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.25873137, + "balance_loss_mlp": 1.01910448, + "epoch": 0.4419059071095746, + "flos": 23373171879480.0, + "grad_norm": 1.6181841448974297, + "language_loss": 0.74249452, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76667732, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.15594482, + "step": 7350, + "time_per_iteration": 2.7354538440704346 + }, + { + "auxiliary_loss_clip": 0.01379568, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.25593066, + "balance_loss_mlp": 1.02021885, + "epoch": 0.4419660303622426, + "flos": 15016286550600.0, + "grad_norm": 2.0068953200208997, + "language_loss": 0.81488407, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83902383, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14190674, + "step": 7351, + "time_per_iteration": 2.7858376502990723 + }, + { + "auxiliary_loss_clip": 0.01370515, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.25237608, + "balance_loss_mlp": 1.02137434, + "epoch": 0.44202615361491054, + "flos": 25376910213240.0, + "grad_norm": 1.5522192723598918, + "language_loss": 0.73181617, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75586283, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.12774658, + "step": 7352, + "time_per_iteration": 2.837844133377075 + }, + { + "auxiliary_loss_clip": 0.01382642, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.26058459, + "balance_loss_mlp": 1.01758385, + "epoch": 0.4420862768675785, + "flos": 24686525260800.0, + "grad_norm": 2.012846475462606, + "language_loss": 0.87788761, + "learning_rate": 2.467846890815649e-06, + "loss": 0.90203059, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14080811, + "step": 7353, + "time_per_iteration": 2.7686924934387207 + }, + { + "auxiliary_loss_clip": 0.01383478, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.25990832, + "balance_loss_mlp": 1.02528179, + "epoch": 0.44214640012024653, + "flos": 19532104581960.0, + "grad_norm": 1.879455600129492, + "language_loss": 0.76395345, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78817475, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.13378906, + "step": 7354, + "time_per_iteration": 2.816833734512329 + }, + { + "auxiliary_loss_clip": 0.01371333, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.25295401, + "balance_loss_mlp": 1.01767039, + "epoch": 0.4422065233729145, + "flos": 47565795371400.0, + "grad_norm": 2.1144097920063087, + "language_loss": 0.65092438, + "learning_rate": 2.467089543204268e-06, + "loss": 0.6749422, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.12792969, + "step": 7355, + "time_per_iteration": 3.1642119884490967 + }, + { + "auxiliary_loss_clip": 0.01385592, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.25904942, + "balance_loss_mlp": 1.01937425, + "epoch": 0.44226664662558246, + "flos": 19285730705160.0, + "grad_norm": 1.7162773306570631, + "language_loss": 0.7850467, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80923998, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14373779, + "step": 7356, + "time_per_iteration": 2.8698065280914307 + }, + { + "auxiliary_loss_clip": 0.01388121, + "auxiliary_loss_mlp": 0.01042578, + "balance_loss_clip": 1.26329851, + "balance_loss_mlp": 1.02852893, + "epoch": 0.4423267698782504, + "flos": 17826336369360.0, + "grad_norm": 2.101196370433003, + "language_loss": 0.77242529, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79673225, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14044189, + "step": 7357, + "time_per_iteration": 2.7767858505249023 + }, + { + "auxiliary_loss_clip": 0.01377502, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.25405347, + "balance_loss_mlp": 1.02543736, + "epoch": 0.4423868931309184, + "flos": 29210302530720.0, + "grad_norm": 1.5230687971487313, + "language_loss": 0.73425114, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75842851, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.14801025, + "step": 7358, + "time_per_iteration": 2.813732862472534 + }, + { + "auxiliary_loss_clip": 0.01387404, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.26303351, + "balance_loss_mlp": 1.02119315, + "epoch": 0.44244701638358636, + "flos": 29718806328000.0, + "grad_norm": 1.8145868938249314, + "language_loss": 0.75695229, + "learning_rate": 2.465574635551405e-06, + "loss": 0.78117812, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.13995361, + "step": 7359, + "time_per_iteration": 2.8266427516937256 + }, + { + "auxiliary_loss_clip": 0.01380893, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.25841522, + "balance_loss_mlp": 1.01984453, + "epoch": 0.4425071396362543, + "flos": 22935252057840.0, + "grad_norm": 1.9042793214055267, + "language_loss": 0.70382202, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72797567, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14630127, + "step": 7360, + "time_per_iteration": 2.7467916011810303 + }, + { + "auxiliary_loss_clip": 0.01381315, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.25738871, + "balance_loss_mlp": 1.02433276, + "epoch": 0.4425672628889223, + "flos": 19797158304360.0, + "grad_norm": 2.4202306626410377, + "language_loss": 0.69645441, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.72064924, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.1383667, + "step": 7361, + "time_per_iteration": 2.7348129749298096 + }, + { + "auxiliary_loss_clip": 0.01378481, + "auxiliary_loss_mlp": 0.01043374, + "balance_loss_clip": 1.2545501, + "balance_loss_mlp": 1.02809763, + "epoch": 0.44262738614159025, + "flos": 13666158976320.0, + "grad_norm": 1.8866232331647264, + "language_loss": 0.8241064, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84832501, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.152771, + "step": 7362, + "time_per_iteration": 2.701253652572632 + }, + { + "auxiliary_loss_clip": 0.01389513, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.26031995, + "balance_loss_mlp": 1.03345835, + "epoch": 0.4426875093942582, + "flos": 14214320160120.0, + "grad_norm": 1.646775108939125, + "language_loss": 0.74815232, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77254522, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.16333008, + "step": 7363, + "time_per_iteration": 2.7929749488830566 + }, + { + "auxiliary_loss_clip": 0.0120856, + "auxiliary_loss_mlp": 0.0101527, + "balance_loss_clip": 1.15443969, + "balance_loss_mlp": 1.01269472, + "epoch": 0.4427476326469262, + "flos": 70135149857040.0, + "grad_norm": 0.6875057515638926, + "language_loss": 0.55720001, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57943827, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.02575684, + "step": 7364, + "time_per_iteration": 3.316953659057617 + }, + { + "auxiliary_loss_clip": 0.01367364, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.24617696, + "balance_loss_mlp": 1.02678537, + "epoch": 0.44280775589959415, + "flos": 25450296165720.0, + "grad_norm": 1.6788014458062352, + "language_loss": 0.74932289, + "learning_rate": 2.463301744720305e-06, + "loss": 0.77339751, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.13317871, + "step": 7365, + "time_per_iteration": 2.817171096801758 + }, + { + "auxiliary_loss_clip": 0.0137211, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.25076437, + "balance_loss_mlp": 1.02524745, + "epoch": 0.4428678791522621, + "flos": 22862434622400.0, + "grad_norm": 1.5124927678785844, + "language_loss": 0.74017984, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.7642982, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14489746, + "step": 7366, + "time_per_iteration": 2.795245409011841 + }, + { + "auxiliary_loss_clip": 0.01374565, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.25301409, + "balance_loss_mlp": 1.02024937, + "epoch": 0.44292800240493013, + "flos": 25818038095320.0, + "grad_norm": 1.9300106577591156, + "language_loss": 0.73797697, + "learning_rate": 2.46254397374245e-06, + "loss": 0.76206428, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.13909912, + "step": 7367, + "time_per_iteration": 2.814859390258789 + }, + { + "auxiliary_loss_clip": 0.01370418, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.24765253, + "balance_loss_mlp": 1.0230881, + "epoch": 0.4429881256575981, + "flos": 32423406571080.0, + "grad_norm": 1.5602844889059304, + "language_loss": 0.74126148, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.7653349, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.1383667, + "step": 7368, + "time_per_iteration": 4.266419410705566 + }, + { + "auxiliary_loss_clip": 0.01373551, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.25238705, + "balance_loss_mlp": 1.01908469, + "epoch": 0.44304824891026606, + "flos": 22168801001160.0, + "grad_norm": 1.6084829459795789, + "language_loss": 0.80443048, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.8284933, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.13635254, + "step": 7369, + "time_per_iteration": 2.787386417388916 + }, + { + "auxiliary_loss_clip": 0.01365431, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.24561834, + "balance_loss_mlp": 1.01853514, + "epoch": 0.443108372162934, + "flos": 25344156247920.0, + "grad_norm": 1.8247458559511402, + "language_loss": 0.71589482, + "learning_rate": 2.461407185763737e-06, + "loss": 0.7398681, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.13372803, + "step": 7370, + "time_per_iteration": 2.80297589302063 + }, + { + "auxiliary_loss_clip": 0.0136999, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.24858713, + "balance_loss_mlp": 1.01759291, + "epoch": 0.443168495415602, + "flos": 23336154036360.0, + "grad_norm": 1.8095071375191623, + "language_loss": 0.70560288, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72961593, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.13745117, + "step": 7371, + "time_per_iteration": 2.7901291847229004 + }, + { + "auxiliary_loss_clip": 0.01363426, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.24330139, + "balance_loss_mlp": 1.01712275, + "epoch": 0.44322861866826996, + "flos": 21876516050400.0, + "grad_norm": 2.608377262292515, + "language_loss": 0.68686545, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.71079451, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.12347412, + "step": 7372, + "time_per_iteration": 2.743236780166626 + }, + { + "auxiliary_loss_clip": 0.01375949, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.25034964, + "balance_loss_mlp": 1.02269602, + "epoch": 0.4432887419209379, + "flos": 20089280821680.0, + "grad_norm": 1.7154441209719007, + "language_loss": 0.83721745, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86135864, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15454102, + "step": 7373, + "time_per_iteration": 2.7794687747955322 + }, + { + "auxiliary_loss_clip": 0.01209354, + "auxiliary_loss_mlp": 0.01009887, + "balance_loss_clip": 1.15454245, + "balance_loss_mlp": 1.00716877, + "epoch": 0.4433488651736059, + "flos": 70051977289800.0, + "grad_norm": 0.7870893079404688, + "language_loss": 0.5533489, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57554126, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.02722168, + "step": 7374, + "time_per_iteration": 3.2946219444274902 + }, + { + "auxiliary_loss_clip": 0.01367237, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.24752569, + "balance_loss_mlp": 1.02095318, + "epoch": 0.44340898842627385, + "flos": 16285881101040.0, + "grad_norm": 3.0752190650215687, + "language_loss": 0.82943743, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.85346687, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14752197, + "step": 7375, + "time_per_iteration": 5.623778581619263 + }, + { + "auxiliary_loss_clip": 0.01376067, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.252159, + "balance_loss_mlp": 1.01760578, + "epoch": 0.4434691116789418, + "flos": 16615589978520.0, + "grad_norm": 1.8049107490837886, + "language_loss": 0.8438983, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86797249, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.13745117, + "step": 7376, + "time_per_iteration": 2.760936975479126 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.25189447, + "balance_loss_mlp": 1.01584125, + "epoch": 0.4435292349316098, + "flos": 19068049957680.0, + "grad_norm": 1.6319279755467067, + "language_loss": 0.77370882, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79774702, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14147949, + "step": 7377, + "time_per_iteration": 2.7593159675598145 + }, + { + "auxiliary_loss_clip": 0.01363995, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.24555922, + "balance_loss_mlp": 1.01552498, + "epoch": 0.44358935818427775, + "flos": 21256268381640.0, + "grad_norm": 1.7267629548107466, + "language_loss": 0.76192391, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78585076, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.13153076, + "step": 7378, + "time_per_iteration": 2.7678582668304443 + }, + { + "auxiliary_loss_clip": 0.01372506, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.24978399, + "balance_loss_mlp": 1.01817834, + "epoch": 0.4436494814369457, + "flos": 12499171416360.0, + "grad_norm": 1.9020874402236514, + "language_loss": 0.69655603, + "learning_rate": 2.457995878562982e-06, + "loss": 0.72060829, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14526367, + "step": 7379, + "time_per_iteration": 2.749352216720581 + }, + { + "auxiliary_loss_clip": 0.01377607, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.25444937, + "balance_loss_mlp": 1.01732922, + "epoch": 0.44370960468961373, + "flos": 23665172571720.0, + "grad_norm": 1.7299345170077138, + "language_loss": 0.73283052, + "learning_rate": 2.457616757401656e-06, + "loss": 0.7569195, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.13970947, + "step": 7380, + "time_per_iteration": 2.7373528480529785 + }, + { + "auxiliary_loss_clip": 0.01377879, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.25478578, + "balance_loss_mlp": 1.01972485, + "epoch": 0.4437697279422817, + "flos": 32423365962720.0, + "grad_norm": 2.0114514217936477, + "language_loss": 0.65143073, + "learning_rate": 2.457237618887458e-06, + "loss": 0.67554986, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14294434, + "step": 7381, + "time_per_iteration": 2.831023693084717 + }, + { + "auxiliary_loss_clip": 0.01380598, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.2565124, + "balance_loss_mlp": 1.02254605, + "epoch": 0.44382985119494966, + "flos": 18117321852600.0, + "grad_norm": 1.8522321725256115, + "language_loss": 0.80594754, + "learning_rate": 2.456858463034763e-06, + "loss": 0.83011997, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14105225, + "step": 7382, + "time_per_iteration": 4.192057132720947 + }, + { + "auxiliary_loss_clip": 0.01377274, + "auxiliary_loss_mlp": 0.01040428, + "balance_loss_clip": 1.25479245, + "balance_loss_mlp": 1.02586067, + "epoch": 0.44388997444761763, + "flos": 30780344312280.0, + "grad_norm": 1.76408455620025, + "language_loss": 0.65700698, + "learning_rate": 2.456479289857949e-06, + "loss": 0.68118393, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14562988, + "step": 7383, + "time_per_iteration": 2.7843704223632812 + }, + { + "auxiliary_loss_clip": 0.01384434, + "auxiliary_loss_mlp": 0.01039914, + "balance_loss_clip": 1.25915706, + "balance_loss_mlp": 1.02502489, + "epoch": 0.4439500977002856, + "flos": 20343979412280.0, + "grad_norm": 3.3105520094543945, + "language_loss": 0.76519191, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78943539, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14904785, + "step": 7384, + "time_per_iteration": 2.753124237060547 + }, + { + "auxiliary_loss_clip": 0.01384727, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.25897837, + "balance_loss_mlp": 1.02304649, + "epoch": 0.44401022095295356, + "flos": 20375474518440.0, + "grad_norm": 1.5566687171071953, + "language_loss": 0.81714553, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.84136873, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14562988, + "step": 7385, + "time_per_iteration": 2.7348272800445557 + }, + { + "auxiliary_loss_clip": 0.01374998, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.25231004, + "balance_loss_mlp": 1.02460063, + "epoch": 0.4440703442056215, + "flos": 20235565426320.0, + "grad_norm": 1.5995603858865366, + "language_loss": 0.82184303, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84598547, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14642334, + "step": 7386, + "time_per_iteration": 2.772834539413452 + }, + { + "auxiliary_loss_clip": 0.01391987, + "auxiliary_loss_mlp": 0.01039352, + "balance_loss_clip": 1.26439643, + "balance_loss_mlp": 1.0242784, + "epoch": 0.4441304674582895, + "flos": 39502088377200.0, + "grad_norm": 3.2521156669719007, + "language_loss": 0.69704616, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72135961, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.15075684, + "step": 7387, + "time_per_iteration": 2.9200711250305176 + }, + { + "auxiliary_loss_clip": 0.0137969, + "auxiliary_loss_mlp": 0.01045111, + "balance_loss_clip": 1.25527382, + "balance_loss_mlp": 1.03070498, + "epoch": 0.44419059071095746, + "flos": 14833755661680.0, + "grad_norm": 2.059859465855698, + "language_loss": 0.71774197, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.74199003, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14404297, + "step": 7388, + "time_per_iteration": 2.7234280109405518 + }, + { + "auxiliary_loss_clip": 0.01389844, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.2630738, + "balance_loss_mlp": 1.02174735, + "epoch": 0.4442507139636254, + "flos": 22642967107080.0, + "grad_norm": 1.4482896763986488, + "language_loss": 0.69076937, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71502954, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.14440918, + "step": 7389, + "time_per_iteration": 2.748842716217041 + }, + { + "auxiliary_loss_clip": 0.01376924, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.25377154, + "balance_loss_mlp": 1.02978194, + "epoch": 0.4443108372162934, + "flos": 38297961149040.0, + "grad_norm": 2.5536901902478846, + "language_loss": 0.75078738, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77499139, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.13690186, + "step": 7390, + "time_per_iteration": 2.9502921104431152 + }, + { + "auxiliary_loss_clip": 0.01374259, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.25315523, + "balance_loss_mlp": 1.02573133, + "epoch": 0.44437096046896135, + "flos": 17753518933920.0, + "grad_norm": 1.796150261049021, + "language_loss": 0.81790578, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.84204203, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.13641357, + "step": 7391, + "time_per_iteration": 2.9193363189697266 + }, + { + "auxiliary_loss_clip": 0.01378398, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.25774789, + "balance_loss_mlp": 1.02618933, + "epoch": 0.4444310837216293, + "flos": 13735930784760.0, + "grad_norm": 1.6481021904651416, + "language_loss": 0.74320555, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76738548, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.1340332, + "step": 7392, + "time_per_iteration": 2.912841796875 + }, + { + "auxiliary_loss_clip": 0.01377075, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.25494432, + "balance_loss_mlp": 1.02131987, + "epoch": 0.44449120697429734, + "flos": 25015787446320.0, + "grad_norm": 7.5552382593322704, + "language_loss": 0.79902625, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.82314384, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.13366699, + "step": 7393, + "time_per_iteration": 2.827472448348999 + }, + { + "auxiliary_loss_clip": 0.0138797, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.26122415, + "balance_loss_mlp": 1.02423501, + "epoch": 0.4445513302269653, + "flos": 32678186378400.0, + "grad_norm": 1.8676671243518763, + "language_loss": 0.8091619, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83343321, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14929199, + "step": 7394, + "time_per_iteration": 2.854750633239746 + }, + { + "auxiliary_loss_clip": 0.01375646, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.25387716, + "balance_loss_mlp": 1.02931476, + "epoch": 0.44461145347963327, + "flos": 11659537449000.0, + "grad_norm": 2.3439632379688327, + "language_loss": 0.80330336, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.82749039, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.13739014, + "step": 7395, + "time_per_iteration": 2.7337026596069336 + }, + { + "auxiliary_loss_clip": 0.01380541, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.25792766, + "balance_loss_mlp": 1.01530266, + "epoch": 0.44467157673230123, + "flos": 20891693904120.0, + "grad_norm": 1.6535121213280795, + "language_loss": 0.68415487, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70825744, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14416504, + "step": 7396, + "time_per_iteration": 2.886322259902954 + }, + { + "auxiliary_loss_clip": 0.01389417, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.26364279, + "balance_loss_mlp": 1.0180546, + "epoch": 0.4447316999849692, + "flos": 18550043804160.0, + "grad_norm": 1.9914313985811092, + "language_loss": 0.81058848, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83480859, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.1451416, + "step": 7397, + "time_per_iteration": 2.7736990451812744 + }, + { + "auxiliary_loss_clip": 0.01381105, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.25920045, + "balance_loss_mlp": 1.01745307, + "epoch": 0.44479182323763716, + "flos": 23774358116520.0, + "grad_norm": 1.673252091833511, + "language_loss": 0.67850196, + "learning_rate": 2.450789623090293e-06, + "loss": 0.70262569, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.13800049, + "step": 7398, + "time_per_iteration": 2.819304943084717 + }, + { + "auxiliary_loss_clip": 0.01378969, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.25749326, + "balance_loss_mlp": 1.02488065, + "epoch": 0.44485194649030513, + "flos": 16548051629880.0, + "grad_norm": 1.9040877184706677, + "language_loss": 0.70143557, + "learning_rate": 2.450410174683472e-06, + "loss": 0.72561944, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14556885, + "step": 7399, + "time_per_iteration": 2.826676607131958 + }, + { + "auxiliary_loss_clip": 0.01375286, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.25404954, + "balance_loss_mlp": 1.02053535, + "epoch": 0.4449120697429731, + "flos": 22606030480680.0, + "grad_norm": 2.8154000770300014, + "language_loss": 0.72855997, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75266284, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14471436, + "step": 7400, + "time_per_iteration": 2.797847270965576 + }, + { + "auxiliary_loss_clip": 0.0138305, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.26098561, + "balance_loss_mlp": 1.01762652, + "epoch": 0.44497219299564106, + "flos": 20008869622920.0, + "grad_norm": 1.6108635647040146, + "language_loss": 0.85540897, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87955821, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14251709, + "step": 7401, + "time_per_iteration": 2.7879748344421387 + }, + { + "auxiliary_loss_clip": 0.0137633, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.25578523, + "balance_loss_mlp": 1.02222228, + "epoch": 0.445032316248309, + "flos": 25599910655880.0, + "grad_norm": 2.174485039973306, + "language_loss": 0.8365854, + "learning_rate": 2.449271727042973e-06, + "loss": 0.86069977, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.12872314, + "step": 7402, + "time_per_iteration": 2.782989501953125 + }, + { + "auxiliary_loss_clip": 0.01384975, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.26180577, + "balance_loss_mlp": 1.0176897, + "epoch": 0.445092439500977, + "flos": 21255253172640.0, + "grad_norm": 1.883144253765925, + "language_loss": 0.77328914, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79746103, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.1451416, + "step": 7403, + "time_per_iteration": 2.7484471797943115 + }, + { + "auxiliary_loss_clip": 0.01223617, + "auxiliary_loss_mlp": 0.01001308, + "balance_loss_clip": 1.17069221, + "balance_loss_mlp": 0.9987092, + "epoch": 0.44515256275364495, + "flos": 57778242817680.0, + "grad_norm": 0.7494959254024103, + "language_loss": 0.6013025, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62355179, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.02600098, + "step": 7404, + "time_per_iteration": 3.244381904602051 + }, + { + "auxiliary_loss_clip": 0.01387775, + "auxiliary_loss_mlp": 0.01039468, + "balance_loss_clip": 1.26200843, + "balance_loss_mlp": 1.02356005, + "epoch": 0.4452126860063129, + "flos": 15600206718360.0, + "grad_norm": 1.7659396047636398, + "language_loss": 0.82149899, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84577131, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15899658, + "step": 7405, + "time_per_iteration": 2.809689521789551 + }, + { + "auxiliary_loss_clip": 0.01383849, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.26057577, + "balance_loss_mlp": 1.01859152, + "epoch": 0.4452728092589809, + "flos": 21622548410280.0, + "grad_norm": 1.7082876479402493, + "language_loss": 0.755229, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77939725, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14398193, + "step": 7406, + "time_per_iteration": 2.7439351081848145 + }, + { + "auxiliary_loss_clip": 0.01366792, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.24863791, + "balance_loss_mlp": 1.02018332, + "epoch": 0.4453329325116489, + "flos": 29503562082120.0, + "grad_norm": 1.685390852702487, + "language_loss": 0.65405595, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67806119, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.13549805, + "step": 7407, + "time_per_iteration": 4.202820062637329 + }, + { + "auxiliary_loss_clip": 0.01386881, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.26160371, + "balance_loss_mlp": 1.01983738, + "epoch": 0.44539305576431687, + "flos": 21366144268560.0, + "grad_norm": 1.5626512550237261, + "language_loss": 0.68224829, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.70645982, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14434814, + "step": 7408, + "time_per_iteration": 2.79052472114563 + }, + { + "auxiliary_loss_clip": 0.01380961, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.25670576, + "balance_loss_mlp": 1.01787734, + "epoch": 0.44545317901698483, + "flos": 41435161518600.0, + "grad_norm": 1.7600924060885994, + "language_loss": 0.71929741, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74343097, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1451416, + "step": 7409, + "time_per_iteration": 2.941253900527954 + }, + { + "auxiliary_loss_clip": 0.01385364, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.26096225, + "balance_loss_mlp": 1.01529813, + "epoch": 0.4455133022696528, + "flos": 22060183973400.0, + "grad_norm": 1.9896944852108005, + "language_loss": 0.65375108, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67790937, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.15179443, + "step": 7410, + "time_per_iteration": 2.7814934253692627 + }, + { + "auxiliary_loss_clip": 0.01396383, + "auxiliary_loss_mlp": 0.01040306, + "balance_loss_clip": 1.26786149, + "balance_loss_mlp": 1.02507079, + "epoch": 0.44557342552232077, + "flos": 23482194990840.0, + "grad_norm": 3.651521266284505, + "language_loss": 0.73912597, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76349282, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.15216064, + "step": 7411, + "time_per_iteration": 2.7593512535095215 + }, + { + "auxiliary_loss_clip": 0.01379752, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.26026869, + "balance_loss_mlp": 1.01396465, + "epoch": 0.44563354877498873, + "flos": 19139283667080.0, + "grad_norm": 2.270101851891928, + "language_loss": 0.79558623, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81965375, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13043213, + "step": 7412, + "time_per_iteration": 2.7607688903808594 + }, + { + "auxiliary_loss_clip": 0.01390632, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.26343238, + "balance_loss_mlp": 1.02313221, + "epoch": 0.4456936720276567, + "flos": 13624796038680.0, + "grad_norm": 3.355197335698521, + "language_loss": 0.80106521, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82534134, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.1383667, + "step": 7413, + "time_per_iteration": 4.216794729232788 + }, + { + "auxiliary_loss_clip": 0.01378047, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.25794077, + "balance_loss_mlp": 1.02011347, + "epoch": 0.44575379528032466, + "flos": 14716935745200.0, + "grad_norm": 2.3446568812981736, + "language_loss": 0.7699666, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.79409081, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14257812, + "step": 7414, + "time_per_iteration": 4.208416700363159 + }, + { + "auxiliary_loss_clip": 0.01382026, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.26020062, + "balance_loss_mlp": 1.02082705, + "epoch": 0.4458139185329926, + "flos": 24176072262240.0, + "grad_norm": 1.5619527199551273, + "language_loss": 0.8381418, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.86231011, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.13977051, + "step": 7415, + "time_per_iteration": 2.810333251953125 + }, + { + "auxiliary_loss_clip": 0.013776, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.25577128, + "balance_loss_mlp": 1.02131081, + "epoch": 0.4458740417856606, + "flos": 21767249288880.0, + "grad_norm": 1.4987239163581159, + "language_loss": 0.84487081, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86899519, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13531494, + "step": 7416, + "time_per_iteration": 2.8511297702789307 + }, + { + "auxiliary_loss_clip": 0.01389989, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.26289725, + "balance_loss_mlp": 1.02330494, + "epoch": 0.44593416503832856, + "flos": 21073696884360.0, + "grad_norm": 2.379250336318856, + "language_loss": 0.81303436, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83731169, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14434814, + "step": 7417, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.01388995, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.26306403, + "balance_loss_mlp": 1.02652419, + "epoch": 0.4459942882909965, + "flos": 22605502572000.0, + "grad_norm": 3.1621612255172, + "language_loss": 0.80982882, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83413136, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14733887, + "step": 7418, + "time_per_iteration": 2.7314541339874268 + }, + { + "auxiliary_loss_clip": 0.01390989, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.26518905, + "balance_loss_mlp": 1.01643705, + "epoch": 0.4460544115436645, + "flos": 26511143807880.0, + "grad_norm": 1.6989229149310814, + "language_loss": 0.77290857, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79712284, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.13983154, + "step": 7419, + "time_per_iteration": 2.7789344787597656 + }, + { + "auxiliary_loss_clip": 0.01393123, + "auxiliary_loss_mlp": 0.01037415, + "balance_loss_clip": 1.27055097, + "balance_loss_mlp": 1.02302599, + "epoch": 0.4461145347963325, + "flos": 17608655621880.0, + "grad_norm": 1.57640467470495, + "language_loss": 0.72563094, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74993634, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14404297, + "step": 7420, + "time_per_iteration": 2.7434935569763184 + }, + { + "auxiliary_loss_clip": 0.01379529, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.26067579, + "balance_loss_mlp": 1.01656961, + "epoch": 0.44617465804900047, + "flos": 27273493420200.0, + "grad_norm": 1.4713776462321564, + "language_loss": 0.75433528, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77844334, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14709473, + "step": 7421, + "time_per_iteration": 2.841184377670288 + }, + { + "auxiliary_loss_clip": 0.01380412, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.26112914, + "balance_loss_mlp": 1.02331829, + "epoch": 0.44623478130166844, + "flos": 17791064685720.0, + "grad_norm": 1.709526185928011, + "language_loss": 0.75888395, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78305876, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13757324, + "step": 7422, + "time_per_iteration": 4.2665650844573975 + }, + { + "auxiliary_loss_clip": 0.01393857, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.26962209, + "balance_loss_mlp": 1.01859856, + "epoch": 0.4462949045543364, + "flos": 23008231926720.0, + "grad_norm": 1.4418027618060467, + "language_loss": 0.65328962, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67756152, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14733887, + "step": 7423, + "time_per_iteration": 2.7450220584869385 + }, + { + "auxiliary_loss_clip": 0.01377883, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.25925469, + "balance_loss_mlp": 1.01869678, + "epoch": 0.44635502780700437, + "flos": 17824387168080.0, + "grad_norm": 1.405412894745761, + "language_loss": 0.79447311, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81857407, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13531494, + "step": 7424, + "time_per_iteration": 2.718684673309326 + }, + { + "auxiliary_loss_clip": 0.01381465, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.26239657, + "balance_loss_mlp": 1.01893187, + "epoch": 0.44641515105967233, + "flos": 26693715305160.0, + "grad_norm": 1.3491093001650651, + "language_loss": 0.80455029, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82868898, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.1348877, + "step": 7425, + "time_per_iteration": 2.808556079864502 + }, + { + "auxiliary_loss_clip": 0.01388715, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.26804614, + "balance_loss_mlp": 1.0187397, + "epoch": 0.4464752743123403, + "flos": 18917582691960.0, + "grad_norm": 1.4949302257646537, + "language_loss": 0.77542925, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79964149, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13763428, + "step": 7426, + "time_per_iteration": 2.762773275375366 + }, + { + "auxiliary_loss_clip": 0.01391575, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.26630974, + "balance_loss_mlp": 1.01774538, + "epoch": 0.44653539756500826, + "flos": 29576379517560.0, + "grad_norm": 3.384520680765465, + "language_loss": 0.64672089, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.67095792, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.14385986, + "step": 7427, + "time_per_iteration": 2.8692007064819336 + }, + { + "auxiliary_loss_clip": 0.01388285, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.26853216, + "balance_loss_mlp": 1.01931334, + "epoch": 0.44659552081767623, + "flos": 21473786695680.0, + "grad_norm": 2.089783817982426, + "language_loss": 0.75016969, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77438301, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.137146, + "step": 7428, + "time_per_iteration": 2.8057730197906494 + }, + { + "auxiliary_loss_clip": 0.01386464, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.2658416, + "balance_loss_mlp": 1.01982951, + "epoch": 0.4466556440703442, + "flos": 17936496514800.0, + "grad_norm": 1.6466867032243044, + "language_loss": 0.78002405, + "learning_rate": 2.439018845165806e-06, + "loss": 0.80423677, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.1496582, + "step": 7429, + "time_per_iteration": 2.7457642555236816 + }, + { + "auxiliary_loss_clip": 0.01395942, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.2714622, + "balance_loss_mlp": 1.01570332, + "epoch": 0.44671576732301216, + "flos": 21112623320400.0, + "grad_norm": 1.6585076252748352, + "language_loss": 0.91126359, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93552995, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.14990234, + "step": 7430, + "time_per_iteration": 2.776063919067383 + }, + { + "auxiliary_loss_clip": 0.01402543, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.27372789, + "balance_loss_mlp": 1.02163267, + "epoch": 0.4467758905756801, + "flos": 23513568271920.0, + "grad_norm": 1.5049701801427326, + "language_loss": 0.79825211, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.82265157, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.15759277, + "step": 7431, + "time_per_iteration": 2.770711660385132 + }, + { + "auxiliary_loss_clip": 0.01393463, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.26889968, + "balance_loss_mlp": 1.02321148, + "epoch": 0.4468360138283481, + "flos": 18738665947080.0, + "grad_norm": 5.784959683389509, + "language_loss": 0.7982198, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82254225, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.15563965, + "step": 7432, + "time_per_iteration": 2.7593142986297607 + }, + { + "auxiliary_loss_clip": 0.01392612, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.26607025, + "balance_loss_mlp": 1.02257347, + "epoch": 0.4468961370810161, + "flos": 23482398032640.0, + "grad_norm": 2.1815249020571827, + "language_loss": 0.77246523, + "learning_rate": 2.437498860702301e-06, + "loss": 0.79676586, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14880371, + "step": 7433, + "time_per_iteration": 2.773348808288574 + }, + { + "auxiliary_loss_clip": 0.01376907, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.2588141, + "balance_loss_mlp": 1.01933742, + "epoch": 0.4469562603336841, + "flos": 30080375786880.0, + "grad_norm": 1.7132472666341787, + "language_loss": 0.77571911, + "learning_rate": 2.437118823075398e-06, + "loss": 0.7998147, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13323975, + "step": 7434, + "time_per_iteration": 2.809344530105591 + }, + { + "auxiliary_loss_clip": 0.01398362, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.27413082, + "balance_loss_mlp": 1.01607108, + "epoch": 0.44701638358635204, + "flos": 22461654468960.0, + "grad_norm": 1.748544611701738, + "language_loss": 0.64870811, + "learning_rate": 2.436738768872905e-06, + "loss": 0.67299515, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.1428833, + "step": 7435, + "time_per_iteration": 2.7910988330841064 + }, + { + "auxiliary_loss_clip": 0.01390389, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.26674509, + "balance_loss_mlp": 1.01715159, + "epoch": 0.44707650683902, + "flos": 24062785273080.0, + "grad_norm": 1.5992100823833175, + "language_loss": 0.83531153, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8595348, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14788818, + "step": 7436, + "time_per_iteration": 2.8292133808135986 + }, + { + "auxiliary_loss_clip": 0.01396371, + "auxiliary_loss_mlp": 0.01038231, + "balance_loss_clip": 1.27013254, + "balance_loss_mlp": 1.02170885, + "epoch": 0.44713663009168797, + "flos": 23771962223280.0, + "grad_norm": 1.7704694877661544, + "language_loss": 0.79725033, + "learning_rate": 2.435978610798798e-06, + "loss": 0.82159638, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.16516113, + "step": 7437, + "time_per_iteration": 2.7959306240081787 + }, + { + "auxiliary_loss_clip": 0.01395523, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.27024722, + "balance_loss_mlp": 1.0186131, + "epoch": 0.44719675334435594, + "flos": 24504725322360.0, + "grad_norm": 4.42549801090819, + "language_loss": 0.7183395, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74262738, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14660645, + "step": 7438, + "time_per_iteration": 2.8193552494049072 + }, + { + "auxiliary_loss_clip": 0.0139571, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.27001953, + "balance_loss_mlp": 1.01790237, + "epoch": 0.4472568765970239, + "flos": 29786872585320.0, + "grad_norm": 1.6333608002293427, + "language_loss": 0.67408288, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69837761, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15844727, + "step": 7439, + "time_per_iteration": 2.9525508880615234 + }, + { + "auxiliary_loss_clip": 0.01399558, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.27463651, + "balance_loss_mlp": 1.02091849, + "epoch": 0.44731699984969187, + "flos": 24648329775240.0, + "grad_norm": 1.7141798374032466, + "language_loss": 0.74240363, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76676834, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.16003418, + "step": 7440, + "time_per_iteration": 2.7802984714508057 + }, + { + "auxiliary_loss_clip": 0.01387599, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.26467919, + "balance_loss_mlp": 1.01732206, + "epoch": 0.44737712310235983, + "flos": 29461062110400.0, + "grad_norm": 1.615730116420664, + "language_loss": 0.74475527, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76895547, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15093994, + "step": 7441, + "time_per_iteration": 2.794459104537964 + }, + { + "auxiliary_loss_clip": 0.01399105, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.27234364, + "balance_loss_mlp": 1.02252316, + "epoch": 0.4474372463550278, + "flos": 24901850723400.0, + "grad_norm": 1.935715733359407, + "language_loss": 0.75359309, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.7779628, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15344238, + "step": 7442, + "time_per_iteration": 2.772547960281372 + }, + { + "auxiliary_loss_clip": 0.01401021, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.27311611, + "balance_loss_mlp": 1.02243912, + "epoch": 0.44749736960769576, + "flos": 33188030251560.0, + "grad_norm": 1.751348253266604, + "language_loss": 0.74055684, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76494139, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15002441, + "step": 7443, + "time_per_iteration": 2.847011089324951 + }, + { + "auxiliary_loss_clip": 0.01390884, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.26731801, + "balance_loss_mlp": 1.01880336, + "epoch": 0.4475574928603637, + "flos": 21077107986600.0, + "grad_norm": 2.199230304488091, + "language_loss": 0.772569, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79682505, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.15905762, + "step": 7444, + "time_per_iteration": 2.80705189704895 + }, + { + "auxiliary_loss_clip": 0.01385666, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.26460576, + "balance_loss_mlp": 1.01747918, + "epoch": 0.4476176161130317, + "flos": 21866363960400.0, + "grad_norm": 2.281888362719098, + "language_loss": 0.85443044, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.8786025, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.14074707, + "step": 7445, + "time_per_iteration": 4.097264766693115 + }, + { + "auxiliary_loss_clip": 0.01399321, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.27411032, + "balance_loss_mlp": 1.01790059, + "epoch": 0.4476777393656997, + "flos": 22533822170640.0, + "grad_norm": 2.224327768264825, + "language_loss": 0.64512444, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66945809, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.16137695, + "step": 7446, + "time_per_iteration": 2.7381584644317627 + }, + { + "auxiliary_loss_clip": 0.01236812, + "auxiliary_loss_mlp": 0.01007261, + "balance_loss_clip": 1.18591881, + "balance_loss_mlp": 1.00431681, + "epoch": 0.4477378626183677, + "flos": 49030485775200.0, + "grad_norm": 0.7407851559564741, + "language_loss": 0.50309974, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52554047, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02941895, + "step": 7447, + "time_per_iteration": 3.1112864017486572 + }, + { + "auxiliary_loss_clip": 0.01237811, + "auxiliary_loss_mlp": 0.01006405, + "balance_loss_clip": 1.18720341, + "balance_loss_mlp": 1.00378227, + "epoch": 0.44779798587103564, + "flos": 56557686184920.0, + "grad_norm": 0.7731472676344896, + "language_loss": 0.59444332, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61688548, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.02624512, + "step": 7448, + "time_per_iteration": 3.3032329082489014 + }, + { + "auxiliary_loss_clip": 0.01393881, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.27121449, + "balance_loss_mlp": 1.02070081, + "epoch": 0.4478581091237036, + "flos": 46506653280360.0, + "grad_norm": 2.1328095775553155, + "language_loss": 0.59298384, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61727142, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14178467, + "step": 7449, + "time_per_iteration": 2.9888060092926025 + }, + { + "auxiliary_loss_clip": 0.01393087, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.26966274, + "balance_loss_mlp": 1.01341748, + "epoch": 0.4479182323763716, + "flos": 20819363769000.0, + "grad_norm": 2.1333890651588883, + "language_loss": 0.80394208, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82814926, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14202881, + "step": 7450, + "time_per_iteration": 2.7492594718933105 + }, + { + "auxiliary_loss_clip": 0.01399744, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.2761817, + "balance_loss_mlp": 1.0224359, + "epoch": 0.44797835562903954, + "flos": 14249997927360.0, + "grad_norm": 2.0015361285971935, + "language_loss": 0.79726779, + "learning_rate": 2.430655659114697e-06, + "loss": 0.82163638, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14672852, + "step": 7451, + "time_per_iteration": 2.733909845352173 + }, + { + "auxiliary_loss_clip": 0.0124007, + "auxiliary_loss_mlp": 0.01011122, + "balance_loss_clip": 1.1903609, + "balance_loss_mlp": 1.00834453, + "epoch": 0.4480384788817075, + "flos": 63549297021240.0, + "grad_norm": 1.0244097514453538, + "language_loss": 0.62845975, + "learning_rate": 2.430275325332681e-06, + "loss": 0.65097165, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.02783203, + "step": 7452, + "time_per_iteration": 4.722520351409912 + }, + { + "auxiliary_loss_clip": 0.01394888, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.27206802, + "balance_loss_mlp": 1.0185883, + "epoch": 0.44809860213437547, + "flos": 21657617052120.0, + "grad_norm": 1.8630560252619834, + "language_loss": 0.62689525, + "learning_rate": 2.429894975234582e-06, + "loss": 0.6511873, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.15710449, + "step": 7453, + "time_per_iteration": 4.367917776107788 + }, + { + "auxiliary_loss_clip": 0.01236502, + "auxiliary_loss_mlp": 0.01007489, + "balance_loss_clip": 1.18653965, + "balance_loss_mlp": 1.00441325, + "epoch": 0.44815872538704343, + "flos": 69206211460080.0, + "grad_norm": 0.9909071257098822, + "language_loss": 0.5709247, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59336466, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03063965, + "step": 7454, + "time_per_iteration": 3.138110399246216 + }, + { + "auxiliary_loss_clip": 0.01401961, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.27776408, + "balance_loss_mlp": 1.01957297, + "epoch": 0.4482188486397114, + "flos": 12601331714880.0, + "grad_norm": 2.3801957470919737, + "language_loss": 0.74881083, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7731688, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14263916, + "step": 7455, + "time_per_iteration": 2.6814560890197754 + }, + { + "auxiliary_loss_clip": 0.01393036, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.27032411, + "balance_loss_mlp": 1.02200437, + "epoch": 0.44827897189237936, + "flos": 34064844495480.0, + "grad_norm": 1.7839066460735464, + "language_loss": 0.76506305, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78935373, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14025879, + "step": 7456, + "time_per_iteration": 2.8339273929595947 + }, + { + "auxiliary_loss_clip": 0.01389546, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.27012014, + "balance_loss_mlp": 1.01681137, + "epoch": 0.44833909514504733, + "flos": 25151838744240.0, + "grad_norm": 1.8607161181819758, + "language_loss": 0.76887918, + "learning_rate": 2.428373411969818e-06, + "loss": 0.7930817, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13916016, + "step": 7457, + "time_per_iteration": 2.726573944091797 + }, + { + "auxiliary_loss_clip": 0.01400737, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.27704144, + "balance_loss_mlp": 1.01753807, + "epoch": 0.4483992183977153, + "flos": 16184289319560.0, + "grad_norm": 2.4131079924704273, + "language_loss": 0.68638086, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.71071804, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.15441895, + "step": 7458, + "time_per_iteration": 2.7532315254211426 + }, + { + "auxiliary_loss_clip": 0.01403914, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.27823949, + "balance_loss_mlp": 1.02019811, + "epoch": 0.44845934165038326, + "flos": 17750189048400.0, + "grad_norm": 1.5352562626895891, + "language_loss": 0.71719337, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74158609, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.1517334, + "step": 7459, + "time_per_iteration": 2.6978888511657715 + }, + { + "auxiliary_loss_clip": 0.01396301, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.27378964, + "balance_loss_mlp": 1.01814437, + "epoch": 0.4485194649030513, + "flos": 21841041324960.0, + "grad_norm": 1.608266837382611, + "language_loss": 0.6992408, + "learning_rate": 2.427232068909154e-06, + "loss": 0.7235347, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14953613, + "step": 7460, + "time_per_iteration": 2.770142078399658 + }, + { + "auxiliary_loss_clip": 0.01398066, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.27370954, + "balance_loss_mlp": 1.02138567, + "epoch": 0.44857958815571924, + "flos": 20089402646760.0, + "grad_norm": 1.8250167284695629, + "language_loss": 0.7775349, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.80187774, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14825439, + "step": 7461, + "time_per_iteration": 4.229188919067383 + }, + { + "auxiliary_loss_clip": 0.01400878, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.27644563, + "balance_loss_mlp": 1.02095342, + "epoch": 0.4486397114083872, + "flos": 27059345600040.0, + "grad_norm": 1.5866417665346804, + "language_loss": 0.68328071, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70764142, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.14233398, + "step": 7462, + "time_per_iteration": 2.7653706073760986 + }, + { + "auxiliary_loss_clip": 0.01235936, + "auxiliary_loss_mlp": 0.01004756, + "balance_loss_clip": 1.18554235, + "balance_loss_mlp": 1.00165617, + "epoch": 0.4486998346610552, + "flos": 67335397580520.0, + "grad_norm": 0.7728779407657025, + "language_loss": 0.54451036, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.5669173, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.03100586, + "step": 7463, + "time_per_iteration": 3.317255973815918 + }, + { + "auxiliary_loss_clip": 0.01391334, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.26985025, + "balance_loss_mlp": 1.02426004, + "epoch": 0.44875995791372314, + "flos": 27642656642400.0, + "grad_norm": 1.71566229299717, + "language_loss": 0.76100284, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78530276, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.1439209, + "step": 7464, + "time_per_iteration": 2.7801403999328613 + }, + { + "auxiliary_loss_clip": 0.01398124, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.2766031, + "balance_loss_mlp": 1.02666128, + "epoch": 0.4488200811663911, + "flos": 13009868065080.0, + "grad_norm": 1.951956807449323, + "language_loss": 0.74755472, + "learning_rate": 2.425329506653441e-06, + "loss": 0.77193731, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.1348877, + "step": 7465, + "time_per_iteration": 2.755204200744629 + }, + { + "auxiliary_loss_clip": 0.01406914, + "auxiliary_loss_mlp": 0.01046834, + "balance_loss_clip": 1.27909911, + "balance_loss_mlp": 1.03069329, + "epoch": 0.44888020441905907, + "flos": 27495681695640.0, + "grad_norm": 2.2077830138933705, + "language_loss": 0.80166715, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82620466, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.16149902, + "step": 7466, + "time_per_iteration": 2.801396608352661 + }, + { + "auxiliary_loss_clip": 0.01397022, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.27320433, + "balance_loss_mlp": 1.02369118, + "epoch": 0.44894032767172704, + "flos": 18264256191000.0, + "grad_norm": 2.2533145432863866, + "language_loss": 0.80815619, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.83250827, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14501953, + "step": 7467, + "time_per_iteration": 2.7154810428619385 + }, + { + "auxiliary_loss_clip": 0.01387441, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.27047586, + "balance_loss_mlp": 1.02332258, + "epoch": 0.449000450924395, + "flos": 21584840225040.0, + "grad_norm": 1.8589399477159851, + "language_loss": 0.75665653, + "learning_rate": 2.424187775642129e-06, + "loss": 0.78090078, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13659668, + "step": 7468, + "time_per_iteration": 2.8007943630218506 + }, + { + "auxiliary_loss_clip": 0.01388543, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.26746702, + "balance_loss_mlp": 1.02089071, + "epoch": 0.44906057417706297, + "flos": 17972458540560.0, + "grad_norm": 1.7261708767776387, + "language_loss": 0.71132123, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73554105, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.12561035, + "step": 7469, + "time_per_iteration": 2.7125422954559326 + }, + { + "auxiliary_loss_clip": 0.01393418, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.27032828, + "balance_loss_mlp": 1.02173328, + "epoch": 0.44912069742973093, + "flos": 20052141153480.0, + "grad_norm": 1.9129148781826413, + "language_loss": 0.72013354, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74442738, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14233398, + "step": 7470, + "time_per_iteration": 2.736750364303589 + }, + { + "auxiliary_loss_clip": 0.01393628, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.27022433, + "balance_loss_mlp": 1.02085125, + "epoch": 0.4491808206823989, + "flos": 21038587634160.0, + "grad_norm": 1.7828200543586823, + "language_loss": 0.77332234, + "learning_rate": 2.423045899863634e-06, + "loss": 0.7976079, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.14074707, + "step": 7471, + "time_per_iteration": 2.7459006309509277 + }, + { + "auxiliary_loss_clip": 0.01381799, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.26236844, + "balance_loss_mlp": 1.02467883, + "epoch": 0.44924094393506686, + "flos": 22972432334400.0, + "grad_norm": 1.6398251331126108, + "language_loss": 0.70806128, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.73226297, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13690186, + "step": 7472, + "time_per_iteration": 2.77068829536438 + }, + { + "auxiliary_loss_clip": 0.0123083, + "auxiliary_loss_mlp": 0.01015352, + "balance_loss_clip": 1.18064117, + "balance_loss_mlp": 1.01251495, + "epoch": 0.4493010671877349, + "flos": 59248195327080.0, + "grad_norm": 0.741605179966445, + "language_loss": 0.61689079, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63935268, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.02832031, + "step": 7473, + "time_per_iteration": 3.2021636962890625 + }, + { + "auxiliary_loss_clip": 0.01384699, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.26384187, + "balance_loss_mlp": 1.02917767, + "epoch": 0.44936119044040285, + "flos": 18009516992040.0, + "grad_norm": 2.0867797727087343, + "language_loss": 0.78552675, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80980897, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.14355469, + "step": 7474, + "time_per_iteration": 2.7297091484069824 + }, + { + "auxiliary_loss_clip": 0.01377874, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.26020861, + "balance_loss_mlp": 1.02628541, + "epoch": 0.4494213136930708, + "flos": 21256836898680.0, + "grad_norm": 1.6603328024925486, + "language_loss": 0.72720587, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.75138307, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13574219, + "step": 7475, + "time_per_iteration": 2.7795956134796143 + }, + { + "auxiliary_loss_clip": 0.01385522, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.26523852, + "balance_loss_mlp": 1.01871777, + "epoch": 0.4494814369457388, + "flos": 27424610419680.0, + "grad_norm": 1.746405056501833, + "language_loss": 0.76875323, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79293418, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13879395, + "step": 7476, + "time_per_iteration": 2.8027827739715576 + }, + { + "auxiliary_loss_clip": 0.01395851, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.27071714, + "balance_loss_mlp": 1.02725601, + "epoch": 0.44954156019840674, + "flos": 22858861086720.0, + "grad_norm": 1.8792047236572695, + "language_loss": 0.72355086, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74793357, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.1517334, + "step": 7477, + "time_per_iteration": 2.761638641357422 + }, + { + "auxiliary_loss_clip": 0.01394544, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.26968277, + "balance_loss_mlp": 1.02089858, + "epoch": 0.4496016834510747, + "flos": 17206007483880.0, + "grad_norm": 1.8444545819992133, + "language_loss": 0.67984062, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70414019, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.14538574, + "step": 7478, + "time_per_iteration": 2.6932179927825928 + }, + { + "auxiliary_loss_clip": 0.01378379, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.25935328, + "balance_loss_mlp": 1.02149224, + "epoch": 0.4496618067037427, + "flos": 18921318661080.0, + "grad_norm": 1.8559945748196445, + "language_loss": 0.88719022, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91131842, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.12957764, + "step": 7479, + "time_per_iteration": 2.759570360183716 + }, + { + "auxiliary_loss_clip": 0.01386927, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.26612699, + "balance_loss_mlp": 1.01431179, + "epoch": 0.44972192995641064, + "flos": 21036475999440.0, + "grad_norm": 1.7630343165339124, + "language_loss": 0.75794429, + "learning_rate": 2.419619407822302e-06, + "loss": 0.78209996, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14331055, + "step": 7480, + "time_per_iteration": 2.7651515007019043 + }, + { + "auxiliary_loss_clip": 0.01388298, + "auxiliary_loss_mlp": 0.01037878, + "balance_loss_clip": 1.26503682, + "balance_loss_mlp": 1.02279758, + "epoch": 0.4497820532090786, + "flos": 20781980450640.0, + "grad_norm": 1.9020025199643298, + "language_loss": 0.80211151, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82637334, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.15087891, + "step": 7481, + "time_per_iteration": 2.750746726989746 + }, + { + "auxiliary_loss_clip": 0.01372922, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.2559123, + "balance_loss_mlp": 1.01623631, + "epoch": 0.44984217646174657, + "flos": 33809211912600.0, + "grad_norm": 1.9267657030310157, + "language_loss": 0.68660927, + "learning_rate": 2.418857789743758e-06, + "loss": 0.71064782, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14703369, + "step": 7482, + "time_per_iteration": 2.815671443939209 + }, + { + "auxiliary_loss_clip": 0.01388401, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.2661289, + "balance_loss_mlp": 1.02347088, + "epoch": 0.44990229971441453, + "flos": 15521947762680.0, + "grad_norm": 1.8852176308145951, + "language_loss": 0.8487075, + "learning_rate": 2.418476956872571e-06, + "loss": 0.8729735, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14733887, + "step": 7483, + "time_per_iteration": 4.2065510749816895 + }, + { + "auxiliary_loss_clip": 0.01388844, + "auxiliary_loss_mlp": 0.01038688, + "balance_loss_clip": 1.26538551, + "balance_loss_mlp": 1.02366209, + "epoch": 0.4499624229670825, + "flos": 29867974126200.0, + "grad_norm": 1.5829198381519247, + "language_loss": 0.80846375, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.83273911, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.15032959, + "step": 7484, + "time_per_iteration": 2.784602165222168 + }, + { + "auxiliary_loss_clip": 0.01392799, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.26694727, + "balance_loss_mlp": 1.01547313, + "epoch": 0.45002254621975046, + "flos": 18518102006040.0, + "grad_norm": 3.479450728814097, + "language_loss": 0.75752664, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.78175914, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14978027, + "step": 7485, + "time_per_iteration": 2.678377866744995 + }, + { + "auxiliary_loss_clip": 0.01220851, + "auxiliary_loss_mlp": 0.01017292, + "balance_loss_clip": 1.17046654, + "balance_loss_mlp": 1.01432407, + "epoch": 0.4500826694724185, + "flos": 70434727331400.0, + "grad_norm": 0.7972764447984149, + "language_loss": 0.58681601, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60919744, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.02966309, + "step": 7486, + "time_per_iteration": 3.2577977180480957 + }, + { + "auxiliary_loss_clip": 0.01383594, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.26194739, + "balance_loss_mlp": 1.02019787, + "epoch": 0.45014279272508645, + "flos": 15783184299240.0, + "grad_norm": 2.038699480404849, + "language_loss": 0.83876193, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.86294907, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14916992, + "step": 7487, + "time_per_iteration": 2.7352747917175293 + }, + { + "auxiliary_loss_clip": 0.01378896, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.25868583, + "balance_loss_mlp": 1.01848841, + "epoch": 0.4502029159777544, + "flos": 21804713823960.0, + "grad_norm": 1.4792434485988528, + "language_loss": 0.77298254, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79710376, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14733887, + "step": 7488, + "time_per_iteration": 2.8432700634002686 + }, + { + "auxiliary_loss_clip": 0.01397355, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.26957214, + "balance_loss_mlp": 1.02119935, + "epoch": 0.4502630392304224, + "flos": 28773966435120.0, + "grad_norm": 1.9017800095238981, + "language_loss": 0.72216499, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74650013, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.14959717, + "step": 7489, + "time_per_iteration": 2.8400003910064697 + }, + { + "auxiliary_loss_clip": 0.01390452, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.26682174, + "balance_loss_mlp": 1.0181036, + "epoch": 0.45032316248309034, + "flos": 15847473979080.0, + "grad_norm": 2.9176350786527023, + "language_loss": 0.69478679, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71903521, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.1628418, + "step": 7490, + "time_per_iteration": 2.7480628490448 + }, + { + "auxiliary_loss_clip": 0.01220997, + "auxiliary_loss_mlp": 0.01010152, + "balance_loss_clip": 1.169662, + "balance_loss_mlp": 1.00709999, + "epoch": 0.4503832857357583, + "flos": 57869009148240.0, + "grad_norm": 0.7316944288087046, + "language_loss": 0.56700814, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58931965, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.03051758, + "step": 7491, + "time_per_iteration": 4.672500371932983 + }, + { + "auxiliary_loss_clip": 0.01376572, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.25772095, + "balance_loss_mlp": 1.01919603, + "epoch": 0.4504434089884263, + "flos": 23883340619520.0, + "grad_norm": 1.6368316649340562, + "language_loss": 0.79438174, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81848025, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14080811, + "step": 7492, + "time_per_iteration": 4.24560284614563 + }, + { + "auxiliary_loss_clip": 0.01399915, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.27195299, + "balance_loss_mlp": 1.0192703, + "epoch": 0.45050353224109424, + "flos": 17789562176400.0, + "grad_norm": 2.0628809822302374, + "language_loss": 0.92600501, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.95035142, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.15454102, + "step": 7493, + "time_per_iteration": 2.7562661170959473 + }, + { + "auxiliary_loss_clip": 0.01221588, + "auxiliary_loss_mlp": 0.01006915, + "balance_loss_clip": 1.17078137, + "balance_loss_mlp": 1.0041374, + "epoch": 0.4505636554937622, + "flos": 65077894648440.0, + "grad_norm": 2.022337939418252, + "language_loss": 0.62795854, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65024352, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02783203, + "step": 7494, + "time_per_iteration": 3.218916177749634 + }, + { + "auxiliary_loss_clip": 0.013768, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.25713062, + "balance_loss_mlp": 1.01772857, + "epoch": 0.45062377874643017, + "flos": 22205169110520.0, + "grad_norm": 1.4483071717875942, + "language_loss": 0.82390487, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84798843, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.1383667, + "step": 7495, + "time_per_iteration": 2.77876615524292 + }, + { + "auxiliary_loss_clip": 0.0138478, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.26287913, + "balance_loss_mlp": 1.01632166, + "epoch": 0.45068390199909814, + "flos": 37677429221760.0, + "grad_norm": 1.8115296605235454, + "language_loss": 0.85998511, + "learning_rate": 2.41352469075395e-06, + "loss": 0.88414437, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14825439, + "step": 7496, + "time_per_iteration": 2.8698618412017822 + }, + { + "auxiliary_loss_clip": 0.01389212, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.266294, + "balance_loss_mlp": 1.01240671, + "epoch": 0.4507440252517661, + "flos": 22306679675280.0, + "grad_norm": 1.8857777129576672, + "language_loss": 0.76600188, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.79016614, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14813232, + "step": 7497, + "time_per_iteration": 2.752713203430176 + }, + { + "auxiliary_loss_clip": 0.01388381, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.26243353, + "balance_loss_mlp": 1.02087545, + "epoch": 0.45080414850443407, + "flos": 13192317737280.0, + "grad_norm": 1.8802376738062558, + "language_loss": 0.75216228, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77640557, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.15087891, + "step": 7498, + "time_per_iteration": 2.716737747192383 + }, + { + "auxiliary_loss_clip": 0.01383675, + "auxiliary_loss_mlp": 0.010365, + "balance_loss_clip": 1.25978374, + "balance_loss_mlp": 1.02163494, + "epoch": 0.4508642717571021, + "flos": 21950023827960.0, + "grad_norm": 2.3076862683541033, + "language_loss": 0.70802319, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.73222506, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.14849854, + "step": 7499, + "time_per_iteration": 2.8278353214263916 + }, + { + "auxiliary_loss_clip": 0.01388344, + "auxiliary_loss_mlp": 0.01039549, + "balance_loss_clip": 1.2621659, + "balance_loss_mlp": 1.0241766, + "epoch": 0.45092439500977005, + "flos": 23372684579160.0, + "grad_norm": 2.0928378442157305, + "language_loss": 0.77224886, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79652774, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15350342, + "step": 7500, + "time_per_iteration": 4.255283832550049 + }, + { + "auxiliary_loss_clip": 0.01381945, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.2600702, + "balance_loss_mlp": 1.02299023, + "epoch": 0.450984518262438, + "flos": 20777310489240.0, + "grad_norm": 2.690700686898216, + "language_loss": 0.62850648, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6526981, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14221191, + "step": 7501, + "time_per_iteration": 2.70289945602417 + }, + { + "auxiliary_loss_clip": 0.01390082, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.26507533, + "balance_loss_mlp": 1.0236454, + "epoch": 0.451044641515106, + "flos": 17711222004000.0, + "grad_norm": 2.193871955435139, + "language_loss": 0.84635669, + "learning_rate": 2.411238133735863e-06, + "loss": 0.87064576, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15179443, + "step": 7502, + "time_per_iteration": 2.776434898376465 + }, + { + "auxiliary_loss_clip": 0.01379076, + "auxiliary_loss_mlp": 0.01040702, + "balance_loss_clip": 1.25843501, + "balance_loss_mlp": 1.02692175, + "epoch": 0.45110476476777395, + "flos": 20599571386800.0, + "grad_norm": 1.3576002639726492, + "language_loss": 0.79535162, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81954944, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.13781738, + "step": 7503, + "time_per_iteration": 2.785438060760498 + }, + { + "auxiliary_loss_clip": 0.01378643, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_clip": 1.26028061, + "balance_loss_mlp": 1.02853155, + "epoch": 0.4511648880204419, + "flos": 16038329581800.0, + "grad_norm": 1.7597022835006675, + "language_loss": 0.81480587, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83902335, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14562988, + "step": 7504, + "time_per_iteration": 2.742410898208618 + }, + { + "auxiliary_loss_clip": 0.01376093, + "auxiliary_loss_mlp": 0.01045083, + "balance_loss_clip": 1.25667357, + "balance_loss_mlp": 1.03134441, + "epoch": 0.4512250112731099, + "flos": 23982942591360.0, + "grad_norm": 1.5600555792674733, + "language_loss": 0.63648331, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.66069508, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13757324, + "step": 7505, + "time_per_iteration": 2.780336856842041 + }, + { + "auxiliary_loss_clip": 0.01214576, + "auxiliary_loss_mlp": 0.01022849, + "balance_loss_clip": 1.16483903, + "balance_loss_mlp": 1.02019024, + "epoch": 0.45128513452577784, + "flos": 71479429827480.0, + "grad_norm": 0.8422317675123553, + "language_loss": 0.58991945, + "learning_rate": 2.409713450313968e-06, + "loss": 0.61229372, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.02661133, + "step": 7506, + "time_per_iteration": 3.3055765628814697 + }, + { + "auxiliary_loss_clip": 0.01377707, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.25726748, + "balance_loss_mlp": 1.02870762, + "epoch": 0.4513452577784458, + "flos": 22095942957360.0, + "grad_norm": 1.7303174874496337, + "language_loss": 0.79350364, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81770754, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13977051, + "step": 7507, + "time_per_iteration": 2.779566764831543 + }, + { + "auxiliary_loss_clip": 0.0138098, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.25838184, + "balance_loss_mlp": 1.02966523, + "epoch": 0.4514053810311138, + "flos": 24281318796120.0, + "grad_norm": 1.588378782247222, + "language_loss": 0.74221587, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76648092, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.1583252, + "step": 7508, + "time_per_iteration": 2.751991033554077 + }, + { + "auxiliary_loss_clip": 0.01375889, + "auxiliary_loss_mlp": 0.01040298, + "balance_loss_clip": 1.25825405, + "balance_loss_mlp": 1.02637398, + "epoch": 0.45146550428378174, + "flos": 17890260573960.0, + "grad_norm": 1.8994626809676234, + "language_loss": 0.79611063, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.82027256, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13922119, + "step": 7509, + "time_per_iteration": 2.7810709476470947 + }, + { + "auxiliary_loss_clip": 0.01377796, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.25786924, + "balance_loss_mlp": 1.02397704, + "epoch": 0.4515256275364497, + "flos": 24248808480960.0, + "grad_norm": 2.3567009731018036, + "language_loss": 0.7370891, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.76124454, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.13763428, + "step": 7510, + "time_per_iteration": 2.932177782058716 + }, + { + "auxiliary_loss_clip": 0.01387009, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.26265562, + "balance_loss_mlp": 1.02176416, + "epoch": 0.45158575078911767, + "flos": 20635898887800.0, + "grad_norm": 1.7873859781397552, + "language_loss": 0.77448213, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79872012, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.15026855, + "step": 7511, + "time_per_iteration": 2.851473331451416 + }, + { + "auxiliary_loss_clip": 0.01380706, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.25775492, + "balance_loss_mlp": 1.02131927, + "epoch": 0.45164587404178563, + "flos": 23332539892320.0, + "grad_norm": 1.516283781665111, + "language_loss": 0.78430748, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80847991, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15197754, + "step": 7512, + "time_per_iteration": 2.809431791305542 + }, + { + "auxiliary_loss_clip": 0.01393251, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.26419401, + "balance_loss_mlp": 1.02299333, + "epoch": 0.45170599729445365, + "flos": 23811335351280.0, + "grad_norm": 2.6645248906298766, + "language_loss": 0.87560779, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89992821, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.15808105, + "step": 7513, + "time_per_iteration": 2.752169132232666 + }, + { + "auxiliary_loss_clip": 0.01367738, + "auxiliary_loss_mlp": 0.01036488, + "balance_loss_clip": 1.25314295, + "balance_loss_mlp": 1.02282631, + "epoch": 0.4517661205471216, + "flos": 23518116408240.0, + "grad_norm": 1.5608639114251064, + "language_loss": 0.67455184, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69859409, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13659668, + "step": 7514, + "time_per_iteration": 2.765134811401367 + }, + { + "auxiliary_loss_clip": 0.01385899, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.26383066, + "balance_loss_mlp": 1.0191853, + "epoch": 0.4518262437997896, + "flos": 23519253442320.0, + "grad_norm": 1.8410790999979194, + "language_loss": 0.69964713, + "learning_rate": 2.406282005146318e-06, + "loss": 0.72385103, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.15307617, + "step": 7515, + "time_per_iteration": 2.7510480880737305 + }, + { + "auxiliary_loss_clip": 0.01386452, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.2606082, + "balance_loss_mlp": 1.02279019, + "epoch": 0.45188636705245755, + "flos": 14571300874320.0, + "grad_norm": 2.7027047983887615, + "language_loss": 0.81938201, + "learning_rate": 2.405900656236963e-06, + "loss": 0.84363317, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15869141, + "step": 7516, + "time_per_iteration": 2.708725690841675 + }, + { + "auxiliary_loss_clip": 0.01375527, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.25693512, + "balance_loss_mlp": 1.01914597, + "epoch": 0.4519464903051255, + "flos": 19906343849160.0, + "grad_norm": 1.5478811092501494, + "language_loss": 0.65611374, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.68020654, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14605713, + "step": 7517, + "time_per_iteration": 2.718350410461426 + }, + { + "auxiliary_loss_clip": 0.01376521, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.2595067, + "balance_loss_mlp": 1.01822817, + "epoch": 0.4520066135577935, + "flos": 18849150959400.0, + "grad_norm": 1.64121699862965, + "language_loss": 0.62800217, + "learning_rate": 2.405137912257333e-06, + "loss": 0.6520763, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.12664795, + "step": 7518, + "time_per_iteration": 2.7693533897399902 + }, + { + "auxiliary_loss_clip": 0.01382596, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.26137304, + "balance_loss_mlp": 1.02283382, + "epoch": 0.45206673681046144, + "flos": 48222898449840.0, + "grad_norm": 1.3485673644751364, + "language_loss": 0.59926766, + "learning_rate": 2.404756517215982e-06, + "loss": 0.62346625, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14440918, + "step": 7519, + "time_per_iteration": 2.9841387271881104 + }, + { + "auxiliary_loss_clip": 0.01386055, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.26356077, + "balance_loss_mlp": 1.02614748, + "epoch": 0.4521268600631294, + "flos": 23847500418840.0, + "grad_norm": 1.31596173676494, + "language_loss": 0.72737253, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75164557, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.15106201, + "step": 7520, + "time_per_iteration": 2.7683029174804688 + }, + { + "auxiliary_loss_clip": 0.01379551, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.25882185, + "balance_loss_mlp": 1.02341294, + "epoch": 0.4521869833157974, + "flos": 18848298183840.0, + "grad_norm": 1.8870624932448523, + "language_loss": 0.76064944, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.78481185, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13287354, + "step": 7521, + "time_per_iteration": 2.760409116744995 + }, + { + "auxiliary_loss_clip": 0.01391252, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.26546955, + "balance_loss_mlp": 1.02412701, + "epoch": 0.45224710656846534, + "flos": 19792325909520.0, + "grad_norm": 1.8183245012126346, + "language_loss": 0.68259436, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70690584, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15783691, + "step": 7522, + "time_per_iteration": 4.173171758651733 + }, + { + "auxiliary_loss_clip": 0.01371427, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.25119221, + "balance_loss_mlp": 1.02179384, + "epoch": 0.4523072298211333, + "flos": 28261929710520.0, + "grad_norm": 1.623137141530016, + "language_loss": 0.60827953, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63235384, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14215088, + "step": 7523, + "time_per_iteration": 2.866584539413452 + }, + { + "auxiliary_loss_clip": 0.01388159, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.26422977, + "balance_loss_mlp": 1.01866162, + "epoch": 0.45236735307380127, + "flos": 11184640392600.0, + "grad_norm": 2.3526061389093194, + "language_loss": 0.7865845, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.81080556, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.15283203, + "step": 7524, + "time_per_iteration": 2.9668877124786377 + }, + { + "auxiliary_loss_clip": 0.01384327, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.26426256, + "balance_loss_mlp": 1.01956415, + "epoch": 0.45242747632646924, + "flos": 22606517781000.0, + "grad_norm": 1.7969751558770823, + "language_loss": 0.63458228, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65876585, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14465332, + "step": 7525, + "time_per_iteration": 2.75065016746521 + }, + { + "auxiliary_loss_clip": 0.01376305, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.25808311, + "balance_loss_mlp": 1.02428234, + "epoch": 0.45248759957913726, + "flos": 18260398396800.0, + "grad_norm": 2.2860043808306627, + "language_loss": 0.79194319, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81608391, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.1348877, + "step": 7526, + "time_per_iteration": 2.7531657218933105 + }, + { + "auxiliary_loss_clip": 0.01379045, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.26077735, + "balance_loss_mlp": 1.01552749, + "epoch": 0.4525477228318052, + "flos": 22454507397600.0, + "grad_norm": 1.5403090049606294, + "language_loss": 0.81233507, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83641744, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.13665771, + "step": 7527, + "time_per_iteration": 2.758354902267456 + }, + { + "auxiliary_loss_clip": 0.01379163, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.26094687, + "balance_loss_mlp": 1.01465654, + "epoch": 0.4526078460844732, + "flos": 29651146154280.0, + "grad_norm": 1.5498043290086918, + "language_loss": 0.65470076, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67877495, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13586426, + "step": 7528, + "time_per_iteration": 2.8133466243743896 + }, + { + "auxiliary_loss_clip": 0.01377035, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.25836861, + "balance_loss_mlp": 1.02116466, + "epoch": 0.45266796933714115, + "flos": 23045006119680.0, + "grad_norm": 1.507174480340682, + "language_loss": 0.75533557, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77945852, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14123535, + "step": 7529, + "time_per_iteration": 2.7517263889312744 + }, + { + "auxiliary_loss_clip": 0.01383996, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.26153815, + "balance_loss_mlp": 1.02176833, + "epoch": 0.4527280925898091, + "flos": 14432447599560.0, + "grad_norm": 2.08107612915086, + "language_loss": 0.73083133, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75502485, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.13604736, + "step": 7530, + "time_per_iteration": 4.143756151199341 + }, + { + "auxiliary_loss_clip": 0.01378163, + "auxiliary_loss_mlp": 0.0102894, + "balance_loss_clip": 1.25864387, + "balance_loss_mlp": 1.01550543, + "epoch": 0.4527882158424771, + "flos": 22930135404480.0, + "grad_norm": 2.075897461248577, + "language_loss": 0.76570392, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78977495, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.13446045, + "step": 7531, + "time_per_iteration": 4.301951885223389 + }, + { + "auxiliary_loss_clip": 0.0137546, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.2577455, + "balance_loss_mlp": 1.0253973, + "epoch": 0.45284833909514505, + "flos": 25560659352960.0, + "grad_norm": 1.558082856426503, + "language_loss": 0.67122221, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69537342, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.1427002, + "step": 7532, + "time_per_iteration": 2.8132143020629883 + }, + { + "auxiliary_loss_clip": 0.01374619, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.25571609, + "balance_loss_mlp": 1.02220082, + "epoch": 0.452908462347813, + "flos": 18154542737520.0, + "grad_norm": 2.1525703338745172, + "language_loss": 0.78643894, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81055158, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14447021, + "step": 7533, + "time_per_iteration": 2.703139066696167 + }, + { + "auxiliary_loss_clip": 0.01396498, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.26821494, + "balance_loss_mlp": 1.01998711, + "epoch": 0.452968585600481, + "flos": 19067481440640.0, + "grad_norm": 2.4853841052480266, + "language_loss": 0.83510649, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85943258, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.16125488, + "step": 7534, + "time_per_iteration": 2.765625238418579 + }, + { + "auxiliary_loss_clip": 0.01385084, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.26234782, + "balance_loss_mlp": 1.02127624, + "epoch": 0.45302870885314894, + "flos": 22056488612640.0, + "grad_norm": 1.750271144963746, + "language_loss": 0.76602048, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.79024166, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.15771484, + "step": 7535, + "time_per_iteration": 2.7523722648620605 + }, + { + "auxiliary_loss_clip": 0.01379208, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.26032805, + "balance_loss_mlp": 1.02144289, + "epoch": 0.4530888321058169, + "flos": 20381119080480.0, + "grad_norm": 1.5379017010692548, + "language_loss": 0.8053813, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82951939, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13165283, + "step": 7536, + "time_per_iteration": 2.7391762733459473 + }, + { + "auxiliary_loss_clip": 0.01381103, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.25682425, + "balance_loss_mlp": 1.02058017, + "epoch": 0.4531489553584849, + "flos": 14834608437240.0, + "grad_norm": 1.7168099560325394, + "language_loss": 0.76592177, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.79008025, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14160156, + "step": 7537, + "time_per_iteration": 2.7041373252868652 + }, + { + "auxiliary_loss_clip": 0.0138195, + "auxiliary_loss_mlp": 0.01040345, + "balance_loss_clip": 1.25991368, + "balance_loss_mlp": 1.02662432, + "epoch": 0.45320907861115284, + "flos": 21950064436320.0, + "grad_norm": 1.7620626343984787, + "language_loss": 0.7592693, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78349221, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.13720703, + "step": 7538, + "time_per_iteration": 4.29032039642334 + }, + { + "auxiliary_loss_clip": 0.01211402, + "auxiliary_loss_mlp": 0.01002325, + "balance_loss_clip": 1.16071939, + "balance_loss_mlp": 0.99910682, + "epoch": 0.45326920186382086, + "flos": 66268759924080.0, + "grad_norm": 0.804958269760761, + "language_loss": 0.62404883, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64618611, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03222656, + "step": 7539, + "time_per_iteration": 3.271812915802002 + }, + { + "auxiliary_loss_clip": 0.01376962, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.25675833, + "balance_loss_mlp": 1.02650213, + "epoch": 0.4533293251164888, + "flos": 14688364440960.0, + "grad_norm": 1.8426310060520275, + "language_loss": 0.65517855, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67934853, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13537598, + "step": 7540, + "time_per_iteration": 2.7334532737731934 + }, + { + "auxiliary_loss_clip": 0.01387556, + "auxiliary_loss_mlp": 0.01043851, + "balance_loss_clip": 1.26137567, + "balance_loss_mlp": 1.02865231, + "epoch": 0.4533894483691568, + "flos": 22606314739200.0, + "grad_norm": 2.308166859211855, + "language_loss": 0.85229635, + "learning_rate": 2.396361968778424e-06, + "loss": 0.8766104, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15185547, + "step": 7541, + "time_per_iteration": 2.8049256801605225 + }, + { + "auxiliary_loss_clip": 0.01380568, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.25954342, + "balance_loss_mlp": 1.01986587, + "epoch": 0.45344957162182475, + "flos": 34758396900000.0, + "grad_norm": 1.6243944653199291, + "language_loss": 0.77128094, + "learning_rate": 2.395980224383889e-06, + "loss": 0.79542565, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14050293, + "step": 7542, + "time_per_iteration": 2.883923053741455 + }, + { + "auxiliary_loss_clip": 0.01382694, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.26152599, + "balance_loss_mlp": 1.01689577, + "epoch": 0.4535096948744927, + "flos": 23555499726600.0, + "grad_norm": 1.5552999145683435, + "language_loss": 0.80341268, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82755709, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14831543, + "step": 7543, + "time_per_iteration": 2.7576518058776855 + }, + { + "auxiliary_loss_clip": 0.01386364, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.26326621, + "balance_loss_mlp": 1.0233258, + "epoch": 0.4535698181271607, + "flos": 25563177071280.0, + "grad_norm": 1.6015691189859882, + "language_loss": 0.76148897, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78572667, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14074707, + "step": 7544, + "time_per_iteration": 2.9275920391082764 + }, + { + "auxiliary_loss_clip": 0.01388363, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.26486325, + "balance_loss_mlp": 1.02309608, + "epoch": 0.45362994137982865, + "flos": 24869624666760.0, + "grad_norm": 1.9764667822618296, + "language_loss": 0.75271034, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77696645, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14154053, + "step": 7545, + "time_per_iteration": 2.7993834018707275 + }, + { + "auxiliary_loss_clip": 0.01379945, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.25914323, + "balance_loss_mlp": 1.02573681, + "epoch": 0.4536900646324966, + "flos": 30812489152200.0, + "grad_norm": 1.536056793870655, + "language_loss": 0.72466582, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74885893, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13616943, + "step": 7546, + "time_per_iteration": 2.871920347213745 + }, + { + "auxiliary_loss_clip": 0.01397312, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.27113712, + "balance_loss_mlp": 1.01886785, + "epoch": 0.4537501878851646, + "flos": 23409580597200.0, + "grad_norm": 1.482423569307124, + "language_loss": 0.75603831, + "learning_rate": 2.394071277466609e-06, + "loss": 0.7803511, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.15093994, + "step": 7547, + "time_per_iteration": 2.82485294342041 + }, + { + "auxiliary_loss_clip": 0.01388946, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.26557851, + "balance_loss_mlp": 1.01555228, + "epoch": 0.45381031113783254, + "flos": 18153730570320.0, + "grad_norm": 2.469947908159835, + "language_loss": 0.69702512, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72122115, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.15100098, + "step": 7548, + "time_per_iteration": 2.821718215942383 + }, + { + "auxiliary_loss_clip": 0.01379053, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.25871849, + "balance_loss_mlp": 1.02428651, + "epoch": 0.4538704343905005, + "flos": 25342004004840.0, + "grad_norm": 1.9028247015546715, + "language_loss": 0.72834063, + "learning_rate": 2.393307593995794e-06, + "loss": 0.75252098, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.14697266, + "step": 7549, + "time_per_iteration": 2.833170175552368 + }, + { + "auxiliary_loss_clip": 0.01379058, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.25872433, + "balance_loss_mlp": 1.01575899, + "epoch": 0.4539305576431685, + "flos": 28737435892320.0, + "grad_norm": 1.5413618552116497, + "language_loss": 0.6534574, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67754334, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13775635, + "step": 7550, + "time_per_iteration": 2.8233468532562256 + }, + { + "auxiliary_loss_clip": 0.01372636, + "auxiliary_loss_mlp": 0.01038705, + "balance_loss_clip": 1.25457954, + "balance_loss_mlp": 1.02516294, + "epoch": 0.45399068089583644, + "flos": 22497575886360.0, + "grad_norm": 2.223218934278946, + "language_loss": 0.69156384, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.7156772, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13531494, + "step": 7551, + "time_per_iteration": 2.8935697078704834 + }, + { + "auxiliary_loss_clip": 0.01385216, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.26259851, + "balance_loss_mlp": 1.02026224, + "epoch": 0.45405080414850446, + "flos": 12896905942800.0, + "grad_norm": 1.6825760828406906, + "language_loss": 0.79106641, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81527197, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.15081787, + "step": 7552, + "time_per_iteration": 2.752368211746216 + }, + { + "auxiliary_loss_clip": 0.01213813, + "auxiliary_loss_mlp": 0.01004028, + "balance_loss_clip": 1.16376877, + "balance_loss_mlp": 1.00052357, + "epoch": 0.4541109274011724, + "flos": 59779910125080.0, + "grad_norm": 0.8173973768466717, + "language_loss": 0.5791539, + "learning_rate": 2.39178004819885e-06, + "loss": 0.60133231, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03515625, + "step": 7553, + "time_per_iteration": 3.2197015285491943 + }, + { + "auxiliary_loss_clip": 0.01373939, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.25465345, + "balance_loss_mlp": 1.02312911, + "epoch": 0.4541710506538404, + "flos": 28517602901760.0, + "grad_norm": 1.3597324663047423, + "language_loss": 0.76982301, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.79392993, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.1362915, + "step": 7554, + "time_per_iteration": 2.7985527515411377 + }, + { + "auxiliary_loss_clip": 0.01385258, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.26211655, + "balance_loss_mlp": 1.01471853, + "epoch": 0.45423117390650836, + "flos": 17680051764720.0, + "grad_norm": 6.6588102719606805, + "language_loss": 0.77382702, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79798675, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.15979004, + "step": 7555, + "time_per_iteration": 2.720820188522339 + }, + { + "auxiliary_loss_clip": 0.01373781, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.25339627, + "balance_loss_mlp": 1.02006412, + "epoch": 0.4542912971591763, + "flos": 28078180570800.0, + "grad_norm": 1.9521990370125397, + "language_loss": 0.72996819, + "learning_rate": 2.390634232808903e-06, + "loss": 0.75405395, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14733887, + "step": 7556, + "time_per_iteration": 2.809880256652832 + }, + { + "auxiliary_loss_clip": 0.01386296, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.26142716, + "balance_loss_mlp": 1.01827443, + "epoch": 0.4543514204118443, + "flos": 22676573847960.0, + "grad_norm": 2.2393037010036467, + "language_loss": 0.6367628, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.66095084, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14233398, + "step": 7557, + "time_per_iteration": 2.7661514282226562 + }, + { + "auxiliary_loss_clip": 0.01212293, + "auxiliary_loss_mlp": 0.0100477, + "balance_loss_clip": 1.16181254, + "balance_loss_mlp": 1.00172997, + "epoch": 0.45441154366451225, + "flos": 58231756249560.0, + "grad_norm": 0.6769067133087425, + "language_loss": 0.57662278, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59879339, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.03039551, + "step": 7558, + "time_per_iteration": 3.220029592514038 + }, + { + "auxiliary_loss_clip": 0.01381257, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.25636029, + "balance_loss_mlp": 1.01660752, + "epoch": 0.4544716669171802, + "flos": 16768737396000.0, + "grad_norm": 2.5456885677077103, + "language_loss": 0.56749928, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.59163702, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.15893555, + "step": 7559, + "time_per_iteration": 2.753877639770508 + }, + { + "auxiliary_loss_clip": 0.01371305, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.25216675, + "balance_loss_mlp": 1.02092242, + "epoch": 0.4545317901698482, + "flos": 15929590728960.0, + "grad_norm": 1.9425336772983566, + "language_loss": 0.71818489, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74225372, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14660645, + "step": 7560, + "time_per_iteration": 4.2181501388549805 + }, + { + "auxiliary_loss_clip": 0.01386454, + "auxiliary_loss_mlp": 0.01028581, + "balance_loss_clip": 1.26079893, + "balance_loss_mlp": 1.01342988, + "epoch": 0.45459191342251615, + "flos": 17644292780760.0, + "grad_norm": 2.077661519922031, + "language_loss": 0.70109069, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.72524107, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.15155029, + "step": 7561, + "time_per_iteration": 2.809398889541626 + }, + { + "auxiliary_loss_clip": 0.01368988, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.25083542, + "balance_loss_mlp": 1.01631975, + "epoch": 0.4546520366751841, + "flos": 16180472133720.0, + "grad_norm": 1.650481580601511, + "language_loss": 0.85230893, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87629306, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13110352, + "step": 7562, + "time_per_iteration": 2.7447257041931152 + }, + { + "auxiliary_loss_clip": 0.01368259, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.25122797, + "balance_loss_mlp": 1.01649618, + "epoch": 0.4547121599278521, + "flos": 19755673541640.0, + "grad_norm": 1.7704763869858982, + "language_loss": 0.89805651, + "learning_rate": 2.38796014579055e-06, + "loss": 0.92204523, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14117432, + "step": 7563, + "time_per_iteration": 2.758574962615967 + }, + { + "auxiliary_loss_clip": 0.01374559, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.25349867, + "balance_loss_mlp": 1.02250218, + "epoch": 0.45477228318052004, + "flos": 19942184049840.0, + "grad_norm": 3.521949615766514, + "language_loss": 0.71456432, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73867953, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14471436, + "step": 7564, + "time_per_iteration": 2.749366283416748 + }, + { + "auxiliary_loss_clip": 0.01377925, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.25657618, + "balance_loss_mlp": 1.02409947, + "epoch": 0.454832406433188, + "flos": 21293326833120.0, + "grad_norm": 2.128944458034954, + "language_loss": 0.68574911, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70991135, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14221191, + "step": 7565, + "time_per_iteration": 2.7732510566711426 + }, + { + "auxiliary_loss_clip": 0.01372429, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.25360739, + "balance_loss_mlp": 1.02014792, + "epoch": 0.45489252968585603, + "flos": 24504116196960.0, + "grad_norm": 1.5690926969818122, + "language_loss": 0.80173004, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82579088, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.1350708, + "step": 7566, + "time_per_iteration": 2.786045551300049 + }, + { + "auxiliary_loss_clip": 0.01379162, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.25741875, + "balance_loss_mlp": 1.02171803, + "epoch": 0.454952652938524, + "flos": 17096943764160.0, + "grad_norm": 1.62147158620524, + "language_loss": 0.7361002, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76026022, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.15118408, + "step": 7567, + "time_per_iteration": 2.740273952484131 + }, + { + "auxiliary_loss_clip": 0.01379214, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.25740695, + "balance_loss_mlp": 1.02684534, + "epoch": 0.45501277619119196, + "flos": 27635387745960.0, + "grad_norm": 1.6097308548692684, + "language_loss": 0.81378746, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83798802, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14013672, + "step": 7568, + "time_per_iteration": 2.8890345096588135 + }, + { + "auxiliary_loss_clip": 0.01384636, + "auxiliary_loss_mlp": 0.01045226, + "balance_loss_clip": 1.25835443, + "balance_loss_mlp": 1.02921581, + "epoch": 0.4550728994438599, + "flos": 19979486151480.0, + "grad_norm": 1.8746751098817687, + "language_loss": 0.80303562, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82733428, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.16003418, + "step": 7569, + "time_per_iteration": 2.837995767593384 + }, + { + "auxiliary_loss_clip": 0.01380479, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.25686908, + "balance_loss_mlp": 1.02364397, + "epoch": 0.4551330226965279, + "flos": 26072371210680.0, + "grad_norm": 1.4636140653427756, + "language_loss": 0.75107992, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77527028, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.14910889, + "step": 7570, + "time_per_iteration": 5.822457313537598 + }, + { + "auxiliary_loss_clip": 0.01371805, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_clip": 1.25495863, + "balance_loss_mlp": 1.03095019, + "epoch": 0.45519314594919585, + "flos": 32787574965000.0, + "grad_norm": 1.5788356424875372, + "language_loss": 0.74769711, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77187252, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14782715, + "step": 7571, + "time_per_iteration": 2.9216020107269287 + }, + { + "auxiliary_loss_clip": 0.01363781, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.2488091, + "balance_loss_mlp": 1.02153969, + "epoch": 0.4552532692018638, + "flos": 19177479152640.0, + "grad_norm": 1.5718550418787913, + "language_loss": 0.80922914, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83321631, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13415527, + "step": 7572, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.01383647, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.25978327, + "balance_loss_mlp": 1.0256505, + "epoch": 0.4553133924545318, + "flos": 26032226523840.0, + "grad_norm": 1.7037604251691436, + "language_loss": 0.73405182, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75829375, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14880371, + "step": 7573, + "time_per_iteration": 2.7823781967163086 + }, + { + "auxiliary_loss_clip": 0.01380848, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.25822151, + "balance_loss_mlp": 1.02354264, + "epoch": 0.45537351570719975, + "flos": 30667300973280.0, + "grad_norm": 2.036320474696273, + "language_loss": 0.75124311, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.77544105, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.15393066, + "step": 7574, + "time_per_iteration": 2.8620431423187256 + }, + { + "auxiliary_loss_clip": 0.01382398, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.25996077, + "balance_loss_mlp": 1.02107286, + "epoch": 0.4554336389598677, + "flos": 24358846801320.0, + "grad_norm": 1.495764951448372, + "language_loss": 0.71561593, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73979491, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14440918, + "step": 7575, + "time_per_iteration": 2.826725959777832 + }, + { + "auxiliary_loss_clip": 0.01376102, + "auxiliary_loss_mlp": 0.01040627, + "balance_loss_clip": 1.25579405, + "balance_loss_mlp": 1.02634537, + "epoch": 0.4554937622125357, + "flos": 20562756585480.0, + "grad_norm": 1.689669332301829, + "language_loss": 0.73808199, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.76224923, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14282227, + "step": 7576, + "time_per_iteration": 2.7518954277038574 + }, + { + "auxiliary_loss_clip": 0.01370177, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.25244606, + "balance_loss_mlp": 1.02231193, + "epoch": 0.45555388546520365, + "flos": 22825985296320.0, + "grad_norm": 1.7557174310758707, + "language_loss": 0.67207253, + "learning_rate": 2.382609814135511e-06, + "loss": 0.69613522, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13769531, + "step": 7577, + "time_per_iteration": 4.2403364181518555 + }, + { + "auxiliary_loss_clip": 0.0137939, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.2584815, + "balance_loss_mlp": 1.02703357, + "epoch": 0.4556140087178716, + "flos": 21731043612960.0, + "grad_norm": 2.036035508271107, + "language_loss": 0.74168372, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76590639, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.1583252, + "step": 7578, + "time_per_iteration": 2.7594897747039795 + }, + { + "auxiliary_loss_clip": 0.01373119, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.25485098, + "balance_loss_mlp": 1.01853991, + "epoch": 0.45567413197053963, + "flos": 26000081683920.0, + "grad_norm": 1.9838997941436796, + "language_loss": 0.71103746, + "learning_rate": 2.381845247976697e-06, + "loss": 0.73508847, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.18310547, + "router_z_loss_mlp": 0.13439941, + "step": 7579, + "time_per_iteration": 2.8098652362823486 + }, + { + "auxiliary_loss_clip": 0.01372994, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.25489235, + "balance_loss_mlp": 1.02089655, + "epoch": 0.4557342552232076, + "flos": 21541852953000.0, + "grad_norm": 1.7367534183679672, + "language_loss": 0.78863204, + "learning_rate": 2.381462943170627e-06, + "loss": 0.81270516, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.13439941, + "step": 7580, + "time_per_iteration": 2.7437047958374023 + }, + { + "auxiliary_loss_clip": 0.01376108, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.25755119, + "balance_loss_mlp": 1.02172732, + "epoch": 0.45579437847587556, + "flos": 40008967840080.0, + "grad_norm": 1.6391117169060594, + "language_loss": 0.69016105, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71428508, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14575195, + "step": 7581, + "time_per_iteration": 2.922001361846924 + }, + { + "auxiliary_loss_clip": 0.01372323, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.25519967, + "balance_loss_mlp": 1.02131283, + "epoch": 0.4558545017285435, + "flos": 31144675139640.0, + "grad_norm": 1.6033143982570623, + "language_loss": 0.73706663, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.76114315, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.14025879, + "step": 7582, + "time_per_iteration": 2.9116055965423584 + }, + { + "auxiliary_loss_clip": 0.01383162, + "auxiliary_loss_mlp": 0.01045789, + "balance_loss_clip": 1.259799, + "balance_loss_mlp": 1.03048277, + "epoch": 0.4559146249812115, + "flos": 21730799962800.0, + "grad_norm": 1.8141399817473063, + "language_loss": 0.72987533, + "learning_rate": 2.380315942019729e-06, + "loss": 0.75416481, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.15307617, + "step": 7583, + "time_per_iteration": 2.8372628688812256 + }, + { + "auxiliary_loss_clip": 0.01382701, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.25973773, + "balance_loss_mlp": 1.02119803, + "epoch": 0.45597474823387946, + "flos": 23811335351280.0, + "grad_norm": 1.8464078577531584, + "language_loss": 0.73011953, + "learning_rate": 2.379933579440195e-06, + "loss": 0.75430804, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14941406, + "step": 7584, + "time_per_iteration": 2.8102431297302246 + }, + { + "auxiliary_loss_clip": 0.01380243, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.26106894, + "balance_loss_mlp": 1.01624489, + "epoch": 0.4560348714865474, + "flos": 31912060188600.0, + "grad_norm": 2.299350951673306, + "language_loss": 0.67831117, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70241588, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13989258, + "step": 7585, + "time_per_iteration": 2.839444398880005 + }, + { + "auxiliary_loss_clip": 0.01380934, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.26017451, + "balance_loss_mlp": 1.01717091, + "epoch": 0.4560949947392154, + "flos": 22053199335480.0, + "grad_norm": 1.4685138998867382, + "language_loss": 0.76812923, + "learning_rate": 2.379168811074267e-06, + "loss": 0.79225445, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14422607, + "step": 7586, + "time_per_iteration": 2.777942657470703 + }, + { + "auxiliary_loss_clip": 0.01375615, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.25700545, + "balance_loss_mlp": 1.01550651, + "epoch": 0.45615511799188335, + "flos": 24577217890920.0, + "grad_norm": 1.6520595779754028, + "language_loss": 0.78602403, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.81006479, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.12963867, + "step": 7587, + "time_per_iteration": 2.7757465839385986 + }, + { + "auxiliary_loss_clip": 0.01395876, + "auxiliary_loss_mlp": 0.01037193, + "balance_loss_clip": 1.26789546, + "balance_loss_mlp": 1.02211285, + "epoch": 0.4562152412445513, + "flos": 18335043208440.0, + "grad_norm": 2.2405388841919027, + "language_loss": 0.6989373, + "learning_rate": 2.378403985195863e-06, + "loss": 0.72326803, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.15075684, + "step": 7588, + "time_per_iteration": 2.7613346576690674 + }, + { + "auxiliary_loss_clip": 0.01380785, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.26262772, + "balance_loss_mlp": 1.02264142, + "epoch": 0.4562753644972193, + "flos": 13520483497080.0, + "grad_norm": 1.599644615099816, + "language_loss": 0.79997337, + "learning_rate": 2.378021550725735e-06, + "loss": 0.8241418, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13427734, + "step": 7589, + "time_per_iteration": 2.6918468475341797 + }, + { + "auxiliary_loss_clip": 0.0138712, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.26688039, + "balance_loss_mlp": 1.01850724, + "epoch": 0.45633548774988725, + "flos": 29645298550440.0, + "grad_norm": 3.5814482933707645, + "language_loss": 0.63359869, + "learning_rate": 2.377639101920992e-06, + "loss": 0.65780163, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14678955, + "step": 7590, + "time_per_iteration": 2.8749139308929443 + }, + { + "auxiliary_loss_clip": 0.01385474, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.26428509, + "balance_loss_mlp": 1.02199912, + "epoch": 0.4563956110025552, + "flos": 22238085509280.0, + "grad_norm": 1.7107745104611796, + "language_loss": 0.73090732, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75511843, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.13635254, + "step": 7591, + "time_per_iteration": 2.743680953979492 + }, + { + "auxiliary_loss_clip": 0.01390284, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.2688098, + "balance_loss_mlp": 1.02652383, + "epoch": 0.45645573425522323, + "flos": 17096456463840.0, + "grad_norm": 1.7687695882782144, + "language_loss": 0.77541286, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.7997303, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14923096, + "step": 7592, + "time_per_iteration": 2.7346105575561523 + }, + { + "auxiliary_loss_clip": 0.01385616, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.26424015, + "balance_loss_mlp": 1.01950383, + "epoch": 0.4565158575078912, + "flos": 20336263823880.0, + "grad_norm": 1.9546508602342452, + "language_loss": 0.69792378, + "learning_rate": 2.376491669644098e-06, + "loss": 0.72211725, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14227295, + "step": 7593, + "time_per_iteration": 2.837620735168457 + }, + { + "auxiliary_loss_clip": 0.01375454, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.25927794, + "balance_loss_mlp": 1.017308, + "epoch": 0.45657598076055916, + "flos": 23987693769480.0, + "grad_norm": 2.011010715241869, + "language_loss": 0.84095967, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86500996, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.12249756, + "step": 7594, + "time_per_iteration": 2.7722692489624023 + }, + { + "auxiliary_loss_clip": 0.0122141, + "auxiliary_loss_mlp": 0.01013838, + "balance_loss_clip": 1.17223811, + "balance_loss_mlp": 1.01076257, + "epoch": 0.45663610401322713, + "flos": 69378752692440.0, + "grad_norm": 0.8076091058663245, + "language_loss": 0.52753693, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54988939, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.03063965, + "step": 7595, + "time_per_iteration": 3.314152240753174 + }, + { + "auxiliary_loss_clip": 0.01398354, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.27298796, + "balance_loss_mlp": 1.01703846, + "epoch": 0.4566962272658951, + "flos": 15152053590000.0, + "grad_norm": 2.4251690006752034, + "language_loss": 0.87623608, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.90053695, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14691162, + "step": 7596, + "time_per_iteration": 2.7341346740722656 + }, + { + "auxiliary_loss_clip": 0.01393306, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.27120602, + "balance_loss_mlp": 1.02109563, + "epoch": 0.45675635051856306, + "flos": 18702094795920.0, + "grad_norm": 2.036189715707202, + "language_loss": 0.77840984, + "learning_rate": 2.374961560136843e-06, + "loss": 0.80269498, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14135742, + "step": 7597, + "time_per_iteration": 2.7810416221618652 + }, + { + "auxiliary_loss_clip": 0.0138839, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.26524973, + "balance_loss_mlp": 1.017555, + "epoch": 0.456816473771231, + "flos": 19102915557720.0, + "grad_norm": 1.5800694868978944, + "language_loss": 0.78867608, + "learning_rate": 2.374578997177314e-06, + "loss": 0.8128854, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.14984131, + "step": 7598, + "time_per_iteration": 2.766942262649536 + }, + { + "auxiliary_loss_clip": 0.0138136, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.26119947, + "balance_loss_mlp": 1.01576674, + "epoch": 0.456876597023899, + "flos": 28956050632080.0, + "grad_norm": 2.2398928020861284, + "language_loss": 0.71384311, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73795557, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14105225, + "step": 7599, + "time_per_iteration": 4.154489994049072 + }, + { + "auxiliary_loss_clip": 0.01384084, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.26326728, + "balance_loss_mlp": 1.02256835, + "epoch": 0.45693672027656695, + "flos": 23294019539880.0, + "grad_norm": 2.1624128165095415, + "language_loss": 0.70130444, + "learning_rate": 2.373813828660544e-06, + "loss": 0.72551823, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14733887, + "step": 7600, + "time_per_iteration": 2.7605690956115723 + }, + { + "auxiliary_loss_clip": 0.0138845, + "auxiliary_loss_mlp": 0.01040755, + "balance_loss_clip": 1.26715279, + "balance_loss_mlp": 1.02677202, + "epoch": 0.4569968435292349, + "flos": 20563081452360.0, + "grad_norm": 1.737924508704715, + "language_loss": 0.79033113, + "learning_rate": 2.373431223132319e-06, + "loss": 0.81462312, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.13989258, + "step": 7601, + "time_per_iteration": 2.8237855434417725 + }, + { + "auxiliary_loss_clip": 0.01394261, + "auxiliary_loss_mlp": 0.0103429, + "balance_loss_clip": 1.27054703, + "balance_loss_mlp": 1.01993728, + "epoch": 0.4570569667819029, + "flos": 41290257598200.0, + "grad_norm": 1.6418923777199164, + "language_loss": 0.72177994, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.7460655, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14343262, + "step": 7602, + "time_per_iteration": 3.0065243244171143 + }, + { + "auxiliary_loss_clip": 0.01395639, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.27312613, + "balance_loss_mlp": 1.01846838, + "epoch": 0.45711709003457085, + "flos": 26037221352120.0, + "grad_norm": 2.0570769452883613, + "language_loss": 0.73634404, + "learning_rate": 2.372665969608729e-06, + "loss": 0.7606467, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.16149902, + "step": 7603, + "time_per_iteration": 2.913341522216797 + }, + { + "auxiliary_loss_clip": 0.01388732, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.26668632, + "balance_loss_mlp": 1.02488542, + "epoch": 0.4571772132872388, + "flos": 22162425488640.0, + "grad_norm": 1.7062836056432367, + "language_loss": 0.83458477, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85888028, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.15936279, + "step": 7604, + "time_per_iteration": 2.7378714084625244 + }, + { + "auxiliary_loss_clip": 0.0140235, + "auxiliary_loss_mlp": 0.01040862, + "balance_loss_clip": 1.27474225, + "balance_loss_mlp": 1.02479243, + "epoch": 0.45723733653990684, + "flos": 23884234003440.0, + "grad_norm": 5.0708316611849, + "language_loss": 0.86631095, + "learning_rate": 2.371900659559016e-06, + "loss": 0.89074308, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.1607666, + "step": 7605, + "time_per_iteration": 2.822519063949585 + }, + { + "auxiliary_loss_clip": 0.01394486, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.26961589, + "balance_loss_mlp": 1.02468348, + "epoch": 0.4572974597925748, + "flos": 16876014347880.0, + "grad_norm": 1.6737353357741678, + "language_loss": 0.73885071, + "learning_rate": 2.371517983373138e-06, + "loss": 0.76319438, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.15197754, + "step": 7606, + "time_per_iteration": 2.7691781520843506 + }, + { + "auxiliary_loss_clip": 0.01395918, + "auxiliary_loss_mlp": 0.01041126, + "balance_loss_clip": 1.27076936, + "balance_loss_mlp": 1.02558064, + "epoch": 0.45735758304524277, + "flos": 13775425737840.0, + "grad_norm": 2.1327469461422432, + "language_loss": 0.80442804, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82879847, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15551758, + "step": 7607, + "time_per_iteration": 4.21094012260437 + }, + { + "auxiliary_loss_clip": 0.0139486, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.27312458, + "balance_loss_mlp": 1.02739608, + "epoch": 0.45741770629791073, + "flos": 21105070165440.0, + "grad_norm": 1.886057234485845, + "language_loss": 0.8073501, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83171827, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14556885, + "step": 7608, + "time_per_iteration": 2.8003220558166504 + }, + { + "auxiliary_loss_clip": 0.01388716, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.26613927, + "balance_loss_mlp": 1.02464056, + "epoch": 0.4574778295505787, + "flos": 23118189030360.0, + "grad_norm": 1.6691402015008152, + "language_loss": 0.68224633, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70652997, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.15002441, + "step": 7609, + "time_per_iteration": 4.322239875793457 + }, + { + "auxiliary_loss_clip": 0.01391873, + "auxiliary_loss_mlp": 0.01041612, + "balance_loss_clip": 1.2676475, + "balance_loss_mlp": 1.02706218, + "epoch": 0.45753795280324666, + "flos": 24358318892640.0, + "grad_norm": 1.826285117399353, + "language_loss": 0.81000614, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83434099, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14550781, + "step": 7610, + "time_per_iteration": 2.75929856300354 + }, + { + "auxiliary_loss_clip": 0.01398576, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.27308476, + "balance_loss_mlp": 1.02214289, + "epoch": 0.4575980760559146, + "flos": 16658089950240.0, + "grad_norm": 2.2079751679869477, + "language_loss": 0.82639974, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.85076356, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15661621, + "step": 7611, + "time_per_iteration": 2.7741901874542236 + }, + { + "auxiliary_loss_clip": 0.01389882, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.26530814, + "balance_loss_mlp": 1.01978636, + "epoch": 0.4576581993085826, + "flos": 35917344004680.0, + "grad_norm": 1.8636888921548267, + "language_loss": 0.7415241, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76577544, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.15478516, + "step": 7612, + "time_per_iteration": 2.8829898834228516 + }, + { + "auxiliary_loss_clip": 0.01388302, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.26550198, + "balance_loss_mlp": 1.02526879, + "epoch": 0.45771832256125056, + "flos": 20084773293720.0, + "grad_norm": 2.879384757582686, + "language_loss": 0.84960401, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87389171, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.15209961, + "step": 7613, + "time_per_iteration": 2.825198173522949 + }, + { + "auxiliary_loss_clip": 0.01390615, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.26808047, + "balance_loss_mlp": 1.01895237, + "epoch": 0.4577784458139185, + "flos": 10748020038480.0, + "grad_norm": 2.5104568948100643, + "language_loss": 0.75554025, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77977931, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14355469, + "step": 7614, + "time_per_iteration": 2.7139954566955566 + }, + { + "auxiliary_loss_clip": 0.01383756, + "auxiliary_loss_mlp": 0.01035219, + "balance_loss_clip": 1.26353729, + "balance_loss_mlp": 1.02106905, + "epoch": 0.4578385690665865, + "flos": 21912193817640.0, + "grad_norm": 1.3499944040191718, + "language_loss": 0.74914217, + "learning_rate": 2.368073265481791e-06, + "loss": 0.773332, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14172363, + "step": 7615, + "time_per_iteration": 2.7657549381256104 + }, + { + "auxiliary_loss_clip": 0.0123646, + "auxiliary_loss_mlp": 0.01003089, + "balance_loss_clip": 1.1866926, + "balance_loss_mlp": 0.99975109, + "epoch": 0.45789869231925445, + "flos": 64770990688080.0, + "grad_norm": 0.7809749263216023, + "language_loss": 0.57654381, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.5989393, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.03344727, + "step": 7616, + "time_per_iteration": 4.724584341049194 + }, + { + "auxiliary_loss_clip": 0.01383749, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.26105452, + "balance_loss_mlp": 1.02160192, + "epoch": 0.4579588155719224, + "flos": 16148367902160.0, + "grad_norm": 1.5068310139799488, + "language_loss": 0.71248007, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73668337, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14978027, + "step": 7617, + "time_per_iteration": 2.7410085201263428 + }, + { + "auxiliary_loss_clip": 0.01391769, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.26839733, + "balance_loss_mlp": 1.02452493, + "epoch": 0.45801893882459044, + "flos": 21400278918120.0, + "grad_norm": 2.5099951325535104, + "language_loss": 0.76934671, + "learning_rate": 2.36692477442939e-06, + "loss": 0.79365885, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.14923096, + "step": 7618, + "time_per_iteration": 2.79498028755188 + }, + { + "auxiliary_loss_clip": 0.01392367, + "auxiliary_loss_mlp": 0.01048347, + "balance_loss_clip": 1.2684052, + "balance_loss_mlp": 1.0342567, + "epoch": 0.4580790620772584, + "flos": 19541282071320.0, + "grad_norm": 2.0776955942226563, + "language_loss": 0.7697562, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79416335, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14074707, + "step": 7619, + "time_per_iteration": 2.9006595611572266 + }, + { + "auxiliary_loss_clip": 0.01382957, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.26170528, + "balance_loss_mlp": 1.02543449, + "epoch": 0.45813918532992637, + "flos": 16585394339880.0, + "grad_norm": 1.7888869251681716, + "language_loss": 0.71690625, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74111807, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.12780762, + "step": 7620, + "time_per_iteration": 2.7404565811157227 + }, + { + "auxiliary_loss_clip": 0.01382256, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.26189721, + "balance_loss_mlp": 1.0233593, + "epoch": 0.45819930858259433, + "flos": 42237940076280.0, + "grad_norm": 1.6277906812368415, + "language_loss": 0.78705728, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.81124938, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13598633, + "step": 7621, + "time_per_iteration": 2.9802498817443848 + }, + { + "auxiliary_loss_clip": 0.01232593, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_clip": 1.18313909, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4582594318352623, + "flos": 63729269583480.0, + "grad_norm": 0.7803094973006508, + "language_loss": 0.65078038, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.6731497, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.03710938, + "step": 7622, + "time_per_iteration": 3.268301248550415 + }, + { + "auxiliary_loss_clip": 0.01391818, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.26770616, + "balance_loss_mlp": 1.02203584, + "epoch": 0.45831955508793026, + "flos": 26875637068680.0, + "grad_norm": 1.826412299871196, + "language_loss": 0.80049777, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.82478571, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14941406, + "step": 7623, + "time_per_iteration": 2.866976737976074 + }, + { + "auxiliary_loss_clip": 0.01393814, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.26800346, + "balance_loss_mlp": 1.02603924, + "epoch": 0.45837967834059823, + "flos": 18738584730360.0, + "grad_norm": 4.1605628161610575, + "language_loss": 0.70655811, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.73089516, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.13848877, + "step": 7624, + "time_per_iteration": 2.7843503952026367 + }, + { + "auxiliary_loss_clip": 0.01385662, + "auxiliary_loss_mlp": 0.01035874, + "balance_loss_clip": 1.26158381, + "balance_loss_mlp": 1.02168822, + "epoch": 0.4584398015932662, + "flos": 21183004254240.0, + "grad_norm": 3.7862372811518528, + "language_loss": 0.73337114, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75758648, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14196777, + "step": 7625, + "time_per_iteration": 2.79488468170166 + }, + { + "auxiliary_loss_clip": 0.01385294, + "auxiliary_loss_mlp": 0.0104813, + "balance_loss_clip": 1.26279092, + "balance_loss_mlp": 1.03415298, + "epoch": 0.45849992484593416, + "flos": 19794559369320.0, + "grad_norm": 2.7192146757783546, + "language_loss": 0.78179616, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80613041, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.13970947, + "step": 7626, + "time_per_iteration": 2.7843832969665527 + }, + { + "auxiliary_loss_clip": 0.01398911, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.27268434, + "balance_loss_mlp": 1.02808189, + "epoch": 0.4585600480986021, + "flos": 18227847473280.0, + "grad_norm": 1.7312121862242331, + "language_loss": 0.85020876, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87462682, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.14813232, + "step": 7627, + "time_per_iteration": 2.759261131286621 + }, + { + "auxiliary_loss_clip": 0.01399744, + "auxiliary_loss_mlp": 0.01043551, + "balance_loss_clip": 1.27164066, + "balance_loss_mlp": 1.02833998, + "epoch": 0.4586201713512701, + "flos": 29028096508680.0, + "grad_norm": 1.987165538957976, + "language_loss": 0.69437778, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71881074, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15209961, + "step": 7628, + "time_per_iteration": 2.8326895236968994 + }, + { + "auxiliary_loss_clip": 0.01384621, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.26286244, + "balance_loss_mlp": 1.01914811, + "epoch": 0.45868029460393805, + "flos": 23409905464080.0, + "grad_norm": 1.512635533707474, + "language_loss": 0.78241742, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80659783, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14282227, + "step": 7629, + "time_per_iteration": 2.754533529281616 + }, + { + "auxiliary_loss_clip": 0.01396742, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.26781285, + "balance_loss_mlp": 1.0230298, + "epoch": 0.458740417856606, + "flos": 18226385572320.0, + "grad_norm": 1.8359587767907106, + "language_loss": 0.79843926, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.82279706, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.16003418, + "step": 7630, + "time_per_iteration": 2.8068912029266357 + }, + { + "auxiliary_loss_clip": 0.01393467, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.26613164, + "balance_loss_mlp": 1.02580953, + "epoch": 0.458800541109274, + "flos": 34575459927480.0, + "grad_norm": 1.9007383795799335, + "language_loss": 0.72411656, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74845695, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14758301, + "step": 7631, + "time_per_iteration": 2.8889520168304443 + }, + { + "auxiliary_loss_clip": 0.01390739, + "auxiliary_loss_mlp": 0.01043353, + "balance_loss_clip": 1.26642489, + "balance_loss_mlp": 1.0287199, + "epoch": 0.458860664361942, + "flos": 17716582307520.0, + "grad_norm": 2.470006726026464, + "language_loss": 0.72358739, + "learning_rate": 2.361563500108531e-06, + "loss": 0.74792832, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.1463623, + "step": 7632, + "time_per_iteration": 2.7416441440582275 + }, + { + "auxiliary_loss_clip": 0.01392339, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.26395202, + "balance_loss_mlp": 1.01687908, + "epoch": 0.45892078761460997, + "flos": 18446462213040.0, + "grad_norm": 2.225564561581343, + "language_loss": 0.69416642, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71841377, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.1552124, + "step": 7633, + "time_per_iteration": 2.7489166259765625 + }, + { + "auxiliary_loss_clip": 0.01386119, + "auxiliary_loss_mlp": 0.01042681, + "balance_loss_clip": 1.2615428, + "balance_loss_mlp": 1.02791142, + "epoch": 0.45898091086727794, + "flos": 22677913923840.0, + "grad_norm": 1.521969826872868, + "language_loss": 0.81010568, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83439362, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14758301, + "step": 7634, + "time_per_iteration": 2.8036346435546875 + }, + { + "auxiliary_loss_clip": 0.01405317, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_clip": 1.27518511, + "balance_loss_mlp": 1.02716291, + "epoch": 0.4590410341199459, + "flos": 21657982527360.0, + "grad_norm": 1.6543636790428815, + "language_loss": 0.81572181, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84020203, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15551758, + "step": 7635, + "time_per_iteration": 2.8009653091430664 + }, + { + "auxiliary_loss_clip": 0.01382114, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.2594285, + "balance_loss_mlp": 1.03184152, + "epoch": 0.45910115737261387, + "flos": 36541855551240.0, + "grad_norm": 1.338612742625119, + "language_loss": 0.64871603, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67300326, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14770508, + "step": 7636, + "time_per_iteration": 2.866009473800659 + }, + { + "auxiliary_loss_clip": 0.01382781, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.26137209, + "balance_loss_mlp": 1.01933658, + "epoch": 0.45916128062528183, + "flos": 24424476557040.0, + "grad_norm": 1.4675104423561636, + "language_loss": 0.80539548, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82956308, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14648438, + "step": 7637, + "time_per_iteration": 2.8003923892974854 + }, + { + "auxiliary_loss_clip": 0.01394716, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_clip": 1.26805854, + "balance_loss_mlp": 1.02845287, + "epoch": 0.4592214038779498, + "flos": 23227212141720.0, + "grad_norm": 1.5004157838833752, + "language_loss": 0.75510877, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77949786, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.1574707, + "step": 7638, + "time_per_iteration": 4.224390983581543 + }, + { + "auxiliary_loss_clip": 0.01379303, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.2579037, + "balance_loss_mlp": 1.02177119, + "epoch": 0.45928152713061776, + "flos": 19176870027240.0, + "grad_norm": 1.6817328417422377, + "language_loss": 0.74271321, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76686168, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.13781738, + "step": 7639, + "time_per_iteration": 2.721055030822754 + }, + { + "auxiliary_loss_clip": 0.01391466, + "auxiliary_loss_mlp": 0.01038784, + "balance_loss_clip": 1.2659061, + "balance_loss_mlp": 1.02363253, + "epoch": 0.4593416503832857, + "flos": 22419235713960.0, + "grad_norm": 1.529176645298296, + "language_loss": 0.68231857, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70662111, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.15148926, + "step": 7640, + "time_per_iteration": 2.862985610961914 + }, + { + "auxiliary_loss_clip": 0.01393172, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.26656651, + "balance_loss_mlp": 1.0242157, + "epoch": 0.4594017736359537, + "flos": 18884909943360.0, + "grad_norm": 1.5269521471118888, + "language_loss": 0.75657952, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.78089744, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.14404297, + "step": 7641, + "time_per_iteration": 2.7407026290893555 + }, + { + "auxiliary_loss_clip": 0.01397641, + "auxiliary_loss_mlp": 0.01040067, + "balance_loss_clip": 1.27057052, + "balance_loss_mlp": 1.02391434, + "epoch": 0.45946189688862166, + "flos": 20523017982240.0, + "grad_norm": 1.635383839673315, + "language_loss": 0.74786246, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77223951, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.16137695, + "step": 7642, + "time_per_iteration": 2.8077645301818848 + }, + { + "auxiliary_loss_clip": 0.01221558, + "auxiliary_loss_mlp": 0.0100424, + "balance_loss_clip": 1.17231858, + "balance_loss_mlp": 1.00121236, + "epoch": 0.4595220201412896, + "flos": 61418830331160.0, + "grad_norm": 0.8252217542283492, + "language_loss": 0.582515, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60477298, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.03027344, + "step": 7643, + "time_per_iteration": 3.0638253688812256 + }, + { + "auxiliary_loss_clip": 0.01403639, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.27249122, + "balance_loss_mlp": 1.02279174, + "epoch": 0.4595821433939576, + "flos": 23336154036360.0, + "grad_norm": 1.5267694985072082, + "language_loss": 0.93372273, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95813411, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.14709473, + "step": 7644, + "time_per_iteration": 2.841905355453491 + }, + { + "auxiliary_loss_clip": 0.01390165, + "auxiliary_loss_mlp": 0.01037293, + "balance_loss_clip": 1.26306736, + "balance_loss_mlp": 1.02197433, + "epoch": 0.4596422666466256, + "flos": 14286366036720.0, + "grad_norm": 1.7899603304349112, + "language_loss": 0.82760608, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85188067, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15307617, + "step": 7645, + "time_per_iteration": 2.813481330871582 + }, + { + "auxiliary_loss_clip": 0.01217399, + "auxiliary_loss_mlp": 0.01003129, + "balance_loss_clip": 1.16879869, + "balance_loss_mlp": 0.9997434, + "epoch": 0.4597023898992936, + "flos": 65742835140360.0, + "grad_norm": 0.759256819416298, + "language_loss": 0.59922767, + "learning_rate": 2.356199538526593e-06, + "loss": 0.62143302, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.03393555, + "step": 7646, + "time_per_iteration": 4.625898838043213 + }, + { + "auxiliary_loss_clip": 0.01387218, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.26146209, + "balance_loss_mlp": 1.01718163, + "epoch": 0.45976251315196154, + "flos": 26912817345240.0, + "grad_norm": 1.5102204842860647, + "language_loss": 0.72885841, + "learning_rate": 2.355816296637939e-06, + "loss": 0.75305176, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14929199, + "step": 7647, + "time_per_iteration": 4.249419689178467 + }, + { + "auxiliary_loss_clip": 0.01400534, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.27340889, + "balance_loss_mlp": 1.02251816, + "epoch": 0.4598226364046295, + "flos": 26624146538520.0, + "grad_norm": 1.6496866883135495, + "language_loss": 0.6683172, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69269812, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.15045166, + "step": 7648, + "time_per_iteration": 2.826284408569336 + }, + { + "auxiliary_loss_clip": 0.0138784, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.2634902, + "balance_loss_mlp": 1.01730824, + "epoch": 0.45988275965729747, + "flos": 24392940842520.0, + "grad_norm": 1.4432641462948423, + "language_loss": 0.78892016, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81312174, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.15014648, + "step": 7649, + "time_per_iteration": 2.776054620742798 + }, + { + "auxiliary_loss_clip": 0.01392503, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.26851332, + "balance_loss_mlp": 1.02486384, + "epoch": 0.45994288290996543, + "flos": 24541296473520.0, + "grad_norm": 1.8329857242441512, + "language_loss": 0.69805372, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.72237802, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.1505127, + "step": 7650, + "time_per_iteration": 2.7807157039642334 + }, + { + "auxiliary_loss_clip": 0.01402134, + "auxiliary_loss_mlp": 0.01040721, + "balance_loss_clip": 1.27217031, + "balance_loss_mlp": 1.02381706, + "epoch": 0.4600030061626334, + "flos": 14834405395440.0, + "grad_norm": 1.8126483471715242, + "language_loss": 0.8447299, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86915851, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.16906738, + "step": 7651, + "time_per_iteration": 2.7215888500213623 + }, + { + "auxiliary_loss_clip": 0.0138725, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.2643218, + "balance_loss_mlp": 1.01397312, + "epoch": 0.46006312941530136, + "flos": 18118418278320.0, + "grad_norm": 2.3104367201249656, + "language_loss": 0.75612438, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.78028464, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14807129, + "step": 7652, + "time_per_iteration": 2.7220773696899414 + }, + { + "auxiliary_loss_clip": 0.01391686, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.26434171, + "balance_loss_mlp": 1.01294982, + "epoch": 0.46012325266796933, + "flos": 21980909808720.0, + "grad_norm": 1.575663402570377, + "language_loss": 0.7639209, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78812295, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15563965, + "step": 7653, + "time_per_iteration": 2.8478078842163086 + }, + { + "auxiliary_loss_clip": 0.01412248, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.27994919, + "balance_loss_mlp": 1.02032423, + "epoch": 0.4601833759206373, + "flos": 15272284608720.0, + "grad_norm": 2.7834567245875705, + "language_loss": 0.65972346, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68422228, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.17297363, + "step": 7654, + "time_per_iteration": 2.790635585784912 + }, + { + "auxiliary_loss_clip": 0.01387743, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.26331341, + "balance_loss_mlp": 1.0193119, + "epoch": 0.46024349917330526, + "flos": 27094698500400.0, + "grad_norm": 1.621666985415053, + "language_loss": 0.79034591, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81456411, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14782715, + "step": 7655, + "time_per_iteration": 4.270195245742798 + }, + { + "auxiliary_loss_clip": 0.01384957, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.26296842, + "balance_loss_mlp": 1.01766562, + "epoch": 0.4603036224259732, + "flos": 24468397821360.0, + "grad_norm": 3.7197691685930643, + "language_loss": 0.67779303, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.70196491, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.14562988, + "step": 7656, + "time_per_iteration": 2.846954107284546 + }, + { + "auxiliary_loss_clip": 0.0138538, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.26060641, + "balance_loss_mlp": 1.02239656, + "epoch": 0.4603637456786412, + "flos": 28114954763760.0, + "grad_norm": 1.6933105489142375, + "language_loss": 0.81126863, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83549201, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.14550781, + "step": 7657, + "time_per_iteration": 2.8476412296295166 + }, + { + "auxiliary_loss_clip": 0.01388866, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.26397121, + "balance_loss_mlp": 1.01775897, + "epoch": 0.4604238689313092, + "flos": 24353811364680.0, + "grad_norm": 2.0053930155581554, + "language_loss": 0.70723522, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.73145366, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.15222168, + "step": 7658, + "time_per_iteration": 2.8343217372894287 + }, + { + "auxiliary_loss_clip": 0.01213854, + "auxiliary_loss_mlp": 0.01005333, + "balance_loss_clip": 1.16454649, + "balance_loss_mlp": 1.00209069, + "epoch": 0.4604839921839772, + "flos": 53619121242000.0, + "grad_norm": 0.9641891884241026, + "language_loss": 0.62154818, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64374006, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.0324707, + "step": 7659, + "time_per_iteration": 3.3924105167388916 + }, + { + "auxiliary_loss_clip": 0.01387051, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.26476908, + "balance_loss_mlp": 1.01602757, + "epoch": 0.46054411543664514, + "flos": 31254672851640.0, + "grad_norm": 1.575298834269094, + "language_loss": 0.68689841, + "learning_rate": 2.350832929550336e-06, + "loss": 0.71108609, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.15686035, + "step": 7660, + "time_per_iteration": 2.860966444015503 + }, + { + "auxiliary_loss_clip": 0.01390761, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.26518559, + "balance_loss_mlp": 1.02263415, + "epoch": 0.4606042386893131, + "flos": 24097610264760.0, + "grad_norm": 1.7299536777648514, + "language_loss": 0.7740643, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79834491, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14660645, + "step": 7661, + "time_per_iteration": 2.820289373397827 + }, + { + "auxiliary_loss_clip": 0.01383401, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.26373374, + "balance_loss_mlp": 1.0241766, + "epoch": 0.46066436194198107, + "flos": 26584123676760.0, + "grad_norm": 1.8909173328040105, + "language_loss": 0.75154144, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77575207, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.13500977, + "step": 7662, + "time_per_iteration": 2.773458957672119 + }, + { + "auxiliary_loss_clip": 0.01402499, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.27127147, + "balance_loss_mlp": 1.02021778, + "epoch": 0.46072448519464904, + "flos": 17778841569360.0, + "grad_norm": 4.272006868555279, + "language_loss": 0.80054528, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82494414, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.17163086, + "step": 7663, + "time_per_iteration": 2.791964054107666 + }, + { + "auxiliary_loss_clip": 0.0138373, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.26325595, + "balance_loss_mlp": 1.02330971, + "epoch": 0.460784608447317, + "flos": 15090362845200.0, + "grad_norm": 1.8669525456110152, + "language_loss": 0.73800075, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.76221484, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14367676, + "step": 7664, + "time_per_iteration": 2.784712314605713 + }, + { + "auxiliary_loss_clip": 0.01386928, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.26386273, + "balance_loss_mlp": 1.01811075, + "epoch": 0.46084473169998497, + "flos": 18592909251120.0, + "grad_norm": 1.4384726434761808, + "language_loss": 0.72450435, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74869734, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.1427002, + "step": 7665, + "time_per_iteration": 2.8847298622131348 + }, + { + "auxiliary_loss_clip": 0.01383051, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.25856483, + "balance_loss_mlp": 1.02065635, + "epoch": 0.46090485495265293, + "flos": 19498822707960.0, + "grad_norm": 1.7657379110939284, + "language_loss": 0.77974683, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80392742, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14355469, + "step": 7666, + "time_per_iteration": 2.7834928035736084 + }, + { + "auxiliary_loss_clip": 0.01383943, + "auxiliary_loss_mlp": 0.0103571, + "balance_loss_clip": 1.26213717, + "balance_loss_mlp": 1.01999831, + "epoch": 0.4609649782053209, + "flos": 33371495132760.0, + "grad_norm": 1.3135344712995654, + "language_loss": 0.74334753, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76754409, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.15722656, + "step": 7667, + "time_per_iteration": 2.8631365299224854 + }, + { + "auxiliary_loss_clip": 0.01384763, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.26054442, + "balance_loss_mlp": 1.02226424, + "epoch": 0.46102510145798886, + "flos": 23774479941600.0, + "grad_norm": 2.519819288236139, + "language_loss": 0.76748025, + "learning_rate": 2.347765122572676e-06, + "loss": 0.79169035, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.13970947, + "step": 7668, + "time_per_iteration": 2.77877140045166 + }, + { + "auxiliary_loss_clip": 0.01382956, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.26382792, + "balance_loss_mlp": 1.0183382, + "epoch": 0.4610852247106568, + "flos": 23300070185520.0, + "grad_norm": 1.867717436054439, + "language_loss": 0.78149211, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80563062, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.12542725, + "step": 7669, + "time_per_iteration": 2.763645887374878 + }, + { + "auxiliary_loss_clip": 0.01386885, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.2618165, + "balance_loss_mlp": 1.02055788, + "epoch": 0.4611453479633248, + "flos": 25452976317480.0, + "grad_norm": 1.7022023303689136, + "language_loss": 0.83284247, + "learning_rate": 2.34699803866453e-06, + "loss": 0.8570534, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.13659668, + "step": 7670, + "time_per_iteration": 2.8337419033050537 + }, + { + "auxiliary_loss_clip": 0.01382856, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.26228213, + "balance_loss_mlp": 1.02533436, + "epoch": 0.4612054712159928, + "flos": 21144240251640.0, + "grad_norm": 1.5740320828615866, + "language_loss": 0.64004558, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.66426444, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13702393, + "step": 7671, + "time_per_iteration": 2.742290735244751 + }, + { + "auxiliary_loss_clip": 0.01219819, + "auxiliary_loss_mlp": 0.01007996, + "balance_loss_clip": 1.17151141, + "balance_loss_mlp": 1.00561225, + "epoch": 0.4612655944686608, + "flos": 69975261451800.0, + "grad_norm": 0.7113054722180343, + "language_loss": 0.55950165, + "learning_rate": 2.346230902123583e-06, + "loss": 0.58177984, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.02380371, + "step": 7672, + "time_per_iteration": 3.321553945541382 + }, + { + "auxiliary_loss_clip": 0.01389166, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.26370597, + "balance_loss_mlp": 1.02500975, + "epoch": 0.46132571772132874, + "flos": 16841595439800.0, + "grad_norm": 1.8764237464296771, + "language_loss": 0.71823192, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.74251544, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14172363, + "step": 7673, + "time_per_iteration": 2.7364397048950195 + }, + { + "auxiliary_loss_clip": 0.01388326, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.26732159, + "balance_loss_mlp": 1.01957202, + "epoch": 0.4613858409739967, + "flos": 35815752223200.0, + "grad_norm": 1.9243049442428473, + "language_loss": 0.71323246, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73744977, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.13824463, + "step": 7674, + "time_per_iteration": 2.8583621978759766 + }, + { + "auxiliary_loss_clip": 0.01384556, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.26041794, + "balance_loss_mlp": 1.02683926, + "epoch": 0.4614459642266647, + "flos": 35274169593720.0, + "grad_norm": 1.605316573099223, + "language_loss": 0.65290642, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67717052, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.15002441, + "step": 7675, + "time_per_iteration": 2.9431955814361572 + }, + { + "auxiliary_loss_clip": 0.01217058, + "auxiliary_loss_mlp": 0.01010348, + "balance_loss_clip": 1.16991854, + "balance_loss_mlp": 1.00788069, + "epoch": 0.46150608747933264, + "flos": 66719210747760.0, + "grad_norm": 0.7923726112552516, + "language_loss": 0.58644068, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.6087147, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.0246582, + "step": 7676, + "time_per_iteration": 3.256633996963501 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01009008, + "balance_loss_clip": 1.17091, + "balance_loss_mlp": 1.00640965, + "epoch": 0.4615662107320006, + "flos": 55843139258280.0, + "grad_norm": 0.7856064948254774, + "language_loss": 0.62755513, + "learning_rate": 2.344312831266341e-06, + "loss": 0.6498276, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02600098, + "step": 7677, + "time_per_iteration": 4.621450662612915 + }, + { + "auxiliary_loss_clip": 0.01384367, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.26270008, + "balance_loss_mlp": 1.02180195, + "epoch": 0.46162633398466857, + "flos": 15487000945920.0, + "grad_norm": 2.227769338970245, + "language_loss": 0.76511705, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78932571, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14715576, + "step": 7678, + "time_per_iteration": 2.8117244243621826 + }, + { + "auxiliary_loss_clip": 0.01393969, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.26949859, + "balance_loss_mlp": 1.02086186, + "epoch": 0.46168645723733653, + "flos": 20016300952800.0, + "grad_norm": 2.020146331730863, + "language_loss": 0.66312426, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68742418, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.15161133, + "step": 7679, + "time_per_iteration": 2.77763032913208 + }, + { + "auxiliary_loss_clip": 0.01386346, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.26394188, + "balance_loss_mlp": 1.02884543, + "epoch": 0.4617465804900045, + "flos": 20302819516440.0, + "grad_norm": 2.104794323147823, + "language_loss": 0.70408547, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.72837722, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.13989258, + "step": 7680, + "time_per_iteration": 2.843435049057007 + }, + { + "auxiliary_loss_clip": 0.01396393, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.26918411, + "balance_loss_mlp": 1.0314579, + "epoch": 0.46180670374267246, + "flos": 22351453715160.0, + "grad_norm": 1.758252952932543, + "language_loss": 0.63590729, + "learning_rate": 2.342778139478487e-06, + "loss": 0.6603387, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.1529541, + "step": 7681, + "time_per_iteration": 2.7700912952423096 + }, + { + "auxiliary_loss_clip": 0.01383582, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.26296151, + "balance_loss_mlp": 1.02103853, + "epoch": 0.46186682699534043, + "flos": 19899724686480.0, + "grad_norm": 1.4740952592377983, + "language_loss": 0.67416716, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69835246, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13916016, + "step": 7682, + "time_per_iteration": 2.795598030090332 + }, + { + "auxiliary_loss_clip": 0.01391847, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.26831675, + "balance_loss_mlp": 1.02078533, + "epoch": 0.4619269502480084, + "flos": 31509818134200.0, + "grad_norm": 2.200701680972394, + "language_loss": 0.74394268, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76821446, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.14550781, + "step": 7683, + "time_per_iteration": 2.8876681327819824 + }, + { + "auxiliary_loss_clip": 0.01384175, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.26199889, + "balance_loss_mlp": 1.02446103, + "epoch": 0.46198707350067636, + "flos": 25014934670760.0, + "grad_norm": 1.740036495302102, + "language_loss": 0.76802576, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.79225534, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.14318848, + "step": 7684, + "time_per_iteration": 2.8111488819122314 + }, + { + "auxiliary_loss_clip": 0.01402083, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.27329469, + "balance_loss_mlp": 1.02167368, + "epoch": 0.4620471967533444, + "flos": 18296441639280.0, + "grad_norm": 1.6761655026805782, + "language_loss": 0.80248749, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82687473, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.1496582, + "step": 7685, + "time_per_iteration": 4.275589466094971 + }, + { + "auxiliary_loss_clip": 0.01381049, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.26164079, + "balance_loss_mlp": 1.02068567, + "epoch": 0.46210732000601235, + "flos": 33991742801520.0, + "grad_norm": 1.6217158255133324, + "language_loss": 0.67264563, + "learning_rate": 2.340859482393731e-06, + "loss": 0.69681811, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.15527344, + "step": 7686, + "time_per_iteration": 4.334856748580933 + }, + { + "auxiliary_loss_clip": 0.01387762, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.26256943, + "balance_loss_mlp": 1.02438891, + "epoch": 0.4621674432586803, + "flos": 25014853454040.0, + "grad_norm": 2.9127610716788372, + "language_loss": 0.7413938, + "learning_rate": 2.340475712142296e-06, + "loss": 0.76566309, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14788818, + "step": 7687, + "time_per_iteration": 2.7410950660705566 + }, + { + "auxiliary_loss_clip": 0.01385883, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.26396298, + "balance_loss_mlp": 1.01608324, + "epoch": 0.4622275665113483, + "flos": 22018617993960.0, + "grad_norm": 1.996403515530144, + "language_loss": 0.75142783, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.7755971, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14953613, + "step": 7688, + "time_per_iteration": 2.8110382556915283 + }, + { + "auxiliary_loss_clip": 0.01382115, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.25977433, + "balance_loss_mlp": 1.01950812, + "epoch": 0.46228768976401624, + "flos": 24063881698800.0, + "grad_norm": 1.5912396510787772, + "language_loss": 0.79349709, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.81766737, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.15393066, + "step": 7689, + "time_per_iteration": 2.8243589401245117 + }, + { + "auxiliary_loss_clip": 0.01389376, + "auxiliary_loss_mlp": 0.01036699, + "balance_loss_clip": 1.26343179, + "balance_loss_mlp": 1.0211308, + "epoch": 0.4623478130166842, + "flos": 26657062937280.0, + "grad_norm": 2.346444648287068, + "language_loss": 0.57013881, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59439957, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15563965, + "step": 7690, + "time_per_iteration": 2.818662405014038 + }, + { + "auxiliary_loss_clip": 0.01389129, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.26268959, + "balance_loss_mlp": 1.01849914, + "epoch": 0.46240793626935217, + "flos": 20563325102520.0, + "grad_norm": 2.0809325489694417, + "language_loss": 0.83310771, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85733539, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.15142822, + "step": 7691, + "time_per_iteration": 2.758272886276245 + }, + { + "auxiliary_loss_clip": 0.01386986, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.26322556, + "balance_loss_mlp": 1.0145781, + "epoch": 0.46246805952202014, + "flos": 22461248385360.0, + "grad_norm": 1.7308174774546168, + "language_loss": 0.75529122, + "learning_rate": 2.338556667513091e-06, + "loss": 0.7794528, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14587402, + "step": 7692, + "time_per_iteration": 2.7836756706237793 + }, + { + "auxiliary_loss_clip": 0.01385074, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.2602098, + "balance_loss_mlp": 1.02182078, + "epoch": 0.4625281827746881, + "flos": 35047839265560.0, + "grad_norm": 2.414126876151916, + "language_loss": 0.74172658, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76595199, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.15649414, + "step": 7693, + "time_per_iteration": 2.8769659996032715 + }, + { + "auxiliary_loss_clip": 0.01386846, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.26529789, + "balance_loss_mlp": 1.02509081, + "epoch": 0.46258830602735607, + "flos": 21073534450920.0, + "grad_norm": 1.4892069400243642, + "language_loss": 0.85857499, + "learning_rate": 2.337788959692808e-06, + "loss": 0.88284206, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14764404, + "step": 7694, + "time_per_iteration": 4.304639101028442 + }, + { + "auxiliary_loss_clip": 0.01388177, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.26356936, + "balance_loss_mlp": 1.02003276, + "epoch": 0.46264842928002403, + "flos": 26182856223000.0, + "grad_norm": 2.0129112715183126, + "language_loss": 0.79208565, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81630629, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.13861084, + "step": 7695, + "time_per_iteration": 2.805072546005249 + }, + { + "auxiliary_loss_clip": 0.01375938, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.25545883, + "balance_loss_mlp": 1.01796985, + "epoch": 0.462708552532692, + "flos": 16768656179280.0, + "grad_norm": 1.6106319319126288, + "language_loss": 0.72168648, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74576855, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14276123, + "step": 7696, + "time_per_iteration": 2.762927293777466 + }, + { + "auxiliary_loss_clip": 0.01391502, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.26714563, + "balance_loss_mlp": 1.02569532, + "epoch": 0.46276867578535996, + "flos": 15564772601280.0, + "grad_norm": 1.6455686660620965, + "language_loss": 0.69988602, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.72420919, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.15112305, + "step": 7697, + "time_per_iteration": 2.7775559425354004 + }, + { + "auxiliary_loss_clip": 0.01387109, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.26461244, + "balance_loss_mlp": 1.01890254, + "epoch": 0.462828799038028, + "flos": 22420047881160.0, + "grad_norm": 1.849754677583956, + "language_loss": 0.85016221, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87436509, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14263916, + "step": 7698, + "time_per_iteration": 2.796754837036133 + }, + { + "auxiliary_loss_clip": 0.01386043, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.26469541, + "balance_loss_mlp": 1.01916122, + "epoch": 0.46288892229069595, + "flos": 21074590268280.0, + "grad_norm": 1.825056140035757, + "language_loss": 0.71461499, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73880959, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14251709, + "step": 7699, + "time_per_iteration": 2.783036708831787 + }, + { + "auxiliary_loss_clip": 0.01392482, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.26586652, + "balance_loss_mlp": 1.02172935, + "epoch": 0.4629490455433639, + "flos": 23190965857440.0, + "grad_norm": 2.3286489416720886, + "language_loss": 0.7180016, + "learning_rate": 2.335485529281996e-06, + "loss": 0.74229515, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15148926, + "step": 7700, + "time_per_iteration": 2.8807532787323 + }, + { + "auxiliary_loss_clip": 0.01381013, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.25966835, + "balance_loss_mlp": 1.02288926, + "epoch": 0.4630091687960319, + "flos": 18839161302840.0, + "grad_norm": 2.0101042813178216, + "language_loss": 0.72851276, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.7526902, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.1385498, + "step": 7701, + "time_per_iteration": 2.8744587898254395 + }, + { + "auxiliary_loss_clip": 0.0139052, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.26404738, + "balance_loss_mlp": 1.02220392, + "epoch": 0.46306929204869984, + "flos": 38913823114920.0, + "grad_norm": 1.8763586259406697, + "language_loss": 0.65399957, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67826855, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.14172363, + "step": 7702, + "time_per_iteration": 2.891047716140747 + }, + { + "auxiliary_loss_clip": 0.01376832, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.25644505, + "balance_loss_mlp": 1.01739192, + "epoch": 0.4631294153013678, + "flos": 19648680848280.0, + "grad_norm": 1.9796568848381408, + "language_loss": 0.73230654, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75638849, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13970947, + "step": 7703, + "time_per_iteration": 2.781620502471924 + }, + { + "auxiliary_loss_clip": 0.0139093, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.2649343, + "balance_loss_mlp": 1.02517724, + "epoch": 0.4631895385540358, + "flos": 24613667217000.0, + "grad_norm": 1.626753952928389, + "language_loss": 0.6863752, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.71069038, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.15423584, + "step": 7704, + "time_per_iteration": 2.8003382682800293 + }, + { + "auxiliary_loss_clip": 0.01387845, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.26209581, + "balance_loss_mlp": 1.02098036, + "epoch": 0.46324966180670374, + "flos": 26325364250160.0, + "grad_norm": 1.9414125447156274, + "language_loss": 0.81419009, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83841795, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.13970947, + "step": 7705, + "time_per_iteration": 2.8851070404052734 + }, + { + "auxiliary_loss_clip": 0.01393537, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.26746929, + "balance_loss_mlp": 1.02044272, + "epoch": 0.4633097850593717, + "flos": 19244652026040.0, + "grad_norm": 1.754113873961086, + "language_loss": 0.78010583, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.80438471, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.13934326, + "step": 7706, + "time_per_iteration": 2.767740249633789 + }, + { + "auxiliary_loss_clip": 0.01376585, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.25761151, + "balance_loss_mlp": 1.02048922, + "epoch": 0.46336990831203967, + "flos": 22788236502720.0, + "grad_norm": 2.042198238909929, + "language_loss": 0.70363748, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.7277528, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14447021, + "step": 7707, + "time_per_iteration": 2.8289918899536133 + }, + { + "auxiliary_loss_clip": 0.0139444, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.26796472, + "balance_loss_mlp": 1.02168441, + "epoch": 0.46343003156470763, + "flos": 38216900216520.0, + "grad_norm": 2.052338388234154, + "language_loss": 0.62017292, + "learning_rate": 2.332413576865791e-06, + "loss": 0.64448953, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.15527344, + "step": 7708, + "time_per_iteration": 2.8726253509521484 + }, + { + "auxiliary_loss_clip": 0.01384979, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.26168609, + "balance_loss_mlp": 1.02099562, + "epoch": 0.4634901548173756, + "flos": 31944489287040.0, + "grad_norm": 1.9350345873931973, + "language_loss": 0.77407557, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79827785, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.1427002, + "step": 7709, + "time_per_iteration": 2.901211738586426 + }, + { + "auxiliary_loss_clip": 0.01385518, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.2595464, + "balance_loss_mlp": 1.02190018, + "epoch": 0.46355027807004356, + "flos": 20087169186960.0, + "grad_norm": 1.6597799402128137, + "language_loss": 0.77524799, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79947418, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15197754, + "step": 7710, + "time_per_iteration": 2.7645153999328613 + }, + { + "auxiliary_loss_clip": 0.01399431, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.27255881, + "balance_loss_mlp": 1.01762986, + "epoch": 0.4636104013227116, + "flos": 24066764892360.0, + "grad_norm": 1.9899145987426663, + "language_loss": 0.73504919, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75938344, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.16369629, + "step": 7711, + "time_per_iteration": 2.7908737659454346 + }, + { + "auxiliary_loss_clip": 0.01384937, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.26238441, + "balance_loss_mlp": 1.02039337, + "epoch": 0.46367052457537955, + "flos": 23919911770680.0, + "grad_norm": 1.279376816079651, + "language_loss": 0.71329111, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73749018, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14562988, + "step": 7712, + "time_per_iteration": 2.822768211364746 + }, + { + "auxiliary_loss_clip": 0.01397866, + "auxiliary_loss_mlp": 0.01044675, + "balance_loss_clip": 1.2692759, + "balance_loss_mlp": 1.02831316, + "epoch": 0.4637306478280475, + "flos": 26402242521600.0, + "grad_norm": 2.1922383026641428, + "language_loss": 0.7315833, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75600863, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.16351318, + "step": 7713, + "time_per_iteration": 2.8566062450408936 + }, + { + "auxiliary_loss_clip": 0.01398957, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.27025294, + "balance_loss_mlp": 1.01857138, + "epoch": 0.4637907710807155, + "flos": 21985701595200.0, + "grad_norm": 1.5540510982345745, + "language_loss": 0.59294713, + "learning_rate": 2.3301090827294e-06, + "loss": 0.61728084, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.15820312, + "step": 7714, + "time_per_iteration": 2.7856950759887695 + }, + { + "auxiliary_loss_clip": 0.01385207, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.26215911, + "balance_loss_mlp": 1.01711345, + "epoch": 0.46385089433338345, + "flos": 12426150939120.0, + "grad_norm": 1.804474506640396, + "language_loss": 0.71118414, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.73534852, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14123535, + "step": 7715, + "time_per_iteration": 2.751729726791382 + }, + { + "auxiliary_loss_clip": 0.01400282, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.27009165, + "balance_loss_mlp": 1.02656448, + "epoch": 0.4639110175860514, + "flos": 23921170629840.0, + "grad_norm": 2.124091179872099, + "language_loss": 0.68612486, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.7105518, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.15844727, + "step": 7716, + "time_per_iteration": 4.275898694992065 + }, + { + "auxiliary_loss_clip": 0.01390311, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.26416206, + "balance_loss_mlp": 1.01680493, + "epoch": 0.4639711408387194, + "flos": 25305757720560.0, + "grad_norm": 1.5077045947257663, + "language_loss": 0.81097221, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83519852, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15509033, + "step": 7717, + "time_per_iteration": 2.8311448097229004 + }, + { + "auxiliary_loss_clip": 0.01393242, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.26694632, + "balance_loss_mlp": 1.02035451, + "epoch": 0.46403126409138734, + "flos": 21216367344960.0, + "grad_norm": 1.8318718986450835, + "language_loss": 0.73690367, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.76118416, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.14447021, + "step": 7718, + "time_per_iteration": 2.8067092895507812 + }, + { + "auxiliary_loss_clip": 0.01385784, + "auxiliary_loss_mlp": 0.01038713, + "balance_loss_clip": 1.26142621, + "balance_loss_mlp": 1.02313805, + "epoch": 0.4640913873440553, + "flos": 35852404591080.0, + "grad_norm": 1.751813110617302, + "language_loss": 0.70818913, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.73243409, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.15582275, + "step": 7719, + "time_per_iteration": 2.941709518432617 + }, + { + "auxiliary_loss_clip": 0.01392001, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.26600909, + "balance_loss_mlp": 1.02238965, + "epoch": 0.46415151059672327, + "flos": 19170169647840.0, + "grad_norm": 1.8564590554116813, + "language_loss": 0.86768496, + "learning_rate": 2.327804137953357e-06, + "loss": 0.89197618, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14733887, + "step": 7720, + "time_per_iteration": 2.8598124980926514 + }, + { + "auxiliary_loss_clip": 0.01214268, + "auxiliary_loss_mlp": 0.01003543, + "balance_loss_clip": 1.16671109, + "balance_loss_mlp": 1.00101531, + "epoch": 0.46421163384939124, + "flos": 58928435497800.0, + "grad_norm": 0.7167767574831819, + "language_loss": 0.55144715, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57362533, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.02526855, + "step": 7721, + "time_per_iteration": 3.2958712577819824 + }, + { + "auxiliary_loss_clip": 0.0139065, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.26768541, + "balance_loss_mlp": 1.02375913, + "epoch": 0.4642717571020592, + "flos": 20162220082200.0, + "grad_norm": 1.7600727971494674, + "language_loss": 0.79606575, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.82035732, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14746094, + "step": 7722, + "time_per_iteration": 2.7981715202331543 + }, + { + "auxiliary_loss_clip": 0.01391553, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.26416266, + "balance_loss_mlp": 1.02028942, + "epoch": 0.46433188035472717, + "flos": 25051668255360.0, + "grad_norm": 1.475862674601044, + "language_loss": 0.77722228, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80148607, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14538574, + "step": 7723, + "time_per_iteration": 4.301342964172363 + }, + { + "auxiliary_loss_clip": 0.01384328, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.26284242, + "balance_loss_mlp": 1.02401745, + "epoch": 0.4643920036073952, + "flos": 28081672889760.0, + "grad_norm": 1.54384139605223, + "language_loss": 0.68442202, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70864964, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.144104, + "step": 7724, + "time_per_iteration": 2.888577938079834 + }, + { + "auxiliary_loss_clip": 0.01388639, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.26568246, + "balance_loss_mlp": 1.02042854, + "epoch": 0.46445212686006315, + "flos": 18373969644480.0, + "grad_norm": 3.1035345297239783, + "language_loss": 0.67224276, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69647586, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14245605, + "step": 7725, + "time_per_iteration": 4.162615537643433 + }, + { + "auxiliary_loss_clip": 0.01378734, + "auxiliary_loss_mlp": 0.01038715, + "balance_loss_clip": 1.25894856, + "balance_loss_mlp": 1.02572751, + "epoch": 0.4645122501127311, + "flos": 31727783140200.0, + "grad_norm": 1.864396824584672, + "language_loss": 0.65148211, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67565656, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.12976074, + "step": 7726, + "time_per_iteration": 2.8907201290130615 + }, + { + "auxiliary_loss_clip": 0.0139065, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.26753104, + "balance_loss_mlp": 1.02067745, + "epoch": 0.4645723733653991, + "flos": 23774073858000.0, + "grad_norm": 2.2243210612318514, + "language_loss": 0.75360447, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77786255, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.14483643, + "step": 7727, + "time_per_iteration": 2.7776756286621094 + }, + { + "auxiliary_loss_clip": 0.01390151, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.26657391, + "balance_loss_mlp": 1.02012169, + "epoch": 0.46463249661806705, + "flos": 33151987009080.0, + "grad_norm": 2.1810355182030254, + "language_loss": 0.7863887, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81064367, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.15234375, + "step": 7728, + "time_per_iteration": 2.8419952392578125 + }, + { + "auxiliary_loss_clip": 0.01391336, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.26726627, + "balance_loss_mlp": 1.01851773, + "epoch": 0.464692619870735, + "flos": 18300583692000.0, + "grad_norm": 1.8466944511044951, + "language_loss": 0.76426792, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78851914, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.15283203, + "step": 7729, + "time_per_iteration": 2.693669080734253 + }, + { + "auxiliary_loss_clip": 0.01393135, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.26898837, + "balance_loss_mlp": 1.01934409, + "epoch": 0.464752743123403, + "flos": 22643210757240.0, + "grad_norm": 4.142072579986189, + "language_loss": 0.79820222, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82247287, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14587402, + "step": 7730, + "time_per_iteration": 2.7747867107391357 + }, + { + "auxiliary_loss_clip": 0.01388964, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.26515365, + "balance_loss_mlp": 1.01736259, + "epoch": 0.46481286637607094, + "flos": 20416959281160.0, + "grad_norm": 1.5956925342985555, + "language_loss": 0.77445984, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79867136, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.1484375, + "step": 7731, + "time_per_iteration": 2.7529900074005127 + }, + { + "auxiliary_loss_clip": 0.01383331, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.26126051, + "balance_loss_mlp": 1.01557612, + "epoch": 0.4648729896287389, + "flos": 34281875509200.0, + "grad_norm": 1.661696331324261, + "language_loss": 0.65982473, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68395686, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.1428833, + "step": 7732, + "time_per_iteration": 2.852496385574341 + }, + { + "auxiliary_loss_clip": 0.01397084, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.26910233, + "balance_loss_mlp": 1.02032757, + "epoch": 0.4649331128814069, + "flos": 21326405665320.0, + "grad_norm": 2.4137314183721443, + "language_loss": 0.73527229, + "learning_rate": 2.32280855998725e-06, + "loss": 0.75961018, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.16381836, + "step": 7733, + "time_per_iteration": 4.279908895492554 + }, + { + "auxiliary_loss_clip": 0.01211181, + "auxiliary_loss_mlp": 0.01004141, + "balance_loss_clip": 1.16315711, + "balance_loss_mlp": 1.00166154, + "epoch": 0.46499323613407484, + "flos": 58321449781560.0, + "grad_norm": 1.216692396692004, + "language_loss": 0.51987123, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54202449, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02478027, + "step": 7734, + "time_per_iteration": 3.2462337017059326 + }, + { + "auxiliary_loss_clip": 0.01391233, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.2678659, + "balance_loss_mlp": 1.01896441, + "epoch": 0.4650533593867428, + "flos": 10893289434120.0, + "grad_norm": 1.9193035756086352, + "language_loss": 0.75469339, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77893698, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.1416626, + "step": 7735, + "time_per_iteration": 2.848625659942627 + }, + { + "auxiliary_loss_clip": 0.01382361, + "auxiliary_loss_mlp": 0.01042428, + "balance_loss_clip": 1.26080728, + "balance_loss_mlp": 1.02784836, + "epoch": 0.46511348263941077, + "flos": 19979404934760.0, + "grad_norm": 1.822037584971617, + "language_loss": 0.69912279, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72337061, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14569092, + "step": 7736, + "time_per_iteration": 2.827688455581665 + }, + { + "auxiliary_loss_clip": 0.01379612, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.2599113, + "balance_loss_mlp": 1.01847744, + "epoch": 0.46517360589207873, + "flos": 19682612456040.0, + "grad_norm": 1.643348339707758, + "language_loss": 0.72086906, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74498677, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13684082, + "step": 7737, + "time_per_iteration": 2.749910354614258 + }, + { + "auxiliary_loss_clip": 0.01395299, + "auxiliary_loss_mlp": 0.01040297, + "balance_loss_clip": 1.27091074, + "balance_loss_mlp": 1.02501416, + "epoch": 0.46523372914474675, + "flos": 16876988948520.0, + "grad_norm": 2.0027696455109636, + "language_loss": 0.84198606, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.86634195, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.15283203, + "step": 7738, + "time_per_iteration": 2.7645490169525146 + }, + { + "auxiliary_loss_clip": 0.01210056, + "auxiliary_loss_mlp": 0.01006689, + "balance_loss_clip": 1.16176867, + "balance_loss_mlp": 1.00429344, + "epoch": 0.4652938523974147, + "flos": 53454011339520.0, + "grad_norm": 0.7700598301866831, + "language_loss": 0.57881403, + "learning_rate": 2.320502208946932e-06, + "loss": 0.60098147, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.02392578, + "step": 7739, + "time_per_iteration": 3.342672348022461 + }, + { + "auxiliary_loss_clip": 0.0139347, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.26946425, + "balance_loss_mlp": 1.02531385, + "epoch": 0.4653539756500827, + "flos": 15235469807400.0, + "grad_norm": 2.7074826969954335, + "language_loss": 0.84848416, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87282342, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.15118408, + "step": 7740, + "time_per_iteration": 2.7479665279388428 + }, + { + "auxiliary_loss_clip": 0.01384798, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.26370013, + "balance_loss_mlp": 1.02794039, + "epoch": 0.46541409890275065, + "flos": 23737502706840.0, + "grad_norm": 1.8708776204758895, + "language_loss": 0.76000327, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78427851, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14782715, + "step": 7741, + "time_per_iteration": 2.8045578002929688 + }, + { + "auxiliary_loss_clip": 0.01399332, + "auxiliary_loss_mlp": 0.0104032, + "balance_loss_clip": 1.27214885, + "balance_loss_mlp": 1.02585936, + "epoch": 0.4654742221554186, + "flos": 20851874084160.0, + "grad_norm": 1.7036029139423703, + "language_loss": 0.81213772, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83653426, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.14440918, + "step": 7742, + "time_per_iteration": 2.7482850551605225 + }, + { + "auxiliary_loss_clip": 0.01396553, + "auxiliary_loss_mlp": 0.01044514, + "balance_loss_clip": 1.26975167, + "balance_loss_mlp": 1.02917218, + "epoch": 0.4655343454080866, + "flos": 20709812748960.0, + "grad_norm": 1.9584813512451762, + "language_loss": 0.73113829, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.75554895, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15344238, + "step": 7743, + "time_per_iteration": 2.801295757293701 + }, + { + "auxiliary_loss_clip": 0.01391284, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.26798213, + "balance_loss_mlp": 1.0196625, + "epoch": 0.46559446866075455, + "flos": 18994582788480.0, + "grad_norm": 2.64660124536533, + "language_loss": 0.71598738, + "learning_rate": 2.318579915392483e-06, + "loss": 0.74024433, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.14733887, + "step": 7744, + "time_per_iteration": 2.7340810298919678 + }, + { + "auxiliary_loss_clip": 0.01385767, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.26456499, + "balance_loss_mlp": 1.01860261, + "epoch": 0.4656545919134225, + "flos": 34503129792360.0, + "grad_norm": 1.6612663037494986, + "language_loss": 0.85696656, + "learning_rate": 2.31819542038153e-06, + "loss": 0.88114423, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.1340332, + "step": 7745, + "time_per_iteration": 2.878335475921631 + }, + { + "auxiliary_loss_clip": 0.01385715, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.26506066, + "balance_loss_mlp": 1.02294886, + "epoch": 0.4657147151660905, + "flos": 24315169187160.0, + "grad_norm": 1.3011867644050419, + "language_loss": 0.73210561, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75633305, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14093018, + "step": 7746, + "time_per_iteration": 2.7568891048431396 + }, + { + "auxiliary_loss_clip": 0.01383421, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.26343799, + "balance_loss_mlp": 1.02375817, + "epoch": 0.46577483841875844, + "flos": 58804979437440.0, + "grad_norm": 1.543032398897915, + "language_loss": 0.70214558, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72636169, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14416504, + "step": 7747, + "time_per_iteration": 3.103206157684326 + }, + { + "auxiliary_loss_clip": 0.01384952, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.26351893, + "balance_loss_mlp": 1.02040815, + "epoch": 0.4658349616714264, + "flos": 31328302454280.0, + "grad_norm": 1.4120664848940143, + "language_loss": 0.67311817, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69731307, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14129639, + "step": 7748, + "time_per_iteration": 2.8397672176361084 + }, + { + "auxiliary_loss_clip": 0.01397661, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.26975775, + "balance_loss_mlp": 1.02231431, + "epoch": 0.46589508492409437, + "flos": 14863667041800.0, + "grad_norm": 1.8868198711114736, + "language_loss": 0.64474398, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66910535, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.16186523, + "step": 7749, + "time_per_iteration": 2.7166049480438232 + }, + { + "auxiliary_loss_clip": 0.01394054, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.26896739, + "balance_loss_mlp": 1.02202439, + "epoch": 0.46595520817676234, + "flos": 12899829744720.0, + "grad_norm": 1.9440704377010145, + "language_loss": 0.74509454, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76940662, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.15136719, + "step": 7750, + "time_per_iteration": 2.7582147121429443 + }, + { + "auxiliary_loss_clip": 0.01398493, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.27212608, + "balance_loss_mlp": 1.01860619, + "epoch": 0.46601533142943036, + "flos": 32860676658960.0, + "grad_norm": 2.1673142959588683, + "language_loss": 0.74455184, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76887649, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.15380859, + "step": 7751, + "time_per_iteration": 2.898545742034912 + }, + { + "auxiliary_loss_clip": 0.0139729, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.26948595, + "balance_loss_mlp": 1.0211606, + "epoch": 0.4660754546820983, + "flos": 19971405087840.0, + "grad_norm": 1.7758406995850828, + "language_loss": 0.73441637, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.7587558, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.15490723, + "step": 7752, + "time_per_iteration": 2.7335472106933594 + }, + { + "auxiliary_loss_clip": 0.01397207, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.2698667, + "balance_loss_mlp": 1.02532077, + "epoch": 0.4661355779347663, + "flos": 26693877738600.0, + "grad_norm": 2.115679850426568, + "language_loss": 0.69244957, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71682572, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.15087891, + "step": 7753, + "time_per_iteration": 2.784217357635498 + }, + { + "auxiliary_loss_clip": 0.01384767, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.26354074, + "balance_loss_mlp": 1.01782167, + "epoch": 0.46619570118743425, + "flos": 20964348906120.0, + "grad_norm": 1.639423486141154, + "language_loss": 0.73673427, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.76090419, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14404297, + "step": 7754, + "time_per_iteration": 2.7495670318603516 + }, + { + "auxiliary_loss_clip": 0.01399596, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.27361822, + "balance_loss_mlp": 1.01669025, + "epoch": 0.4662558244401022, + "flos": 24431501803320.0, + "grad_norm": 1.4743603303122297, + "language_loss": 0.79193676, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81625152, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.1519165, + "step": 7755, + "time_per_iteration": 4.3170859813690186 + }, + { + "auxiliary_loss_clip": 0.01384913, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.26487958, + "balance_loss_mlp": 1.01884818, + "epoch": 0.4663159476927702, + "flos": 20600424162360.0, + "grad_norm": 1.5238421382366276, + "language_loss": 0.72656119, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.75074285, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14404297, + "step": 7756, + "time_per_iteration": 2.799072742462158 + }, + { + "auxiliary_loss_clip": 0.01384265, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.26231074, + "balance_loss_mlp": 1.02140236, + "epoch": 0.46637607094543815, + "flos": 25666839879120.0, + "grad_norm": 5.068187606622146, + "language_loss": 0.7846086, + "learning_rate": 2.313580543272274e-06, + "loss": 0.8088131, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14782715, + "step": 7757, + "time_per_iteration": 2.8643181324005127 + }, + { + "auxiliary_loss_clip": 0.0138851, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.26557755, + "balance_loss_mlp": 1.02176571, + "epoch": 0.4664361941981061, + "flos": 24278435602560.0, + "grad_norm": 1.8839896072998796, + "language_loss": 0.66852427, + "learning_rate": 2.313195892540705e-06, + "loss": 0.69276685, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.13989258, + "step": 7758, + "time_per_iteration": 2.7782928943634033 + }, + { + "auxiliary_loss_clip": 0.01386332, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.26392925, + "balance_loss_mlp": 1.02170467, + "epoch": 0.4664963174507741, + "flos": 18410500187280.0, + "grad_norm": 2.2547640595051512, + "language_loss": 0.75120926, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.77543432, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14465332, + "step": 7759, + "time_per_iteration": 2.7952558994293213 + }, + { + "auxiliary_loss_clip": 0.01386008, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.26470089, + "balance_loss_mlp": 1.01980686, + "epoch": 0.46655644070344204, + "flos": 22460192568000.0, + "grad_norm": 3.1188876598306488, + "language_loss": 0.77720809, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80140907, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14282227, + "step": 7760, + "time_per_iteration": 2.806399345397949 + }, + { + "auxiliary_loss_clip": 0.01386609, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.26599896, + "balance_loss_mlp": 1.01583171, + "epoch": 0.46661656395611, + "flos": 13812402972600.0, + "grad_norm": 1.5964543905828443, + "language_loss": 0.74217802, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76634789, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.14538574, + "step": 7761, + "time_per_iteration": 2.7626705169677734 + }, + { + "auxiliary_loss_clip": 0.0139333, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.26726174, + "balance_loss_mlp": 1.02201819, + "epoch": 0.466676687208778, + "flos": 21657089143440.0, + "grad_norm": 1.5559193551371575, + "language_loss": 0.78963596, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81395864, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.16906738, + "step": 7762, + "time_per_iteration": 4.184962034225464 + }, + { + "auxiliary_loss_clip": 0.01205334, + "auxiliary_loss_mlp": 0.0100282, + "balance_loss_clip": 1.15744841, + "balance_loss_mlp": 1.0002569, + "epoch": 0.46673681046144594, + "flos": 68549798723760.0, + "grad_norm": 0.80159277828717, + "language_loss": 0.59779155, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61987305, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.02563477, + "step": 7763, + "time_per_iteration": 4.680582284927368 + }, + { + "auxiliary_loss_clip": 0.01392123, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.26627421, + "balance_loss_mlp": 1.01913893, + "epoch": 0.46679693371411396, + "flos": 15818618416320.0, + "grad_norm": 2.153793291026309, + "language_loss": 0.79347241, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.8177464, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.16125488, + "step": 7764, + "time_per_iteration": 2.7574329376220703 + }, + { + "auxiliary_loss_clip": 0.01388423, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.26668024, + "balance_loss_mlp": 1.02208018, + "epoch": 0.4668570569667819, + "flos": 18519117215040.0, + "grad_norm": 1.7221764656607956, + "language_loss": 0.72766352, + "learning_rate": 2.310503005696839e-06, + "loss": 0.75190938, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14093018, + "step": 7765, + "time_per_iteration": 2.7448205947875977 + }, + { + "auxiliary_loss_clip": 0.01395267, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.27067494, + "balance_loss_mlp": 1.02485585, + "epoch": 0.4669171802194499, + "flos": 19211126501880.0, + "grad_norm": 2.033656557984596, + "language_loss": 0.78241372, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80676293, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.14788818, + "step": 7766, + "time_per_iteration": 2.7510814666748047 + }, + { + "auxiliary_loss_clip": 0.01386172, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.26312244, + "balance_loss_mlp": 1.01968813, + "epoch": 0.46697730347211786, + "flos": 12279947551200.0, + "grad_norm": 2.080558533963307, + "language_loss": 0.6507293, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67493093, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.1428833, + "step": 7767, + "time_per_iteration": 2.7580556869506836 + }, + { + "auxiliary_loss_clip": 0.01391719, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.26914155, + "balance_loss_mlp": 1.01966214, + "epoch": 0.4670374267247858, + "flos": 23592030269400.0, + "grad_norm": 1.9159667595680303, + "language_loss": 0.74416256, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76841921, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.1427002, + "step": 7768, + "time_per_iteration": 2.779956579208374 + }, + { + "auxiliary_loss_clip": 0.01387669, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.26343977, + "balance_loss_mlp": 1.02412724, + "epoch": 0.4670975499774538, + "flos": 15994530142560.0, + "grad_norm": 1.5604192220387703, + "language_loss": 0.71279037, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73704886, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14050293, + "step": 7769, + "time_per_iteration": 2.7009894847869873 + }, + { + "auxiliary_loss_clip": 0.01387068, + "auxiliary_loss_mlp": 0.01037284, + "balance_loss_clip": 1.2631098, + "balance_loss_mlp": 1.02291882, + "epoch": 0.46715767323012175, + "flos": 15381835628760.0, + "grad_norm": 1.6751895018398322, + "language_loss": 0.81278217, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83702564, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14367676, + "step": 7770, + "time_per_iteration": 2.7358498573303223 + }, + { + "auxiliary_loss_clip": 0.0120606, + "auxiliary_loss_mlp": 0.01004347, + "balance_loss_clip": 1.15801477, + "balance_loss_mlp": 1.00215364, + "epoch": 0.4672177964827897, + "flos": 60266705430960.0, + "grad_norm": 0.8300531269453454, + "language_loss": 0.55649287, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57859689, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02197266, + "step": 7771, + "time_per_iteration": 4.770530462265015 + }, + { + "auxiliary_loss_clip": 0.01390458, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.2685976, + "balance_loss_mlp": 1.02715218, + "epoch": 0.4672779197354577, + "flos": 27641885083560.0, + "grad_norm": 1.6600707944483244, + "language_loss": 0.66083461, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68515199, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14141846, + "step": 7772, + "time_per_iteration": 2.8202314376831055 + }, + { + "auxiliary_loss_clip": 0.01384833, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.26445115, + "balance_loss_mlp": 1.02406955, + "epoch": 0.46733804298812565, + "flos": 31400510764320.0, + "grad_norm": 1.9704806708250124, + "language_loss": 0.63888371, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66311419, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.14123535, + "step": 7773, + "time_per_iteration": 2.858264923095703 + }, + { + "auxiliary_loss_clip": 0.01388128, + "auxiliary_loss_mlp": 0.01039541, + "balance_loss_clip": 1.26419878, + "balance_loss_mlp": 1.02482426, + "epoch": 0.4673981662407936, + "flos": 19505401262280.0, + "grad_norm": 2.347343036616811, + "language_loss": 0.80155444, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82583117, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14715576, + "step": 7774, + "time_per_iteration": 2.733593463897705 + }, + { + "auxiliary_loss_clip": 0.01392492, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.26746476, + "balance_loss_mlp": 1.01987469, + "epoch": 0.4674582894934616, + "flos": 20526713343000.0, + "grad_norm": 1.5124024635253401, + "language_loss": 0.77983254, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80410397, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14794922, + "step": 7775, + "time_per_iteration": 2.8008880615234375 + }, + { + "auxiliary_loss_clip": 0.01385187, + "auxiliary_loss_mlp": 0.01036498, + "balance_loss_clip": 1.26385713, + "balance_loss_mlp": 1.02244306, + "epoch": 0.46751841274612954, + "flos": 22096064782440.0, + "grad_norm": 1.7763933425713105, + "language_loss": 0.70141309, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72562993, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14050293, + "step": 7776, + "time_per_iteration": 2.785327672958374 + }, + { + "auxiliary_loss_clip": 0.01383338, + "auxiliary_loss_mlp": 0.01041034, + "balance_loss_clip": 1.26172876, + "balance_loss_mlp": 1.02745581, + "epoch": 0.46757853599879756, + "flos": 26985837822480.0, + "grad_norm": 1.9955772225263424, + "language_loss": 0.74041176, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.76465547, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.13568115, + "step": 7777, + "time_per_iteration": 2.929382562637329 + }, + { + "auxiliary_loss_clip": 0.01386224, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.26323164, + "balance_loss_mlp": 1.02933121, + "epoch": 0.4676386592514655, + "flos": 24139419894360.0, + "grad_norm": 2.6187119607739575, + "language_loss": 0.69991237, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.72420591, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.13793945, + "step": 7778, + "time_per_iteration": 2.8208634853363037 + }, + { + "auxiliary_loss_clip": 0.01397795, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_clip": 1.27301574, + "balance_loss_mlp": 1.03076255, + "epoch": 0.4676987825041335, + "flos": 25489222601760.0, + "grad_norm": 1.5941761475186547, + "language_loss": 0.74088097, + "learning_rate": 2.305115506191206e-06, + "loss": 0.76531291, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.14611816, + "step": 7779, + "time_per_iteration": 2.8029263019561768 + }, + { + "auxiliary_loss_clip": 0.01383082, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.26303291, + "balance_loss_mlp": 1.02806127, + "epoch": 0.46775890575680146, + "flos": 21950429911560.0, + "grad_norm": 1.4620413977567737, + "language_loss": 0.72301388, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74726105, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.13592529, + "step": 7780, + "time_per_iteration": 2.83439564704895 + }, + { + "auxiliary_loss_clip": 0.01400335, + "auxiliary_loss_mlp": 0.01044404, + "balance_loss_clip": 1.27230179, + "balance_loss_mlp": 1.02992034, + "epoch": 0.4678190290094694, + "flos": 25233874277400.0, + "grad_norm": 2.629244198908891, + "language_loss": 0.74437129, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76881874, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.14477539, + "step": 7781, + "time_per_iteration": 2.835522413253784 + }, + { + "auxiliary_loss_clip": 0.01398361, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.27143085, + "balance_loss_mlp": 1.02319002, + "epoch": 0.4678791522621374, + "flos": 32274523031400.0, + "grad_norm": 1.79054126197256, + "language_loss": 0.63346183, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65782607, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14880371, + "step": 7782, + "time_per_iteration": 2.9447028636932373 + }, + { + "auxiliary_loss_clip": 0.01395499, + "auxiliary_loss_mlp": 0.01049017, + "balance_loss_clip": 1.2684437, + "balance_loss_mlp": 1.03444314, + "epoch": 0.46793927551480535, + "flos": 27051386361480.0, + "grad_norm": 1.9970282609900065, + "language_loss": 0.63497525, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65942037, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.14569092, + "step": 7783, + "time_per_iteration": 2.897850275039673 + }, + { + "auxiliary_loss_clip": 0.01399977, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.27058339, + "balance_loss_mlp": 1.02741039, + "epoch": 0.4679993987674733, + "flos": 17461924325280.0, + "grad_norm": 2.6056445865485203, + "language_loss": 0.68036091, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70479393, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.15899658, + "step": 7784, + "time_per_iteration": 2.8112194538116455 + }, + { + "auxiliary_loss_clip": 0.0138698, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.26458216, + "balance_loss_mlp": 1.0220598, + "epoch": 0.4680595220201413, + "flos": 17169558157800.0, + "grad_norm": 1.9127205500327196, + "language_loss": 0.85108089, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.87530649, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.13519287, + "step": 7785, + "time_per_iteration": 2.7196030616760254 + }, + { + "auxiliary_loss_clip": 0.0138945, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.26462233, + "balance_loss_mlp": 1.02519488, + "epoch": 0.46811964527280925, + "flos": 11331290472480.0, + "grad_norm": 1.7576692002040333, + "language_loss": 0.77659017, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.80088621, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14959717, + "step": 7786, + "time_per_iteration": 2.7210564613342285 + }, + { + "auxiliary_loss_clip": 0.01378718, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.2594974, + "balance_loss_mlp": 1.0152837, + "epoch": 0.4681797685254772, + "flos": 24284039556240.0, + "grad_norm": 1.8571183091621073, + "language_loss": 0.74144012, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76551497, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.1348877, + "step": 7787, + "time_per_iteration": 2.7510485649108887 + }, + { + "auxiliary_loss_clip": 0.01389131, + "auxiliary_loss_mlp": 0.01040877, + "balance_loss_clip": 1.2653867, + "balance_loss_mlp": 1.02698886, + "epoch": 0.4682398917781452, + "flos": 31656752472600.0, + "grad_norm": 1.652195874464723, + "language_loss": 0.65864766, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.6829477, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.13885498, + "step": 7788, + "time_per_iteration": 2.8641517162323 + }, + { + "auxiliary_loss_clip": 0.01382318, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.26067615, + "balance_loss_mlp": 1.01982343, + "epoch": 0.46830001503081314, + "flos": 28116619706520.0, + "grad_norm": 1.4239908263348986, + "language_loss": 0.64240766, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66655731, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.12823486, + "step": 7789, + "time_per_iteration": 2.8087618350982666 + }, + { + "auxiliary_loss_clip": 0.01203261, + "auxiliary_loss_mlp": 0.0100284, + "balance_loss_clip": 1.1554482, + "balance_loss_mlp": 1.00031269, + "epoch": 0.4683601382834811, + "flos": 57896037334800.0, + "grad_norm": 0.714451100017156, + "language_loss": 0.61905372, + "learning_rate": 2.300880877982825e-06, + "loss": 0.64111471, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.02526855, + "step": 7790, + "time_per_iteration": 3.3503360748291016 + }, + { + "auxiliary_loss_clip": 0.01382818, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.26252937, + "balance_loss_mlp": 1.01891661, + "epoch": 0.46842026153614913, + "flos": 21877003350720.0, + "grad_norm": 1.515077796605691, + "language_loss": 0.7906549, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14263916, + "step": 7791, + "time_per_iteration": 2.7602365016937256 + }, + { + "auxiliary_loss_clip": 0.01383374, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.26218092, + "balance_loss_mlp": 1.02055836, + "epoch": 0.4684803847888171, + "flos": 24906358251360.0, + "grad_norm": 1.5924727375590315, + "language_loss": 0.74960226, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77377951, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.13806152, + "step": 7792, + "time_per_iteration": 2.8257858753204346 + }, + { + "auxiliary_loss_clip": 0.0137605, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.25684083, + "balance_loss_mlp": 1.02250993, + "epoch": 0.46854050804148506, + "flos": 26257501034640.0, + "grad_norm": 1.5720173273733469, + "language_loss": 0.68198645, + "learning_rate": 2.299725738964898e-06, + "loss": 0.7061038, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13183594, + "step": 7793, + "time_per_iteration": 2.8087475299835205 + }, + { + "auxiliary_loss_clip": 0.01380678, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.25948596, + "balance_loss_mlp": 1.01758528, + "epoch": 0.468600631294153, + "flos": 21584799616680.0, + "grad_norm": 1.5186033335759657, + "language_loss": 0.7425428, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76666081, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.13531494, + "step": 7794, + "time_per_iteration": 4.197694540023804 + }, + { + "auxiliary_loss_clip": 0.01388551, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.26798344, + "balance_loss_mlp": 1.0252924, + "epoch": 0.468660754546821, + "flos": 25891261614360.0, + "grad_norm": 1.6587085068659695, + "language_loss": 0.64163184, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.66591513, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14483643, + "step": 7795, + "time_per_iteration": 2.8980026245117188 + }, + { + "auxiliary_loss_clip": 0.01377535, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.25820875, + "balance_loss_mlp": 1.01569414, + "epoch": 0.46872087779948896, + "flos": 35481048517440.0, + "grad_norm": 1.8150843372006022, + "language_loss": 0.67911518, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70318419, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13671875, + "step": 7796, + "time_per_iteration": 2.9128530025482178 + }, + { + "auxiliary_loss_clip": 0.01380944, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.25856137, + "balance_loss_mlp": 1.0143981, + "epoch": 0.4687810010521569, + "flos": 26402120696520.0, + "grad_norm": 1.660961984246654, + "language_loss": 0.70361161, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.7277016, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.13653564, + "step": 7797, + "time_per_iteration": 2.825673818588257 + }, + { + "auxiliary_loss_clip": 0.01387819, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.26512706, + "balance_loss_mlp": 1.01713538, + "epoch": 0.4688411243048249, + "flos": 19977333908400.0, + "grad_norm": 2.1396510999276797, + "language_loss": 0.6718539, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69605064, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14697266, + "step": 7798, + "time_per_iteration": 2.7868943214416504 + }, + { + "auxiliary_loss_clip": 0.01200244, + "auxiliary_loss_mlp": 0.01010425, + "balance_loss_clip": 1.15289211, + "balance_loss_mlp": 1.00816023, + "epoch": 0.46890124755749285, + "flos": 63992414712960.0, + "grad_norm": 0.9106420359732729, + "language_loss": 0.64586622, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66797292, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02270508, + "step": 7799, + "time_per_iteration": 3.4024240970611572 + }, + { + "auxiliary_loss_clip": 0.01376474, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.25634551, + "balance_loss_mlp": 1.01338148, + "epoch": 0.4689613708101608, + "flos": 23774276899800.0, + "grad_norm": 1.2872776115722113, + "language_loss": 0.72743058, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.75146616, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13696289, + "step": 7800, + "time_per_iteration": 2.828841209411621 + }, + { + "auxiliary_loss_clip": 0.01379005, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.26078463, + "balance_loss_mlp": 1.01966012, + "epoch": 0.4690214940628288, + "flos": 24793883429400.0, + "grad_norm": 1.731012459240698, + "language_loss": 0.72686315, + "learning_rate": 2.296644869233568e-06, + "loss": 0.7509765, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.12670898, + "step": 7801, + "time_per_iteration": 5.719025135040283 + }, + { + "auxiliary_loss_clip": 0.01392545, + "auxiliary_loss_mlp": 0.01041269, + "balance_loss_clip": 1.26569819, + "balance_loss_mlp": 1.02635598, + "epoch": 0.46908161731549675, + "flos": 18081887735520.0, + "grad_norm": 2.02157470117435, + "language_loss": 0.63260251, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.6569407, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.14923096, + "step": 7802, + "time_per_iteration": 2.713799238204956 + }, + { + "auxiliary_loss_clip": 0.01390451, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.26737046, + "balance_loss_mlp": 1.02817142, + "epoch": 0.4691417405681647, + "flos": 25708974375600.0, + "grad_norm": 1.7572072978163742, + "language_loss": 0.73903942, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76336688, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14117432, + "step": 7803, + "time_per_iteration": 2.782299280166626 + }, + { + "auxiliary_loss_clip": 0.01376791, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.25629175, + "balance_loss_mlp": 1.02446365, + "epoch": 0.46920186382083273, + "flos": 17461518241680.0, + "grad_norm": 3.4270666752639984, + "language_loss": 0.77490962, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.7990582, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13598633, + "step": 7804, + "time_per_iteration": 2.7034969329833984 + }, + { + "auxiliary_loss_clip": 0.01377969, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.25993156, + "balance_loss_mlp": 1.01455522, + "epoch": 0.4692619870735007, + "flos": 20344304279160.0, + "grad_norm": 1.7565383010820323, + "language_loss": 0.77325499, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79731488, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13476562, + "step": 7805, + "time_per_iteration": 2.7752745151519775 + }, + { + "auxiliary_loss_clip": 0.01395363, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.26809585, + "balance_loss_mlp": 1.03090096, + "epoch": 0.46932211032616866, + "flos": 29502506264760.0, + "grad_norm": 1.6460706424297789, + "language_loss": 0.82857871, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85299683, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15551758, + "step": 7806, + "time_per_iteration": 2.870241403579712 + }, + { + "auxiliary_loss_clip": 0.01389453, + "auxiliary_loss_mlp": 0.01042517, + "balance_loss_clip": 1.2657938, + "balance_loss_mlp": 1.02821231, + "epoch": 0.4693822335788366, + "flos": 36218237927760.0, + "grad_norm": 1.9566817949477848, + "language_loss": 0.77633947, + "learning_rate": 2.294333744076472e-06, + "loss": 0.80065924, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.14306641, + "step": 7807, + "time_per_iteration": 2.970170736312866 + }, + { + "auxiliary_loss_clip": 0.01390089, + "auxiliary_loss_mlp": 0.01044792, + "balance_loss_clip": 1.26799834, + "balance_loss_mlp": 1.03055894, + "epoch": 0.4694423568315046, + "flos": 20343776370480.0, + "grad_norm": 1.8492067747944894, + "language_loss": 0.51513219, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53948104, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14239502, + "step": 7808, + "time_per_iteration": 2.8346447944641113 + }, + { + "auxiliary_loss_clip": 0.01198086, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.15008307, + "balance_loss_mlp": 1.02865243, + "epoch": 0.46950248008417256, + "flos": 64339771245840.0, + "grad_norm": 0.8419352438165443, + "language_loss": 0.5778476, + "learning_rate": 2.293563279578978e-06, + "loss": 0.60013705, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02209473, + "step": 7809, + "time_per_iteration": 3.0956270694732666 + }, + { + "auxiliary_loss_clip": 0.01388488, + "auxiliary_loss_mlp": 0.0104434, + "balance_loss_clip": 1.26434302, + "balance_loss_mlp": 1.03003454, + "epoch": 0.4695626033368405, + "flos": 19203735780360.0, + "grad_norm": 2.0969690501209897, + "language_loss": 0.72419429, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74852252, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14312744, + "step": 7810, + "time_per_iteration": 2.768331289291382 + }, + { + "auxiliary_loss_clip": 0.01388108, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.26486456, + "balance_loss_mlp": 1.02826571, + "epoch": 0.4696227265895085, + "flos": 23007582192960.0, + "grad_norm": 2.232192088433093, + "language_loss": 0.81537378, + "learning_rate": 2.29279277055369e-06, + "loss": 0.8396771, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.13970947, + "step": 7811, + "time_per_iteration": 4.288309335708618 + }, + { + "auxiliary_loss_clip": 0.01394004, + "auxiliary_loss_mlp": 0.01042948, + "balance_loss_clip": 1.27120531, + "balance_loss_mlp": 1.02886343, + "epoch": 0.46968284984217645, + "flos": 21875785099920.0, + "grad_norm": 1.9606157845452539, + "language_loss": 0.80717814, + "learning_rate": 2.292407499379644e-06, + "loss": 0.83154762, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14074707, + "step": 7812, + "time_per_iteration": 2.76420521736145 + }, + { + "auxiliary_loss_clip": 0.01386987, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.26708806, + "balance_loss_mlp": 1.02829993, + "epoch": 0.4697429730948444, + "flos": 19979851626720.0, + "grad_norm": 1.6549483339634137, + "language_loss": 0.74139655, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76568168, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13232422, + "step": 7813, + "time_per_iteration": 2.7545762062072754 + }, + { + "auxiliary_loss_clip": 0.01388417, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.26704788, + "balance_loss_mlp": 1.02560759, + "epoch": 0.4698030963475124, + "flos": 15160094045280.0, + "grad_norm": 6.975821593025779, + "language_loss": 0.84643352, + "learning_rate": 2.291636923781798e-06, + "loss": 0.87072086, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14733887, + "step": 7814, + "time_per_iteration": 2.697709798812866 + }, + { + "auxiliary_loss_clip": 0.01378685, + "auxiliary_loss_mlp": 0.01041099, + "balance_loss_clip": 1.26025295, + "balance_loss_mlp": 1.02790856, + "epoch": 0.46986321960018035, + "flos": 15154408874880.0, + "grad_norm": 1.7624677997898226, + "language_loss": 0.81654584, + "learning_rate": 2.291251619387217e-06, + "loss": 0.84074366, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13189697, + "step": 7815, + "time_per_iteration": 2.709482192993164 + }, + { + "auxiliary_loss_clip": 0.01388526, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.26619744, + "balance_loss_mlp": 1.02510941, + "epoch": 0.4699233428528483, + "flos": 23113681502400.0, + "grad_norm": 1.9248644676034168, + "language_loss": 0.78104258, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.8053211, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14208984, + "step": 7816, + "time_per_iteration": 2.7439217567443848 + }, + { + "auxiliary_loss_clip": 0.01200764, + "auxiliary_loss_mlp": 0.01007878, + "balance_loss_clip": 1.15260148, + "balance_loss_mlp": 1.00552964, + "epoch": 0.46998346610551633, + "flos": 68122234033920.0, + "grad_norm": 0.8381626012625324, + "language_loss": 0.59102917, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61311555, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.0234375, + "step": 7817, + "time_per_iteration": 3.3059144020080566 + }, + { + "auxiliary_loss_clip": 0.01378386, + "auxiliary_loss_mlp": 0.01039057, + "balance_loss_clip": 1.26141346, + "balance_loss_mlp": 1.02481103, + "epoch": 0.4700435893581843, + "flos": 24134222024280.0, + "grad_norm": 1.8229567983727135, + "language_loss": 0.79592085, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8200953, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.14245605, + "step": 7818, + "time_per_iteration": 2.868603467941284 + }, + { + "auxiliary_loss_clip": 0.01382596, + "auxiliary_loss_mlp": 0.01037791, + "balance_loss_clip": 1.26159692, + "balance_loss_mlp": 1.02405775, + "epoch": 0.47010371261085226, + "flos": 20154504493800.0, + "grad_norm": 1.8340032657255245, + "language_loss": 0.83829117, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86249506, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.13739014, + "step": 7819, + "time_per_iteration": 2.74753999710083 + }, + { + "auxiliary_loss_clip": 0.01391761, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.26856732, + "balance_loss_mlp": 1.01882672, + "epoch": 0.47016383586352023, + "flos": 15126406087680.0, + "grad_norm": 3.770750774483199, + "language_loss": 0.76581681, + "learning_rate": 2.289324932042186e-06, + "loss": 0.79007524, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.15246582, + "step": 7820, + "time_per_iteration": 2.7760610580444336 + }, + { + "auxiliary_loss_clip": 0.01381128, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.26241422, + "balance_loss_mlp": 1.02877665, + "epoch": 0.4702239591161882, + "flos": 13556932823160.0, + "grad_norm": 1.9150503864541386, + "language_loss": 0.74558765, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76982242, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13562012, + "step": 7821, + "time_per_iteration": 2.7176411151885986 + }, + { + "auxiliary_loss_clip": 0.01383282, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.26463687, + "balance_loss_mlp": 1.02656794, + "epoch": 0.47028408236885616, + "flos": 24281440621200.0, + "grad_norm": 1.789255733323864, + "language_loss": 0.89414555, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91837484, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13079834, + "step": 7822, + "time_per_iteration": 2.8009424209594727 + }, + { + "auxiliary_loss_clip": 0.01385259, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.26620066, + "balance_loss_mlp": 1.0182128, + "epoch": 0.4703442056215241, + "flos": 22862109755520.0, + "grad_norm": 1.5256718253900945, + "language_loss": 0.7979635, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.82212466, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.12634277, + "step": 7823, + "time_per_iteration": 2.7641408443450928 + }, + { + "auxiliary_loss_clip": 0.01196163, + "auxiliary_loss_mlp": 0.01001713, + "balance_loss_clip": 1.14864874, + "balance_loss_mlp": 0.99932843, + "epoch": 0.4704043288741921, + "flos": 69257523445920.0, + "grad_norm": 0.7063077624015578, + "language_loss": 0.56714201, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58912075, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.02380371, + "step": 7824, + "time_per_iteration": 3.3487191200256348 + }, + { + "auxiliary_loss_clip": 0.01386611, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.26453888, + "balance_loss_mlp": 1.02205467, + "epoch": 0.47046445212686006, + "flos": 18045682059600.0, + "grad_norm": 2.2581633885485357, + "language_loss": 0.80955029, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83378696, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15002441, + "step": 7825, + "time_per_iteration": 2.7866601943969727 + }, + { + "auxiliary_loss_clip": 0.01387845, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.26611578, + "balance_loss_mlp": 1.01609373, + "epoch": 0.470524575379528, + "flos": 23956767180360.0, + "grad_norm": 2.0471358943289655, + "language_loss": 0.67046511, + "learning_rate": 2.287012545338324e-06, + "loss": 0.69464713, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14263916, + "step": 7826, + "time_per_iteration": 2.7777152061462402 + }, + { + "auxiliary_loss_clip": 0.01388618, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.26505172, + "balance_loss_mlp": 1.02061415, + "epoch": 0.470584698632196, + "flos": 18118093411440.0, + "grad_norm": 1.7454079854877262, + "language_loss": 0.84453607, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.8687669, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.13848877, + "step": 7827, + "time_per_iteration": 2.8055419921875 + }, + { + "auxiliary_loss_clip": 0.01193594, + "auxiliary_loss_mlp": 0.01004529, + "balance_loss_clip": 1.14647818, + "balance_loss_mlp": 1.00209713, + "epoch": 0.47064482188486395, + "flos": 57264906625560.0, + "grad_norm": 0.8211166769960141, + "language_loss": 0.55673349, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57871473, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.02429199, + "step": 7828, + "time_per_iteration": 3.2115318775177 + }, + { + "auxiliary_loss_clip": 0.0138145, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.26117599, + "balance_loss_mlp": 1.01556563, + "epoch": 0.4707049451375319, + "flos": 17899641105120.0, + "grad_norm": 1.9403171714363125, + "language_loss": 0.81836903, + "learning_rate": 2.285856204861245e-06, + "loss": 0.84247291, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13378906, + "step": 7829, + "time_per_iteration": 2.796151876449585 + }, + { + "auxiliary_loss_clip": 0.01378396, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.25917006, + "balance_loss_mlp": 1.01638305, + "epoch": 0.47076506839019994, + "flos": 25238706672240.0, + "grad_norm": 1.2873783432729815, + "language_loss": 0.76051068, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78458482, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.12634277, + "step": 7830, + "time_per_iteration": 2.86566162109375 + }, + { + "auxiliary_loss_clip": 0.01378293, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.26116347, + "balance_loss_mlp": 1.02068031, + "epoch": 0.4708251916428679, + "flos": 13483587479040.0, + "grad_norm": 2.258083215041965, + "language_loss": 0.78570426, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80983317, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13922119, + "step": 7831, + "time_per_iteration": 2.7642955780029297 + }, + { + "auxiliary_loss_clip": 0.01395992, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.2696023, + "balance_loss_mlp": 1.02353799, + "epoch": 0.47088531489553587, + "flos": 30153152613960.0, + "grad_norm": 1.6717096033743413, + "language_loss": 0.76138484, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78572339, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.14312744, + "step": 7832, + "time_per_iteration": 2.824925184249878 + }, + { + "auxiliary_loss_clip": 0.01373738, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.2553159, + "balance_loss_mlp": 1.01778245, + "epoch": 0.47094543814820383, + "flos": 21803251923000.0, + "grad_norm": 1.3287834940829462, + "language_loss": 0.74992573, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.77397025, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.12921143, + "step": 7833, + "time_per_iteration": 4.164145231246948 + }, + { + "auxiliary_loss_clip": 0.01382006, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.26242745, + "balance_loss_mlp": 1.01958406, + "epoch": 0.4710055614008718, + "flos": 23008231926720.0, + "grad_norm": 1.534626258164241, + "language_loss": 0.75849789, + "learning_rate": 2.283928754133762e-06, + "loss": 0.78265351, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13977051, + "step": 7834, + "time_per_iteration": 2.8302669525146484 + }, + { + "auxiliary_loss_clip": 0.01378064, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.25956583, + "balance_loss_mlp": 1.02423739, + "epoch": 0.47106568465353976, + "flos": 42749083416960.0, + "grad_norm": 1.346711440749231, + "language_loss": 0.66397887, + "learning_rate": 2.283543231629972e-06, + "loss": 0.6881305, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.12866211, + "step": 7835, + "time_per_iteration": 2.9986093044281006 + }, + { + "auxiliary_loss_clip": 0.01190731, + "auxiliary_loss_mlp": 0.0100482, + "balance_loss_clip": 1.14289582, + "balance_loss_mlp": 1.00243628, + "epoch": 0.4711258079062077, + "flos": 68567365162440.0, + "grad_norm": 0.8835528854347774, + "language_loss": 0.62185913, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64381462, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.02380371, + "step": 7836, + "time_per_iteration": 3.230506181716919 + }, + { + "auxiliary_loss_clip": 0.01390211, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.26386917, + "balance_loss_mlp": 1.01925409, + "epoch": 0.4711859311588757, + "flos": 25452001716840.0, + "grad_norm": 1.5594087217015078, + "language_loss": 0.69948864, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.72372842, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.14520264, + "step": 7837, + "time_per_iteration": 2.814164638519287 + }, + { + "auxiliary_loss_clip": 0.01383383, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.26143575, + "balance_loss_mlp": 1.02405047, + "epoch": 0.47124605441154366, + "flos": 21987041671080.0, + "grad_norm": 1.6840103326227667, + "language_loss": 0.66690314, + "learning_rate": 2.282386599665153e-06, + "loss": 0.6911217, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14422607, + "step": 7838, + "time_per_iteration": 2.820089101791382 + }, + { + "auxiliary_loss_clip": 0.01385582, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.26190126, + "balance_loss_mlp": 1.01518917, + "epoch": 0.4713061776642116, + "flos": 25418435584320.0, + "grad_norm": 1.7371013665957586, + "language_loss": 0.77929509, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.80344146, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.13861084, + "step": 7839, + "time_per_iteration": 2.926967144012451 + }, + { + "auxiliary_loss_clip": 0.01373286, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.2551322, + "balance_loss_mlp": 1.02184141, + "epoch": 0.4713663009168796, + "flos": 26547958609200.0, + "grad_norm": 1.85573910867909, + "language_loss": 0.72562009, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74970257, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13140869, + "step": 7840, + "time_per_iteration": 5.664055585861206 + }, + { + "auxiliary_loss_clip": 0.01381546, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.26136971, + "balance_loss_mlp": 1.01570249, + "epoch": 0.47142642416954755, + "flos": 23628926287440.0, + "grad_norm": 1.5509973282414589, + "language_loss": 0.75466323, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77877337, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13769531, + "step": 7841, + "time_per_iteration": 2.777651071548462 + }, + { + "auxiliary_loss_clip": 0.01382274, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.26156688, + "balance_loss_mlp": 1.01815724, + "epoch": 0.4714865474222155, + "flos": 22315329255960.0, + "grad_norm": 1.64780841457419, + "language_loss": 0.70123607, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72537446, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13415527, + "step": 7842, + "time_per_iteration": 2.740056037902832 + }, + { + "auxiliary_loss_clip": 0.01381439, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.26133454, + "balance_loss_mlp": 1.02305746, + "epoch": 0.4715466706748835, + "flos": 17827026711480.0, + "grad_norm": 2.081198026985538, + "language_loss": 0.7920965, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81627518, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.13360596, + "step": 7843, + "time_per_iteration": 2.754146099090576 + }, + { + "auxiliary_loss_clip": 0.01377702, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.25740075, + "balance_loss_mlp": 1.02101898, + "epoch": 0.4716067939275515, + "flos": 23664807096480.0, + "grad_norm": 1.5327907701907966, + "language_loss": 0.74573058, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76984793, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13018799, + "step": 7844, + "time_per_iteration": 2.7512929439544678 + }, + { + "auxiliary_loss_clip": 0.0137819, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.25739253, + "balance_loss_mlp": 1.02336717, + "epoch": 0.47166691718021947, + "flos": 17934547313520.0, + "grad_norm": 1.5465396565648193, + "language_loss": 0.79062855, + "learning_rate": 2.279687417645088e-06, + "loss": 0.81478429, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14019775, + "step": 7845, + "time_per_iteration": 2.747934341430664 + }, + { + "auxiliary_loss_clip": 0.01376363, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.2579298, + "balance_loss_mlp": 1.01932895, + "epoch": 0.47172704043288743, + "flos": 26620207527600.0, + "grad_norm": 1.3132513887177129, + "language_loss": 0.73273575, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75681698, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.12438965, + "step": 7846, + "time_per_iteration": 2.7888500690460205 + }, + { + "auxiliary_loss_clip": 0.01370576, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.25419044, + "balance_loss_mlp": 1.02039504, + "epoch": 0.4717871636855554, + "flos": 27927794521800.0, + "grad_norm": 1.271783456259162, + "language_loss": 0.74268544, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76672602, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.1307373, + "step": 7847, + "time_per_iteration": 2.7707386016845703 + }, + { + "auxiliary_loss_clip": 0.0138009, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.25984359, + "balance_loss_mlp": 1.02704847, + "epoch": 0.47184728693822336, + "flos": 14506402069080.0, + "grad_norm": 1.81795462562802, + "language_loss": 0.80916905, + "learning_rate": 2.278530465971703e-06, + "loss": 0.83337116, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.1305542, + "step": 7848, + "time_per_iteration": 4.184573411941528 + }, + { + "auxiliary_loss_clip": 0.01387585, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.26632428, + "balance_loss_mlp": 1.01941514, + "epoch": 0.47190741019089133, + "flos": 17860836494160.0, + "grad_norm": 1.8623962850645408, + "language_loss": 0.70395136, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72815561, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.13439941, + "step": 7849, + "time_per_iteration": 2.7505035400390625 + }, + { + "auxiliary_loss_clip": 0.01388119, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.26340222, + "balance_loss_mlp": 1.0253737, + "epoch": 0.4719675334435593, + "flos": 17900087797080.0, + "grad_norm": 1.849895163118929, + "language_loss": 0.69943392, + "learning_rate": 2.277759112022224e-06, + "loss": 0.72371799, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.14904785, + "step": 7850, + "time_per_iteration": 2.7564682960510254 + }, + { + "auxiliary_loss_clip": 0.01386984, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.26386714, + "balance_loss_mlp": 1.02021098, + "epoch": 0.47202765669622726, + "flos": 20709284840280.0, + "grad_norm": 1.8344421105937019, + "language_loss": 0.75379092, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77800775, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14501953, + "step": 7851, + "time_per_iteration": 2.9177684783935547 + }, + { + "auxiliary_loss_clip": 0.01387024, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.26359701, + "balance_loss_mlp": 1.02611089, + "epoch": 0.4720877799488952, + "flos": 16364302490160.0, + "grad_norm": 1.7613255653776372, + "language_loss": 0.76595938, + "learning_rate": 2.276987715942132e-06, + "loss": 0.79023612, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.14538574, + "step": 7852, + "time_per_iteration": 2.8178060054779053 + }, + { + "auxiliary_loss_clip": 0.01375777, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.25541377, + "balance_loss_mlp": 1.0205121, + "epoch": 0.4721479032015632, + "flos": 20673119772720.0, + "grad_norm": 1.6208923428044466, + "language_loss": 0.69655412, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.72065806, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14111328, + "step": 7853, + "time_per_iteration": 2.863424777984619 + }, + { + "auxiliary_loss_clip": 0.01200925, + "auxiliary_loss_mlp": 0.01005983, + "balance_loss_clip": 1.15292144, + "balance_loss_mlp": 1.00362289, + "epoch": 0.47220802645423116, + "flos": 67768339555080.0, + "grad_norm": 0.6889926085335027, + "language_loss": 0.50179321, + "learning_rate": 2.276216277848432e-06, + "loss": 0.5238623, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02355957, + "step": 7854, + "time_per_iteration": 3.399444580078125 + }, + { + "auxiliary_loss_clip": 0.01387934, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.26561832, + "balance_loss_mlp": 1.01478148, + "epoch": 0.4722681497068991, + "flos": 20926153420560.0, + "grad_norm": 1.7634591324695728, + "language_loss": 0.63692778, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.661098, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.1428833, + "step": 7855, + "time_per_iteration": 2.780148983001709 + }, + { + "auxiliary_loss_clip": 0.01382016, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.25978494, + "balance_loss_mlp": 1.01972246, + "epoch": 0.4723282729595671, + "flos": 28298947553640.0, + "grad_norm": 1.9761166342644538, + "language_loss": 0.75739104, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7815547, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.14611816, + "step": 7856, + "time_per_iteration": 2.8472073078155518 + }, + { + "auxiliary_loss_clip": 0.01379944, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.26075912, + "balance_loss_mlp": 1.02298021, + "epoch": 0.4723883962122351, + "flos": 27130701134520.0, + "grad_norm": 4.395258604175369, + "language_loss": 0.74854803, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.77270806, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13067627, + "step": 7857, + "time_per_iteration": 2.829941511154175 + }, + { + "auxiliary_loss_clip": 0.01372648, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.25485158, + "balance_loss_mlp": 1.02296329, + "epoch": 0.47244851946490307, + "flos": 31543140616560.0, + "grad_norm": 1.4091262653853613, + "language_loss": 0.64998901, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67407334, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.12823486, + "step": 7858, + "time_per_iteration": 2.8632419109344482 + }, + { + "auxiliary_loss_clip": 0.01368366, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.2509985, + "balance_loss_mlp": 1.02103353, + "epoch": 0.47250864271757104, + "flos": 20891450253960.0, + "grad_norm": 2.5191811231137935, + "language_loss": 0.7049861, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72901642, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.1362915, + "step": 7859, + "time_per_iteration": 2.819915771484375 + }, + { + "auxiliary_loss_clip": 0.01388364, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.26326656, + "balance_loss_mlp": 1.02600551, + "epoch": 0.472568765970239, + "flos": 20526997601520.0, + "grad_norm": 1.7334634311189203, + "language_loss": 0.62638062, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.65066433, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14013672, + "step": 7860, + "time_per_iteration": 2.8290460109710693 + }, + { + "auxiliary_loss_clip": 0.01381399, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.25880647, + "balance_loss_mlp": 1.02968311, + "epoch": 0.47262888922290697, + "flos": 35811528953760.0, + "grad_norm": 2.042403918441267, + "language_loss": 0.7254678, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74972576, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14715576, + "step": 7861, + "time_per_iteration": 2.8863253593444824 + }, + { + "auxiliary_loss_clip": 0.01378475, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.25704312, + "balance_loss_mlp": 1.02553344, + "epoch": 0.47268901247557493, + "flos": 20672632472400.0, + "grad_norm": 2.0452525216675257, + "language_loss": 0.84966147, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87383997, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.1385498, + "step": 7862, + "time_per_iteration": 2.766456365585327 + }, + { + "auxiliary_loss_clip": 0.01381106, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.25754237, + "balance_loss_mlp": 1.02479029, + "epoch": 0.4727491357282429, + "flos": 19578056264280.0, + "grad_norm": 1.6911254134977627, + "language_loss": 0.8459897, + "learning_rate": 2.272744289645927e-06, + "loss": 0.870179, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.13018799, + "step": 7863, + "time_per_iteration": 2.753786325454712 + }, + { + "auxiliary_loss_clip": 0.0137831, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.25762606, + "balance_loss_mlp": 1.02691221, + "epoch": 0.47280925898091086, + "flos": 18221025268800.0, + "grad_norm": 1.78626636395158, + "language_loss": 0.65713525, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68132031, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13287354, + "step": 7864, + "time_per_iteration": 2.719395399093628 + }, + { + "auxiliary_loss_clip": 0.01375854, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.25557184, + "balance_loss_mlp": 1.02567625, + "epoch": 0.4728693822335788, + "flos": 17826336369360.0, + "grad_norm": 1.7014719322190224, + "language_loss": 0.65144181, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67559409, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13684082, + "step": 7865, + "time_per_iteration": 2.735562801361084 + }, + { + "auxiliary_loss_clip": 0.01369403, + "auxiliary_loss_mlp": 0.01040603, + "balance_loss_clip": 1.25128818, + "balance_loss_mlp": 1.02714396, + "epoch": 0.4729295054862468, + "flos": 20599977470400.0, + "grad_norm": 1.7230903011651637, + "language_loss": 0.74307638, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76717645, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13482666, + "step": 7866, + "time_per_iteration": 2.773721694946289 + }, + { + "auxiliary_loss_clip": 0.01382618, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.25844812, + "balance_loss_mlp": 1.02291775, + "epoch": 0.47298962873891476, + "flos": 23373374921280.0, + "grad_norm": 1.7409812419557291, + "language_loss": 0.83196157, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85616255, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14556885, + "step": 7867, + "time_per_iteration": 2.8097610473632812 + }, + { + "auxiliary_loss_clip": 0.013695, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.25097442, + "balance_loss_mlp": 1.0270865, + "epoch": 0.4730497519915827, + "flos": 22057057129680.0, + "grad_norm": 1.9898642688266996, + "language_loss": 0.79871249, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.82281077, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13220215, + "step": 7868, + "time_per_iteration": 2.7714574337005615 + }, + { + "auxiliary_loss_clip": 0.0138423, + "auxiliary_loss_mlp": 0.01040336, + "balance_loss_clip": 1.25868356, + "balance_loss_mlp": 1.02634048, + "epoch": 0.4731098752442507, + "flos": 21074752701720.0, + "grad_norm": 1.837765798860331, + "language_loss": 0.75642002, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.7806657, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.13977051, + "step": 7869, + "time_per_iteration": 2.780276298522949 + }, + { + "auxiliary_loss_clip": 0.01377213, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.25441647, + "balance_loss_mlp": 1.03256297, + "epoch": 0.4731699984969187, + "flos": 22533943995720.0, + "grad_norm": 1.4812733768900672, + "language_loss": 0.7350812, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75932962, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.15063477, + "step": 7870, + "time_per_iteration": 2.77423357963562 + }, + { + "auxiliary_loss_clip": 0.01389543, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.26456189, + "balance_loss_mlp": 1.02600884, + "epoch": 0.4732301217495867, + "flos": 24903312624360.0, + "grad_norm": 1.8523106895645045, + "language_loss": 0.8176595, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.84196198, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14697266, + "step": 7871, + "time_per_iteration": 4.173172473907471 + }, + { + "auxiliary_loss_clip": 0.01376271, + "auxiliary_loss_mlp": 0.01040527, + "balance_loss_clip": 1.25608408, + "balance_loss_mlp": 1.02624595, + "epoch": 0.47329024500225464, + "flos": 22789657795320.0, + "grad_norm": 1.580543814640291, + "language_loss": 0.75972658, + "learning_rate": 2.269271463701879e-06, + "loss": 0.78389454, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14282227, + "step": 7872, + "time_per_iteration": 2.803405284881592 + }, + { + "auxiliary_loss_clip": 0.01379706, + "auxiliary_loss_mlp": 0.01040277, + "balance_loss_clip": 1.25745916, + "balance_loss_mlp": 1.02562618, + "epoch": 0.4733503682549226, + "flos": 38703695522400.0, + "grad_norm": 1.618962031059032, + "language_loss": 0.68045378, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70465356, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14642334, + "step": 7873, + "time_per_iteration": 2.9802427291870117 + }, + { + "auxiliary_loss_clip": 0.01376405, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.25766206, + "balance_loss_mlp": 1.02497363, + "epoch": 0.47341049150759057, + "flos": 22972107467520.0, + "grad_norm": 1.7495323310905742, + "language_loss": 0.72619528, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75034237, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13317871, + "step": 7874, + "time_per_iteration": 2.8285961151123047 + }, + { + "auxiliary_loss_clip": 0.01384218, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.26141143, + "balance_loss_mlp": 1.02878213, + "epoch": 0.47347061476025853, + "flos": 14542607745000.0, + "grad_norm": 2.247452376202121, + "language_loss": 0.65520465, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67946988, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.13513184, + "step": 7875, + "time_per_iteration": 2.7189042568206787 + }, + { + "auxiliary_loss_clip": 0.01382423, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.26177669, + "balance_loss_mlp": 1.02272987, + "epoch": 0.4735307380129265, + "flos": 30269363405040.0, + "grad_norm": 2.095283228202885, + "language_loss": 0.81338418, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.8375808, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14520264, + "step": 7876, + "time_per_iteration": 2.829224109649658 + }, + { + "auxiliary_loss_clip": 0.01373235, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.25025213, + "balance_loss_mlp": 1.02351129, + "epoch": 0.47359086126559446, + "flos": 19395809633880.0, + "grad_norm": 1.9824244603438361, + "language_loss": 0.79066688, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81477433, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14007568, + "step": 7877, + "time_per_iteration": 2.758301019668579 + }, + { + "auxiliary_loss_clip": 0.01370892, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.2511965, + "balance_loss_mlp": 1.01877999, + "epoch": 0.47365098451826243, + "flos": 21943770140520.0, + "grad_norm": 1.8447648054310328, + "language_loss": 0.71028239, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.73431349, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13446045, + "step": 7878, + "time_per_iteration": 2.7994587421417236 + }, + { + "auxiliary_loss_clip": 0.01372984, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.25453448, + "balance_loss_mlp": 1.01529551, + "epoch": 0.4737111077709304, + "flos": 25850142326880.0, + "grad_norm": 1.4948634142223038, + "language_loss": 0.75299346, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.7770105, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13427734, + "step": 7879, + "time_per_iteration": 4.37247109413147 + }, + { + "auxiliary_loss_clip": 0.01196483, + "auxiliary_loss_mlp": 0.01003517, + "balance_loss_clip": 1.14827251, + "balance_loss_mlp": 1.0007987, + "epoch": 0.47377123102359836, + "flos": 67775388428520.0, + "grad_norm": 0.7268458127278281, + "language_loss": 0.61253917, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63453913, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02722168, + "step": 7880, + "time_per_iteration": 4.830432176589966 + }, + { + "auxiliary_loss_clip": 0.01373341, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.25436473, + "balance_loss_mlp": 1.02461743, + "epoch": 0.4738313542762663, + "flos": 24321301049520.0, + "grad_norm": 1.4784989237794361, + "language_loss": 0.67910439, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70322788, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14398193, + "step": 7881, + "time_per_iteration": 2.8687846660614014 + }, + { + "auxiliary_loss_clip": 0.01366569, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.24937987, + "balance_loss_mlp": 1.01342058, + "epoch": 0.4738914775289343, + "flos": 20710381266000.0, + "grad_norm": 1.7801590473716697, + "language_loss": 0.77561045, + "learning_rate": 2.265411798646092e-06, + "loss": 0.799541, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13061523, + "step": 7882, + "time_per_iteration": 2.7770750522613525 + }, + { + "auxiliary_loss_clip": 0.01372521, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.25152671, + "balance_loss_mlp": 1.01857686, + "epoch": 0.4739516007816023, + "flos": 25451717458320.0, + "grad_norm": 1.4267912802868041, + "language_loss": 0.76405227, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78810334, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14031982, + "step": 7883, + "time_per_iteration": 2.8215880393981934 + }, + { + "auxiliary_loss_clip": 0.01371444, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.25256443, + "balance_loss_mlp": 1.01940393, + "epoch": 0.4740117240342703, + "flos": 19979120676240.0, + "grad_norm": 2.7290573589706595, + "language_loss": 0.72155213, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74559331, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13275146, + "step": 7884, + "time_per_iteration": 2.7475857734680176 + }, + { + "auxiliary_loss_clip": 0.01386022, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.25997519, + "balance_loss_mlp": 1.01732934, + "epoch": 0.47407184728693824, + "flos": 15664333964760.0, + "grad_norm": 1.871522372891562, + "language_loss": 0.8236618, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84784019, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14489746, + "step": 7885, + "time_per_iteration": 2.7484917640686035 + }, + { + "auxiliary_loss_clip": 0.0138071, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.26155686, + "balance_loss_mlp": 1.02020895, + "epoch": 0.4741319705396062, + "flos": 18593518376520.0, + "grad_norm": 1.9025870562456781, + "language_loss": 0.73787951, + "learning_rate": 2.263867649999751e-06, + "loss": 0.76202857, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13983154, + "step": 7886, + "time_per_iteration": 2.7073090076446533 + }, + { + "auxiliary_loss_clip": 0.01387558, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.26192248, + "balance_loss_mlp": 1.01793432, + "epoch": 0.47419209379227417, + "flos": 13265013347640.0, + "grad_norm": 1.7473124399177868, + "language_loss": 0.74244487, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76664734, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14752197, + "step": 7887, + "time_per_iteration": 4.205507278442383 + }, + { + "auxiliary_loss_clip": 0.01372067, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.25387025, + "balance_loss_mlp": 1.014642, + "epoch": 0.47425221704494214, + "flos": 20048405184360.0, + "grad_norm": 1.62045714309833, + "language_loss": 0.77245486, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79645395, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13195801, + "step": 7888, + "time_per_iteration": 2.782179832458496 + }, + { + "auxiliary_loss_clip": 0.0137671, + "auxiliary_loss_mlp": 0.01031673, + "balance_loss_clip": 1.25548089, + "balance_loss_mlp": 1.01721871, + "epoch": 0.4743123402976101, + "flos": 27277676081280.0, + "grad_norm": 1.611075800167396, + "language_loss": 0.73260653, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.75669032, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14453125, + "step": 7889, + "time_per_iteration": 2.826162815093994 + }, + { + "auxiliary_loss_clip": 0.0119004, + "auxiliary_loss_mlp": 0.01004505, + "balance_loss_clip": 1.14252448, + "balance_loss_mlp": 1.00116754, + "epoch": 0.47437246355027807, + "flos": 55407087421200.0, + "grad_norm": 0.7237813353967574, + "language_loss": 0.56098962, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58293509, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.03344727, + "step": 7890, + "time_per_iteration": 3.384066343307495 + }, + { + "auxiliary_loss_clip": 0.01379787, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.25754547, + "balance_loss_mlp": 1.01882911, + "epoch": 0.47443258680294603, + "flos": 23883787311480.0, + "grad_norm": 1.8712672700754955, + "language_loss": 0.65757811, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.6817168, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.15264893, + "step": 7891, + "time_per_iteration": 2.8218064308166504 + }, + { + "auxiliary_loss_clip": 0.01384828, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.25935972, + "balance_loss_mlp": 1.02265668, + "epoch": 0.474492710055614, + "flos": 21982371709680.0, + "grad_norm": 2.247228978638203, + "language_loss": 0.70169622, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72593069, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15960693, + "step": 7892, + "time_per_iteration": 2.7381420135498047 + }, + { + "auxiliary_loss_clip": 0.01191597, + "auxiliary_loss_mlp": 0.01010952, + "balance_loss_clip": 1.14335871, + "balance_loss_mlp": 1.00701833, + "epoch": 0.47455283330828196, + "flos": 62572799588760.0, + "grad_norm": 0.8458572621091331, + "language_loss": 0.58692789, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60895336, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.03930664, + "step": 7893, + "time_per_iteration": 3.327625036239624 + }, + { + "auxiliary_loss_clip": 0.0137831, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.25755239, + "balance_loss_mlp": 1.01724923, + "epoch": 0.47461295656094993, + "flos": 12097416662280.0, + "grad_norm": 2.038477114985805, + "language_loss": 0.78156894, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.80566585, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14117432, + "step": 7894, + "time_per_iteration": 2.733419179916382 + }, + { + "auxiliary_loss_clip": 0.01379148, + "auxiliary_loss_mlp": 0.01026923, + "balance_loss_clip": 1.25885081, + "balance_loss_mlp": 1.01287413, + "epoch": 0.4746730798136179, + "flos": 20889216794160.0, + "grad_norm": 1.9769335073746126, + "language_loss": 0.74865222, + "learning_rate": 2.260392731628497e-06, + "loss": 0.77271295, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14038086, + "step": 7895, + "time_per_iteration": 2.837221145629883 + }, + { + "auxiliary_loss_clip": 0.01377013, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.25638676, + "balance_loss_mlp": 1.01677406, + "epoch": 0.4747332030662859, + "flos": 19979648584920.0, + "grad_norm": 1.847529400074067, + "language_loss": 0.82915378, + "learning_rate": 2.260006580021429e-06, + "loss": 0.85323274, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14117432, + "step": 7896, + "time_per_iteration": 2.7894623279571533 + }, + { + "auxiliary_loss_clip": 0.01379407, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.25949574, + "balance_loss_mlp": 1.0105114, + "epoch": 0.4747933263189539, + "flos": 16038492015240.0, + "grad_norm": 1.9409440008262326, + "language_loss": 0.75760949, + "learning_rate": 2.259620418554886e-06, + "loss": 0.78165251, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.14398193, + "step": 7897, + "time_per_iteration": 2.7501683235168457 + }, + { + "auxiliary_loss_clip": 0.0138574, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.2590065, + "balance_loss_mlp": 1.02343452, + "epoch": 0.47485344957162184, + "flos": 13959215485920.0, + "grad_norm": 2.0200597464399097, + "language_loss": 0.63586569, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66010314, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.14575195, + "step": 7898, + "time_per_iteration": 2.7219977378845215 + }, + { + "auxiliary_loss_clip": 0.01378895, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.25637543, + "balance_loss_mlp": 1.02564132, + "epoch": 0.4749135728242898, + "flos": 20453977124280.0, + "grad_norm": 2.363701628868789, + "language_loss": 0.71051466, + "learning_rate": 2.258848066101946e-06, + "loss": 0.73472023, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.16009521, + "step": 7899, + "time_per_iteration": 2.7545018196105957 + }, + { + "auxiliary_loss_clip": 0.01379776, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.25770044, + "balance_loss_mlp": 1.01918006, + "epoch": 0.4749736960769578, + "flos": 28955928807000.0, + "grad_norm": 3.5899580939653726, + "language_loss": 0.69282895, + "learning_rate": 2.258461875144837e-06, + "loss": 0.71696824, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.14978027, + "step": 7900, + "time_per_iteration": 2.811438798904419 + }, + { + "auxiliary_loss_clip": 0.01381526, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.26002145, + "balance_loss_mlp": 1.02137852, + "epoch": 0.47503381932962574, + "flos": 31944732937200.0, + "grad_norm": 1.9477327649059664, + "language_loss": 0.7100538, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.73422801, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14520264, + "step": 7901, + "time_per_iteration": 2.8603312969207764 + }, + { + "auxiliary_loss_clip": 0.01375197, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.25423968, + "balance_loss_mlp": 1.0289315, + "epoch": 0.4750939425822937, + "flos": 22132351675080.0, + "grad_norm": 1.568143936095653, + "language_loss": 0.73877823, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.76295888, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.1394043, + "step": 7902, + "time_per_iteration": 2.7915878295898438 + }, + { + "auxiliary_loss_clip": 0.01371792, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.25354373, + "balance_loss_mlp": 1.02439022, + "epoch": 0.47515406583496167, + "flos": 20855000927880.0, + "grad_norm": 2.2002686441592685, + "language_loss": 0.69097996, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71507323, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13140869, + "step": 7903, + "time_per_iteration": 2.7222421169281006 + }, + { + "auxiliary_loss_clip": 0.01375738, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.25730765, + "balance_loss_mlp": 1.02487016, + "epoch": 0.47521418908762963, + "flos": 17528609898360.0, + "grad_norm": 1.5245486427482138, + "language_loss": 0.72107959, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74521148, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.12591553, + "step": 7904, + "time_per_iteration": 2.755680799484253 + }, + { + "auxiliary_loss_clip": 0.01373779, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.25657892, + "balance_loss_mlp": 1.02022064, + "epoch": 0.4752743123402976, + "flos": 20564462136600.0, + "grad_norm": 3.343614116048499, + "language_loss": 0.86580205, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88987613, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13415527, + "step": 7905, + "time_per_iteration": 2.7301361560821533 + }, + { + "auxiliary_loss_clip": 0.01367571, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.25103259, + "balance_loss_mlp": 1.02041793, + "epoch": 0.47533443559296557, + "flos": 26366402320920.0, + "grad_norm": 1.6659007913146051, + "language_loss": 0.82340205, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84741807, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13623047, + "step": 7906, + "time_per_iteration": 2.8126285076141357 + }, + { + "auxiliary_loss_clip": 0.01194224, + "auxiliary_loss_mlp": 0.01019952, + "balance_loss_clip": 1.14348233, + "balance_loss_mlp": 1.01724565, + "epoch": 0.47539455884563353, + "flos": 65965632541200.0, + "grad_norm": 0.9572958012262137, + "language_loss": 0.59030229, + "learning_rate": 2.255758264840002e-06, + "loss": 0.6124441, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02709961, + "step": 7907, + "time_per_iteration": 3.409456253051758 + }, + { + "auxiliary_loss_clip": 0.01374752, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.25628245, + "balance_loss_mlp": 1.02207518, + "epoch": 0.4754546820983015, + "flos": 17242578635040.0, + "grad_norm": 1.9674869899567997, + "language_loss": 0.81634504, + "learning_rate": 2.255371995885765e-06, + "loss": 0.84045112, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13775635, + "step": 7908, + "time_per_iteration": 2.6868269443511963 + }, + { + "auxiliary_loss_clip": 0.01384416, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.26261353, + "balance_loss_mlp": 1.02513123, + "epoch": 0.47551480535096946, + "flos": 19830318353280.0, + "grad_norm": 1.6996480937661174, + "language_loss": 0.74126494, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76550174, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14123535, + "step": 7909, + "time_per_iteration": 2.853665590286255 + }, + { + "auxiliary_loss_clip": 0.01380219, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.25997055, + "balance_loss_mlp": 1.01745737, + "epoch": 0.4755749286036375, + "flos": 22169085259680.0, + "grad_norm": 1.4516563698115053, + "language_loss": 0.75379229, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77790433, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.13537598, + "step": 7910, + "time_per_iteration": 4.214813232421875 + }, + { + "auxiliary_loss_clip": 0.01378601, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.2590971, + "balance_loss_mlp": 1.01818919, + "epoch": 0.47563505185630545, + "flos": 21652703440560.0, + "grad_norm": 1.864555861331475, + "language_loss": 0.79415697, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81825399, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.12908936, + "step": 7911, + "time_per_iteration": 2.8248791694641113 + }, + { + "auxiliary_loss_clip": 0.01392266, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.26855445, + "balance_loss_mlp": 1.01982164, + "epoch": 0.4756951751089734, + "flos": 20633340561120.0, + "grad_norm": 1.7190470132229159, + "language_loss": 0.76622409, + "learning_rate": 2.253826823377983e-06, + "loss": 0.79050314, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.15820312, + "step": 7912, + "time_per_iteration": 2.842562437057495 + }, + { + "auxiliary_loss_clip": 0.01381582, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.26060343, + "balance_loss_mlp": 1.02507162, + "epoch": 0.4757552983616414, + "flos": 25854365596320.0, + "grad_norm": 1.6340575524277823, + "language_loss": 0.74624151, + "learning_rate": 2.253440506151569e-06, + "loss": 0.77044368, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.13574219, + "step": 7913, + "time_per_iteration": 2.861149311065674 + }, + { + "auxiliary_loss_clip": 0.01379006, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.25793695, + "balance_loss_mlp": 1.01498401, + "epoch": 0.47581542161430934, + "flos": 18227563214760.0, + "grad_norm": 1.860909249597147, + "language_loss": 0.72930658, + "learning_rate": 2.253054179314666e-06, + "loss": 0.75338972, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14324951, + "step": 7914, + "time_per_iteration": 2.856146812438965 + }, + { + "auxiliary_loss_clip": 0.01380472, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.25934803, + "balance_loss_mlp": 1.0199163, + "epoch": 0.4758755448669773, + "flos": 21584759008320.0, + "grad_norm": 2.2354874683789725, + "language_loss": 0.64771163, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67185241, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.13696289, + "step": 7915, + "time_per_iteration": 2.8464808464050293 + }, + { + "auxiliary_loss_clip": 0.01372496, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.2548064, + "balance_loss_mlp": 1.0208497, + "epoch": 0.47593566811964527, + "flos": 15235307373960.0, + "grad_norm": 1.6942288692912484, + "language_loss": 0.77099198, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79506475, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13946533, + "step": 7916, + "time_per_iteration": 2.7500007152557373 + }, + { + "auxiliary_loss_clip": 0.01370935, + "auxiliary_loss_mlp": 0.01032489, + "balance_loss_clip": 1.25207996, + "balance_loss_mlp": 1.01871419, + "epoch": 0.47599579137231324, + "flos": 21548228465520.0, + "grad_norm": 2.118904533842515, + "language_loss": 0.6429882, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66702241, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13781738, + "step": 7917, + "time_per_iteration": 4.223572492599487 + }, + { + "auxiliary_loss_clip": 0.01186033, + "auxiliary_loss_mlp": 0.01004685, + "balance_loss_clip": 1.13669229, + "balance_loss_mlp": 1.00186014, + "epoch": 0.4760559146249812, + "flos": 64569269026080.0, + "grad_norm": 0.8347807172935608, + "language_loss": 0.65755892, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67946613, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.02819824, + "step": 7918, + "time_per_iteration": 4.732922077178955 + }, + { + "auxiliary_loss_clip": 0.01383249, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.2622689, + "balance_loss_mlp": 1.01655698, + "epoch": 0.47611603787764917, + "flos": 22238613417960.0, + "grad_norm": 1.7524610391473672, + "language_loss": 0.69315833, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.71729302, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.13665771, + "step": 7919, + "time_per_iteration": 2.75034761428833 + }, + { + "auxiliary_loss_clip": 0.01383612, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.26185, + "balance_loss_mlp": 1.02238178, + "epoch": 0.47617616113031713, + "flos": 22784662967040.0, + "grad_norm": 1.4996206133240402, + "language_loss": 0.75092542, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7751174, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13214111, + "step": 7920, + "time_per_iteration": 2.7970664501190186 + }, + { + "auxiliary_loss_clip": 0.01387677, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.2632072, + "balance_loss_mlp": 1.01786685, + "epoch": 0.4762362843829851, + "flos": 24139135635840.0, + "grad_norm": 1.5574245285774144, + "language_loss": 0.77905893, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.80326825, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.1539917, + "step": 7921, + "time_per_iteration": 2.826404571533203 + }, + { + "auxiliary_loss_clip": 0.0137755, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.25554109, + "balance_loss_mlp": 1.0247283, + "epoch": 0.47629640763565306, + "flos": 22456740857400.0, + "grad_norm": 2.904831035281948, + "language_loss": 0.78455186, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80871677, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.14221191, + "step": 7922, + "time_per_iteration": 2.8069207668304443 + }, + { + "auxiliary_loss_clip": 0.01381686, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.25651455, + "balance_loss_mlp": 1.02126098, + "epoch": 0.4763565308883211, + "flos": 11185046476200.0, + "grad_norm": 1.5415320673349395, + "language_loss": 0.72470051, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74887466, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14477539, + "step": 7923, + "time_per_iteration": 2.8637144565582275 + }, + { + "auxiliary_loss_clip": 0.01376403, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.25399375, + "balance_loss_mlp": 1.02372646, + "epoch": 0.47641665414098905, + "flos": 22387212699120.0, + "grad_norm": 1.7114654440526862, + "language_loss": 0.81906104, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84319675, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13433838, + "step": 7924, + "time_per_iteration": 2.7847909927368164 + }, + { + "auxiliary_loss_clip": 0.01390847, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.26475954, + "balance_loss_mlp": 1.02811694, + "epoch": 0.476476777393657, + "flos": 25051627647000.0, + "grad_norm": 2.277116941057754, + "language_loss": 0.80516303, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82950437, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15161133, + "step": 7925, + "time_per_iteration": 2.7919692993164062 + }, + { + "auxiliary_loss_clip": 0.01380074, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.25726676, + "balance_loss_mlp": 1.01872396, + "epoch": 0.476536900646325, + "flos": 27275442621480.0, + "grad_norm": 1.5032205108915824, + "language_loss": 0.72366112, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.7477864, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.1373291, + "step": 7926, + "time_per_iteration": 4.456258773803711 + }, + { + "auxiliary_loss_clip": 0.01391101, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.26629972, + "balance_loss_mlp": 1.01830065, + "epoch": 0.47659702389899294, + "flos": 25306894754640.0, + "grad_norm": 2.147380101922304, + "language_loss": 0.68733448, + "learning_rate": 2.248031062546432e-06, + "loss": 0.71157622, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.14770508, + "step": 7927, + "time_per_iteration": 2.7740447521209717 + }, + { + "auxiliary_loss_clip": 0.01371724, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.25335765, + "balance_loss_mlp": 1.01635087, + "epoch": 0.4766571471516609, + "flos": 25998213699360.0, + "grad_norm": 1.9058805035878157, + "language_loss": 0.68352765, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70754063, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13220215, + "step": 7928, + "time_per_iteration": 2.8097338676452637 + }, + { + "auxiliary_loss_clip": 0.01376956, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.2555027, + "balance_loss_mlp": 1.0145762, + "epoch": 0.4767172704043289, + "flos": 16036258555440.0, + "grad_norm": 1.9789343793374419, + "language_loss": 0.7875756, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81162733, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.13641357, + "step": 7929, + "time_per_iteration": 2.7034013271331787 + }, + { + "auxiliary_loss_clip": 0.01376334, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.25723791, + "balance_loss_mlp": 1.01793575, + "epoch": 0.47677739365699684, + "flos": 39242191916520.0, + "grad_norm": 1.7650071417056763, + "language_loss": 0.66636759, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.69044781, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.13757324, + "step": 7930, + "time_per_iteration": 2.948775291442871 + }, + { + "auxiliary_loss_clip": 0.01374073, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.25358021, + "balance_loss_mlp": 1.01420939, + "epoch": 0.4768375169096648, + "flos": 24723218237040.0, + "grad_norm": 2.248450651285002, + "language_loss": 0.80254728, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82656574, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13562012, + "step": 7931, + "time_per_iteration": 2.790978193283081 + }, + { + "auxiliary_loss_clip": 0.01384456, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.26195574, + "balance_loss_mlp": 1.01788282, + "epoch": 0.47689764016233277, + "flos": 22533822170640.0, + "grad_norm": 1.7306190252395584, + "language_loss": 0.76494539, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78910714, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13824463, + "step": 7932, + "time_per_iteration": 2.7559399604797363 + }, + { + "auxiliary_loss_clip": 0.01371523, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.25236952, + "balance_loss_mlp": 1.02071476, + "epoch": 0.47695776341500074, + "flos": 15124619319840.0, + "grad_norm": 1.5974925466858239, + "language_loss": 0.7980907, + "learning_rate": 2.245712162906593e-06, + "loss": 0.82215273, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13983154, + "step": 7933, + "time_per_iteration": 2.7437243461608887 + }, + { + "auxiliary_loss_clip": 0.01390953, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.26390123, + "balance_loss_mlp": 1.01860428, + "epoch": 0.4770178866676687, + "flos": 14682354403680.0, + "grad_norm": 1.6957906296047103, + "language_loss": 0.74267125, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76692021, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.15356445, + "step": 7934, + "time_per_iteration": 2.716243028640747 + }, + { + "auxiliary_loss_clip": 0.01380845, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.25771391, + "balance_loss_mlp": 1.01981497, + "epoch": 0.47707800992033667, + "flos": 22570799405400.0, + "grad_norm": 1.973870840254721, + "language_loss": 0.79985213, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82399607, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.13757324, + "step": 7935, + "time_per_iteration": 2.7816951274871826 + }, + { + "auxiliary_loss_clip": 0.01394683, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.26741827, + "balance_loss_mlp": 1.02262664, + "epoch": 0.4771381331730047, + "flos": 30924192415320.0, + "grad_norm": 3.493536382790743, + "language_loss": 0.71429491, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73862082, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.15283203, + "step": 7936, + "time_per_iteration": 2.8162841796875 + }, + { + "auxiliary_loss_clip": 0.01387427, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.26305389, + "balance_loss_mlp": 1.02116179, + "epoch": 0.47719825642567265, + "flos": 25744408492680.0, + "grad_norm": 2.0693910996700766, + "language_loss": 0.68959081, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.71382391, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14703369, + "step": 7937, + "time_per_iteration": 2.8076694011688232 + }, + { + "auxiliary_loss_clip": 0.01187825, + "auxiliary_loss_mlp": 0.01007811, + "balance_loss_clip": 1.13875484, + "balance_loss_mlp": 1.00484228, + "epoch": 0.4772583796783406, + "flos": 66371691781440.0, + "grad_norm": 0.7053270508853839, + "language_loss": 0.5641712, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58612758, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.02966309, + "step": 7938, + "time_per_iteration": 3.42166805267334 + }, + { + "auxiliary_loss_clip": 0.01380539, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.26025116, + "balance_loss_mlp": 1.01823807, + "epoch": 0.4773185029310086, + "flos": 22055757662160.0, + "grad_norm": 1.5812827417292727, + "language_loss": 0.88946682, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91360295, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.1484375, + "step": 7939, + "time_per_iteration": 2.7559974193573 + }, + { + "auxiliary_loss_clip": 0.01378658, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.25777471, + "balance_loss_mlp": 1.02665472, + "epoch": 0.47737862618367655, + "flos": 16732247461560.0, + "grad_norm": 1.7522134579619386, + "language_loss": 0.77288723, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79707873, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.1383667, + "step": 7940, + "time_per_iteration": 2.771657943725586 + }, + { + "auxiliary_loss_clip": 0.01373769, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.25590754, + "balance_loss_mlp": 1.01646399, + "epoch": 0.4774387494363445, + "flos": 19614180723480.0, + "grad_norm": 1.594247376856362, + "language_loss": 0.85311675, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.8771503, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13128662, + "step": 7941, + "time_per_iteration": 2.7388572692871094 + }, + { + "auxiliary_loss_clip": 0.01391559, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.26729524, + "balance_loss_mlp": 1.02453184, + "epoch": 0.4774988726890125, + "flos": 16658292992040.0, + "grad_norm": 1.7337675207156338, + "language_loss": 0.760023, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78433031, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.1463623, + "step": 7942, + "time_per_iteration": 2.7277328968048096 + }, + { + "auxiliary_loss_clip": 0.01391537, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.26872599, + "balance_loss_mlp": 1.02215493, + "epoch": 0.47755899594168044, + "flos": 20490467058720.0, + "grad_norm": 1.78916789479074, + "language_loss": 0.64029217, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66457188, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14263916, + "step": 7943, + "time_per_iteration": 2.7072207927703857 + }, + { + "auxiliary_loss_clip": 0.01394562, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.26953888, + "balance_loss_mlp": 1.01924622, + "epoch": 0.4776191191943484, + "flos": 21657820093920.0, + "grad_norm": 1.6175331430727837, + "language_loss": 0.73459065, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75887895, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.15032959, + "step": 7944, + "time_per_iteration": 2.760560989379883 + }, + { + "auxiliary_loss_clip": 0.01388578, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.26601267, + "balance_loss_mlp": 1.02049398, + "epoch": 0.4776792424470164, + "flos": 18774831014640.0, + "grad_norm": 2.7137686829852856, + "language_loss": 0.68185771, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70609123, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14257812, + "step": 7945, + "time_per_iteration": 2.719479560852051 + }, + { + "auxiliary_loss_clip": 0.01387154, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.26360643, + "balance_loss_mlp": 1.02373433, + "epoch": 0.47773936569968434, + "flos": 29721851955000.0, + "grad_norm": 2.5656790089875905, + "language_loss": 0.75595152, + "learning_rate": 2.240686733875009e-06, + "loss": 0.78019714, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.13671875, + "step": 7946, + "time_per_iteration": 2.82271146774292 + }, + { + "auxiliary_loss_clip": 0.01390089, + "auxiliary_loss_mlp": 0.01039057, + "balance_loss_clip": 1.26661396, + "balance_loss_mlp": 1.02415574, + "epoch": 0.4777994889523523, + "flos": 24797091489840.0, + "grad_norm": 1.707873955505227, + "language_loss": 0.79546571, + "learning_rate": 2.240300098112506e-06, + "loss": 0.8197571, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.14892578, + "step": 7947, + "time_per_iteration": 2.7422139644622803 + }, + { + "auxiliary_loss_clip": 0.01379224, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.26032567, + "balance_loss_mlp": 1.01788342, + "epoch": 0.47785961220502027, + "flos": 17862663870360.0, + "grad_norm": 1.9764294814520735, + "language_loss": 0.73956621, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.76367974, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14239502, + "step": 7948, + "time_per_iteration": 4.224491119384766 + }, + { + "auxiliary_loss_clip": 0.01389733, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.26668406, + "balance_loss_mlp": 1.01789522, + "epoch": 0.4779197354576883, + "flos": 20271405627000.0, + "grad_norm": 1.418134949009432, + "language_loss": 0.78447318, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80869877, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14923096, + "step": 7949, + "time_per_iteration": 2.808101177215576 + }, + { + "auxiliary_loss_clip": 0.01380422, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.26216316, + "balance_loss_mlp": 1.01636839, + "epoch": 0.47797985871035625, + "flos": 17061144171840.0, + "grad_norm": 2.0138346892358134, + "language_loss": 0.74265206, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76674998, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13000488, + "step": 7950, + "time_per_iteration": 2.802473783493042 + }, + { + "auxiliary_loss_clip": 0.01381222, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.26017416, + "balance_loss_mlp": 1.02354693, + "epoch": 0.4780399819630242, + "flos": 31364751780360.0, + "grad_norm": 1.6595750495012307, + "language_loss": 0.75177616, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.77597451, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.15075684, + "step": 7951, + "time_per_iteration": 2.896885871887207 + }, + { + "auxiliary_loss_clip": 0.01393383, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.26896095, + "balance_loss_mlp": 1.02147591, + "epoch": 0.4781001052156922, + "flos": 24904977567120.0, + "grad_norm": 2.4049535614248807, + "language_loss": 0.80357397, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82786989, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1473999, + "step": 7952, + "time_per_iteration": 2.8003506660461426 + }, + { + "auxiliary_loss_clip": 0.01389727, + "auxiliary_loss_mlp": 0.01038422, + "balance_loss_clip": 1.26601541, + "balance_loss_mlp": 1.02372384, + "epoch": 0.47816022846836015, + "flos": 18702379054440.0, + "grad_norm": 1.6040832936167888, + "language_loss": 0.78938174, + "learning_rate": 2.23798009269438e-06, + "loss": 0.81366324, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14697266, + "step": 7953, + "time_per_iteration": 2.7713286876678467 + }, + { + "auxiliary_loss_clip": 0.01390255, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.26506448, + "balance_loss_mlp": 1.01941347, + "epoch": 0.4782203517210281, + "flos": 11980474920720.0, + "grad_norm": 2.0403106416761516, + "language_loss": 0.84275496, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86699188, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.14007568, + "step": 7954, + "time_per_iteration": 2.723890781402588 + }, + { + "auxiliary_loss_clip": 0.01381882, + "auxiliary_loss_mlp": 0.01037096, + "balance_loss_clip": 1.26027536, + "balance_loss_mlp": 1.0232079, + "epoch": 0.4782804749736961, + "flos": 20818389168360.0, + "grad_norm": 1.3443969540020313, + "language_loss": 0.70371091, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72790074, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13891602, + "step": 7955, + "time_per_iteration": 2.729809045791626 + }, + { + "auxiliary_loss_clip": 0.01385799, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.26300716, + "balance_loss_mlp": 1.02081823, + "epoch": 0.47834059822636404, + "flos": 23845388784120.0, + "grad_norm": 1.45636618580801, + "language_loss": 0.82013345, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84434021, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.140625, + "step": 7956, + "time_per_iteration": 4.299575090408325 + }, + { + "auxiliary_loss_clip": 0.01385137, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.26439226, + "balance_loss_mlp": 1.02357173, + "epoch": 0.478400721479032, + "flos": 22638378362400.0, + "grad_norm": 1.9334316504916833, + "language_loss": 0.85273147, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87696552, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.1472168, + "step": 7957, + "time_per_iteration": 4.2101545333862305 + }, + { + "auxiliary_loss_clip": 0.01381709, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.26077867, + "balance_loss_mlp": 1.0203197, + "epoch": 0.4784608447317, + "flos": 19359766391400.0, + "grad_norm": 1.4827998693774533, + "language_loss": 0.8007561, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.82491589, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.13934326, + "step": 7958, + "time_per_iteration": 2.7444779872894287 + }, + { + "auxiliary_loss_clip": 0.01379314, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.25742126, + "balance_loss_mlp": 1.02167177, + "epoch": 0.47852096798436794, + "flos": 24026011080120.0, + "grad_norm": 1.8209118995218834, + "language_loss": 0.8341521, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85830307, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14099121, + "step": 7959, + "time_per_iteration": 2.7430639266967773 + }, + { + "auxiliary_loss_clip": 0.01376639, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.2605474, + "balance_loss_mlp": 1.01592708, + "epoch": 0.4785810912370359, + "flos": 25671997140840.0, + "grad_norm": 2.064669816779829, + "language_loss": 0.72608805, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75013572, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.12188721, + "step": 7960, + "time_per_iteration": 2.7778491973876953 + }, + { + "auxiliary_loss_clip": 0.01377055, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.25859082, + "balance_loss_mlp": 1.02302694, + "epoch": 0.47864121448970387, + "flos": 21436890677640.0, + "grad_norm": 1.5730503651226493, + "language_loss": 0.76983386, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.7939657, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13116455, + "step": 7961, + "time_per_iteration": 2.747222423553467 + }, + { + "auxiliary_loss_clip": 0.01382271, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.26229143, + "balance_loss_mlp": 1.01621008, + "epoch": 0.47870133774237184, + "flos": 16148246077080.0, + "grad_norm": 2.01673195112787, + "language_loss": 0.77832484, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80244833, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.1385498, + "step": 7962, + "time_per_iteration": 2.793403148651123 + }, + { + "auxiliary_loss_clip": 0.01381066, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.25890183, + "balance_loss_mlp": 1.02166581, + "epoch": 0.47876146099503986, + "flos": 26912492478360.0, + "grad_norm": 1.8682205469995208, + "language_loss": 0.65230328, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67646766, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.137146, + "step": 7963, + "time_per_iteration": 2.8159115314483643 + }, + { + "auxiliary_loss_clip": 0.0138233, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.26039803, + "balance_loss_mlp": 1.02257013, + "epoch": 0.4788215842477078, + "flos": 45340193629080.0, + "grad_norm": 2.192749834481651, + "language_loss": 0.78159803, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80578589, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.13897705, + "step": 7964, + "time_per_iteration": 2.9835309982299805 + }, + { + "auxiliary_loss_clip": 0.01388028, + "auxiliary_loss_mlp": 0.01034357, + "balance_loss_clip": 1.26335669, + "balance_loss_mlp": 1.01843095, + "epoch": 0.4788817075003758, + "flos": 22242349387080.0, + "grad_norm": 1.7403245298898635, + "language_loss": 0.76596224, + "learning_rate": 2.233339110409044e-06, + "loss": 0.79018611, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.15930176, + "step": 7965, + "time_per_iteration": 4.251121759414673 + }, + { + "auxiliary_loss_clip": 0.01384938, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.26391888, + "balance_loss_mlp": 1.0206368, + "epoch": 0.47894183075304375, + "flos": 16475437236240.0, + "grad_norm": 1.7152288850442639, + "language_loss": 0.75009197, + "learning_rate": 2.232952304022137e-06, + "loss": 0.77428246, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.1348877, + "step": 7966, + "time_per_iteration": 2.7089290618896484 + }, + { + "auxiliary_loss_clip": 0.01378666, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.25846684, + "balance_loss_mlp": 1.02205276, + "epoch": 0.4790019540057117, + "flos": 24288425259120.0, + "grad_norm": 1.6941804768608582, + "language_loss": 0.73153901, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75568515, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13885498, + "step": 7967, + "time_per_iteration": 2.7894625663757324 + }, + { + "auxiliary_loss_clip": 0.01368702, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.25345039, + "balance_loss_mlp": 1.01821446, + "epoch": 0.4790620772583797, + "flos": 25672037749200.0, + "grad_norm": 2.041553678877497, + "language_loss": 0.79946339, + "learning_rate": 2.232178664762267e-06, + "loss": 0.82346851, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13586426, + "step": 7968, + "time_per_iteration": 2.7815146446228027 + }, + { + "auxiliary_loss_clip": 0.0119516, + "auxiliary_loss_mlp": 0.01013445, + "balance_loss_clip": 1.1463418, + "balance_loss_mlp": 1.01081014, + "epoch": 0.47912220051104765, + "flos": 69446104980480.0, + "grad_norm": 0.7640433598924056, + "language_loss": 0.62261301, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.6446991, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.02636719, + "step": 7969, + "time_per_iteration": 3.399709463119507 + }, + { + "auxiliary_loss_clip": 0.01369368, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.25281501, + "balance_loss_mlp": 1.01636279, + "epoch": 0.4791823237637156, + "flos": 24174082452600.0, + "grad_norm": 1.3693702681322155, + "language_loss": 0.77541018, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79940212, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13452148, + "step": 7970, + "time_per_iteration": 2.7890989780426025 + }, + { + "auxiliary_loss_clip": 0.01375961, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.25523496, + "balance_loss_mlp": 1.02083302, + "epoch": 0.4792424470163836, + "flos": 24756540719400.0, + "grad_norm": 2.663775436870609, + "language_loss": 0.70431131, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72841597, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13659668, + "step": 7971, + "time_per_iteration": 2.7727794647216797 + }, + { + "auxiliary_loss_clip": 0.01374736, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.25644708, + "balance_loss_mlp": 1.01816201, + "epoch": 0.47930257026905154, + "flos": 23263092950760.0, + "grad_norm": 1.274183635612342, + "language_loss": 0.80164015, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82571781, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14855957, + "step": 7972, + "time_per_iteration": 2.8012571334838867 + }, + { + "auxiliary_loss_clip": 0.0138215, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.25992179, + "balance_loss_mlp": 1.01915562, + "epoch": 0.4793626935217195, + "flos": 14068157380560.0, + "grad_norm": 2.355476499166705, + "language_loss": 0.70824826, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7324031, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.1416626, + "step": 7973, + "time_per_iteration": 2.807779550552368 + }, + { + "auxiliary_loss_clip": 0.01366185, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.25018549, + "balance_loss_mlp": 1.01990175, + "epoch": 0.4794228167743875, + "flos": 21803901656760.0, + "grad_norm": 3.50752007422033, + "language_loss": 0.79318267, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.81717753, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.1340332, + "step": 7974, + "time_per_iteration": 2.8084707260131836 + }, + { + "auxiliary_loss_clip": 0.01191469, + "auxiliary_loss_mlp": 0.010079, + "balance_loss_clip": 1.14309025, + "balance_loss_mlp": 1.00507474, + "epoch": 0.47948294002705544, + "flos": 66984589337040.0, + "grad_norm": 0.7570224013879095, + "language_loss": 0.54061055, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56260425, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02819824, + "step": 7975, + "time_per_iteration": 3.3142807483673096 + }, + { + "auxiliary_loss_clip": 0.01391452, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_clip": 1.26627862, + "balance_loss_mlp": 1.02757645, + "epoch": 0.47954306327972346, + "flos": 12426353980920.0, + "grad_norm": 2.1327612732107375, + "language_loss": 0.90048075, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.924824, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.1529541, + "step": 7976, + "time_per_iteration": 2.7389822006225586 + }, + { + "auxiliary_loss_clip": 0.01387283, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.26414227, + "balance_loss_mlp": 1.02210677, + "epoch": 0.4796031865323914, + "flos": 18365604322320.0, + "grad_norm": 2.4921450321404843, + "language_loss": 0.73905742, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.76329768, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14624023, + "step": 7977, + "time_per_iteration": 2.8116190433502197 + }, + { + "auxiliary_loss_clip": 0.01369604, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.25246227, + "balance_loss_mlp": 1.0217495, + "epoch": 0.4796633097850594, + "flos": 21840107332680.0, + "grad_norm": 1.5579179368643858, + "language_loss": 0.78441036, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80845392, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13000488, + "step": 7978, + "time_per_iteration": 2.7775793075561523 + }, + { + "auxiliary_loss_clip": 0.01379478, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.25875604, + "balance_loss_mlp": 1.02245545, + "epoch": 0.47972343303772735, + "flos": 23442172129080.0, + "grad_norm": 1.7044276261605058, + "language_loss": 0.89558017, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91974115, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14160156, + "step": 7979, + "time_per_iteration": 2.7938506603240967 + }, + { + "auxiliary_loss_clip": 0.01385206, + "auxiliary_loss_mlp": 0.01042904, + "balance_loss_clip": 1.26153564, + "balance_loss_mlp": 1.02715611, + "epoch": 0.4797835562903953, + "flos": 24905180608920.0, + "grad_norm": 1.4367659662438683, + "language_loss": 0.76824844, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79252952, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.1574707, + "step": 7980, + "time_per_iteration": 2.7896034717559814 + }, + { + "auxiliary_loss_clip": 0.01390426, + "auxiliary_loss_mlp": 0.01040319, + "balance_loss_clip": 1.26577914, + "balance_loss_mlp": 1.02464294, + "epoch": 0.4798436795430633, + "flos": 35049747858480.0, + "grad_norm": 1.6875632118985249, + "language_loss": 0.719657, + "learning_rate": 2.227149156404295e-06, + "loss": 0.74396443, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.15673828, + "step": 7981, + "time_per_iteration": 2.889667510986328 + }, + { + "auxiliary_loss_clip": 0.01375169, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.25729239, + "balance_loss_mlp": 1.02283168, + "epoch": 0.47990380279573125, + "flos": 20594495341800.0, + "grad_norm": 2.034021688650788, + "language_loss": 0.70400548, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.7281127, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.12719727, + "step": 7982, + "time_per_iteration": 2.7089052200317383 + }, + { + "auxiliary_loss_clip": 0.01367211, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.2517029, + "balance_loss_mlp": 1.01947904, + "epoch": 0.4799639260483992, + "flos": 26365021636680.0, + "grad_norm": 1.7829615073256457, + "language_loss": 0.71442831, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73841596, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.12078857, + "step": 7983, + "time_per_iteration": 2.8160500526428223 + }, + { + "auxiliary_loss_clip": 0.01186093, + "auxiliary_loss_mlp": 0.01005814, + "balance_loss_clip": 1.13747442, + "balance_loss_mlp": 1.00326335, + "epoch": 0.4800240493010672, + "flos": 70994908589760.0, + "grad_norm": 0.871776110169087, + "language_loss": 0.5951122, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61703128, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.0255127, + "step": 7984, + "time_per_iteration": 3.2245676517486572 + }, + { + "auxiliary_loss_clip": 0.01373737, + "auxiliary_loss_mlp": 0.01040083, + "balance_loss_clip": 1.25431609, + "balance_loss_mlp": 1.02618313, + "epoch": 0.48008417255373514, + "flos": 17090365209840.0, + "grad_norm": 1.8770660988399837, + "language_loss": 0.66935968, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.6934979, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13903809, + "step": 7985, + "time_per_iteration": 2.8301570415496826 + }, + { + "auxiliary_loss_clip": 0.01382178, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.25923204, + "balance_loss_mlp": 1.02144909, + "epoch": 0.4801442958064031, + "flos": 15417594612720.0, + "grad_norm": 1.6366867002662115, + "language_loss": 0.70413184, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72831666, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14868164, + "step": 7986, + "time_per_iteration": 2.7871317863464355 + }, + { + "auxiliary_loss_clip": 0.01388961, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.26351571, + "balance_loss_mlp": 1.02209163, + "epoch": 0.4802044190590711, + "flos": 11477493860400.0, + "grad_norm": 2.012981922217179, + "language_loss": 0.79299486, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.8172605, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.15515137, + "step": 7987, + "time_per_iteration": 4.077197790145874 + }, + { + "auxiliary_loss_clip": 0.01377857, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.2576015, + "balance_loss_mlp": 1.02677667, + "epoch": 0.48026454231173904, + "flos": 20954805941520.0, + "grad_norm": 2.1090781401427443, + "language_loss": 0.75589508, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.78008461, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14318848, + "step": 7988, + "time_per_iteration": 2.7288689613342285 + }, + { + "auxiliary_loss_clip": 0.01380824, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.25927019, + "balance_loss_mlp": 1.01858497, + "epoch": 0.48032466556440706, + "flos": 20452880698560.0, + "grad_norm": 2.711988248323821, + "language_loss": 0.79729986, + "learning_rate": 2.224053348748365e-06, + "loss": 0.82142729, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.13317871, + "step": 7989, + "time_per_iteration": 2.776054859161377 + }, + { + "auxiliary_loss_clip": 0.01390386, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.26549006, + "balance_loss_mlp": 1.0254283, + "epoch": 0.480384788817075, + "flos": 37127603095200.0, + "grad_norm": 2.0390708153386314, + "language_loss": 0.73475093, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75905442, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.14526367, + "step": 7990, + "time_per_iteration": 2.8894307613372803 + }, + { + "auxiliary_loss_clip": 0.01183703, + "auxiliary_loss_mlp": 0.0100276, + "balance_loss_clip": 1.1342423, + "balance_loss_mlp": 0.99986315, + "epoch": 0.480444912069743, + "flos": 69567821527320.0, + "grad_norm": 0.7683661624191728, + "language_loss": 0.59089434, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61275887, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.02893066, + "step": 7991, + "time_per_iteration": 3.3963348865509033 + }, + { + "auxiliary_loss_clip": 0.01377432, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.25746691, + "balance_loss_mlp": 1.01586378, + "epoch": 0.48050503532241096, + "flos": 29827870047720.0, + "grad_norm": 2.1530198712431114, + "language_loss": 0.6764729, + "learning_rate": 2.222892280287768e-06, + "loss": 0.70054823, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14227295, + "step": 7992, + "time_per_iteration": 2.818230628967285 + }, + { + "auxiliary_loss_clip": 0.01386717, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.26351857, + "balance_loss_mlp": 1.02295446, + "epoch": 0.4805651585750789, + "flos": 23953477903200.0, + "grad_norm": 1.5562955953001016, + "language_loss": 0.76516593, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78940904, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.14630127, + "step": 7993, + "time_per_iteration": 2.8153469562530518 + }, + { + "auxiliary_loss_clip": 0.01375088, + "auxiliary_loss_mlp": 0.01035206, + "balance_loss_clip": 1.25680935, + "balance_loss_mlp": 1.02096629, + "epoch": 0.4806252818277469, + "flos": 25670860106760.0, + "grad_norm": 1.5544558035996905, + "language_loss": 0.78356481, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80766785, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14251709, + "step": 7994, + "time_per_iteration": 4.258812427520752 + }, + { + "auxiliary_loss_clip": 0.01381439, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.26024771, + "balance_loss_mlp": 1.0159595, + "epoch": 0.48068540508041485, + "flos": 13155949627920.0, + "grad_norm": 1.9665131125535416, + "language_loss": 0.79922777, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.82333803, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.13616943, + "step": 7995, + "time_per_iteration": 2.71743106842041 + }, + { + "auxiliary_loss_clip": 0.01380717, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.26088262, + "balance_loss_mlp": 1.01839566, + "epoch": 0.4807455283330828, + "flos": 21181461136560.0, + "grad_norm": 1.4007494249527852, + "language_loss": 0.83098412, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85511088, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.13580322, + "step": 7996, + "time_per_iteration": 4.2812817096710205 + }, + { + "auxiliary_loss_clip": 0.01374988, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.25616288, + "balance_loss_mlp": 1.01863432, + "epoch": 0.4808056515857508, + "flos": 12279785117760.0, + "grad_norm": 1.5728159833159725, + "language_loss": 0.80685991, + "learning_rate": 2.220956997340516e-06, + "loss": 0.83093572, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.1395874, + "step": 7997, + "time_per_iteration": 2.8143584728240967 + }, + { + "auxiliary_loss_clip": 0.0138184, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.26002598, + "balance_loss_mlp": 1.01841354, + "epoch": 0.48086577483841875, + "flos": 24831347964480.0, + "grad_norm": 1.9972784096262943, + "language_loss": 0.7297852, + "learning_rate": 2.220569915556221e-06, + "loss": 0.75393373, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14611816, + "step": 7998, + "time_per_iteration": 2.755794048309326 + }, + { + "auxiliary_loss_clip": 0.01375866, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.25481415, + "balance_loss_mlp": 1.02126765, + "epoch": 0.4809258980910867, + "flos": 24471159189840.0, + "grad_norm": 1.784094449920008, + "language_loss": 0.71072316, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73484421, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14959717, + "step": 7999, + "time_per_iteration": 2.7788681983947754 + }, + { + "auxiliary_loss_clip": 0.01386688, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.26114058, + "balance_loss_mlp": 1.02334106, + "epoch": 0.4809860213437547, + "flos": 21221037306360.0, + "grad_norm": 1.4227702717719244, + "language_loss": 0.7133112, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73756474, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.15319824, + "step": 8000, + "time_per_iteration": 2.738022804260254 + }, + { + "auxiliary_loss_clip": 0.01387188, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.26460338, + "balance_loss_mlp": 1.02141345, + "epoch": 0.48104614459642264, + "flos": 37638990086040.0, + "grad_norm": 1.2991864824763057, + "language_loss": 0.74975789, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77400374, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.1595459, + "step": 8001, + "time_per_iteration": 2.8908193111419678 + }, + { + "auxiliary_loss_clip": 0.01389847, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.26621008, + "balance_loss_mlp": 1.02687573, + "epoch": 0.48110626784909066, + "flos": 18410784445800.0, + "grad_norm": 1.7171466290774138, + "language_loss": 0.81722617, + "learning_rate": 2.219021504925493e-06, + "loss": 0.84155452, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.16113281, + "step": 8002, + "time_per_iteration": 2.7462236881256104 + }, + { + "auxiliary_loss_clip": 0.01393294, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.26879764, + "balance_loss_mlp": 1.01902485, + "epoch": 0.48116639110175863, + "flos": 28445800675320.0, + "grad_norm": 1.9647746973225273, + "language_loss": 0.71991551, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74419367, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.1550293, + "step": 8003, + "time_per_iteration": 2.7607028484344482 + }, + { + "auxiliary_loss_clip": 0.01370609, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.25186086, + "balance_loss_mlp": 1.02329707, + "epoch": 0.4812265143544266, + "flos": 21730190837400.0, + "grad_norm": 1.5934940414428467, + "language_loss": 0.82226199, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84634185, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14080811, + "step": 8004, + "time_per_iteration": 4.247179985046387 + }, + { + "auxiliary_loss_clip": 0.01401264, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.27053785, + "balance_loss_mlp": 1.02634442, + "epoch": 0.48128663760709456, + "flos": 13228604629920.0, + "grad_norm": 1.9524183815792173, + "language_loss": 0.7799688, + "learning_rate": 2.217860109695239e-06, + "loss": 0.80441761, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.17266846, + "step": 8005, + "time_per_iteration": 2.7043495178222656 + }, + { + "auxiliary_loss_clip": 0.01388511, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.26405168, + "balance_loss_mlp": 1.02233887, + "epoch": 0.4813467608597625, + "flos": 24248808480960.0, + "grad_norm": 1.6513276992996513, + "language_loss": 0.70936763, + "learning_rate": 2.217472961409692e-06, + "loss": 0.73362017, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14404297, + "step": 8006, + "time_per_iteration": 2.7671921253204346 + }, + { + "auxiliary_loss_clip": 0.01384664, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.26014447, + "balance_loss_mlp": 1.01783633, + "epoch": 0.4814068841124305, + "flos": 27484879871880.0, + "grad_norm": 2.627443047576624, + "language_loss": 0.70451438, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72869146, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.15185547, + "step": 8007, + "time_per_iteration": 2.7719764709472656 + }, + { + "auxiliary_loss_clip": 0.01386451, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.26161718, + "balance_loss_mlp": 1.02076101, + "epoch": 0.48146700736509845, + "flos": 19577609572320.0, + "grad_norm": 1.7438382871077212, + "language_loss": 0.72016162, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.74438083, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.14709473, + "step": 8008, + "time_per_iteration": 2.8209543228149414 + }, + { + "auxiliary_loss_clip": 0.01386056, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.26122284, + "balance_loss_mlp": 1.02469862, + "epoch": 0.4815271306177664, + "flos": 20632203527040.0, + "grad_norm": 1.6406702770108421, + "language_loss": 0.60987771, + "learning_rate": 2.216311467132199e-06, + "loss": 0.63414097, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.15563965, + "step": 8009, + "time_per_iteration": 2.8455681800842285 + }, + { + "auxiliary_loss_clip": 0.01181266, + "auxiliary_loss_mlp": 0.01020732, + "balance_loss_clip": 1.13106537, + "balance_loss_mlp": 1.01710796, + "epoch": 0.4815872538704344, + "flos": 67705982095320.0, + "grad_norm": 0.8689735483182274, + "language_loss": 0.6138562, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63587618, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03613281, + "step": 8010, + "time_per_iteration": 3.272695779800415 + }, + { + "auxiliary_loss_clip": 0.0138763, + "auxiliary_loss_mlp": 0.01041727, + "balance_loss_clip": 1.26440573, + "balance_loss_mlp": 1.02553844, + "epoch": 0.48164737712310235, + "flos": 22825985296320.0, + "grad_norm": 1.6184240646581058, + "language_loss": 0.73664498, + "learning_rate": 2.215537096576639e-06, + "loss": 0.76093864, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.16186523, + "step": 8011, + "time_per_iteration": 2.821640968322754 + }, + { + "auxiliary_loss_clip": 0.01376424, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.25770867, + "balance_loss_mlp": 1.0207088, + "epoch": 0.4817075003757703, + "flos": 23739330083040.0, + "grad_norm": 1.68809081738365, + "language_loss": 0.79190266, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81601578, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14178467, + "step": 8012, + "time_per_iteration": 2.8165407180786133 + }, + { + "auxiliary_loss_clip": 0.01382786, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.26032448, + "balance_loss_mlp": 1.02606106, + "epoch": 0.4817676236284383, + "flos": 28188665583120.0, + "grad_norm": 1.6436793356130184, + "language_loss": 0.73927402, + "learning_rate": 2.214762693328326e-06, + "loss": 0.76351523, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.15270996, + "step": 8013, + "time_per_iteration": 2.812962055206299 + }, + { + "auxiliary_loss_clip": 0.0138065, + "auxiliary_loss_mlp": 0.01033439, + "balance_loss_clip": 1.26040924, + "balance_loss_mlp": 1.01946795, + "epoch": 0.48182774688110624, + "flos": 17096131596960.0, + "grad_norm": 1.8465741840054788, + "language_loss": 0.91142875, + "learning_rate": 2.214375479481094e-06, + "loss": 0.93556964, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13977051, + "step": 8014, + "time_per_iteration": 2.9556398391723633 + }, + { + "auxiliary_loss_clip": 0.01389322, + "auxiliary_loss_mlp": 0.01043274, + "balance_loss_clip": 1.26261568, + "balance_loss_mlp": 1.02766931, + "epoch": 0.4818878701337742, + "flos": 12571704593280.0, + "grad_norm": 2.2159645505835357, + "language_loss": 0.74603701, + "learning_rate": 2.213988257504722e-06, + "loss": 0.77036297, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.15612793, + "step": 8015, + "time_per_iteration": 2.736574172973633 + }, + { + "auxiliary_loss_clip": 0.0139455, + "auxiliary_loss_mlp": 0.01047489, + "balance_loss_clip": 1.26605105, + "balance_loss_mlp": 1.03159809, + "epoch": 0.48194799338644223, + "flos": 24613951475520.0, + "grad_norm": 2.5737678920868956, + "language_loss": 0.80487096, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82929134, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.15905762, + "step": 8016, + "time_per_iteration": 2.8190064430236816 + }, + { + "auxiliary_loss_clip": 0.01373829, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.25611222, + "balance_loss_mlp": 1.01525974, + "epoch": 0.4820081166391102, + "flos": 21110105602080.0, + "grad_norm": 1.6969157041463958, + "language_loss": 0.78139532, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.80542195, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13568115, + "step": 8017, + "time_per_iteration": 2.774292469024658 + }, + { + "auxiliary_loss_clip": 0.01375195, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.25689769, + "balance_loss_mlp": 1.01898003, + "epoch": 0.48206823989177816, + "flos": 25270039344960.0, + "grad_norm": 1.7787929935211646, + "language_loss": 0.80405146, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82813972, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14648438, + "step": 8018, + "time_per_iteration": 2.8292367458343506 + }, + { + "auxiliary_loss_clip": 0.01383612, + "auxiliary_loss_mlp": 0.0103571, + "balance_loss_clip": 1.26092362, + "balance_loss_mlp": 1.02187538, + "epoch": 0.4821283631444461, + "flos": 24650522626680.0, + "grad_norm": 1.6455843475577818, + "language_loss": 0.76572078, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78991401, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.1383667, + "step": 8019, + "time_per_iteration": 2.8479278087615967 + }, + { + "auxiliary_loss_clip": 0.0138095, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.25793064, + "balance_loss_mlp": 1.02515817, + "epoch": 0.4821884863971141, + "flos": 23957498130840.0, + "grad_norm": 12.45266409077769, + "language_loss": 0.79506695, + "learning_rate": 2.212052026199701e-06, + "loss": 0.8192687, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.140625, + "step": 8020, + "time_per_iteration": 2.767828941345215 + }, + { + "auxiliary_loss_clip": 0.0137465, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.2551477, + "balance_loss_mlp": 1.02114439, + "epoch": 0.48224860964978206, + "flos": 17164522721160.0, + "grad_norm": 3.9660135380878128, + "language_loss": 0.69557261, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71968019, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.1496582, + "step": 8021, + "time_per_iteration": 2.7948474884033203 + }, + { + "auxiliary_loss_clip": 0.01388878, + "auxiliary_loss_mlp": 0.01040847, + "balance_loss_clip": 1.26346231, + "balance_loss_mlp": 1.02495599, + "epoch": 0.48230873290245, + "flos": 23081049362160.0, + "grad_norm": 1.6486344009262215, + "language_loss": 0.6294291, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65372634, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.15881348, + "step": 8022, + "time_per_iteration": 2.848177909851074 + }, + { + "auxiliary_loss_clip": 0.01373596, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.25386429, + "balance_loss_mlp": 1.02280617, + "epoch": 0.482368856155118, + "flos": 19358142057000.0, + "grad_norm": 2.1983529732347478, + "language_loss": 0.66601497, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.69012189, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14294434, + "step": 8023, + "time_per_iteration": 2.735689401626587 + }, + { + "auxiliary_loss_clip": 0.01379688, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.25721633, + "balance_loss_mlp": 1.02151108, + "epoch": 0.48242897940778595, + "flos": 20083148959320.0, + "grad_norm": 1.7016584305043545, + "language_loss": 0.77047098, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.79462755, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14453125, + "step": 8024, + "time_per_iteration": 2.7349328994750977 + }, + { + "auxiliary_loss_clip": 0.01387333, + "auxiliary_loss_mlp": 0.0103938, + "balance_loss_clip": 1.26378798, + "balance_loss_mlp": 1.02440751, + "epoch": 0.4824891026604539, + "flos": 23409215121960.0, + "grad_norm": 1.390217269715307, + "language_loss": 0.75339234, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77765942, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.1496582, + "step": 8025, + "time_per_iteration": 2.8017237186431885 + }, + { + "auxiliary_loss_clip": 0.01388837, + "auxiliary_loss_mlp": 0.01036662, + "balance_loss_clip": 1.26685965, + "balance_loss_mlp": 1.02226758, + "epoch": 0.4825492259131219, + "flos": 20372591324880.0, + "grad_norm": 1.67230994838379, + "language_loss": 0.71103346, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73528838, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.1439209, + "step": 8026, + "time_per_iteration": 4.133430480957031 + }, + { + "auxiliary_loss_clip": 0.01387632, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.26233792, + "balance_loss_mlp": 1.02609921, + "epoch": 0.48260934916578985, + "flos": 14323059012960.0, + "grad_norm": 1.9839308929515749, + "language_loss": 0.75470936, + "learning_rate": 2.209340965060465e-06, + "loss": 0.77899361, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.14685059, + "step": 8027, + "time_per_iteration": 2.697770833969116 + }, + { + "auxiliary_loss_clip": 0.01385351, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.26092958, + "balance_loss_mlp": 1.02516019, + "epoch": 0.4826694724184578, + "flos": 22125448253880.0, + "grad_norm": 1.6006170395043702, + "language_loss": 0.67256027, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69680816, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14294434, + "step": 8028, + "time_per_iteration": 2.7874932289123535 + }, + { + "auxiliary_loss_clip": 0.01382872, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.25990796, + "balance_loss_mlp": 1.0212698, + "epoch": 0.48272959567112583, + "flos": 16185629395440.0, + "grad_norm": 1.7671763589099012, + "language_loss": 0.7315855, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75577855, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.15185547, + "step": 8029, + "time_per_iteration": 2.843121290206909 + }, + { + "auxiliary_loss_clip": 0.01379666, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.2554189, + "balance_loss_mlp": 1.01705599, + "epoch": 0.4827897189237938, + "flos": 23185037036880.0, + "grad_norm": 2.2781892130762467, + "language_loss": 0.84582996, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86995149, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.1541748, + "step": 8030, + "time_per_iteration": 2.7934858798980713 + }, + { + "auxiliary_loss_clip": 0.01382639, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.26049209, + "balance_loss_mlp": 1.01943278, + "epoch": 0.48284984217646176, + "flos": 21657617052120.0, + "grad_norm": 1.9507570042048032, + "language_loss": 0.74393523, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.7680977, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.1418457, + "step": 8031, + "time_per_iteration": 2.730426073074341 + }, + { + "auxiliary_loss_clip": 0.01392506, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.26554465, + "balance_loss_mlp": 1.02447319, + "epoch": 0.48290996542912973, + "flos": 31473734283360.0, + "grad_norm": 1.778323998280319, + "language_loss": 0.71954072, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.7438674, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.15692139, + "step": 8032, + "time_per_iteration": 4.176281213760376 + }, + { + "auxiliary_loss_clip": 0.01376784, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.25546622, + "balance_loss_mlp": 1.01832068, + "epoch": 0.4829700886817977, + "flos": 24467301395640.0, + "grad_norm": 2.3716162494871904, + "language_loss": 0.74389857, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76799631, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14660645, + "step": 8033, + "time_per_iteration": 2.8751211166381836 + }, + { + "auxiliary_loss_clip": 0.01393314, + "auxiliary_loss_mlp": 0.01041002, + "balance_loss_clip": 1.26532972, + "balance_loss_mlp": 1.02599335, + "epoch": 0.48303021193446566, + "flos": 25707634299720.0, + "grad_norm": 1.6571522110479924, + "language_loss": 0.83535314, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85969627, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.14996338, + "step": 8034, + "time_per_iteration": 2.861973524093628 + }, + { + "auxiliary_loss_clip": 0.01373407, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.25536108, + "balance_loss_mlp": 1.02431917, + "epoch": 0.4830903351871336, + "flos": 20090214813960.0, + "grad_norm": 1.6811479037505241, + "language_loss": 0.79831338, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.82243341, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.14257812, + "step": 8035, + "time_per_iteration": 4.233685255050659 + }, + { + "auxiliary_loss_clip": 0.01378481, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.25623202, + "balance_loss_mlp": 1.02312171, + "epoch": 0.4831504584398016, + "flos": 39458816846640.0, + "grad_norm": 1.91244091307846, + "language_loss": 0.69791645, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.72210181, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.16918945, + "step": 8036, + "time_per_iteration": 2.9119396209716797 + }, + { + "auxiliary_loss_clip": 0.01378071, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.25698054, + "balance_loss_mlp": 1.02127576, + "epoch": 0.48321058169246955, + "flos": 20010818824200.0, + "grad_norm": 1.7762700461360745, + "language_loss": 0.73279357, + "learning_rate": 2.205467347074847e-06, + "loss": 0.75693226, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14526367, + "step": 8037, + "time_per_iteration": 2.806584596633911 + }, + { + "auxiliary_loss_clip": 0.01389121, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.26149321, + "balance_loss_mlp": 1.01876354, + "epoch": 0.4832707049451375, + "flos": 20746465116840.0, + "grad_norm": 2.0918269190739256, + "language_loss": 0.69456887, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71880293, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.15515137, + "step": 8038, + "time_per_iteration": 2.7521438598632812 + }, + { + "auxiliary_loss_clip": 0.01376549, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.25476503, + "balance_loss_mlp": 1.02120304, + "epoch": 0.4833308281978055, + "flos": 33152068225800.0, + "grad_norm": 1.4969783637553478, + "language_loss": 0.7896536, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81377757, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14630127, + "step": 8039, + "time_per_iteration": 2.8303754329681396 + }, + { + "auxiliary_loss_clip": 0.01382079, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.25924015, + "balance_loss_mlp": 1.0174526, + "epoch": 0.48339095145047345, + "flos": 19103971375080.0, + "grad_norm": 1.4929724020261048, + "language_loss": 0.77607757, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.8002196, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14660645, + "step": 8040, + "time_per_iteration": 2.746321439743042 + }, + { + "auxiliary_loss_clip": 0.01380563, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.25772154, + "balance_loss_mlp": 1.01806808, + "epoch": 0.4834510747031414, + "flos": 34466152557600.0, + "grad_norm": 1.5469092656571466, + "language_loss": 0.75953686, + "learning_rate": 2.203917680900409e-06, + "loss": 0.78367531, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.15209961, + "step": 8041, + "time_per_iteration": 2.857083797454834 + }, + { + "auxiliary_loss_clip": 0.01371389, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.25262272, + "balance_loss_mlp": 1.0204258, + "epoch": 0.48351119795580944, + "flos": 27386577367560.0, + "grad_norm": 1.6622366342999932, + "language_loss": 0.67097944, + "learning_rate": 2.203530244988624e-06, + "loss": 0.69504273, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14501953, + "step": 8042, + "time_per_iteration": 4.353826284408569 + }, + { + "auxiliary_loss_clip": 0.01175391, + "auxiliary_loss_mlp": 0.01012185, + "balance_loss_clip": 1.12684798, + "balance_loss_mlp": 1.00980043, + "epoch": 0.4835713212084774, + "flos": 67158795512160.0, + "grad_norm": 0.7010581906160426, + "language_loss": 0.58618045, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60805619, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02380371, + "step": 8043, + "time_per_iteration": 3.3194875717163086 + }, + { + "auxiliary_loss_clip": 0.01380186, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.25570631, + "balance_loss_mlp": 1.02013278, + "epoch": 0.48363144446114537, + "flos": 17971971240240.0, + "grad_norm": 2.2748747036115304, + "language_loss": 0.72389328, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74804205, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.14550781, + "step": 8044, + "time_per_iteration": 2.7498536109924316 + }, + { + "auxiliary_loss_clip": 0.01374891, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.25391579, + "balance_loss_mlp": 1.02158952, + "epoch": 0.48369156771381333, + "flos": 20598231310920.0, + "grad_norm": 1.283389421702452, + "language_loss": 0.76269066, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78681242, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.15698242, + "step": 8045, + "time_per_iteration": 2.8348171710968018 + }, + { + "auxiliary_loss_clip": 0.01377971, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.25424945, + "balance_loss_mlp": 1.01729822, + "epoch": 0.4837516909664813, + "flos": 22680309817080.0, + "grad_norm": 1.5757147417617032, + "language_loss": 0.69381893, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71792173, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.15014648, + "step": 8046, + "time_per_iteration": 2.827948570251465 + }, + { + "auxiliary_loss_clip": 0.01376003, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.25351238, + "balance_loss_mlp": 1.01983595, + "epoch": 0.48381181421914926, + "flos": 25523925768360.0, + "grad_norm": 1.728249990404039, + "language_loss": 0.82110035, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84521484, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.15606689, + "step": 8047, + "time_per_iteration": 2.742241144180298 + }, + { + "auxiliary_loss_clip": 0.01369306, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.24909711, + "balance_loss_mlp": 1.01595962, + "epoch": 0.4838719374718172, + "flos": 24212927671920.0, + "grad_norm": 1.4867045749047954, + "language_loss": 0.80455041, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82854784, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14489746, + "step": 8048, + "time_per_iteration": 2.7965805530548096 + }, + { + "auxiliary_loss_clip": 0.01378119, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.25271773, + "balance_loss_mlp": 1.02031446, + "epoch": 0.4839320607244852, + "flos": 26730286456320.0, + "grad_norm": 1.7728171159133124, + "language_loss": 0.81695396, + "learning_rate": 2.200817978328054e-06, + "loss": 0.84109455, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.15637207, + "step": 8049, + "time_per_iteration": 2.838285446166992 + }, + { + "auxiliary_loss_clip": 0.01366334, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.24763286, + "balance_loss_mlp": 1.01576853, + "epoch": 0.48399218397715316, + "flos": 20453774082480.0, + "grad_norm": 1.7650584050779725, + "language_loss": 0.73007572, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.75402528, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.12854004, + "step": 8050, + "time_per_iteration": 2.769401788711548 + }, + { + "auxiliary_loss_clip": 0.01177441, + "auxiliary_loss_mlp": 0.01013722, + "balance_loss_clip": 1.12816334, + "balance_loss_mlp": 1.01098049, + "epoch": 0.4840523072298211, + "flos": 67195447880040.0, + "grad_norm": 0.7080341937841695, + "language_loss": 0.56442583, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58633745, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.02746582, + "step": 8051, + "time_per_iteration": 3.3128693103790283 + }, + { + "auxiliary_loss_clip": 0.01378659, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.2556026, + "balance_loss_mlp": 1.01890528, + "epoch": 0.4841124304824891, + "flos": 22415621569920.0, + "grad_norm": 2.3048419079367632, + "language_loss": 0.75784022, + "learning_rate": 2.199655463811236e-06, + "loss": 0.78196472, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14880371, + "step": 8052, + "time_per_iteration": 2.778818130493164 + }, + { + "auxiliary_loss_clip": 0.0137511, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.25336099, + "balance_loss_mlp": 1.01949692, + "epoch": 0.48417255373515705, + "flos": 13847837089680.0, + "grad_norm": 2.722205704951374, + "language_loss": 0.67552632, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.69961393, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14129639, + "step": 8053, + "time_per_iteration": 2.7833378314971924 + }, + { + "auxiliary_loss_clip": 0.0136615, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.24720097, + "balance_loss_mlp": 1.01965964, + "epoch": 0.484232676987825, + "flos": 31655980913760.0, + "grad_norm": 2.641482681729904, + "language_loss": 0.69713247, + "learning_rate": 2.198880416254091e-06, + "loss": 0.72113377, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14306641, + "step": 8054, + "time_per_iteration": 2.8627493381500244 + }, + { + "auxiliary_loss_clip": 0.01370035, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.24854755, + "balance_loss_mlp": 1.0214479, + "epoch": 0.48429280024049304, + "flos": 24100452849960.0, + "grad_norm": 1.5204117097279375, + "language_loss": 0.69888973, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.72293884, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.13427734, + "step": 8055, + "time_per_iteration": 2.8499579429626465 + }, + { + "auxiliary_loss_clip": 0.01372169, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.25032818, + "balance_loss_mlp": 1.01834083, + "epoch": 0.484352923493161, + "flos": 17534579327280.0, + "grad_norm": 2.0379282731387645, + "language_loss": 0.62807298, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65212333, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14508057, + "step": 8056, + "time_per_iteration": 2.7590689659118652 + }, + { + "auxiliary_loss_clip": 0.01377043, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.25435472, + "balance_loss_mlp": 1.01802444, + "epoch": 0.48441304674582897, + "flos": 29172594345480.0, + "grad_norm": 2.2326099142642377, + "language_loss": 0.67527127, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69937265, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.15075684, + "step": 8057, + "time_per_iteration": 2.8583223819732666 + }, + { + "auxiliary_loss_clip": 0.01363255, + "auxiliary_loss_mlp": 0.01037295, + "balance_loss_clip": 1.24351978, + "balance_loss_mlp": 1.02297795, + "epoch": 0.48447316999849693, + "flos": 15890786118000.0, + "grad_norm": 1.5844730649785186, + "language_loss": 0.8159439, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83994937, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14324951, + "step": 8058, + "time_per_iteration": 2.812981367111206 + }, + { + "auxiliary_loss_clip": 0.01379966, + "auxiliary_loss_mlp": 0.0104088, + "balance_loss_clip": 1.2550112, + "balance_loss_mlp": 1.02636075, + "epoch": 0.4845332932511649, + "flos": 24385753162800.0, + "grad_norm": 1.6318461645787348, + "language_loss": 0.80147815, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82568657, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.1451416, + "step": 8059, + "time_per_iteration": 2.7309799194335938 + }, + { + "auxiliary_loss_clip": 0.01388998, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_clip": 1.26284337, + "balance_loss_mlp": 1.02884853, + "epoch": 0.48459341650383286, + "flos": 37122689483640.0, + "grad_norm": 3.2199726521168746, + "language_loss": 0.67298865, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69732511, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.15802002, + "step": 8060, + "time_per_iteration": 2.853262186050415 + }, + { + "auxiliary_loss_clip": 0.01380566, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.25819039, + "balance_loss_mlp": 1.02991021, + "epoch": 0.48465353975650083, + "flos": 22972026250800.0, + "grad_norm": 1.842267819255455, + "language_loss": 0.67589808, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.70014626, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14349365, + "step": 8061, + "time_per_iteration": 2.7221641540527344 + }, + { + "auxiliary_loss_clip": 0.01375662, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_clip": 1.25384283, + "balance_loss_mlp": 1.0286777, + "epoch": 0.4847136630091688, + "flos": 17711587479240.0, + "grad_norm": 1.978592811737933, + "language_loss": 0.82510066, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84929156, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.1473999, + "step": 8062, + "time_per_iteration": 2.717024326324463 + }, + { + "auxiliary_loss_clip": 0.01368877, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.24891984, + "balance_loss_mlp": 1.02337992, + "epoch": 0.48477378626183676, + "flos": 22023409780440.0, + "grad_norm": 1.6980962585459007, + "language_loss": 0.74396765, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76802969, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.13946533, + "step": 8063, + "time_per_iteration": 2.710099935531616 + }, + { + "auxiliary_loss_clip": 0.01374112, + "auxiliary_loss_mlp": 0.01046036, + "balance_loss_clip": 1.25188911, + "balance_loss_mlp": 1.03223205, + "epoch": 0.4848339095145047, + "flos": 27968548334040.0, + "grad_norm": 1.7469201169488695, + "language_loss": 0.79166794, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81586945, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.13806152, + "step": 8064, + "time_per_iteration": 2.799694776535034 + }, + { + "auxiliary_loss_clip": 0.01367805, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.25057101, + "balance_loss_mlp": 1.03212237, + "epoch": 0.4848940327671727, + "flos": 21693700902960.0, + "grad_norm": 1.7399232660338868, + "language_loss": 0.79587221, + "learning_rate": 2.194617118620173e-06, + "loss": 0.82000124, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.12994385, + "step": 8065, + "time_per_iteration": 2.7266292572021484 + }, + { + "auxiliary_loss_clip": 0.0135829, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.24256635, + "balance_loss_mlp": 1.02503371, + "epoch": 0.48495415601984065, + "flos": 20636508013200.0, + "grad_norm": 1.6051859178108818, + "language_loss": 0.76456845, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78853083, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.12921143, + "step": 8066, + "time_per_iteration": 4.171602487564087 + }, + { + "auxiliary_loss_clip": 0.01371973, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.25416064, + "balance_loss_mlp": 1.02564359, + "epoch": 0.4850142792725086, + "flos": 25633476788400.0, + "grad_norm": 1.4156342076857855, + "language_loss": 0.72249156, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74660492, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.137146, + "step": 8067, + "time_per_iteration": 2.81026029586792 + }, + { + "auxiliary_loss_clip": 0.0137426, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.25320339, + "balance_loss_mlp": 1.02650654, + "epoch": 0.4850744025251766, + "flos": 13775953646520.0, + "grad_norm": 1.9871987867998453, + "language_loss": 0.79891515, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.82306302, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14019775, + "step": 8068, + "time_per_iteration": 2.758716106414795 + }, + { + "auxiliary_loss_clip": 0.01368448, + "auxiliary_loss_mlp": 0.01045498, + "balance_loss_clip": 1.24908328, + "balance_loss_mlp": 1.03236115, + "epoch": 0.4851345257778446, + "flos": 20265517414800.0, + "grad_norm": 1.3975142174445612, + "language_loss": 0.84517932, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86931884, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13134766, + "step": 8069, + "time_per_iteration": 2.825043201446533 + }, + { + "auxiliary_loss_clip": 0.01369313, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.25010681, + "balance_loss_mlp": 1.02478671, + "epoch": 0.48519464903051257, + "flos": 27095470059240.0, + "grad_norm": 1.7835192662715658, + "language_loss": 0.78609979, + "learning_rate": 2.192678959687493e-06, + "loss": 0.81017321, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13238525, + "step": 8070, + "time_per_iteration": 2.911099672317505 + }, + { + "auxiliary_loss_clip": 0.01365372, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.24671006, + "balance_loss_mlp": 1.02441156, + "epoch": 0.48525477228318054, + "flos": 17131484497320.0, + "grad_norm": 1.9921040805315955, + "language_loss": 0.77722764, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80126774, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14251709, + "step": 8071, + "time_per_iteration": 4.171106338500977 + }, + { + "auxiliary_loss_clip": 0.01371494, + "auxiliary_loss_mlp": 0.010369, + "balance_loss_clip": 1.24999738, + "balance_loss_mlp": 1.02125978, + "epoch": 0.4853148955358485, + "flos": 28185741781200.0, + "grad_norm": 3.2385771556282794, + "language_loss": 0.7203055, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74438947, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.15637207, + "step": 8072, + "time_per_iteration": 2.8742029666900635 + }, + { + "auxiliary_loss_clip": 0.01375425, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.25293207, + "balance_loss_mlp": 1.02534199, + "epoch": 0.48537501878851647, + "flos": 17497764525960.0, + "grad_norm": 1.840207329484294, + "language_loss": 0.87904465, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.90319794, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14556885, + "step": 8073, + "time_per_iteration": 2.9242358207702637 + }, + { + "auxiliary_loss_clip": 0.0136277, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.24702549, + "balance_loss_mlp": 1.02203393, + "epoch": 0.48543514204118443, + "flos": 28590745204080.0, + "grad_norm": 1.788785502421064, + "language_loss": 0.61286026, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63683873, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13043213, + "step": 8074, + "time_per_iteration": 4.253251314163208 + }, + { + "auxiliary_loss_clip": 0.01368338, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_clip": 1.2460053, + "balance_loss_mlp": 1.02857101, + "epoch": 0.4854952652938524, + "flos": 20964958031520.0, + "grad_norm": 1.8192847506066578, + "language_loss": 0.73429525, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75841355, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.14904785, + "step": 8075, + "time_per_iteration": 2.7601661682128906 + }, + { + "auxiliary_loss_clip": 0.01355555, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.24001074, + "balance_loss_mlp": 1.02044821, + "epoch": 0.48555538854652036, + "flos": 66540317630040.0, + "grad_norm": 1.5954822599585916, + "language_loss": 0.81672966, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84061933, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.12976074, + "step": 8076, + "time_per_iteration": 3.2290985584259033 + }, + { + "auxiliary_loss_clip": 0.01371158, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.25066137, + "balance_loss_mlp": 1.02181089, + "epoch": 0.4856155117991883, + "flos": 15929306470440.0, + "grad_norm": 1.7463626485284944, + "language_loss": 0.8706997, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.89478099, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.15161133, + "step": 8077, + "time_per_iteration": 2.7186155319213867 + }, + { + "auxiliary_loss_clip": 0.01177379, + "auxiliary_loss_mlp": 0.00999764, + "balance_loss_clip": 1.12650478, + "balance_loss_mlp": 0.99736768, + "epoch": 0.4856756350518563, + "flos": 71062690588560.0, + "grad_norm": 0.9099055393856281, + "language_loss": 0.58749819, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60926962, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02392578, + "step": 8078, + "time_per_iteration": 3.2419447898864746 + }, + { + "auxiliary_loss_clip": 0.01381318, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.25735426, + "balance_loss_mlp": 1.02334309, + "epoch": 0.48573575830452426, + "flos": 29831768450280.0, + "grad_norm": 1.7049807854446706, + "language_loss": 0.72598398, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.75016826, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.13757324, + "step": 8079, + "time_per_iteration": 2.7684381008148193 + }, + { + "auxiliary_loss_clip": 0.01372667, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.25174892, + "balance_loss_mlp": 1.01887739, + "epoch": 0.4857958815571922, + "flos": 17644008522240.0, + "grad_norm": 2.2174852361344253, + "language_loss": 0.80322325, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.82728052, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14178467, + "step": 8080, + "time_per_iteration": 2.7349908351898193 + }, + { + "auxiliary_loss_clip": 0.01370258, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.25006175, + "balance_loss_mlp": 1.01497591, + "epoch": 0.4858560048098602, + "flos": 21110430468960.0, + "grad_norm": 2.2245274794809657, + "language_loss": 0.84231889, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86631334, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14215088, + "step": 8081, + "time_per_iteration": 2.8035390377044678 + }, + { + "auxiliary_loss_clip": 0.01367144, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.24650931, + "balance_loss_mlp": 1.01506793, + "epoch": 0.4859161280625282, + "flos": 22095942957360.0, + "grad_norm": 1.5837640512458195, + "language_loss": 0.83663952, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.86061823, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.15661621, + "step": 8082, + "time_per_iteration": 4.23409628868103 + }, + { + "auxiliary_loss_clip": 0.0136211, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.24524868, + "balance_loss_mlp": 1.01956809, + "epoch": 0.4859762513151962, + "flos": 17497926959400.0, + "grad_norm": 1.9201960574012364, + "language_loss": 0.87501734, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89896476, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13079834, + "step": 8083, + "time_per_iteration": 2.7381210327148438 + }, + { + "auxiliary_loss_clip": 0.01362172, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.24369931, + "balance_loss_mlp": 1.02447641, + "epoch": 0.48603637456786414, + "flos": 18008786041560.0, + "grad_norm": 1.7746126475096404, + "language_loss": 0.81136721, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83537233, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13873291, + "step": 8084, + "time_per_iteration": 2.71321702003479 + }, + { + "auxiliary_loss_clip": 0.01374239, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.25332546, + "balance_loss_mlp": 1.02130198, + "epoch": 0.4860964978205321, + "flos": 22497169802760.0, + "grad_norm": 2.7579661358810883, + "language_loss": 0.68390125, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70799255, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.13592529, + "step": 8085, + "time_per_iteration": 2.730328321456909 + }, + { + "auxiliary_loss_clip": 0.01365639, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.24668324, + "balance_loss_mlp": 1.02147245, + "epoch": 0.48615662107320007, + "flos": 23378410357920.0, + "grad_norm": 1.4866057618354638, + "language_loss": 0.77474809, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79875714, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13800049, + "step": 8086, + "time_per_iteration": 2.849838972091675 + }, + { + "auxiliary_loss_clip": 0.01365103, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.24454033, + "balance_loss_mlp": 1.01563764, + "epoch": 0.48621674432586803, + "flos": 34424992661760.0, + "grad_norm": 1.9621363041144757, + "language_loss": 0.70066273, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72461522, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.1451416, + "step": 8087, + "time_per_iteration": 2.856651544570923 + }, + { + "auxiliary_loss_clip": 0.01384663, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.25801504, + "balance_loss_mlp": 1.02134967, + "epoch": 0.486276867578536, + "flos": 33113832131880.0, + "grad_norm": 1.8906829246119337, + "language_loss": 0.73259032, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.75680196, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15161133, + "step": 8088, + "time_per_iteration": 2.8834049701690674 + }, + { + "auxiliary_loss_clip": 0.01366255, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.24635482, + "balance_loss_mlp": 1.02205324, + "epoch": 0.48633699083120396, + "flos": 21475735896960.0, + "grad_norm": 1.442776148074805, + "language_loss": 0.75760806, + "learning_rate": 2.185312305524892e-06, + "loss": 0.78163135, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14019775, + "step": 8089, + "time_per_iteration": 2.7389657497406006 + }, + { + "auxiliary_loss_clip": 0.01370455, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.24810195, + "balance_loss_mlp": 1.0167042, + "epoch": 0.48639711408387193, + "flos": 20089199604960.0, + "grad_norm": 1.5895543773828762, + "language_loss": 0.84366715, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86768574, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.14727783, + "step": 8090, + "time_per_iteration": 2.772550106048584 + }, + { + "auxiliary_loss_clip": 0.01359421, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.24294758, + "balance_loss_mlp": 1.01742673, + "epoch": 0.4864572373365399, + "flos": 20784010868640.0, + "grad_norm": 1.6508024250145328, + "language_loss": 0.76493734, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78884423, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.1383667, + "step": 8091, + "time_per_iteration": 2.7565159797668457 + }, + { + "auxiliary_loss_clip": 0.01367124, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.24736011, + "balance_loss_mlp": 1.01473308, + "epoch": 0.48651736058920786, + "flos": 26030196105840.0, + "grad_norm": 1.4326429761512218, + "language_loss": 0.80734748, + "learning_rate": 2.184148915123631e-06, + "loss": 0.83130729, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14117432, + "step": 8092, + "time_per_iteration": 2.8365554809570312 + }, + { + "auxiliary_loss_clip": 0.01375035, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.25334859, + "balance_loss_mlp": 1.01858294, + "epoch": 0.4865774838418758, + "flos": 20490507667080.0, + "grad_norm": 1.393696379592476, + "language_loss": 0.72156638, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.74564642, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14379883, + "step": 8093, + "time_per_iteration": 2.7892236709594727 + }, + { + "auxiliary_loss_clip": 0.01368412, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.24907613, + "balance_loss_mlp": 1.01849139, + "epoch": 0.4866376070945438, + "flos": 23552657141400.0, + "grad_norm": 1.762871934459386, + "language_loss": 0.67901629, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70302743, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14215088, + "step": 8094, + "time_per_iteration": 2.824982166290283 + }, + { + "auxiliary_loss_clip": 0.01376206, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.25320959, + "balance_loss_mlp": 1.02063107, + "epoch": 0.4866977303472118, + "flos": 16694945359920.0, + "grad_norm": 1.9203836053326047, + "language_loss": 0.66446126, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68858534, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15576172, + "step": 8095, + "time_per_iteration": 2.8403005599975586 + }, + { + "auxiliary_loss_clip": 0.01377285, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.25441647, + "balance_loss_mlp": 1.01592684, + "epoch": 0.4867578535998798, + "flos": 17900696922480.0, + "grad_norm": 2.1184434594948995, + "language_loss": 0.78954589, + "learning_rate": 2.182597630229345e-06, + "loss": 0.81362879, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.15075684, + "step": 8096, + "time_per_iteration": 2.745486259460449 + }, + { + "auxiliary_loss_clip": 0.01369303, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.24975038, + "balance_loss_mlp": 1.01967072, + "epoch": 0.48681797685254774, + "flos": 22642926498720.0, + "grad_norm": 1.928483860687496, + "language_loss": 0.6795857, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.70362002, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14471436, + "step": 8097, + "time_per_iteration": 2.788846969604492 + }, + { + "auxiliary_loss_clip": 0.0136929, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.24954319, + "balance_loss_mlp": 1.0211755, + "epoch": 0.4868781001052157, + "flos": 20890922345280.0, + "grad_norm": 1.8186774804216244, + "language_loss": 0.71601915, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.74006462, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14068604, + "step": 8098, + "time_per_iteration": 2.707125663757324 + }, + { + "auxiliary_loss_clip": 0.01386879, + "auxiliary_loss_mlp": 0.0103877, + "balance_loss_clip": 1.25966525, + "balance_loss_mlp": 1.02348757, + "epoch": 0.48693822335788367, + "flos": 41982551143560.0, + "grad_norm": 1.718935248967261, + "language_loss": 0.66204566, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68630219, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.15283203, + "step": 8099, + "time_per_iteration": 2.9287352561950684 + }, + { + "auxiliary_loss_clip": 0.01377977, + "auxiliary_loss_mlp": 0.01039259, + "balance_loss_clip": 1.25586593, + "balance_loss_mlp": 1.02538931, + "epoch": 0.48699834661055164, + "flos": 24248646047520.0, + "grad_norm": 1.783065290989076, + "language_loss": 0.67130715, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69547951, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.13873291, + "step": 8100, + "time_per_iteration": 2.755357265472412 + }, + { + "auxiliary_loss_clip": 0.01373617, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.25548697, + "balance_loss_mlp": 1.01816297, + "epoch": 0.4870584698632196, + "flos": 25929741358440.0, + "grad_norm": 1.3095657249776735, + "language_loss": 0.76741099, + "learning_rate": 2.180658368429088e-06, + "loss": 0.79146808, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13928223, + "step": 8101, + "time_per_iteration": 2.8755438327789307 + }, + { + "auxiliary_loss_clip": 0.01180834, + "auxiliary_loss_mlp": 0.01003299, + "balance_loss_clip": 1.13081384, + "balance_loss_mlp": 1.00086689, + "epoch": 0.48711859311588757, + "flos": 70227604757520.0, + "grad_norm": 0.6999852779381087, + "language_loss": 0.52371299, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54555428, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.02429199, + "step": 8102, + "time_per_iteration": 3.379239559173584 + }, + { + "auxiliary_loss_clip": 0.01374901, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.2523948, + "balance_loss_mlp": 1.02041698, + "epoch": 0.48717871636855553, + "flos": 12345983390520.0, + "grad_norm": 1.8348631052768865, + "language_loss": 0.73788458, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76197803, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.14019775, + "step": 8103, + "time_per_iteration": 2.810453176498413 + }, + { + "auxiliary_loss_clip": 0.01379718, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.25611687, + "balance_loss_mlp": 1.02848983, + "epoch": 0.4872388396212235, + "flos": 23482398032640.0, + "grad_norm": 1.7862652198252145, + "language_loss": 0.6293354, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65357381, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.15649414, + "step": 8104, + "time_per_iteration": 2.8189918994903564 + }, + { + "auxiliary_loss_clip": 0.01378211, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.25771642, + "balance_loss_mlp": 1.02290809, + "epoch": 0.48729896287389146, + "flos": 31433427163080.0, + "grad_norm": 1.9383448829496197, + "language_loss": 0.69224739, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71640438, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14581299, + "step": 8105, + "time_per_iteration": 4.294919729232788 + }, + { + "auxiliary_loss_clip": 0.01375008, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.25619292, + "balance_loss_mlp": 1.01648664, + "epoch": 0.4873590861265594, + "flos": 19062039920400.0, + "grad_norm": 1.6075736689909002, + "language_loss": 0.7405681, + "learning_rate": 2.178718935364259e-06, + "loss": 0.76462108, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13793945, + "step": 8106, + "time_per_iteration": 2.864489793777466 + }, + { + "auxiliary_loss_clip": 0.01388863, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.26558411, + "balance_loss_mlp": 1.02299821, + "epoch": 0.4874192093792274, + "flos": 24353080414200.0, + "grad_norm": 1.5629580357459096, + "language_loss": 0.76931953, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.7935881, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.15008545, + "step": 8107, + "time_per_iteration": 2.964948892593384 + }, + { + "auxiliary_loss_clip": 0.01375481, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.25780809, + "balance_loss_mlp": 1.01831365, + "epoch": 0.4874793326318954, + "flos": 23117864163480.0, + "grad_norm": 1.5385001582185494, + "language_loss": 0.75077164, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77484977, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13995361, + "step": 8108, + "time_per_iteration": 2.8390345573425293 + }, + { + "auxiliary_loss_clip": 0.01376873, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.25938928, + "balance_loss_mlp": 1.02557635, + "epoch": 0.4875394558845634, + "flos": 19030869681120.0, + "grad_norm": 1.6549631103271258, + "language_loss": 0.73888171, + "learning_rate": 2.177555194083212e-06, + "loss": 0.76302958, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.12341309, + "step": 8109, + "time_per_iteration": 2.84364914894104 + }, + { + "auxiliary_loss_clip": 0.01367695, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.25094092, + "balance_loss_mlp": 1.02385259, + "epoch": 0.48759957913723134, + "flos": 21438555620400.0, + "grad_norm": 1.93045772828583, + "language_loss": 0.78346336, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80751258, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13378906, + "step": 8110, + "time_per_iteration": 4.096255302429199 + }, + { + "auxiliary_loss_clip": 0.01378731, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.25867474, + "balance_loss_mlp": 1.02984905, + "epoch": 0.4876597023898993, + "flos": 17753153458680.0, + "grad_norm": 1.9171059391644993, + "language_loss": 0.73204559, + "learning_rate": 2.176779332873444e-06, + "loss": 0.7562691, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.13781738, + "step": 8111, + "time_per_iteration": 2.7532849311828613 + }, + { + "auxiliary_loss_clip": 0.01384361, + "auxiliary_loss_mlp": 0.01044079, + "balance_loss_clip": 1.26705933, + "balance_loss_mlp": 1.03079271, + "epoch": 0.4877198256425673, + "flos": 17024126328720.0, + "grad_norm": 1.5172888078779894, + "language_loss": 0.7617988, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78608322, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13287354, + "step": 8112, + "time_per_iteration": 4.276312351226807 + }, + { + "auxiliary_loss_clip": 0.01396849, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.27124584, + "balance_loss_mlp": 1.02903342, + "epoch": 0.48777994889523524, + "flos": 22389405550560.0, + "grad_norm": 2.201914479599482, + "language_loss": 0.7539413, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77834857, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.14837646, + "step": 8113, + "time_per_iteration": 2.7810544967651367 + }, + { + "auxiliary_loss_clip": 0.01196379, + "auxiliary_loss_mlp": 0.01005889, + "balance_loss_clip": 1.1467464, + "balance_loss_mlp": 1.00351644, + "epoch": 0.4878400721479032, + "flos": 61256969097480.0, + "grad_norm": 0.7865276888177731, + "language_loss": 0.48865449, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.51067716, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.02368164, + "step": 8114, + "time_per_iteration": 3.1452183723449707 + }, + { + "auxiliary_loss_clip": 0.01387524, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.26525521, + "balance_loss_mlp": 1.02989388, + "epoch": 0.48790019540057117, + "flos": 24542189857440.0, + "grad_norm": 1.6895888476141097, + "language_loss": 0.76863706, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.79295927, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14807129, + "step": 8115, + "time_per_iteration": 2.8480169773101807 + }, + { + "auxiliary_loss_clip": 0.01394595, + "auxiliary_loss_mlp": 0.01042282, + "balance_loss_clip": 1.27032065, + "balance_loss_mlp": 1.02766705, + "epoch": 0.48796031865323913, + "flos": 21838807865160.0, + "grad_norm": 1.871421498111904, + "language_loss": 0.72204274, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.7464115, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14611816, + "step": 8116, + "time_per_iteration": 2.958312749862671 + }, + { + "auxiliary_loss_clip": 0.01381228, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.26247931, + "balance_loss_mlp": 1.02135849, + "epoch": 0.4880204419059071, + "flos": 18593640201600.0, + "grad_norm": 2.6695758251183057, + "language_loss": 0.63104236, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65520394, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13568115, + "step": 8117, + "time_per_iteration": 2.9496521949768066 + }, + { + "auxiliary_loss_clip": 0.01380254, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.26085925, + "balance_loss_mlp": 1.02243924, + "epoch": 0.48808056515857506, + "flos": 19176910635600.0, + "grad_norm": 1.694896707836316, + "language_loss": 0.79778767, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.82194799, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13354492, + "step": 8118, + "time_per_iteration": 2.799206256866455 + }, + { + "auxiliary_loss_clip": 0.01388583, + "auxiliary_loss_mlp": 0.01042329, + "balance_loss_clip": 1.26602125, + "balance_loss_mlp": 1.02732074, + "epoch": 0.48814068841124303, + "flos": 20125324064160.0, + "grad_norm": 1.8001141850352922, + "language_loss": 0.64198917, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.66629827, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.15002441, + "step": 8119, + "time_per_iteration": 2.7843759059906006 + }, + { + "auxiliary_loss_clip": 0.01386874, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.26548529, + "balance_loss_mlp": 1.01643443, + "epoch": 0.488200811663911, + "flos": 22970686174920.0, + "grad_norm": 2.9137007547879725, + "language_loss": 0.72957116, + "learning_rate": 2.173287627305878e-06, + "loss": 0.7537446, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14038086, + "step": 8120, + "time_per_iteration": 4.301827430725098 + }, + { + "auxiliary_loss_clip": 0.01388258, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.26518273, + "balance_loss_mlp": 1.01866555, + "epoch": 0.48826093491657896, + "flos": 33917138598240.0, + "grad_norm": 2.1073690072370708, + "language_loss": 0.63800031, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.66221702, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14746094, + "step": 8121, + "time_per_iteration": 2.8777449131011963 + }, + { + "auxiliary_loss_clip": 0.01397808, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.27353573, + "balance_loss_mlp": 1.02221763, + "epoch": 0.488321058169247, + "flos": 23074795674720.0, + "grad_norm": 4.469008740893893, + "language_loss": 0.82993615, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.85428774, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.15118408, + "step": 8122, + "time_per_iteration": 2.734567165374756 + }, + { + "auxiliary_loss_clip": 0.01402386, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.27717113, + "balance_loss_mlp": 1.02483344, + "epoch": 0.48838118142191494, + "flos": 19322423681400.0, + "grad_norm": 1.7052577854159017, + "language_loss": 0.85783118, + "learning_rate": 2.172123606640866e-06, + "loss": 0.88224804, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.14477539, + "step": 8123, + "time_per_iteration": 2.778984785079956 + }, + { + "auxiliary_loss_clip": 0.01398209, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.27465844, + "balance_loss_mlp": 1.02067661, + "epoch": 0.4884413046745829, + "flos": 25416161516160.0, + "grad_norm": 1.396827687801489, + "language_loss": 0.85573727, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.88006514, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.13897705, + "step": 8124, + "time_per_iteration": 2.7805874347686768 + }, + { + "auxiliary_loss_clip": 0.01394644, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.27200758, + "balance_loss_mlp": 1.02546763, + "epoch": 0.4885014279272509, + "flos": 20995884620640.0, + "grad_norm": 2.07577590779544, + "language_loss": 0.79920399, + "learning_rate": 2.171347560204948e-06, + "loss": 0.82354331, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13848877, + "step": 8125, + "time_per_iteration": 2.801624059677124 + }, + { + "auxiliary_loss_clip": 0.01388413, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.26784873, + "balance_loss_mlp": 1.02646625, + "epoch": 0.48856155117991884, + "flos": 13775547562920.0, + "grad_norm": 1.8657272452787714, + "language_loss": 0.72474384, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74903238, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13983154, + "step": 8126, + "time_per_iteration": 2.807130813598633 + }, + { + "auxiliary_loss_clip": 0.01397933, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.27266705, + "balance_loss_mlp": 1.02362907, + "epoch": 0.4886216744325868, + "flos": 32094144385560.0, + "grad_norm": 1.6444266494784436, + "language_loss": 0.68884861, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.71320367, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.1394043, + "step": 8127, + "time_per_iteration": 2.8776607513427734 + }, + { + "auxiliary_loss_clip": 0.01393014, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.26792264, + "balance_loss_mlp": 1.01871002, + "epoch": 0.48868179768525477, + "flos": 19614871065600.0, + "grad_norm": 1.6577016899853376, + "language_loss": 0.76908922, + "learning_rate": 2.170183441856481e-06, + "loss": 0.79335463, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.14813232, + "step": 8128, + "time_per_iteration": 2.7759814262390137 + }, + { + "auxiliary_loss_clip": 0.01396821, + "auxiliary_loss_mlp": 0.01036771, + "balance_loss_clip": 1.2722044, + "balance_loss_mlp": 1.02311563, + "epoch": 0.48874192093792274, + "flos": 21291743107080.0, + "grad_norm": 2.1412891121492925, + "language_loss": 0.76323122, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78756714, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.13665771, + "step": 8129, + "time_per_iteration": 2.8189890384674072 + }, + { + "auxiliary_loss_clip": 0.01397103, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.27288747, + "balance_loss_mlp": 1.0216229, + "epoch": 0.4888020441905907, + "flos": 14177505358800.0, + "grad_norm": 1.9561825010215892, + "language_loss": 0.64928055, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67361617, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14837646, + "step": 8130, + "time_per_iteration": 2.8094215393066406 + }, + { + "auxiliary_loss_clip": 0.01390976, + "auxiliary_loss_mlp": 0.01034577, + "balance_loss_clip": 1.27038097, + "balance_loss_mlp": 1.02004528, + "epoch": 0.48886216744325867, + "flos": 24103295435160.0, + "grad_norm": 1.8053790700589474, + "language_loss": 0.72583222, + "learning_rate": 2.169019265427658e-06, + "loss": 0.75008774, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.14538574, + "step": 8131, + "time_per_iteration": 2.8400559425354004 + }, + { + "auxiliary_loss_clip": 0.01402859, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.27718425, + "balance_loss_mlp": 1.02729261, + "epoch": 0.48892229069592663, + "flos": 38437992066240.0, + "grad_norm": 1.4774036256747733, + "language_loss": 0.69520563, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71965468, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.1473999, + "step": 8132, + "time_per_iteration": 2.9311163425445557 + }, + { + "auxiliary_loss_clip": 0.01387576, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.26746583, + "balance_loss_mlp": 1.01920199, + "epoch": 0.4889824139485946, + "flos": 23848799886360.0, + "grad_norm": 1.3960637566052634, + "language_loss": 0.70665109, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.73086059, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14154053, + "step": 8133, + "time_per_iteration": 2.8836259841918945 + }, + { + "auxiliary_loss_clip": 0.01388074, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.2671864, + "balance_loss_mlp": 1.02416825, + "epoch": 0.48904253720126256, + "flos": 24431217544800.0, + "grad_norm": 1.5228688472068408, + "language_loss": 0.71398944, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73825407, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14233398, + "step": 8134, + "time_per_iteration": 2.7986090183258057 + }, + { + "auxiliary_loss_clip": 0.01397757, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.27145767, + "balance_loss_mlp": 1.01970327, + "epoch": 0.4891026604539306, + "flos": 24176072262240.0, + "grad_norm": 1.7997134302115, + "language_loss": 0.80692768, + "learning_rate": 2.167466940528718e-06, + "loss": 0.8312577, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.1552124, + "step": 8135, + "time_per_iteration": 2.795926094055176 + }, + { + "auxiliary_loss_clip": 0.01386099, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.2652576, + "balance_loss_mlp": 1.02127457, + "epoch": 0.48916278370659855, + "flos": 21476020155480.0, + "grad_norm": 1.5342095304148695, + "language_loss": 0.74327517, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76748419, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.13537598, + "step": 8136, + "time_per_iteration": 2.7286832332611084 + }, + { + "auxiliary_loss_clip": 0.01387445, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.26716042, + "balance_loss_mlp": 1.02302969, + "epoch": 0.4892229069592665, + "flos": 22314882564000.0, + "grad_norm": 1.7500262855761008, + "language_loss": 0.73597026, + "learning_rate": 2.166690739918204e-06, + "loss": 0.76021194, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13677979, + "step": 8137, + "time_per_iteration": 2.8428192138671875 + }, + { + "auxiliary_loss_clip": 0.01390711, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.2679261, + "balance_loss_mlp": 1.0250839, + "epoch": 0.4892830302119345, + "flos": 12790725416640.0, + "grad_norm": 2.3541240601438425, + "language_loss": 0.75600553, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.78030825, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.1449585, + "step": 8138, + "time_per_iteration": 2.755157470703125 + }, + { + "auxiliary_loss_clip": 0.01391165, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.27029192, + "balance_loss_mlp": 1.02086139, + "epoch": 0.48934315346460244, + "flos": 20818835860320.0, + "grad_norm": 1.7605658801480273, + "language_loss": 0.74333006, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76758695, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13659668, + "step": 8139, + "time_per_iteration": 2.7681496143341064 + }, + { + "auxiliary_loss_clip": 0.01392828, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.27038646, + "balance_loss_mlp": 1.01873171, + "epoch": 0.4894032767172704, + "flos": 19760302894680.0, + "grad_norm": 1.9362728571683396, + "language_loss": 0.62303364, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64728498, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13568115, + "step": 8140, + "time_per_iteration": 2.7315311431884766 + }, + { + "auxiliary_loss_clip": 0.01389705, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.26481795, + "balance_loss_mlp": 1.02582026, + "epoch": 0.4894633999699384, + "flos": 17823128308920.0, + "grad_norm": 1.7667898398694721, + "language_loss": 0.82188606, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84619379, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.15258789, + "step": 8141, + "time_per_iteration": 2.7798380851745605 + }, + { + "auxiliary_loss_clip": 0.01388029, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.26530349, + "balance_loss_mlp": 1.02203214, + "epoch": 0.48952352322260634, + "flos": 25529123638440.0, + "grad_norm": 1.6077975337693948, + "language_loss": 0.72730392, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.75154841, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14404297, + "step": 8142, + "time_per_iteration": 2.8685784339904785 + }, + { + "auxiliary_loss_clip": 0.01386673, + "auxiliary_loss_mlp": 0.01039055, + "balance_loss_clip": 1.26475608, + "balance_loss_mlp": 1.02465498, + "epoch": 0.4895836464752743, + "flos": 29060972299080.0, + "grad_norm": 2.0725661943509004, + "language_loss": 0.67030001, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69455725, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14379883, + "step": 8143, + "time_per_iteration": 2.855039119720459 + }, + { + "auxiliary_loss_clip": 0.01382105, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.26278234, + "balance_loss_mlp": 1.01983762, + "epoch": 0.48964376972794227, + "flos": 33553498113000.0, + "grad_norm": 1.6662846027915705, + "language_loss": 0.75466114, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77881491, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13433838, + "step": 8144, + "time_per_iteration": 4.273553848266602 + }, + { + "auxiliary_loss_clip": 0.01384849, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.26400971, + "balance_loss_mlp": 1.02013445, + "epoch": 0.48970389298061023, + "flos": 22059087547680.0, + "grad_norm": 1.4926895008155772, + "language_loss": 0.76172554, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78591311, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.13775635, + "step": 8145, + "time_per_iteration": 2.8583192825317383 + }, + { + "auxiliary_loss_clip": 0.01389665, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.26682329, + "balance_loss_mlp": 1.0207963, + "epoch": 0.4897640162332782, + "flos": 20089158996600.0, + "grad_norm": 1.936089238253767, + "language_loss": 0.80535543, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82961142, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.15148926, + "step": 8146, + "time_per_iteration": 2.79252552986145 + }, + { + "auxiliary_loss_clip": 0.0137918, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.26176167, + "balance_loss_mlp": 1.01973772, + "epoch": 0.48982413948594616, + "flos": 23811619609800.0, + "grad_norm": 1.6267220946943848, + "language_loss": 0.7442162, + "learning_rate": 2.162809359964687e-06, + "loss": 0.768345, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13964844, + "step": 8147, + "time_per_iteration": 2.90564227104187 + }, + { + "auxiliary_loss_clip": 0.01385702, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.26494503, + "balance_loss_mlp": 1.01969552, + "epoch": 0.4898842627386142, + "flos": 17644170955680.0, + "grad_norm": 2.562460579912791, + "language_loss": 0.82961166, + "learning_rate": 2.162421187770864e-06, + "loss": 0.85380292, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.13726807, + "step": 8148, + "time_per_iteration": 4.2061614990234375 + }, + { + "auxiliary_loss_clip": 0.01377384, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.26008642, + "balance_loss_mlp": 1.01789618, + "epoch": 0.48994438599128215, + "flos": 16622290357920.0, + "grad_norm": 1.833291150880187, + "language_loss": 0.74082434, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76490402, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.12695312, + "step": 8149, + "time_per_iteration": 2.7235569953918457 + }, + { + "auxiliary_loss_clip": 0.0139606, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.27196956, + "balance_loss_mlp": 1.01717556, + "epoch": 0.4900045092439501, + "flos": 26620491786120.0, + "grad_norm": 1.6511134307986581, + "language_loss": 0.76410449, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78838396, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.14733887, + "step": 8150, + "time_per_iteration": 2.804429292678833 + }, + { + "auxiliary_loss_clip": 0.01387251, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.26431262, + "balance_loss_mlp": 1.01973915, + "epoch": 0.4900646324966181, + "flos": 19906993582920.0, + "grad_norm": 1.8923281450272575, + "language_loss": 0.72987801, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75408936, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.14147949, + "step": 8151, + "time_per_iteration": 2.777583122253418 + }, + { + "auxiliary_loss_clip": 0.01207633, + "auxiliary_loss_mlp": 0.01009467, + "balance_loss_clip": 1.15708172, + "balance_loss_mlp": 1.00707054, + "epoch": 0.49012475574928605, + "flos": 59203380678840.0, + "grad_norm": 0.8614895081348015, + "language_loss": 0.54433203, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56650299, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02392578, + "step": 8152, + "time_per_iteration": 4.722673177719116 + }, + { + "auxiliary_loss_clip": 0.01391133, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.26746821, + "balance_loss_mlp": 1.01608181, + "epoch": 0.490184879001954, + "flos": 45267985319040.0, + "grad_norm": 1.6464321262664654, + "language_loss": 0.61396259, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63817722, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14239502, + "step": 8153, + "time_per_iteration": 3.040480136871338 + }, + { + "auxiliary_loss_clip": 0.01388664, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.2675755, + "balance_loss_mlp": 1.02438414, + "epoch": 0.490245002254622, + "flos": 28008083895480.0, + "grad_norm": 1.4643405969014012, + "language_loss": 0.77057612, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79485863, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.15209961, + "step": 8154, + "time_per_iteration": 2.8485426902770996 + }, + { + "auxiliary_loss_clip": 0.01207491, + "auxiliary_loss_mlp": 0.01015907, + "balance_loss_clip": 1.15848088, + "balance_loss_mlp": 1.01352262, + "epoch": 0.49030512550728994, + "flos": 58967280717120.0, + "grad_norm": 0.9639356600731173, + "language_loss": 0.6702522, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69248617, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.02380371, + "step": 8155, + "time_per_iteration": 3.2887957096099854 + }, + { + "auxiliary_loss_clip": 0.01384491, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.26399183, + "balance_loss_mlp": 1.01895034, + "epoch": 0.4903652487599579, + "flos": 19796589787320.0, + "grad_norm": 1.7214548979700008, + "language_loss": 0.7712034, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79536545, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.12774658, + "step": 8156, + "time_per_iteration": 2.727759838104248 + }, + { + "auxiliary_loss_clip": 0.01378699, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.25820994, + "balance_loss_mlp": 1.01835001, + "epoch": 0.49042537201262587, + "flos": 21767086855440.0, + "grad_norm": 2.0198851149626984, + "language_loss": 0.84305394, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.86716157, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13708496, + "step": 8157, + "time_per_iteration": 2.788395643234253 + }, + { + "auxiliary_loss_clip": 0.01379915, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.25931549, + "balance_loss_mlp": 1.01653123, + "epoch": 0.49048549526529384, + "flos": 18957727378800.0, + "grad_norm": 1.8641276938057447, + "language_loss": 0.80009711, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8242029, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.14135742, + "step": 8158, + "time_per_iteration": 2.9610724449157715 + }, + { + "auxiliary_loss_clip": 0.01385209, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.26207447, + "balance_loss_mlp": 1.01744175, + "epoch": 0.4905456185179618, + "flos": 26912045786400.0, + "grad_norm": 1.4759896956438194, + "language_loss": 0.6935519, + "learning_rate": 2.158150890381454e-06, + "loss": 0.7177211, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.14257812, + "step": 8159, + "time_per_iteration": 4.428290128707886 + }, + { + "auxiliary_loss_clip": 0.01378774, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.25973284, + "balance_loss_mlp": 1.02029395, + "epoch": 0.49060574177062977, + "flos": 20417202931320.0, + "grad_norm": 1.7568461751328168, + "language_loss": 0.73463994, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75876409, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.13342285, + "step": 8160, + "time_per_iteration": 2.897710084915161 + }, + { + "auxiliary_loss_clip": 0.01385727, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.26190662, + "balance_loss_mlp": 1.02266526, + "epoch": 0.4906658650232978, + "flos": 17498211217920.0, + "grad_norm": 1.8827713001662645, + "language_loss": 0.71660829, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.74083078, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.13861084, + "step": 8161, + "time_per_iteration": 2.9623260498046875 + }, + { + "auxiliary_loss_clip": 0.01380736, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.26091957, + "balance_loss_mlp": 1.0194329, + "epoch": 0.49072598827596575, + "flos": 26620045094160.0, + "grad_norm": 1.6299764946785584, + "language_loss": 0.69267273, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.71680892, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13452148, + "step": 8162, + "time_per_iteration": 2.821530342102051 + }, + { + "auxiliary_loss_clip": 0.01389585, + "auxiliary_loss_mlp": 0.01034506, + "balance_loss_clip": 1.26539218, + "balance_loss_mlp": 1.01951516, + "epoch": 0.4907861115286337, + "flos": 20417487189840.0, + "grad_norm": 1.830208697913832, + "language_loss": 0.64127338, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.66551423, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14978027, + "step": 8163, + "time_per_iteration": 2.843392848968506 + }, + { + "auxiliary_loss_clip": 0.0137026, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.25364208, + "balance_loss_mlp": 1.01487231, + "epoch": 0.4908462347813017, + "flos": 14068360422360.0, + "grad_norm": 3.4816217465886794, + "language_loss": 0.77826858, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.80225271, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13287354, + "step": 8164, + "time_per_iteration": 2.841566324234009 + }, + { + "auxiliary_loss_clip": 0.01382091, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.25942504, + "balance_loss_mlp": 1.01557851, + "epoch": 0.49090635803396965, + "flos": 18739884197880.0, + "grad_norm": 2.052902888097403, + "language_loss": 0.77252275, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.7966373, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.13787842, + "step": 8165, + "time_per_iteration": 2.9021942615509033 + }, + { + "auxiliary_loss_clip": 0.01376283, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.25829911, + "balance_loss_mlp": 1.02268338, + "epoch": 0.4909664812866376, + "flos": 20563406319240.0, + "grad_norm": 1.5887909148652635, + "language_loss": 0.78027558, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.80440485, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13964844, + "step": 8166, + "time_per_iteration": 2.8543496131896973 + }, + { + "auxiliary_loss_clip": 0.01201699, + "auxiliary_loss_mlp": 0.0101393, + "balance_loss_clip": 1.15295815, + "balance_loss_mlp": 1.01178443, + "epoch": 0.4910266045393056, + "flos": 54699623330400.0, + "grad_norm": 0.792844783844482, + "language_loss": 0.54220253, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56435883, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.02148438, + "step": 8167, + "time_per_iteration": 3.2838685512542725 + }, + { + "auxiliary_loss_clip": 0.01372611, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.25399292, + "balance_loss_mlp": 1.02078772, + "epoch": 0.49108672779197354, + "flos": 16249919075280.0, + "grad_norm": 1.7778700647558507, + "language_loss": 0.86522442, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88929397, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13543701, + "step": 8168, + "time_per_iteration": 2.879390001296997 + }, + { + "auxiliary_loss_clip": 0.01374648, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.25620699, + "balance_loss_mlp": 1.02100945, + "epoch": 0.4911468510446415, + "flos": 19829790444600.0, + "grad_norm": 1.7761566350686844, + "language_loss": 0.73460317, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75869316, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13336182, + "step": 8169, + "time_per_iteration": 2.804459571838379 + }, + { + "auxiliary_loss_clip": 0.0137157, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.25292909, + "balance_loss_mlp": 1.01541078, + "epoch": 0.4912069742973095, + "flos": 21217341945600.0, + "grad_norm": 1.4647269151195998, + "language_loss": 0.78174168, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80573881, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.12731934, + "step": 8170, + "time_per_iteration": 2.8259565830230713 + }, + { + "auxiliary_loss_clip": 0.01375846, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.25382876, + "balance_loss_mlp": 1.01964366, + "epoch": 0.49126709754997744, + "flos": 19541931805080.0, + "grad_norm": 2.3191833520811396, + "language_loss": 0.76247329, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78656769, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.1394043, + "step": 8171, + "time_per_iteration": 2.8461995124816895 + }, + { + "auxiliary_loss_clip": 0.01383515, + "auxiliary_loss_mlp": 0.01033479, + "balance_loss_clip": 1.26000941, + "balance_loss_mlp": 1.0202055, + "epoch": 0.4913272208026454, + "flos": 12243498225120.0, + "grad_norm": 2.5765117787268665, + "language_loss": 0.81309563, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83726561, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.13262939, + "step": 8172, + "time_per_iteration": 2.8398070335388184 + }, + { + "auxiliary_loss_clip": 0.01193203, + "auxiliary_loss_mlp": 0.01009566, + "balance_loss_clip": 1.14474607, + "balance_loss_mlp": 1.00749135, + "epoch": 0.49138734405531337, + "flos": 65480542786440.0, + "grad_norm": 0.7108295614485957, + "language_loss": 0.53344727, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.555475, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02075195, + "step": 8173, + "time_per_iteration": 3.298438787460327 + }, + { + "auxiliary_loss_clip": 0.01379008, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.25685596, + "balance_loss_mlp": 1.0187695, + "epoch": 0.4914474673079814, + "flos": 18443254152600.0, + "grad_norm": 2.2849707736058873, + "language_loss": 0.62999249, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65411007, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.13977051, + "step": 8174, + "time_per_iteration": 2.799222469329834 + }, + { + "auxiliary_loss_clip": 0.01379813, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.25945103, + "balance_loss_mlp": 1.01912332, + "epoch": 0.49150759056064935, + "flos": 21689518241880.0, + "grad_norm": 1.6744149245846582, + "language_loss": 0.69218028, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.7163108, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14117432, + "step": 8175, + "time_per_iteration": 2.834972381591797 + }, + { + "auxiliary_loss_clip": 0.01372941, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.25323319, + "balance_loss_mlp": 1.01612258, + "epoch": 0.4915677138133173, + "flos": 22387537566000.0, + "grad_norm": 1.6022676014570292, + "language_loss": 0.7455157, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76953721, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.1307373, + "step": 8176, + "time_per_iteration": 2.760756254196167 + }, + { + "auxiliary_loss_clip": 0.01376021, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.25419104, + "balance_loss_mlp": 1.01974535, + "epoch": 0.4916278370659853, + "flos": 18407129693400.0, + "grad_norm": 1.657989323015648, + "language_loss": 0.69989586, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72398782, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13446045, + "step": 8177, + "time_per_iteration": 2.7943472862243652 + }, + { + "auxiliary_loss_clip": 0.01185649, + "auxiliary_loss_mlp": 0.01002586, + "balance_loss_clip": 1.13766825, + "balance_loss_mlp": 1.00038099, + "epoch": 0.49168796031865325, + "flos": 66625537596480.0, + "grad_norm": 0.6866053254013478, + "language_loss": 0.46243492, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48431724, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02209473, + "step": 8178, + "time_per_iteration": 3.2806930541992188 + }, + { + "auxiliary_loss_clip": 0.01384259, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.2598356, + "balance_loss_mlp": 1.01922476, + "epoch": 0.4917480835713212, + "flos": 20964227081040.0, + "grad_norm": 2.3040909240521597, + "language_loss": 0.66262752, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68680733, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1451416, + "step": 8179, + "time_per_iteration": 2.776395797729492 + }, + { + "auxiliary_loss_clip": 0.01382673, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.25648606, + "balance_loss_mlp": 1.02237153, + "epoch": 0.4918082068239892, + "flos": 15776402703120.0, + "grad_norm": 1.8130752580798435, + "language_loss": 0.70231283, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72651076, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.14758301, + "step": 8180, + "time_per_iteration": 2.7438769340515137 + }, + { + "auxiliary_loss_clip": 0.01372744, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.25339842, + "balance_loss_mlp": 1.01658106, + "epoch": 0.49186833007665715, + "flos": 24609768814440.0, + "grad_norm": 1.5718234755524467, + "language_loss": 0.84286159, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86689585, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.14105225, + "step": 8181, + "time_per_iteration": 2.7700507640838623 + }, + { + "auxiliary_loss_clip": 0.01367526, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.24990022, + "balance_loss_mlp": 1.0196538, + "epoch": 0.4919284533293251, + "flos": 22095618090480.0, + "grad_norm": 1.9764073817003294, + "language_loss": 0.73152399, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.75552452, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.12872314, + "step": 8182, + "time_per_iteration": 2.7455592155456543 + }, + { + "auxiliary_loss_clip": 0.01374866, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.25336766, + "balance_loss_mlp": 1.02222586, + "epoch": 0.4919885765819931, + "flos": 23373334312920.0, + "grad_norm": 2.1162464010596276, + "language_loss": 0.72802848, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.75214267, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14343262, + "step": 8183, + "time_per_iteration": 4.220401048660278 + }, + { + "auxiliary_loss_clip": 0.01387617, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.2621423, + "balance_loss_mlp": 1.01350403, + "epoch": 0.49204869983466104, + "flos": 21365413318080.0, + "grad_norm": 1.795798132857799, + "language_loss": 0.77588511, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.80003548, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.13928223, + "step": 8184, + "time_per_iteration": 2.77827525138855 + }, + { + "auxiliary_loss_clip": 0.01369985, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.2498374, + "balance_loss_mlp": 1.01977134, + "epoch": 0.492108823087329, + "flos": 21147732570600.0, + "grad_norm": 1.487987150626708, + "language_loss": 0.70346963, + "learning_rate": 2.148054610995789e-06, + "loss": 0.7275061, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.13879395, + "step": 8185, + "time_per_iteration": 2.798480749130249 + }, + { + "auxiliary_loss_clip": 0.01383968, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.26178312, + "balance_loss_mlp": 1.01928031, + "epoch": 0.49216894633999697, + "flos": 25121764930680.0, + "grad_norm": 2.735187263552495, + "language_loss": 0.75029606, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77447563, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.14709473, + "step": 8186, + "time_per_iteration": 2.7897937297821045 + }, + { + "auxiliary_loss_clip": 0.0137547, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.25525784, + "balance_loss_mlp": 1.02619481, + "epoch": 0.49222906959266494, + "flos": 22643088932160.0, + "grad_norm": 2.4898380456621974, + "language_loss": 0.68597674, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.71013832, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14489746, + "step": 8187, + "time_per_iteration": 4.243904113769531 + }, + { + "auxiliary_loss_clip": 0.01376163, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.25566566, + "balance_loss_mlp": 1.02342939, + "epoch": 0.49228919284533296, + "flos": 20415172513320.0, + "grad_norm": 1.569705010119125, + "language_loss": 0.66839629, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.6925298, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13769531, + "step": 8188, + "time_per_iteration": 2.892667531967163 + }, + { + "auxiliary_loss_clip": 0.01372474, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.2521553, + "balance_loss_mlp": 1.02023816, + "epoch": 0.4923493160980009, + "flos": 27127655507520.0, + "grad_norm": 2.0854042214732686, + "language_loss": 0.75148499, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.77554131, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.12927246, + "step": 8189, + "time_per_iteration": 2.908926010131836 + }, + { + "auxiliary_loss_clip": 0.01372777, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.25347972, + "balance_loss_mlp": 1.01603997, + "epoch": 0.4924094393506689, + "flos": 35743584521520.0, + "grad_norm": 1.5884914743408165, + "language_loss": 0.64530283, + "learning_rate": 2.146112575713104e-06, + "loss": 0.6693244, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.13336182, + "step": 8190, + "time_per_iteration": 4.433629751205444 + }, + { + "auxiliary_loss_clip": 0.01380026, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.2609036, + "balance_loss_mlp": 1.0245651, + "epoch": 0.49246956260333685, + "flos": 20417365364760.0, + "grad_norm": 1.8800539450127416, + "language_loss": 0.72508836, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.74927527, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14099121, + "step": 8191, + "time_per_iteration": 2.799497604370117 + }, + { + "auxiliary_loss_clip": 0.01376422, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.25614917, + "balance_loss_mlp": 1.01958489, + "epoch": 0.4925296858560048, + "flos": 38983351273200.0, + "grad_norm": 1.5390509431038075, + "language_loss": 0.71824872, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74233973, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13092041, + "step": 8192, + "time_per_iteration": 2.914513111114502 + }, + { + "auxiliary_loss_clip": 0.01186333, + "auxiliary_loss_mlp": 0.01004845, + "balance_loss_clip": 1.13907433, + "balance_loss_mlp": 1.00279415, + "epoch": 0.4925898091086728, + "flos": 64295663920920.0, + "grad_norm": 0.7296388381900883, + "language_loss": 0.52173042, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54364222, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02050781, + "step": 8193, + "time_per_iteration": 3.3030803203582764 + }, + { + "auxiliary_loss_clip": 0.01367983, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.25064778, + "balance_loss_mlp": 1.02483535, + "epoch": 0.49264993236134075, + "flos": 23041188933840.0, + "grad_norm": 1.3908854751108017, + "language_loss": 0.77336776, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79743111, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.1350708, + "step": 8194, + "time_per_iteration": 2.853316068649292 + }, + { + "auxiliary_loss_clip": 0.01373084, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.25410843, + "balance_loss_mlp": 1.01546025, + "epoch": 0.4927100556140087, + "flos": 24723705537360.0, + "grad_norm": 1.9373480474271663, + "language_loss": 0.70964515, + "learning_rate": 2.144170401915341e-06, + "loss": 0.73366022, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.12963867, + "step": 8195, + "time_per_iteration": 2.772221326828003 + }, + { + "auxiliary_loss_clip": 0.01373891, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.25428617, + "balance_loss_mlp": 1.01665461, + "epoch": 0.4927701788666767, + "flos": 23509751086080.0, + "grad_norm": 1.841046143641662, + "language_loss": 0.81539083, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83942556, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.12915039, + "step": 8196, + "time_per_iteration": 2.7935078144073486 + }, + { + "auxiliary_loss_clip": 0.01378532, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.25630331, + "balance_loss_mlp": 1.0186547, + "epoch": 0.49283030211934464, + "flos": 22933952590320.0, + "grad_norm": 1.7675889174378625, + "language_loss": 0.70537961, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72948658, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.13500977, + "step": 8197, + "time_per_iteration": 4.339395999908447 + }, + { + "auxiliary_loss_clip": 0.01368218, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.24998856, + "balance_loss_mlp": 1.01736927, + "epoch": 0.4928904253720126, + "flos": 16877395032120.0, + "grad_norm": 1.8924284782574932, + "language_loss": 0.84244227, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86642027, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.12225342, + "step": 8198, + "time_per_iteration": 2.7790510654449463 + }, + { + "auxiliary_loss_clip": 0.01386372, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.26270294, + "balance_loss_mlp": 1.02596891, + "epoch": 0.4929505486246806, + "flos": 14870692288080.0, + "grad_norm": 1.8628732010414164, + "language_loss": 0.76351649, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78778005, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.14013672, + "step": 8199, + "time_per_iteration": 2.739222526550293 + }, + { + "auxiliary_loss_clip": 0.01382563, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.26079774, + "balance_loss_mlp": 1.01761603, + "epoch": 0.49301067187734854, + "flos": 23847703460640.0, + "grad_norm": 1.414917590210767, + "language_loss": 0.60526627, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62940931, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14129639, + "step": 8200, + "time_per_iteration": 2.941453456878662 + }, + { + "auxiliary_loss_clip": 0.01362547, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.24793422, + "balance_loss_mlp": 1.02372789, + "epoch": 0.49307079513001656, + "flos": 22496317027200.0, + "grad_norm": 1.5209960893165506, + "language_loss": 0.79282105, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81680828, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.12432861, + "step": 8201, + "time_per_iteration": 2.8629074096679688 + }, + { + "auxiliary_loss_clip": 0.01386288, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.26067722, + "balance_loss_mlp": 1.01778483, + "epoch": 0.4931309183826845, + "flos": 15928859778480.0, + "grad_norm": 1.8179785837096387, + "language_loss": 0.66864705, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69283438, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14672852, + "step": 8202, + "time_per_iteration": 2.7901535034179688 + }, + { + "auxiliary_loss_clip": 0.01372523, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.25347066, + "balance_loss_mlp": 1.01813889, + "epoch": 0.4931910416353525, + "flos": 27314937574560.0, + "grad_norm": 1.8890035112363, + "language_loss": 0.7584821, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.78251672, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.12792969, + "step": 8203, + "time_per_iteration": 2.8711469173431396 + }, + { + "auxiliary_loss_clip": 0.01377093, + "auxiliary_loss_mlp": 0.01036713, + "balance_loss_clip": 1.25701571, + "balance_loss_mlp": 1.02279544, + "epoch": 0.49325116488802045, + "flos": 20810064454560.0, + "grad_norm": 2.2447720528062933, + "language_loss": 0.80836821, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.8325063, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.13916016, + "step": 8204, + "time_per_iteration": 2.926966428756714 + }, + { + "auxiliary_loss_clip": 0.01376089, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.25817978, + "balance_loss_mlp": 1.02101719, + "epoch": 0.4933112881406884, + "flos": 19870869123720.0, + "grad_norm": 2.602629476714446, + "language_loss": 0.66102624, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68512571, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.12835693, + "step": 8205, + "time_per_iteration": 2.7852394580841064 + }, + { + "auxiliary_loss_clip": 0.01386268, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.26072955, + "balance_loss_mlp": 1.02080274, + "epoch": 0.4933714113933564, + "flos": 21832066877400.0, + "grad_norm": 5.066025146841527, + "language_loss": 0.66555113, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68977195, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.15002441, + "step": 8206, + "time_per_iteration": 2.844867706298828 + }, + { + "auxiliary_loss_clip": 0.01381375, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.26004004, + "balance_loss_mlp": 1.01805747, + "epoch": 0.49343153464602435, + "flos": 27895730898600.0, + "grad_norm": 1.5817811145067444, + "language_loss": 0.76807928, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.79219651, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.12286377, + "step": 8207, + "time_per_iteration": 2.8487374782562256 + }, + { + "auxiliary_loss_clip": 0.01375378, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.25532424, + "balance_loss_mlp": 1.01959205, + "epoch": 0.4934916578986923, + "flos": 24686484652440.0, + "grad_norm": 2.0244184550480964, + "language_loss": 0.61534035, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.63943106, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14099121, + "step": 8208, + "time_per_iteration": 2.83888840675354 + }, + { + "auxiliary_loss_clip": 0.01374812, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.25440323, + "balance_loss_mlp": 1.01685929, + "epoch": 0.4935517811513603, + "flos": 23410514589480.0, + "grad_norm": 1.8763492606389973, + "language_loss": 0.78513372, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.8091898, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13928223, + "step": 8209, + "time_per_iteration": 2.8459413051605225 + }, + { + "auxiliary_loss_clip": 0.01370604, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.25045753, + "balance_loss_mlp": 1.01798129, + "epoch": 0.49361190440402825, + "flos": 21949536527640.0, + "grad_norm": 1.9362382711824, + "language_loss": 0.79486501, + "learning_rate": 2.138343067844089e-06, + "loss": 0.81888485, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.1338501, + "step": 8210, + "time_per_iteration": 2.851316213607788 + }, + { + "auxiliary_loss_clip": 0.01379575, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.25535238, + "balance_loss_mlp": 1.01969993, + "epoch": 0.4936720276566962, + "flos": 25120709113320.0, + "grad_norm": 1.8398697421649501, + "language_loss": 0.81364322, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83778191, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.14581299, + "step": 8211, + "time_per_iteration": 2.8635168075561523 + }, + { + "auxiliary_loss_clip": 0.01383221, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.26186967, + "balance_loss_mlp": 1.01728678, + "epoch": 0.4937321509093642, + "flos": 26364493728000.0, + "grad_norm": 2.131372598266278, + "language_loss": 0.91738701, + "learning_rate": 2.137565999700933e-06, + "loss": 0.94153321, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14123535, + "step": 8212, + "time_per_iteration": 2.840399742126465 + }, + { + "auxiliary_loss_clip": 0.01379392, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.25759149, + "balance_loss_mlp": 1.02107036, + "epoch": 0.49379227416203214, + "flos": 22965934996800.0, + "grad_norm": 1.6350668293924366, + "language_loss": 0.66091877, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.68505502, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.13165283, + "step": 8213, + "time_per_iteration": 2.822157382965088 + }, + { + "auxiliary_loss_clip": 0.01376731, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.2570163, + "balance_loss_mlp": 1.01594019, + "epoch": 0.49385239741470016, + "flos": 32495533664400.0, + "grad_norm": 3.3323745606629047, + "language_loss": 0.76140344, + "learning_rate": 2.136788910691711e-06, + "loss": 0.78546536, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.13525391, + "step": 8214, + "time_per_iteration": 2.9008500576019287 + }, + { + "auxiliary_loss_clip": 0.01381683, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.26159477, + "balance_loss_mlp": 1.02349615, + "epoch": 0.4939125206673681, + "flos": 22498103795040.0, + "grad_norm": 1.6298319585214318, + "language_loss": 0.84350199, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86768329, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.12945557, + "step": 8215, + "time_per_iteration": 2.78406023979187 + }, + { + "auxiliary_loss_clip": 0.01363542, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.25008202, + "balance_loss_mlp": 1.0174253, + "epoch": 0.4939726439200361, + "flos": 31182708191760.0, + "grad_norm": 1.646847592621424, + "language_loss": 0.83759797, + "learning_rate": 2.136011800934292e-06, + "loss": 0.86152864, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12097168, + "step": 8216, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.01372775, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.25446379, + "balance_loss_mlp": 1.01542807, + "epoch": 0.49403276717270406, + "flos": 22679335216440.0, + "grad_norm": 1.3695737605396914, + "language_loss": 0.74580449, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76982939, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14282227, + "step": 8217, + "time_per_iteration": 2.8073229789733887 + }, + { + "auxiliary_loss_clip": 0.01368583, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.25271869, + "balance_loss_mlp": 1.02046275, + "epoch": 0.494092890425372, + "flos": 20746018424880.0, + "grad_norm": 1.7860755346089672, + "language_loss": 0.79127687, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.81530559, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13818359, + "step": 8218, + "time_per_iteration": 2.8521552085876465 + }, + { + "auxiliary_loss_clip": 0.01366218, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.25016797, + "balance_loss_mlp": 1.02314126, + "epoch": 0.49415301367804, + "flos": 18373604169240.0, + "grad_norm": 4.616331402514458, + "language_loss": 0.76722658, + "learning_rate": 2.134846097653142e-06, + "loss": 0.791242, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.12188721, + "step": 8219, + "time_per_iteration": 2.717374801635742 + }, + { + "auxiliary_loss_clip": 0.01372982, + "auxiliary_loss_mlp": 0.01034794, + "balance_loss_clip": 1.25314891, + "balance_loss_mlp": 1.02109063, + "epoch": 0.49421313693070795, + "flos": 17535188452680.0, + "grad_norm": 1.5726523221901105, + "language_loss": 0.62836117, + "learning_rate": 2.134457519646357e-06, + "loss": 0.65243894, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.13708496, + "step": 8220, + "time_per_iteration": 4.254241704940796 + }, + { + "auxiliary_loss_clip": 0.0137414, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.25474739, + "balance_loss_mlp": 1.02155924, + "epoch": 0.4942732601833759, + "flos": 20816886659040.0, + "grad_norm": 2.171439865405227, + "language_loss": 0.72297662, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74707001, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13653564, + "step": 8221, + "time_per_iteration": 2.7586045265197754 + }, + { + "auxiliary_loss_clip": 0.01371536, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.25578237, + "balance_loss_mlp": 1.02140093, + "epoch": 0.4943333834360439, + "flos": 15053101351920.0, + "grad_norm": 1.8723682364910714, + "language_loss": 0.79667908, + "learning_rate": 2.133680348351595e-06, + "loss": 0.82073438, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12591553, + "step": 8222, + "time_per_iteration": 2.7410879135131836 + }, + { + "auxiliary_loss_clip": 0.01374266, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.25597167, + "balance_loss_mlp": 1.02508867, + "epoch": 0.49439350668871185, + "flos": 16074697691160.0, + "grad_norm": 2.0546117909959905, + "language_loss": 0.72141194, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74553967, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13397217, + "step": 8223, + "time_per_iteration": 2.751335620880127 + }, + { + "auxiliary_loss_clip": 0.01380419, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.2587446, + "balance_loss_mlp": 1.0233984, + "epoch": 0.4944536299413798, + "flos": 20884140749160.0, + "grad_norm": 1.745255042857142, + "language_loss": 0.76017374, + "learning_rate": 2.132903156780144e-06, + "loss": 0.7843504, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13830566, + "step": 8224, + "time_per_iteration": 2.7571003437042236 + }, + { + "auxiliary_loss_clip": 0.01374208, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.25364637, + "balance_loss_mlp": 1.01769507, + "epoch": 0.4945137531940478, + "flos": 26613791406720.0, + "grad_norm": 1.9370886297364656, + "language_loss": 0.63888681, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66294074, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.1348877, + "step": 8225, + "time_per_iteration": 4.245421886444092 + }, + { + "auxiliary_loss_clip": 0.01372969, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.25201094, + "balance_loss_mlp": 1.01780915, + "epoch": 0.49457387644671574, + "flos": 23993500764960.0, + "grad_norm": 1.8042342217055263, + "language_loss": 0.77322817, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.79726458, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.12860107, + "step": 8226, + "time_per_iteration": 2.7856955528259277 + }, + { + "auxiliary_loss_clip": 0.0137748, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.25610471, + "balance_loss_mlp": 1.02131724, + "epoch": 0.49463399969938376, + "flos": 26981898811560.0, + "grad_norm": 1.6127411071128224, + "language_loss": 0.71171832, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73585379, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14770508, + "step": 8227, + "time_per_iteration": 2.776121139526367 + }, + { + "auxiliary_loss_clip": 0.01389575, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.2657305, + "balance_loss_mlp": 1.02148843, + "epoch": 0.49469412295205173, + "flos": 29687839130520.0, + "grad_norm": 1.7866713710581859, + "language_loss": 0.71705484, + "learning_rate": 2.131348713278718e-06, + "loss": 0.7413078, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.14233398, + "step": 8228, + "time_per_iteration": 2.8217995166778564 + }, + { + "auxiliary_loss_clip": 0.01369372, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.25324845, + "balance_loss_mlp": 1.01637506, + "epoch": 0.4947542462047197, + "flos": 24136861567680.0, + "grad_norm": 1.3537713528336446, + "language_loss": 0.83922535, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.8632127, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13000488, + "step": 8229, + "time_per_iteration": 4.357402086257935 + }, + { + "auxiliary_loss_clip": 0.01380996, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.25912392, + "balance_loss_mlp": 1.022048, + "epoch": 0.49481436945738766, + "flos": 20049542218440.0, + "grad_norm": 1.774122244751403, + "language_loss": 0.75357711, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.7777546, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14697266, + "step": 8230, + "time_per_iteration": 2.7880706787109375 + }, + { + "auxiliary_loss_clip": 0.01378253, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.25773799, + "balance_loss_mlp": 1.01623023, + "epoch": 0.4948744927100556, + "flos": 15673836321000.0, + "grad_norm": 2.3851768400218725, + "language_loss": 0.80202836, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.82610321, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13018799, + "step": 8231, + "time_per_iteration": 2.754992961883545 + }, + { + "auxiliary_loss_clip": 0.01195887, + "auxiliary_loss_mlp": 0.01018034, + "balance_loss_clip": 1.14799953, + "balance_loss_mlp": 1.01620996, + "epoch": 0.4949346159627236, + "flos": 68888888132400.0, + "grad_norm": 0.7527966371392671, + "language_loss": 0.60261869, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62475789, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.01818848, + "step": 8232, + "time_per_iteration": 3.3714404106140137 + }, + { + "auxiliary_loss_clip": 0.01381661, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.25608563, + "balance_loss_mlp": 1.0196805, + "epoch": 0.49499473921539155, + "flos": 24795670197240.0, + "grad_norm": 1.5549074448254807, + "language_loss": 0.69431424, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71846879, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14105225, + "step": 8233, + "time_per_iteration": 2.8657379150390625 + }, + { + "auxiliary_loss_clip": 0.01368188, + "auxiliary_loss_mlp": 0.01030914, + "balance_loss_clip": 1.25033021, + "balance_loss_mlp": 1.01603699, + "epoch": 0.4950548624680595, + "flos": 32714351445960.0, + "grad_norm": 2.245202428316498, + "language_loss": 0.6679486, + "learning_rate": 2.129016898898633e-06, + "loss": 0.69193971, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14886475, + "step": 8234, + "time_per_iteration": 2.862217664718628 + }, + { + "auxiliary_loss_clip": 0.01199436, + "auxiliary_loss_mlp": 0.0100113, + "balance_loss_clip": 1.15152144, + "balance_loss_mlp": 0.99907982, + "epoch": 0.4951149857207275, + "flos": 50096571895800.0, + "grad_norm": 0.9598599875855477, + "language_loss": 0.5801959, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60220158, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.02050781, + "step": 8235, + "time_per_iteration": 3.1430108547210693 + }, + { + "auxiliary_loss_clip": 0.01381707, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.25935698, + "balance_loss_mlp": 1.02084184, + "epoch": 0.49517510897339545, + "flos": 22241902695120.0, + "grad_norm": 1.5456575697278083, + "language_loss": 0.77546328, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79962873, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14007568, + "step": 8236, + "time_per_iteration": 2.781139373779297 + }, + { + "auxiliary_loss_clip": 0.0137322, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.2553184, + "balance_loss_mlp": 1.01588058, + "epoch": 0.4952352322260634, + "flos": 25379590365000.0, + "grad_norm": 1.8221529312973272, + "language_loss": 0.73085248, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75487381, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13024902, + "step": 8237, + "time_per_iteration": 4.292924404144287 + }, + { + "auxiliary_loss_clip": 0.01369107, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.25262022, + "balance_loss_mlp": 1.01953578, + "epoch": 0.4952953554787314, + "flos": 24614195125680.0, + "grad_norm": 1.6384082570722744, + "language_loss": 0.75776267, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78177905, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13000488, + "step": 8238, + "time_per_iteration": 2.7416141033172607 + }, + { + "auxiliary_loss_clip": 0.01379116, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.25741982, + "balance_loss_mlp": 1.01837599, + "epoch": 0.49535547873139935, + "flos": 17315761545720.0, + "grad_norm": 2.233258338947841, + "language_loss": 0.74824595, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.77236128, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14056396, + "step": 8239, + "time_per_iteration": 2.7500321865081787 + }, + { + "auxiliary_loss_clip": 0.01382707, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.25945258, + "balance_loss_mlp": 1.01923931, + "epoch": 0.4954156019840673, + "flos": 20745165649320.0, + "grad_norm": 2.157451655271669, + "language_loss": 0.79987144, + "learning_rate": 2.126684908394552e-06, + "loss": 0.82404137, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15063477, + "step": 8240, + "time_per_iteration": 2.7256245613098145 + }, + { + "auxiliary_loss_clip": 0.01370768, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_clip": 1.25376081, + "balance_loss_mlp": 1.02883732, + "epoch": 0.49547572523673533, + "flos": 12824210332440.0, + "grad_norm": 2.0136163937656564, + "language_loss": 0.86102676, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88515353, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13067627, + "step": 8241, + "time_per_iteration": 2.755228281021118 + }, + { + "auxiliary_loss_clip": 0.01366616, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.25112176, + "balance_loss_mlp": 1.02707636, + "epoch": 0.4955358484894033, + "flos": 15601384360800.0, + "grad_norm": 1.736882340494664, + "language_loss": 0.77808499, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.80214524, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12341309, + "step": 8242, + "time_per_iteration": 2.7703778743743896 + }, + { + "auxiliary_loss_clip": 0.01375921, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.25727916, + "balance_loss_mlp": 1.02178133, + "epoch": 0.49559597174207126, + "flos": 26469415395000.0, + "grad_norm": 1.6646170034359202, + "language_loss": 0.67630655, + "learning_rate": 2.125518848090833e-06, + "loss": 0.70041728, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.1338501, + "step": 8243, + "time_per_iteration": 2.8589603900909424 + }, + { + "auxiliary_loss_clip": 0.01375328, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.25684178, + "balance_loss_mlp": 1.02133369, + "epoch": 0.4956560949947392, + "flos": 23153379497280.0, + "grad_norm": 2.0251903325442453, + "language_loss": 0.68554866, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70964372, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.128479, + "step": 8244, + "time_per_iteration": 2.848741054534912 + }, + { + "auxiliary_loss_clip": 0.01381059, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_clip": 1.25974751, + "balance_loss_mlp": 1.02752328, + "epoch": 0.4957162182474072, + "flos": 20777879006280.0, + "grad_norm": 1.7392254262683704, + "language_loss": 0.75407737, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77830517, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14190674, + "step": 8245, + "time_per_iteration": 2.8553831577301025 + }, + { + "auxiliary_loss_clip": 0.01374144, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.25798023, + "balance_loss_mlp": 1.02224755, + "epoch": 0.49577634150007516, + "flos": 18738868988880.0, + "grad_norm": 2.1240820235857254, + "language_loss": 0.82078385, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.8448773, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.1295166, + "step": 8246, + "time_per_iteration": 2.817845582962036 + }, + { + "auxiliary_loss_clip": 0.01383866, + "auxiliary_loss_mlp": 0.01041917, + "balance_loss_clip": 1.2633481, + "balance_loss_mlp": 1.02820826, + "epoch": 0.4958364647527431, + "flos": 25559359885440.0, + "grad_norm": 1.6171585557302615, + "language_loss": 0.84090441, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.86516225, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13720703, + "step": 8247, + "time_per_iteration": 2.839350461959839 + }, + { + "auxiliary_loss_clip": 0.01380073, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.25939381, + "balance_loss_mlp": 1.02165866, + "epoch": 0.4958965880054111, + "flos": 24430649027760.0, + "grad_norm": 2.159189021869541, + "language_loss": 0.84008229, + "learning_rate": 2.123575319254087e-06, + "loss": 0.86422241, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.12286377, + "step": 8248, + "time_per_iteration": 2.993384599685669 + }, + { + "auxiliary_loss_clip": 0.01380554, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.25891316, + "balance_loss_mlp": 1.0225904, + "epoch": 0.49595671125807905, + "flos": 25089051573720.0, + "grad_norm": 1.6730112870749259, + "language_loss": 0.73952806, + "learning_rate": 2.123186599369812e-06, + "loss": 0.76370031, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14080811, + "step": 8249, + "time_per_iteration": 2.86148738861084 + }, + { + "auxiliary_loss_clip": 0.01387251, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.26425481, + "balance_loss_mlp": 1.0294857, + "epoch": 0.496016834510747, + "flos": 16440287377680.0, + "grad_norm": 2.7613796824294106, + "language_loss": 0.76469672, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78900242, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.1383667, + "step": 8250, + "time_per_iteration": 2.826512098312378 + }, + { + "auxiliary_loss_clip": 0.01386247, + "auxiliary_loss_mlp": 0.0103841, + "balance_loss_clip": 1.26477551, + "balance_loss_mlp": 1.02517164, + "epoch": 0.496076957763415, + "flos": 23442821862840.0, + "grad_norm": 1.691309162099938, + "language_loss": 0.70038092, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72462749, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.13256836, + "step": 8251, + "time_per_iteration": 2.7835655212402344 + }, + { + "auxiliary_loss_clip": 0.01375579, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.25696254, + "balance_loss_mlp": 1.02367628, + "epoch": 0.49613708101608295, + "flos": 16913884966560.0, + "grad_norm": 1.722144206209322, + "language_loss": 0.80504394, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82916653, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13000488, + "step": 8252, + "time_per_iteration": 2.778698682785034 + }, + { + "auxiliary_loss_clip": 0.01377717, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.25962484, + "balance_loss_mlp": 1.01782084, + "epoch": 0.4961972042687509, + "flos": 16622493399720.0, + "grad_norm": 1.676110119132482, + "language_loss": 0.81860936, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.8427161, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.15136719, + "step": 8253, + "time_per_iteration": 2.728389024734497 + }, + { + "auxiliary_loss_clip": 0.01373438, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.25601506, + "balance_loss_mlp": 1.01884699, + "epoch": 0.49625732752141893, + "flos": 28963603787040.0, + "grad_norm": 1.473963455440481, + "language_loss": 0.67586654, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69991559, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.12634277, + "step": 8254, + "time_per_iteration": 2.8846445083618164 + }, + { + "auxiliary_loss_clip": 0.01380479, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.25964701, + "balance_loss_mlp": 1.02356172, + "epoch": 0.4963174507740869, + "flos": 23117579904960.0, + "grad_norm": 1.678617658585909, + "language_loss": 0.74447417, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76864582, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13122559, + "step": 8255, + "time_per_iteration": 2.7912585735321045 + }, + { + "auxiliary_loss_clip": 0.01369838, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.25164676, + "balance_loss_mlp": 1.02108693, + "epoch": 0.49637757402675486, + "flos": 13921872775920.0, + "grad_norm": 2.1377862471391245, + "language_loss": 0.81852841, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.84257686, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13897705, + "step": 8256, + "time_per_iteration": 2.674686908721924 + }, + { + "auxiliary_loss_clip": 0.0137409, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.25713611, + "balance_loss_mlp": 1.01660895, + "epoch": 0.49643769727942283, + "flos": 22314029788440.0, + "grad_norm": 1.4633339719997671, + "language_loss": 0.8107388, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83477366, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.12799072, + "step": 8257, + "time_per_iteration": 2.8162310123443604 + }, + { + "auxiliary_loss_clip": 0.01389259, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.26493359, + "balance_loss_mlp": 1.02341855, + "epoch": 0.4964978205320908, + "flos": 19504995178680.0, + "grad_norm": 1.7609651044343446, + "language_loss": 0.66294336, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68720961, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.13946533, + "step": 8258, + "time_per_iteration": 2.71112322807312 + }, + { + "auxiliary_loss_clip": 0.01372861, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.2555958, + "balance_loss_mlp": 1.0226357, + "epoch": 0.49655794378475876, + "flos": 23441238136800.0, + "grad_norm": 1.4308680474848392, + "language_loss": 0.77891839, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.80299544, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.12200928, + "step": 8259, + "time_per_iteration": 2.80438494682312 + }, + { + "auxiliary_loss_clip": 0.01381209, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.26321876, + "balance_loss_mlp": 1.02004647, + "epoch": 0.4966180670374267, + "flos": 26836466982480.0, + "grad_norm": 1.4105868089447455, + "language_loss": 0.78528583, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80942923, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13092041, + "step": 8260, + "time_per_iteration": 4.275435209274292 + }, + { + "auxiliary_loss_clip": 0.01377624, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.25663507, + "balance_loss_mlp": 1.0196954, + "epoch": 0.4966781902900947, + "flos": 22013298298800.0, + "grad_norm": 2.5230269872707916, + "language_loss": 0.76461196, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78872108, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.13604736, + "step": 8261, + "time_per_iteration": 2.7596383094787598 + }, + { + "auxiliary_loss_clip": 0.01367711, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.2511549, + "balance_loss_mlp": 1.01732492, + "epoch": 0.49673831354276266, + "flos": 26218940073840.0, + "grad_norm": 4.290370336186464, + "language_loss": 0.89911956, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.923096, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.1262207, + "step": 8262, + "time_per_iteration": 2.8218798637390137 + }, + { + "auxiliary_loss_clip": 0.01374097, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.25725663, + "balance_loss_mlp": 1.01643896, + "epoch": 0.4967984367954306, + "flos": 23187108063240.0, + "grad_norm": 1.4035831049881324, + "language_loss": 0.73926842, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76329941, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.12542725, + "step": 8263, + "time_per_iteration": 2.7422516345977783 + }, + { + "auxiliary_loss_clip": 0.01384154, + "auxiliary_loss_mlp": 0.01035466, + "balance_loss_clip": 1.26246142, + "balance_loss_mlp": 1.02020109, + "epoch": 0.4968585600480986, + "flos": 19286177397120.0, + "grad_norm": 1.9450219688315993, + "language_loss": 0.69614011, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.72033632, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.15252686, + "step": 8264, + "time_per_iteration": 4.1252663135528564 + }, + { + "auxiliary_loss_clip": 0.01380857, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.25803375, + "balance_loss_mlp": 1.01475382, + "epoch": 0.49691868330076655, + "flos": 22533862779000.0, + "grad_norm": 1.5922199488308115, + "language_loss": 0.65066069, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67475599, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.13928223, + "step": 8265, + "time_per_iteration": 2.7676827907562256 + }, + { + "auxiliary_loss_clip": 0.01219751, + "auxiliary_loss_mlp": 0.01004186, + "balance_loss_clip": 1.17078018, + "balance_loss_mlp": 1.00227916, + "epoch": 0.4969788065534345, + "flos": 66594123707040.0, + "grad_norm": 1.24571867934933, + "language_loss": 0.53546786, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55770719, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.01904297, + "step": 8266, + "time_per_iteration": 3.297668218612671 + }, + { + "auxiliary_loss_clip": 0.01362598, + "auxiliary_loss_mlp": 0.01027275, + "balance_loss_clip": 1.24875677, + "balance_loss_mlp": 1.01442444, + "epoch": 0.49703892980610254, + "flos": 24064653257640.0, + "grad_norm": 1.5876481020247133, + "language_loss": 0.79381418, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.8177129, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12854004, + "step": 8267, + "time_per_iteration": 2.7985856533050537 + }, + { + "auxiliary_loss_clip": 0.01374248, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.2542932, + "balance_loss_mlp": 1.02476501, + "epoch": 0.4970990530587705, + "flos": 29131353232920.0, + "grad_norm": 1.913049280636193, + "language_loss": 0.75578177, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.77991807, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14605713, + "step": 8268, + "time_per_iteration": 4.232545614242554 + }, + { + "auxiliary_loss_clip": 0.01373644, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.25392461, + "balance_loss_mlp": 1.01536679, + "epoch": 0.49715917631143847, + "flos": 46034476984080.0, + "grad_norm": 1.5314472389896434, + "language_loss": 0.68218583, + "learning_rate": 2.115411240328073e-06, + "loss": 0.70622516, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14929199, + "step": 8269, + "time_per_iteration": 2.9649343490600586 + }, + { + "auxiliary_loss_clip": 0.01365166, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.25013566, + "balance_loss_mlp": 1.01367998, + "epoch": 0.49721929956410643, + "flos": 20195826823080.0, + "grad_norm": 1.5521516594541744, + "language_loss": 0.85926986, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.88319075, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13250732, + "step": 8270, + "time_per_iteration": 2.8278427124023438 + }, + { + "auxiliary_loss_clip": 0.01372394, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.25248373, + "balance_loss_mlp": 1.01889133, + "epoch": 0.4972794228167744, + "flos": 21658672869480.0, + "grad_norm": 1.7486936857432758, + "language_loss": 0.71174586, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7357794, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.12078857, + "step": 8271, + "time_per_iteration": 2.8179562091827393 + }, + { + "auxiliary_loss_clip": 0.01376074, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.2560966, + "balance_loss_mlp": 1.0163641, + "epoch": 0.49733954606944236, + "flos": 24285095373600.0, + "grad_norm": 1.365726308884367, + "language_loss": 0.78589344, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80995256, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.1348877, + "step": 8272, + "time_per_iteration": 2.9092600345611572 + }, + { + "auxiliary_loss_clip": 0.01372734, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.25178492, + "balance_loss_mlp": 1.02285767, + "epoch": 0.4973996693221103, + "flos": 37859716460520.0, + "grad_norm": 1.9584464452050334, + "language_loss": 0.66618109, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.69027412, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.137146, + "step": 8273, + "time_per_iteration": 3.010932445526123 + }, + { + "auxiliary_loss_clip": 0.01372118, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.25485563, + "balance_loss_mlp": 1.02116799, + "epoch": 0.4974597925747783, + "flos": 21366469135440.0, + "grad_norm": 1.7061499811619818, + "language_loss": 0.78272367, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80678666, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.13018799, + "step": 8274, + "time_per_iteration": 2.8003480434417725 + }, + { + "auxiliary_loss_clip": 0.0137076, + "auxiliary_loss_mlp": 0.01033439, + "balance_loss_clip": 1.24919653, + "balance_loss_mlp": 1.01930118, + "epoch": 0.49751991582744626, + "flos": 30744382286520.0, + "grad_norm": 1.7228672364458921, + "language_loss": 0.75903225, + "learning_rate": 2.113078285889493e-06, + "loss": 0.7830742, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.14147949, + "step": 8275, + "time_per_iteration": 2.890099048614502 + }, + { + "auxiliary_loss_clip": 0.01372973, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.25104034, + "balance_loss_mlp": 1.01817322, + "epoch": 0.4975800390801142, + "flos": 14104606706640.0, + "grad_norm": 2.0935189127533183, + "language_loss": 0.8394047, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.863469, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.152771, + "step": 8276, + "time_per_iteration": 4.249659538269043 + }, + { + "auxiliary_loss_clip": 0.01357249, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.24309301, + "balance_loss_mlp": 1.01596999, + "epoch": 0.4976401623327822, + "flos": 24212684021760.0, + "grad_norm": 1.3597971774937967, + "language_loss": 0.70399028, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72784394, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.121521, + "step": 8277, + "time_per_iteration": 2.880977153778076 + }, + { + "auxiliary_loss_clip": 0.01358355, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.24256682, + "balance_loss_mlp": 1.02256036, + "epoch": 0.49770028558545015, + "flos": 21141032191200.0, + "grad_norm": 1.9151493319825144, + "language_loss": 0.82414669, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84808409, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12823486, + "step": 8278, + "time_per_iteration": 2.789308547973633 + }, + { + "auxiliary_loss_clip": 0.01368096, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.24781775, + "balance_loss_mlp": 1.01906943, + "epoch": 0.4977604088381181, + "flos": 16768331312400.0, + "grad_norm": 2.2007085065939282, + "language_loss": 0.67976588, + "learning_rate": 2.111522896975052e-06, + "loss": 0.70377111, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.13366699, + "step": 8279, + "time_per_iteration": 2.7231431007385254 + }, + { + "auxiliary_loss_clip": 0.01368424, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.24935925, + "balance_loss_mlp": 1.01826906, + "epoch": 0.49782053209078614, + "flos": 15707483670240.0, + "grad_norm": 2.0363169659614333, + "language_loss": 0.70590538, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72991288, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.140625, + "step": 8280, + "time_per_iteration": 2.7227466106414795 + }, + { + "auxiliary_loss_clip": 0.01366339, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.24870145, + "balance_loss_mlp": 1.02020085, + "epoch": 0.4978806553434541, + "flos": 24759058437720.0, + "grad_norm": 1.528235175987304, + "language_loss": 0.64763951, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67163563, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.1307373, + "step": 8281, + "time_per_iteration": 2.7769412994384766 + }, + { + "auxiliary_loss_clip": 0.01370442, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.2502625, + "balance_loss_mlp": 1.02303362, + "epoch": 0.49794077859612207, + "flos": 13119703343640.0, + "grad_norm": 1.9837588070457903, + "language_loss": 0.73952585, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.76360196, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14141846, + "step": 8282, + "time_per_iteration": 2.693718433380127 + }, + { + "auxiliary_loss_clip": 0.01362967, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.24675953, + "balance_loss_mlp": 1.02384377, + "epoch": 0.49800090184879003, + "flos": 27530669120760.0, + "grad_norm": 1.441780332939531, + "language_loss": 0.7321046, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75609654, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.12402344, + "step": 8283, + "time_per_iteration": 2.7875869274139404 + }, + { + "auxiliary_loss_clip": 0.0136118, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.24431396, + "balance_loss_mlp": 1.02323639, + "epoch": 0.498061025101458, + "flos": 19797320737800.0, + "grad_norm": 1.5289311557950882, + "language_loss": 0.78794092, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81191504, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.13006592, + "step": 8284, + "time_per_iteration": 2.744234800338745 + }, + { + "auxiliary_loss_clip": 0.01374446, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.25144958, + "balance_loss_mlp": 1.02256083, + "epoch": 0.49812114835412596, + "flos": 29899388015640.0, + "grad_norm": 1.5466680513923838, + "language_loss": 0.73350525, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75761938, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.144104, + "step": 8285, + "time_per_iteration": 2.8636796474456787 + }, + { + "auxiliary_loss_clip": 0.01368955, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.25102961, + "balance_loss_mlp": 1.01491618, + "epoch": 0.49818127160679393, + "flos": 23152039421400.0, + "grad_norm": 1.6142868673134152, + "language_loss": 0.74441743, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76839936, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14312744, + "step": 8286, + "time_per_iteration": 2.759986400604248 + }, + { + "auxiliary_loss_clip": 0.01368835, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.24991596, + "balance_loss_mlp": 1.01928759, + "epoch": 0.4982413948594619, + "flos": 21657820093920.0, + "grad_norm": 1.8966078439846488, + "language_loss": 0.85795099, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.88197088, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.13867188, + "step": 8287, + "time_per_iteration": 2.7807581424713135 + }, + { + "auxiliary_loss_clip": 0.01366849, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.24743032, + "balance_loss_mlp": 1.01180506, + "epoch": 0.49830151811212986, + "flos": 32493137771160.0, + "grad_norm": 1.5464608359073029, + "language_loss": 0.72394502, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74787128, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13964844, + "step": 8288, + "time_per_iteration": 2.830383539199829 + }, + { + "auxiliary_loss_clip": 0.01370886, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.24982762, + "balance_loss_mlp": 1.01705563, + "epoch": 0.4983616413647978, + "flos": 18145893156840.0, + "grad_norm": 3.282118744167059, + "language_loss": 0.80327678, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82730222, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.14599609, + "step": 8289, + "time_per_iteration": 2.7539806365966797 + }, + { + "auxiliary_loss_clip": 0.01364868, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.24668205, + "balance_loss_mlp": 1.0215199, + "epoch": 0.4984217646174658, + "flos": 19723569310080.0, + "grad_norm": 2.6401755364981176, + "language_loss": 0.74175942, + "learning_rate": 2.107245231409784e-06, + "loss": 0.76576257, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13922119, + "step": 8290, + "time_per_iteration": 2.7599241733551025 + }, + { + "auxiliary_loss_clip": 0.01372007, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.25160098, + "balance_loss_mlp": 1.0209285, + "epoch": 0.49848188787013376, + "flos": 24941995410240.0, + "grad_norm": 1.4877443422319652, + "language_loss": 0.84361058, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86770022, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.16046143, + "step": 8291, + "time_per_iteration": 2.8066673278808594 + }, + { + "auxiliary_loss_clip": 0.01377017, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.25267279, + "balance_loss_mlp": 1.02092004, + "epoch": 0.4985420111228017, + "flos": 22387537566000.0, + "grad_norm": 1.6531492377240384, + "language_loss": 0.67573643, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69985735, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.14154053, + "step": 8292, + "time_per_iteration": 3.081214189529419 + }, + { + "auxiliary_loss_clip": 0.0136519, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.2463454, + "balance_loss_mlp": 1.01851988, + "epoch": 0.4986021343754697, + "flos": 16220779254000.0, + "grad_norm": 1.8010318579169222, + "language_loss": 0.67607492, + "learning_rate": 2.106078509118965e-06, + "loss": 0.70004416, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.13208008, + "step": 8293, + "time_per_iteration": 2.752209186553955 + }, + { + "auxiliary_loss_clip": 0.01370385, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.25149429, + "balance_loss_mlp": 1.01481724, + "epoch": 0.4986622576281377, + "flos": 23408768430000.0, + "grad_norm": 3.9788613117894136, + "language_loss": 0.82777739, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.85175765, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.1282959, + "step": 8294, + "time_per_iteration": 2.7844746112823486 + }, + { + "auxiliary_loss_clip": 0.01369251, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.25083041, + "balance_loss_mlp": 1.01906073, + "epoch": 0.49872238088080567, + "flos": 19979364326400.0, + "grad_norm": 1.5707403778604918, + "language_loss": 0.73589373, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75991642, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13964844, + "step": 8295, + "time_per_iteration": 2.745718002319336 + }, + { + "auxiliary_loss_clip": 0.01362864, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.24708557, + "balance_loss_mlp": 1.02170277, + "epoch": 0.49878250413347364, + "flos": 22898152998000.0, + "grad_norm": 1.7951890282232863, + "language_loss": 0.68286884, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.70684373, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12921143, + "step": 8296, + "time_per_iteration": 2.7617781162261963 + }, + { + "auxiliary_loss_clip": 0.01373286, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.25178695, + "balance_loss_mlp": 1.01519227, + "epoch": 0.4988426273861416, + "flos": 32604881642640.0, + "grad_norm": 1.788741644745848, + "language_loss": 0.64697433, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67100799, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14874268, + "step": 8297, + "time_per_iteration": 2.830575466156006 + }, + { + "auxiliary_loss_clip": 0.01355052, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.23979759, + "balance_loss_mlp": 1.02031636, + "epoch": 0.49890275063880957, + "flos": 20928589922160.0, + "grad_norm": 2.071458662285757, + "language_loss": 0.69797063, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.72184682, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.12249756, + "step": 8298, + "time_per_iteration": 4.247102737426758 + }, + { + "auxiliary_loss_clip": 0.01364245, + "auxiliary_loss_mlp": 0.01026898, + "balance_loss_clip": 1.24740529, + "balance_loss_mlp": 1.01411819, + "epoch": 0.49896287389147753, + "flos": 18629074318680.0, + "grad_norm": 1.7000812841485808, + "language_loss": 0.85471576, + "learning_rate": 2.103744956327814e-06, + "loss": 0.87862724, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.12786865, + "step": 8299, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.01369867, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.24980283, + "balance_loss_mlp": 1.02440464, + "epoch": 0.4990229971441455, + "flos": 24831916481520.0, + "grad_norm": 2.707306211783908, + "language_loss": 0.68984759, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71393037, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14001465, + "step": 8300, + "time_per_iteration": 2.756629228591919 + }, + { + "auxiliary_loss_clip": 0.0122614, + "auxiliary_loss_mlp": 0.01011949, + "balance_loss_clip": 1.1771102, + "balance_loss_mlp": 1.00962448, + "epoch": 0.49908312039681346, + "flos": 71401008438360.0, + "grad_norm": 0.7598777227973218, + "language_loss": 0.51158559, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53396642, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.02319336, + "step": 8301, + "time_per_iteration": 3.358628273010254 + }, + { + "auxiliary_loss_clip": 0.0135927, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.24458802, + "balance_loss_mlp": 1.027812, + "epoch": 0.4991432436494814, + "flos": 19833445197000.0, + "grad_norm": 1.5890616528459718, + "language_loss": 0.85065722, + "learning_rate": 2.102578126623879e-06, + "loss": 0.87466133, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13323975, + "step": 8302, + "time_per_iteration": 4.117407321929932 + }, + { + "auxiliary_loss_clip": 0.01360277, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.24573874, + "balance_loss_mlp": 1.02172828, + "epoch": 0.4992033669021494, + "flos": 15126081220800.0, + "grad_norm": 1.6128242278451037, + "language_loss": 0.69298476, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71692657, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.1217041, + "step": 8303, + "time_per_iteration": 2.7642126083374023 + }, + { + "auxiliary_loss_clip": 0.0137405, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.25393128, + "balance_loss_mlp": 1.02339709, + "epoch": 0.49926349015481736, + "flos": 31213675389240.0, + "grad_norm": 1.7297439415510463, + "language_loss": 0.72822523, + "learning_rate": 2.101800220681144e-06, + "loss": 0.75233471, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13513184, + "step": 8304, + "time_per_iteration": 2.8386905193328857 + }, + { + "auxiliary_loss_clip": 0.01366749, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.24997127, + "balance_loss_mlp": 1.0258863, + "epoch": 0.4993236134074853, + "flos": 24905667909240.0, + "grad_norm": 2.4642268127572446, + "language_loss": 0.81051463, + "learning_rate": 2.10141126191199e-06, + "loss": 0.834566, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.12518311, + "step": 8305, + "time_per_iteration": 2.8115227222442627 + }, + { + "auxiliary_loss_clip": 0.01224739, + "auxiliary_loss_mlp": 0.01009176, + "balance_loss_clip": 1.17532086, + "balance_loss_mlp": 1.0066967, + "epoch": 0.4993837366601533, + "flos": 70434930373200.0, + "grad_norm": 0.7340738421759682, + "language_loss": 0.56935453, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59169364, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.02478027, + "step": 8306, + "time_per_iteration": 3.3496484756469727 + }, + { + "auxiliary_loss_clip": 0.01369698, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.25295889, + "balance_loss_mlp": 1.02218556, + "epoch": 0.4994438599128213, + "flos": 15965512146360.0, + "grad_norm": 1.6537838186433278, + "language_loss": 0.82942188, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.85347879, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13812256, + "step": 8307, + "time_per_iteration": 4.175692081451416 + }, + { + "auxiliary_loss_clip": 0.01363839, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.24879766, + "balance_loss_mlp": 1.01878119, + "epoch": 0.4995039831654893, + "flos": 27934129425960.0, + "grad_norm": 1.55347299275432, + "language_loss": 0.60813886, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63209963, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13452148, + "step": 8308, + "time_per_iteration": 2.820490837097168 + }, + { + "auxiliary_loss_clip": 0.01359665, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.24487925, + "balance_loss_mlp": 1.01570392, + "epoch": 0.49956410641815724, + "flos": 24209882044920.0, + "grad_norm": 1.6077935811527244, + "language_loss": 0.74700719, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77088666, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.12585449, + "step": 8309, + "time_per_iteration": 2.773515462875366 + }, + { + "auxiliary_loss_clip": 0.01367909, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.2506243, + "balance_loss_mlp": 1.02274299, + "epoch": 0.4996242296708252, + "flos": 16184451753000.0, + "grad_norm": 2.0941794405362364, + "language_loss": 0.79899907, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82302946, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.1237793, + "step": 8310, + "time_per_iteration": 2.781611680984497 + }, + { + "auxiliary_loss_clip": 0.01373362, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.2524941, + "balance_loss_mlp": 1.02124321, + "epoch": 0.49968435292349317, + "flos": 16877963549160.0, + "grad_norm": 1.5142313604322095, + "language_loss": 0.70940101, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73347783, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.1307373, + "step": 8311, + "time_per_iteration": 2.7232236862182617 + }, + { + "auxiliary_loss_clip": 0.01367162, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.25163436, + "balance_loss_mlp": 1.01702666, + "epoch": 0.49974447617616113, + "flos": 14943672156960.0, + "grad_norm": 2.458739122176387, + "language_loss": 0.77636188, + "learning_rate": 2.098688443679187e-06, + "loss": 0.80032384, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.11993408, + "step": 8312, + "time_per_iteration": 2.8051486015319824 + }, + { + "auxiliary_loss_clip": 0.01369943, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.25180507, + "balance_loss_mlp": 1.02198362, + "epoch": 0.4998045994288291, + "flos": 26657225370720.0, + "grad_norm": 2.2358315946561276, + "language_loss": 0.84827471, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.87231517, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.12121582, + "step": 8313, + "time_per_iteration": 2.758920192718506 + }, + { + "auxiliary_loss_clip": 0.01365924, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.24791121, + "balance_loss_mlp": 1.01540732, + "epoch": 0.49986472268149706, + "flos": 20957932785240.0, + "grad_norm": 1.6979726274239026, + "language_loss": 0.80775881, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83170307, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13110352, + "step": 8314, + "time_per_iteration": 4.25855565071106 + }, + { + "auxiliary_loss_clip": 0.01364518, + "auxiliary_loss_mlp": 0.01039469, + "balance_loss_clip": 1.24666524, + "balance_loss_mlp": 1.02551544, + "epoch": 0.49992484593416503, + "flos": 22789048669920.0, + "grad_norm": 1.979083086267428, + "language_loss": 0.7998898, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.82392967, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.1395874, + "step": 8315, + "time_per_iteration": 2.787813901901245 + }, + { + "auxiliary_loss_clip": 0.0136376, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.24697983, + "balance_loss_mlp": 1.02066112, + "epoch": 0.499984969186833, + "flos": 46794674353320.0, + "grad_norm": 2.1126529687189795, + "language_loss": 0.74530572, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76927698, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.12701416, + "step": 8316, + "time_per_iteration": 2.9817276000976562 + }, + { + "auxiliary_loss_clip": 0.01359515, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.24527287, + "balance_loss_mlp": 1.01826203, + "epoch": 0.500045092439501, + "flos": 25562527337520.0, + "grad_norm": 1.5488974172026564, + "language_loss": 0.81627178, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.8401708, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.12127686, + "step": 8317, + "time_per_iteration": 2.8316564559936523 + }, + { + "auxiliary_loss_clip": 0.01363075, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.24525809, + "balance_loss_mlp": 1.01888967, + "epoch": 0.5001052156921689, + "flos": 20709731532240.0, + "grad_norm": 1.7110227022135143, + "language_loss": 0.83617532, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.86013234, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13745117, + "step": 8318, + "time_per_iteration": 2.7770581245422363 + }, + { + "auxiliary_loss_clip": 0.01359748, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.24300504, + "balance_loss_mlp": 1.01534057, + "epoch": 0.500165338944837, + "flos": 21256024731480.0, + "grad_norm": 1.8633356420430518, + "language_loss": 0.8221494, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.846035, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13470459, + "step": 8319, + "time_per_iteration": 2.7111589908599854 + }, + { + "auxiliary_loss_clip": 0.0136335, + "auxiliary_loss_mlp": 0.01024962, + "balance_loss_clip": 1.24682689, + "balance_loss_mlp": 1.01275456, + "epoch": 0.5002254621975049, + "flos": 27860093739720.0, + "grad_norm": 1.5260870928385382, + "language_loss": 0.72173172, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74561477, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.12219238, + "step": 8320, + "time_per_iteration": 2.812173366546631 + }, + { + "auxiliary_loss_clip": 0.01382473, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.25689626, + "balance_loss_mlp": 1.02485132, + "epoch": 0.5002855854501729, + "flos": 15555148419960.0, + "grad_norm": 3.252244131818532, + "language_loss": 0.77296889, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79718941, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14746094, + "step": 8321, + "time_per_iteration": 2.8972129821777344 + }, + { + "auxiliary_loss_clip": 0.01360019, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.24272931, + "balance_loss_mlp": 1.01928353, + "epoch": 0.5003457087028408, + "flos": 16111756142640.0, + "grad_norm": 1.6645488555920631, + "language_loss": 0.83258641, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85651135, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13208008, + "step": 8322, + "time_per_iteration": 2.8498106002807617 + }, + { + "auxiliary_loss_clip": 0.01369685, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.25227427, + "balance_loss_mlp": 1.01916015, + "epoch": 0.5004058319555088, + "flos": 22715622109080.0, + "grad_norm": 2.333661053993105, + "language_loss": 0.74120784, + "learning_rate": 2.094409360775228e-06, + "loss": 0.76522529, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.12902832, + "step": 8323, + "time_per_iteration": 2.7561731338500977 + }, + { + "auxiliary_loss_clip": 0.01367257, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.25016284, + "balance_loss_mlp": 1.02342379, + "epoch": 0.5004659552081767, + "flos": 30124134617760.0, + "grad_norm": 1.452952295261529, + "language_loss": 0.69507021, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71911156, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.13458252, + "step": 8324, + "time_per_iteration": 2.8357863426208496 + }, + { + "auxiliary_loss_clip": 0.01366697, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.2494278, + "balance_loss_mlp": 1.01923192, + "epoch": 0.5005260784608447, + "flos": 18629358577200.0, + "grad_norm": 1.7677721895776024, + "language_loss": 0.72548771, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7494837, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13659668, + "step": 8325, + "time_per_iteration": 2.790043830871582 + }, + { + "auxiliary_loss_clip": 0.01367743, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.2485888, + "balance_loss_mlp": 1.02287269, + "epoch": 0.5005862017135126, + "flos": 24864954705360.0, + "grad_norm": 2.863282291603402, + "language_loss": 0.73200822, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75606084, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14660645, + "step": 8326, + "time_per_iteration": 2.852613925933838 + }, + { + "auxiliary_loss_clip": 0.01358327, + "auxiliary_loss_mlp": 0.01029298, + "balance_loss_clip": 1.24103403, + "balance_loss_mlp": 1.01699519, + "epoch": 0.5006463249661807, + "flos": 18739072030680.0, + "grad_norm": 1.5304669449772954, + "language_loss": 0.78111959, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80499589, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.12304688, + "step": 8327, + "time_per_iteration": 2.7685296535491943 + }, + { + "auxiliary_loss_clip": 0.01371077, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.25201023, + "balance_loss_mlp": 1.02400661, + "epoch": 0.5007064482188487, + "flos": 13046033132640.0, + "grad_norm": 2.281856575744674, + "language_loss": 0.8771286, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90121347, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.13415527, + "step": 8328, + "time_per_iteration": 2.7646450996398926 + }, + { + "auxiliary_loss_clip": 0.0137306, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.25076652, + "balance_loss_mlp": 1.01829493, + "epoch": 0.5007665714715166, + "flos": 21293732916720.0, + "grad_norm": 2.2105852146560117, + "language_loss": 0.74738443, + "learning_rate": 2.092075131720388e-06, + "loss": 0.77143633, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.13861084, + "step": 8329, + "time_per_iteration": 2.8107306957244873 + }, + { + "auxiliary_loss_clip": 0.01368076, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.25233006, + "balance_loss_mlp": 1.02125609, + "epoch": 0.5008266947241846, + "flos": 29760900216120.0, + "grad_norm": 1.5623061613506877, + "language_loss": 0.79712021, + "learning_rate": 2.091686081238281e-06, + "loss": 0.82114375, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13024902, + "step": 8330, + "time_per_iteration": 2.829768657684326 + }, + { + "auxiliary_loss_clip": 0.01219257, + "auxiliary_loss_mlp": 0.01007068, + "balance_loss_clip": 1.16888511, + "balance_loss_mlp": 1.00491047, + "epoch": 0.5008868179768525, + "flos": 63570843079200.0, + "grad_norm": 0.7303100305761654, + "language_loss": 0.5617311, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58399427, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.02160645, + "step": 8331, + "time_per_iteration": 3.1133227348327637 + }, + { + "auxiliary_loss_clip": 0.01361538, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.24781275, + "balance_loss_mlp": 1.01735353, + "epoch": 0.5009469412295205, + "flos": 27380689155360.0, + "grad_norm": 2.556746815863223, + "language_loss": 0.65150547, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.6754179, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.12341309, + "step": 8332, + "time_per_iteration": 2.7825870513916016 + }, + { + "auxiliary_loss_clip": 0.01362201, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.24611759, + "balance_loss_mlp": 1.015733, + "epoch": 0.5010070644821885, + "flos": 27384303299400.0, + "grad_norm": 1.723471313843655, + "language_loss": 0.75000644, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77391446, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.12878418, + "step": 8333, + "time_per_iteration": 2.9136955738067627 + }, + { + "auxiliary_loss_clip": 0.01371327, + "auxiliary_loss_mlp": 0.01032678, + "balance_loss_clip": 1.25189912, + "balance_loss_mlp": 1.01996994, + "epoch": 0.5010671877348565, + "flos": 20666987910360.0, + "grad_norm": 1.8216774162588711, + "language_loss": 0.80630577, + "learning_rate": 2.090129844689929e-06, + "loss": 0.83034581, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.12713623, + "step": 8334, + "time_per_iteration": 2.790250301361084 + }, + { + "auxiliary_loss_clip": 0.01214774, + "auxiliary_loss_mlp": 0.01007568, + "balance_loss_clip": 1.16449356, + "balance_loss_mlp": 1.00500536, + "epoch": 0.5011273109875244, + "flos": 59143964002200.0, + "grad_norm": 0.8920122228581658, + "language_loss": 0.62690878, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64913225, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.02563477, + "step": 8335, + "time_per_iteration": 3.193756103515625 + }, + { + "auxiliary_loss_clip": 0.01360434, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.2459321, + "balance_loss_mlp": 1.01665664, + "epoch": 0.5011874342401924, + "flos": 25341638529600.0, + "grad_norm": 1.476040289429262, + "language_loss": 0.7968502, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.82074392, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.12286377, + "step": 8336, + "time_per_iteration": 4.243555307388306 + }, + { + "auxiliary_loss_clip": 0.01363945, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.24655724, + "balance_loss_mlp": 1.01476383, + "epoch": 0.5012475574928603, + "flos": 20234793867480.0, + "grad_norm": 1.8534713357266839, + "language_loss": 0.80321848, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82714158, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.13598633, + "step": 8337, + "time_per_iteration": 2.7984650135040283 + }, + { + "auxiliary_loss_clip": 0.013788, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.25540042, + "balance_loss_mlp": 1.01914406, + "epoch": 0.5013076807455283, + "flos": 22715175417120.0, + "grad_norm": 1.8750941735861775, + "language_loss": 0.79052103, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81463903, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.1385498, + "step": 8338, + "time_per_iteration": 2.710340738296509 + }, + { + "auxiliary_loss_clip": 0.01373542, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.2539537, + "balance_loss_mlp": 1.01419592, + "epoch": 0.5013678039981962, + "flos": 24250757682240.0, + "grad_norm": 1.5299953681403617, + "language_loss": 0.85420901, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87822604, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.13952637, + "step": 8339, + "time_per_iteration": 2.8145761489868164 + }, + { + "auxiliary_loss_clip": 0.01364073, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.24697244, + "balance_loss_mlp": 1.01847267, + "epoch": 0.5014279272508643, + "flos": 26182247097600.0, + "grad_norm": 1.3717674861243505, + "language_loss": 0.71034718, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.73430014, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.12756348, + "step": 8340, + "time_per_iteration": 4.335921287536621 + }, + { + "auxiliary_loss_clip": 0.01375689, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.25434041, + "balance_loss_mlp": 1.01801705, + "epoch": 0.5014880505035323, + "flos": 21434982084720.0, + "grad_norm": 2.3341579793890066, + "language_loss": 0.78235841, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80644184, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.1461792, + "step": 8341, + "time_per_iteration": 2.751551628112793 + }, + { + "auxiliary_loss_clip": 0.01378511, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.25669742, + "balance_loss_mlp": 1.01990604, + "epoch": 0.5015481737562002, + "flos": 15773966201520.0, + "grad_norm": 3.4101161762493835, + "language_loss": 0.89457697, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91870815, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.14703369, + "step": 8342, + "time_per_iteration": 2.834409713745117 + }, + { + "auxiliary_loss_clip": 0.01365515, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.24702537, + "balance_loss_mlp": 1.01801014, + "epoch": 0.5016082970088682, + "flos": 26836101507240.0, + "grad_norm": 2.27234488337523, + "language_loss": 0.76951933, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.7934978, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14337158, + "step": 8343, + "time_per_iteration": 2.813093662261963 + }, + { + "auxiliary_loss_clip": 0.0136575, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.25016403, + "balance_loss_mlp": 1.01353312, + "epoch": 0.5016684202615361, + "flos": 21475614071880.0, + "grad_norm": 1.7762252793982134, + "language_loss": 0.67402613, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69794327, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.12445068, + "step": 8344, + "time_per_iteration": 2.7988319396972656 + }, + { + "auxiliary_loss_clip": 0.01371719, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.25260854, + "balance_loss_mlp": 1.02324295, + "epoch": 0.5017285435142042, + "flos": 26252018906040.0, + "grad_norm": 1.89306967104058, + "language_loss": 0.7585485, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.782637, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13879395, + "step": 8345, + "time_per_iteration": 4.3038108348846436 + }, + { + "auxiliary_loss_clip": 0.01370842, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.25298858, + "balance_loss_mlp": 1.0183394, + "epoch": 0.5017886667668721, + "flos": 20782508359320.0, + "grad_norm": 1.9193916106734874, + "language_loss": 0.79740411, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.82145023, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.1541748, + "step": 8346, + "time_per_iteration": 2.7687103748321533 + }, + { + "auxiliary_loss_clip": 0.01368008, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.24878502, + "balance_loss_mlp": 1.02080941, + "epoch": 0.5018487900195401, + "flos": 20161164264840.0, + "grad_norm": 2.972630608073152, + "language_loss": 0.6959312, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71995628, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13702393, + "step": 8347, + "time_per_iteration": 2.7826292514801025 + }, + { + "auxiliary_loss_clip": 0.01375774, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.2543565, + "balance_loss_mlp": 1.02148521, + "epoch": 0.501908913272208, + "flos": 18155314296360.0, + "grad_norm": 2.0456257390013666, + "language_loss": 0.71997094, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.74408937, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.14581299, + "step": 8348, + "time_per_iteration": 2.733558177947998 + }, + { + "auxiliary_loss_clip": 0.01363752, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.24984694, + "balance_loss_mlp": 1.0146898, + "epoch": 0.501969036524876, + "flos": 23117498688240.0, + "grad_norm": 1.5008828513982304, + "language_loss": 0.74647647, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.77039361, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13269043, + "step": 8349, + "time_per_iteration": 2.7914879322052 + }, + { + "auxiliary_loss_clip": 0.01372125, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.25045633, + "balance_loss_mlp": 1.01811898, + "epoch": 0.5020291597775439, + "flos": 11366805806280.0, + "grad_norm": 2.6005273747443436, + "language_loss": 0.64559984, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66965067, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.14831543, + "step": 8350, + "time_per_iteration": 2.753075361251831 + }, + { + "auxiliary_loss_clip": 0.01212612, + "auxiliary_loss_mlp": 0.01002597, + "balance_loss_clip": 1.1619252, + "balance_loss_mlp": 1.00027263, + "epoch": 0.5020892830302119, + "flos": 64026183887280.0, + "grad_norm": 0.7866123579925328, + "language_loss": 0.598508, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.62066007, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02319336, + "step": 8351, + "time_per_iteration": 3.362549304962158 + }, + { + "auxiliary_loss_clip": 0.01377093, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.25547397, + "balance_loss_mlp": 1.02089584, + "epoch": 0.5021494062828799, + "flos": 23738193048960.0, + "grad_norm": 1.699960825891711, + "language_loss": 0.75459677, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77871495, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.13830566, + "step": 8352, + "time_per_iteration": 2.803884983062744 + }, + { + "auxiliary_loss_clip": 0.01376147, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.25719035, + "balance_loss_mlp": 1.0189296, + "epoch": 0.5022095295355479, + "flos": 21581347906080.0, + "grad_norm": 1.6945198252687792, + "language_loss": 0.72076917, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74486804, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14807129, + "step": 8353, + "time_per_iteration": 4.331586837768555 + }, + { + "auxiliary_loss_clip": 0.01376296, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.25809658, + "balance_loss_mlp": 1.01583695, + "epoch": 0.5022696527882159, + "flos": 21402187511040.0, + "grad_norm": 1.6966995579775055, + "language_loss": 0.74222094, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76629263, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.15045166, + "step": 8354, + "time_per_iteration": 2.763254165649414 + }, + { + "auxiliary_loss_clip": 0.01369929, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.25176561, + "balance_loss_mlp": 1.01945901, + "epoch": 0.5023297760408838, + "flos": 27166257076680.0, + "grad_norm": 1.5161188998075463, + "language_loss": 0.73016495, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.75420046, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14178467, + "step": 8355, + "time_per_iteration": 2.8401522636413574 + }, + { + "auxiliary_loss_clip": 0.01382658, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.25925994, + "balance_loss_mlp": 1.02199972, + "epoch": 0.5023898992935518, + "flos": 26219467982520.0, + "grad_norm": 2.563730809671426, + "language_loss": 0.81629604, + "learning_rate": 2.081569591520548e-06, + "loss": 0.84049773, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.15490723, + "step": 8356, + "time_per_iteration": 2.8443856239318848 + }, + { + "auxiliary_loss_clip": 0.0138768, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.26123571, + "balance_loss_mlp": 1.0236541, + "epoch": 0.5024500225462197, + "flos": 13443848875800.0, + "grad_norm": 2.014486043481185, + "language_loss": 0.76937258, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.79364085, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.15478516, + "step": 8357, + "time_per_iteration": 2.810293197631836 + }, + { + "auxiliary_loss_clip": 0.01381696, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.25993657, + "balance_loss_mlp": 1.01797998, + "epoch": 0.5025101457988878, + "flos": 21584555966520.0, + "grad_norm": 1.8220609140619153, + "language_loss": 0.76502633, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78917331, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.15039062, + "step": 8358, + "time_per_iteration": 2.7459652423858643 + }, + { + "auxiliary_loss_clip": 0.01376116, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.25635934, + "balance_loss_mlp": 1.01993704, + "epoch": 0.5025702690515557, + "flos": 24650603843400.0, + "grad_norm": 2.437418304082589, + "language_loss": 0.73073661, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.7548427, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.14575195, + "step": 8359, + "time_per_iteration": 2.7805259227752686 + }, + { + "auxiliary_loss_clip": 0.01371617, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.25327921, + "balance_loss_mlp": 1.02241731, + "epoch": 0.5026303923042237, + "flos": 22095171398520.0, + "grad_norm": 1.573861749760649, + "language_loss": 0.77211928, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79619908, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13964844, + "step": 8360, + "time_per_iteration": 2.760735034942627 + }, + { + "auxiliary_loss_clip": 0.01376081, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.25772357, + "balance_loss_mlp": 1.01979673, + "epoch": 0.5026905155568916, + "flos": 23702637106800.0, + "grad_norm": 1.4858631490897427, + "language_loss": 0.76956606, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.7936635, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13873291, + "step": 8361, + "time_per_iteration": 2.7702929973602295 + }, + { + "auxiliary_loss_clip": 0.01384151, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.26180243, + "balance_loss_mlp": 1.02009964, + "epoch": 0.5027506388095596, + "flos": 25817997486960.0, + "grad_norm": 1.8197345140487606, + "language_loss": 0.85770029, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.88189793, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.15515137, + "step": 8362, + "time_per_iteration": 2.7962002754211426 + }, + { + "auxiliary_loss_clip": 0.01379869, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.258829, + "balance_loss_mlp": 1.01645064, + "epoch": 0.5028107620622275, + "flos": 27532334063520.0, + "grad_norm": 1.482896581204756, + "language_loss": 0.78623319, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.81032848, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.13220215, + "step": 8363, + "time_per_iteration": 2.8350682258605957 + }, + { + "auxiliary_loss_clip": 0.01372582, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.25837886, + "balance_loss_mlp": 1.01983404, + "epoch": 0.5028708853148955, + "flos": 24539550314040.0, + "grad_norm": 2.0768420197151953, + "language_loss": 0.75989932, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.78397214, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14855957, + "step": 8364, + "time_per_iteration": 2.9044578075408936 + }, + { + "auxiliary_loss_clip": 0.01376703, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.25924945, + "balance_loss_mlp": 1.01649547, + "epoch": 0.5029310085675635, + "flos": 20818510993440.0, + "grad_norm": 1.5854134118695904, + "language_loss": 0.69599134, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.72005993, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13659668, + "step": 8365, + "time_per_iteration": 2.8451004028320312 + }, + { + "auxiliary_loss_clip": 0.01384649, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.26192689, + "balance_loss_mlp": 1.01954532, + "epoch": 0.5029911318202315, + "flos": 22347108620640.0, + "grad_norm": 1.4976998027829374, + "language_loss": 0.73669696, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.76088721, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.14831543, + "step": 8366, + "time_per_iteration": 2.910443067550659 + }, + { + "auxiliary_loss_clip": 0.0137857, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.26041698, + "balance_loss_mlp": 1.01815176, + "epoch": 0.5030512550728995, + "flos": 24358075242480.0, + "grad_norm": 1.3675553268081135, + "language_loss": 0.78560489, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80971205, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13989258, + "step": 8367, + "time_per_iteration": 2.86952543258667 + }, + { + "auxiliary_loss_clip": 0.01377732, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.25967622, + "balance_loss_mlp": 1.01559615, + "epoch": 0.5031113783255674, + "flos": 18264540449520.0, + "grad_norm": 1.7264824350938752, + "language_loss": 0.6991843, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72325236, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.1348877, + "step": 8368, + "time_per_iteration": 2.8981285095214844 + }, + { + "auxiliary_loss_clip": 0.0121042, + "auxiliary_loss_mlp": 0.01001504, + "balance_loss_clip": 1.16170382, + "balance_loss_mlp": 0.99926299, + "epoch": 0.5031715015782354, + "flos": 57266327918160.0, + "grad_norm": 0.865860090447531, + "language_loss": 0.63427466, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65639395, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.02246094, + "step": 8369, + "time_per_iteration": 3.2617642879486084 + }, + { + "auxiliary_loss_clip": 0.0137243, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.2559936, + "balance_loss_mlp": 1.02044201, + "epoch": 0.5032316248309033, + "flos": 27533186839080.0, + "grad_norm": 2.197423880458334, + "language_loss": 0.60535461, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62941289, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.12957764, + "step": 8370, + "time_per_iteration": 2.8445346355438232 + }, + { + "auxiliary_loss_clip": 0.01381677, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.26022387, + "balance_loss_mlp": 1.02138197, + "epoch": 0.5032917480835714, + "flos": 34502520666960.0, + "grad_norm": 1.6227272557360628, + "language_loss": 0.68732184, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.71150255, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.15002441, + "step": 8371, + "time_per_iteration": 2.8832826614379883 + }, + { + "auxiliary_loss_clip": 0.0137462, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.25597775, + "balance_loss_mlp": 1.01331615, + "epoch": 0.5033518713362393, + "flos": 33663373999920.0, + "grad_norm": 2.3920082872940696, + "language_loss": 0.67950046, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.70353222, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.15234375, + "step": 8372, + "time_per_iteration": 2.8400344848632812 + }, + { + "auxiliary_loss_clip": 0.01373751, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.25467217, + "balance_loss_mlp": 1.01949549, + "epoch": 0.5034119945889073, + "flos": 28191589385040.0, + "grad_norm": 1.8710259616240519, + "language_loss": 0.66545081, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68953311, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14971924, + "step": 8373, + "time_per_iteration": 2.84383225440979 + }, + { + "auxiliary_loss_clip": 0.01376681, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.25721049, + "balance_loss_mlp": 1.02078223, + "epoch": 0.5034721178415752, + "flos": 21363382900080.0, + "grad_norm": 1.6596833248722371, + "language_loss": 0.75219291, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.77630579, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13830566, + "step": 8374, + "time_per_iteration": 2.818470001220703 + }, + { + "auxiliary_loss_clip": 0.01384215, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.26230502, + "balance_loss_mlp": 1.02154517, + "epoch": 0.5035322410942432, + "flos": 22680025558560.0, + "grad_norm": 1.6180304257120204, + "language_loss": 0.6887176, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.71292651, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.15130615, + "step": 8375, + "time_per_iteration": 2.74359393119812 + }, + { + "auxiliary_loss_clip": 0.01387249, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.26534319, + "balance_loss_mlp": 1.02316689, + "epoch": 0.5035923643469111, + "flos": 19833526413720.0, + "grad_norm": 1.7568231587679488, + "language_loss": 0.79004884, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81430352, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.15063477, + "step": 8376, + "time_per_iteration": 4.160673141479492 + }, + { + "auxiliary_loss_clip": 0.0138665, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.26377416, + "balance_loss_mlp": 1.02042556, + "epoch": 0.5036524875995791, + "flos": 30520244809800.0, + "grad_norm": 2.087096197180259, + "language_loss": 0.5985918, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.62280357, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.14111328, + "step": 8377, + "time_per_iteration": 2.843085765838623 + }, + { + "auxiliary_loss_clip": 0.01377236, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.25823116, + "balance_loss_mlp": 1.02164316, + "epoch": 0.5037126108522471, + "flos": 14724691941960.0, + "grad_norm": 1.8387275468166198, + "language_loss": 0.76690751, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.79104257, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14630127, + "step": 8378, + "time_per_iteration": 2.741304397583008 + }, + { + "auxiliary_loss_clip": 0.01379616, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.26119423, + "balance_loss_mlp": 1.01858401, + "epoch": 0.5037727341049151, + "flos": 25302712093560.0, + "grad_norm": 2.4849689303179217, + "language_loss": 0.75057471, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77469552, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13873291, + "step": 8379, + "time_per_iteration": 4.344856023788452 + }, + { + "auxiliary_loss_clip": 0.01372185, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.25579047, + "balance_loss_mlp": 1.0211494, + "epoch": 0.5038328573575831, + "flos": 28546783331400.0, + "grad_norm": 2.563823471949171, + "language_loss": 0.66848743, + "learning_rate": 2.072229431544548e-06, + "loss": 0.69255692, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13616943, + "step": 8380, + "time_per_iteration": 2.948622226715088 + }, + { + "auxiliary_loss_clip": 0.01366296, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.25045371, + "balance_loss_mlp": 1.02534747, + "epoch": 0.503892980610251, + "flos": 31656183955560.0, + "grad_norm": 1.9078047924347936, + "language_loss": 0.63745713, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66150445, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.13079834, + "step": 8381, + "time_per_iteration": 2.9293220043182373 + }, + { + "auxiliary_loss_clip": 0.01369226, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.25174093, + "balance_loss_mlp": 1.02200198, + "epoch": 0.503953103862919, + "flos": 27095104584000.0, + "grad_norm": 1.4012300982576829, + "language_loss": 0.67969871, + "learning_rate": 2.071451010853365e-06, + "loss": 0.70374477, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.1338501, + "step": 8382, + "time_per_iteration": 2.8542346954345703 + }, + { + "auxiliary_loss_clip": 0.01395221, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_clip": 1.26871645, + "balance_loss_mlp": 1.03231144, + "epoch": 0.5040132271155869, + "flos": 15637224561480.0, + "grad_norm": 1.6532783595779201, + "language_loss": 0.62453538, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64896667, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.15576172, + "step": 8383, + "time_per_iteration": 2.73282790184021 + }, + { + "auxiliary_loss_clip": 0.01367722, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.25289083, + "balance_loss_mlp": 1.02256703, + "epoch": 0.504073350368255, + "flos": 13594153708080.0, + "grad_norm": 2.053701728125394, + "language_loss": 0.68131793, + "learning_rate": 2.070672579324465e-06, + "loss": 0.70535779, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13677979, + "step": 8384, + "time_per_iteration": 4.32347297668457 + }, + { + "auxiliary_loss_clip": 0.01369703, + "auxiliary_loss_mlp": 0.01042485, + "balance_loss_clip": 1.25208867, + "balance_loss_mlp": 1.02913928, + "epoch": 0.5041334736209229, + "flos": 29063977317720.0, + "grad_norm": 1.6452910318059244, + "language_loss": 0.71749008, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.74161196, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13336182, + "step": 8385, + "time_per_iteration": 2.838317632675171 + }, + { + "auxiliary_loss_clip": 0.01360489, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.24629593, + "balance_loss_mlp": 1.01880217, + "epoch": 0.5041935968735909, + "flos": 24613789042080.0, + "grad_norm": 1.7741235406920357, + "language_loss": 0.83830476, + "learning_rate": 2.069894137075919e-06, + "loss": 0.86222959, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13214111, + "step": 8386, + "time_per_iteration": 2.762399911880493 + }, + { + "auxiliary_loss_clip": 0.01376582, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.25763011, + "balance_loss_mlp": 1.02615213, + "epoch": 0.5042537201262588, + "flos": 26292610284840.0, + "grad_norm": 1.6284018631923711, + "language_loss": 0.6705839, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.69475377, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14245605, + "step": 8387, + "time_per_iteration": 2.840714693069458 + }, + { + "auxiliary_loss_clip": 0.01368937, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.25224602, + "balance_loss_mlp": 1.02174056, + "epoch": 0.5043138433789268, + "flos": 22022394571440.0, + "grad_norm": 1.339299557936489, + "language_loss": 0.80270207, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.8267411, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13214111, + "step": 8388, + "time_per_iteration": 2.795048475265503 + }, + { + "auxiliary_loss_clip": 0.01368227, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.2512145, + "balance_loss_mlp": 1.01985717, + "epoch": 0.5043739666315947, + "flos": 28772991834480.0, + "grad_norm": 2.2051660310330665, + "language_loss": 0.70461315, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72862393, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.12994385, + "step": 8389, + "time_per_iteration": 2.8083975315093994 + }, + { + "auxiliary_loss_clip": 0.01374379, + "auxiliary_loss_mlp": 0.0104462, + "balance_loss_clip": 1.25462604, + "balance_loss_mlp": 1.03113139, + "epoch": 0.5044340898842627, + "flos": 27604948457160.0, + "grad_norm": 1.574192388761028, + "language_loss": 0.7015028, + "learning_rate": 2.068337220892191e-06, + "loss": 0.72569281, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.13500977, + "step": 8390, + "time_per_iteration": 2.8151721954345703 + }, + { + "auxiliary_loss_clip": 0.01210923, + "auxiliary_loss_mlp": 0.01007784, + "balance_loss_clip": 1.16253304, + "balance_loss_mlp": 1.0056262, + "epoch": 0.5044942131369307, + "flos": 67471083403200.0, + "grad_norm": 0.8089703472204433, + "language_loss": 0.53018236, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55236942, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02160645, + "step": 8391, + "time_per_iteration": 3.0855765342712402 + }, + { + "auxiliary_loss_clip": 0.01208945, + "auxiliary_loss_mlp": 0.01014096, + "balance_loss_clip": 1.16024303, + "balance_loss_mlp": 1.01209307, + "epoch": 0.5045543363895987, + "flos": 58643036987040.0, + "grad_norm": 0.8778704676756063, + "language_loss": 0.6081726, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.63040298, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.02001953, + "step": 8392, + "time_per_iteration": 4.493319034576416 + }, + { + "auxiliary_loss_clip": 0.01361577, + "auxiliary_loss_mlp": 0.01037425, + "balance_loss_clip": 1.24707532, + "balance_loss_mlp": 1.02425194, + "epoch": 0.5046144596422667, + "flos": 22531507494120.0, + "grad_norm": 1.5601280620500506, + "language_loss": 0.84782988, + "learning_rate": 2.067169506493517e-06, + "loss": 0.87181991, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13189697, + "step": 8393, + "time_per_iteration": 2.7752273082733154 + }, + { + "auxiliary_loss_clip": 0.01368099, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.25037432, + "balance_loss_mlp": 1.02081978, + "epoch": 0.5046745828949346, + "flos": 27460044536760.0, + "grad_norm": 1.9597174477694193, + "language_loss": 0.51029301, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.5343138, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13171387, + "step": 8394, + "time_per_iteration": 2.817096710205078 + }, + { + "auxiliary_loss_clip": 0.01373688, + "auxiliary_loss_mlp": 0.01037549, + "balance_loss_clip": 1.25512171, + "balance_loss_mlp": 1.02243352, + "epoch": 0.5047347061476026, + "flos": 17278987352760.0, + "grad_norm": 1.5386160160526956, + "language_loss": 0.75555336, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77966571, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.15100098, + "step": 8395, + "time_per_iteration": 2.78235125541687 + }, + { + "auxiliary_loss_clip": 0.01373631, + "auxiliary_loss_mlp": 0.01039772, + "balance_loss_clip": 1.25624895, + "balance_loss_mlp": 1.02615225, + "epoch": 0.5047948294002705, + "flos": 16653257555400.0, + "grad_norm": 1.658757703344004, + "language_loss": 0.68197644, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70611048, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.1361084, + "step": 8396, + "time_per_iteration": 2.7136051654815674 + }, + { + "auxiliary_loss_clip": 0.0136751, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.25157833, + "balance_loss_mlp": 1.02306235, + "epoch": 0.5048549526529386, + "flos": 26870804673840.0, + "grad_norm": 1.5931198373619992, + "language_loss": 0.78828144, + "learning_rate": 2.065612518371792e-06, + "loss": 0.8123157, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12866211, + "step": 8397, + "time_per_iteration": 2.8495800495147705 + }, + { + "auxiliary_loss_clip": 0.01364465, + "auxiliary_loss_mlp": 0.01039688, + "balance_loss_clip": 1.2487191, + "balance_loss_mlp": 1.02715874, + "epoch": 0.5049150759056065, + "flos": 21838807865160.0, + "grad_norm": 1.5433621560824797, + "language_loss": 0.66279876, + "learning_rate": 2.065223265084376e-06, + "loss": 0.6868403, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12530518, + "step": 8398, + "time_per_iteration": 2.810027599334717 + }, + { + "auxiliary_loss_clip": 0.01367567, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.25139999, + "balance_loss_mlp": 1.02081227, + "epoch": 0.5049751991582745, + "flos": 21690371017440.0, + "grad_norm": 1.8047473638794944, + "language_loss": 0.71784496, + "learning_rate": 2.064834009323688e-06, + "loss": 0.74187231, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14331055, + "step": 8399, + "time_per_iteration": 2.793222188949585 + }, + { + "auxiliary_loss_clip": 0.01369216, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.24892712, + "balance_loss_mlp": 1.03890514, + "epoch": 0.5050353224109424, + "flos": 21364154458920.0, + "grad_norm": 2.3155428495071972, + "language_loss": 0.81565952, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83987612, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.13549805, + "step": 8400, + "time_per_iteration": 2.7621243000030518 + }, + { + "auxiliary_loss_clip": 0.01365115, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.24827695, + "balance_loss_mlp": 1.02397692, + "epoch": 0.5050954456636104, + "flos": 22825213737480.0, + "grad_norm": 1.8046096218587035, + "language_loss": 0.79182243, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81585252, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13903809, + "step": 8401, + "time_per_iteration": 2.7668862342834473 + }, + { + "auxiliary_loss_clip": 0.01374087, + "auxiliary_loss_mlp": 0.01040939, + "balance_loss_clip": 1.25338256, + "balance_loss_mlp": 1.02689016, + "epoch": 0.5051555689162783, + "flos": 30454696270800.0, + "grad_norm": 1.505789022850195, + "language_loss": 0.70160222, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72575247, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.14050293, + "step": 8402, + "time_per_iteration": 2.8596484661102295 + }, + { + "auxiliary_loss_clip": 0.01365787, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.24771285, + "balance_loss_mlp": 1.02636051, + "epoch": 0.5052156921689464, + "flos": 21292920749520.0, + "grad_norm": 1.715883417240996, + "language_loss": 0.69188392, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71594024, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13494873, + "step": 8403, + "time_per_iteration": 2.834017753601074 + }, + { + "auxiliary_loss_clip": 0.0136222, + "auxiliary_loss_mlp": 0.01047295, + "balance_loss_clip": 1.24713922, + "balance_loss_mlp": 1.03371119, + "epoch": 0.5052758154216143, + "flos": 25086615072120.0, + "grad_norm": 1.4385481641817954, + "language_loss": 0.86221296, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88630807, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13574219, + "step": 8404, + "time_per_iteration": 2.8437490463256836 + }, + { + "auxiliary_loss_clip": 0.0136502, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.24883687, + "balance_loss_mlp": 1.02403414, + "epoch": 0.5053359386742823, + "flos": 20890272611520.0, + "grad_norm": 1.5053387543731618, + "language_loss": 0.7574535, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.78147638, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13238525, + "step": 8405, + "time_per_iteration": 2.795872926712036 + }, + { + "auxiliary_loss_clip": 0.01371875, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.25223875, + "balance_loss_mlp": 1.01903772, + "epoch": 0.5053960619269503, + "flos": 37750490307360.0, + "grad_norm": 1.6520482514718362, + "language_loss": 0.73407316, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75813389, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.15148926, + "step": 8406, + "time_per_iteration": 2.900028705596924 + }, + { + "auxiliary_loss_clip": 0.01358771, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.24439311, + "balance_loss_mlp": 1.02443695, + "epoch": 0.5054561851796182, + "flos": 23519131617240.0, + "grad_norm": 1.7374406946943053, + "language_loss": 0.76801091, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79197913, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.1361084, + "step": 8407, + "time_per_iteration": 2.749924421310425 + }, + { + "auxiliary_loss_clip": 0.01366725, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.24684465, + "balance_loss_mlp": 1.0217346, + "epoch": 0.5055163084322862, + "flos": 30416500785240.0, + "grad_norm": 2.0330069376432665, + "language_loss": 0.63330257, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65731603, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.12872314, + "step": 8408, + "time_per_iteration": 2.8510494232177734 + }, + { + "auxiliary_loss_clip": 0.01367754, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.25115359, + "balance_loss_mlp": 1.02387094, + "epoch": 0.5055764316849541, + "flos": 20263608821880.0, + "grad_norm": 2.8179809325330303, + "language_loss": 0.63935614, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.66341853, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14593506, + "step": 8409, + "time_per_iteration": 2.837367057800293 + }, + { + "auxiliary_loss_clip": 0.01363795, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.2484796, + "balance_loss_mlp": 1.01466441, + "epoch": 0.5056365549376222, + "flos": 26076919347000.0, + "grad_norm": 1.452388782866279, + "language_loss": 0.70984638, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73375654, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.12548828, + "step": 8410, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.01372017, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.25402677, + "balance_loss_mlp": 1.02292204, + "epoch": 0.5056966781902901, + "flos": 19283497245360.0, + "grad_norm": 1.5071216630418922, + "language_loss": 0.79237497, + "learning_rate": 2.060162752653113e-06, + "loss": 0.8164655, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.14117432, + "step": 8411, + "time_per_iteration": 2.7425758838653564 + }, + { + "auxiliary_loss_clip": 0.01373582, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.25421453, + "balance_loss_mlp": 1.02247047, + "epoch": 0.5057568014429581, + "flos": 21328151824800.0, + "grad_norm": 1.713388222572379, + "language_loss": 0.81834137, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.84245837, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.15655518, + "step": 8412, + "time_per_iteration": 2.753706693649292 + }, + { + "auxiliary_loss_clip": 0.01367504, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.25039721, + "balance_loss_mlp": 1.0214566, + "epoch": 0.505816924695626, + "flos": 17498089392840.0, + "grad_norm": 1.7313090161699938, + "language_loss": 0.80634511, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.8303737, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13885498, + "step": 8413, + "time_per_iteration": 2.8776843547821045 + }, + { + "auxiliary_loss_clip": 0.01372557, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.25290835, + "balance_loss_mlp": 1.0193969, + "epoch": 0.505877047948294, + "flos": 21147042228480.0, + "grad_norm": 1.753157457267205, + "language_loss": 0.80634564, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.83041537, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.15020752, + "step": 8414, + "time_per_iteration": 2.789781093597412 + }, + { + "auxiliary_loss_clip": 0.01368879, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.25032961, + "balance_loss_mlp": 1.01660728, + "epoch": 0.5059371712009619, + "flos": 36356116601880.0, + "grad_norm": 2.128594855014646, + "language_loss": 0.62657136, + "learning_rate": 2.058605592832528e-06, + "loss": 0.650563, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13659668, + "step": 8415, + "time_per_iteration": 4.24825119972229 + }, + { + "auxiliary_loss_clip": 0.01368341, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.25037336, + "balance_loss_mlp": 1.01658845, + "epoch": 0.50599729445363, + "flos": 22678401224160.0, + "grad_norm": 1.501798656793511, + "language_loss": 0.82591248, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84989595, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13421631, + "step": 8416, + "time_per_iteration": 2.801400661468506 + }, + { + "auxiliary_loss_clip": 0.01363575, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.25016212, + "balance_loss_mlp": 1.02168274, + "epoch": 0.5060574177062979, + "flos": 22753086644160.0, + "grad_norm": 2.1764309832807918, + "language_loss": 0.79202431, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.8160072, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13024902, + "step": 8417, + "time_per_iteration": 2.7274487018585205 + }, + { + "auxiliary_loss_clip": 0.01357734, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.24477124, + "balance_loss_mlp": 1.01650286, + "epoch": 0.5061175409589659, + "flos": 21658266785880.0, + "grad_norm": 1.686981529591491, + "language_loss": 0.63340944, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65728223, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13037109, + "step": 8418, + "time_per_iteration": 4.175144672393799 + }, + { + "auxiliary_loss_clip": 0.01369137, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.25063896, + "balance_loss_mlp": 1.02071631, + "epoch": 0.5061776642116339, + "flos": 21621248942760.0, + "grad_norm": 1.8355332085775977, + "language_loss": 0.77484894, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79888278, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13537598, + "step": 8419, + "time_per_iteration": 2.787182092666626 + }, + { + "auxiliary_loss_clip": 0.0137719, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.25801671, + "balance_loss_mlp": 1.02088785, + "epoch": 0.5062377874643018, + "flos": 24431745453480.0, + "grad_norm": 1.9904881424142264, + "language_loss": 0.77276832, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79689586, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14660645, + "step": 8420, + "time_per_iteration": 2.7536706924438477 + }, + { + "auxiliary_loss_clip": 0.0137513, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.25638866, + "balance_loss_mlp": 1.01884902, + "epoch": 0.5062979107169698, + "flos": 22529030384160.0, + "grad_norm": 2.0098619020383888, + "language_loss": 0.77231675, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79639578, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13928223, + "step": 8421, + "time_per_iteration": 2.7688934803009033 + }, + { + "auxiliary_loss_clip": 0.01372394, + "auxiliary_loss_mlp": 0.01026128, + "balance_loss_clip": 1.25530338, + "balance_loss_mlp": 1.01323605, + "epoch": 0.5063580339696377, + "flos": 24577420932720.0, + "grad_norm": 1.5419962143290884, + "language_loss": 0.67179394, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.6957792, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.12890625, + "step": 8422, + "time_per_iteration": 2.7673206329345703 + }, + { + "auxiliary_loss_clip": 0.01374024, + "auxiliary_loss_mlp": 0.01035187, + "balance_loss_clip": 1.25850475, + "balance_loss_mlp": 1.02148938, + "epoch": 0.5064181572223058, + "flos": 22600588960440.0, + "grad_norm": 1.5447280952189897, + "language_loss": 0.81800634, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84209841, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.15673828, + "router_z_loss_mlp": 0.13708496, + "step": 8423, + "time_per_iteration": 4.238006114959717 + }, + { + "auxiliary_loss_clip": 0.0137262, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.25383902, + "balance_loss_mlp": 1.02264035, + "epoch": 0.5064782804749737, + "flos": 26000568984240.0, + "grad_norm": 1.712290291467857, + "language_loss": 0.74730504, + "learning_rate": 2.055101854669237e-06, + "loss": 0.77139771, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14001465, + "step": 8424, + "time_per_iteration": 2.8799164295196533 + }, + { + "auxiliary_loss_clip": 0.01369385, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.25445592, + "balance_loss_mlp": 1.01733792, + "epoch": 0.5065384037276417, + "flos": 28559534356440.0, + "grad_norm": 1.33263371988701, + "language_loss": 0.71938622, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.74339598, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14263916, + "step": 8425, + "time_per_iteration": 2.90547251701355 + }, + { + "auxiliary_loss_clip": 0.01377141, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.25876594, + "balance_loss_mlp": 1.02335095, + "epoch": 0.5065985269803096, + "flos": 22971173475240.0, + "grad_norm": 1.6702303254880295, + "language_loss": 0.79205751, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.81619883, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13641357, + "step": 8426, + "time_per_iteration": 2.7656452655792236 + }, + { + "auxiliary_loss_clip": 0.01376852, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.25947905, + "balance_loss_mlp": 1.02140093, + "epoch": 0.5066586502329776, + "flos": 21612558753720.0, + "grad_norm": 1.8721741617343606, + "language_loss": 0.78492337, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80903971, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13366699, + "step": 8427, + "time_per_iteration": 2.784621000289917 + }, + { + "auxiliary_loss_clip": 0.01368653, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.25351822, + "balance_loss_mlp": 1.01405644, + "epoch": 0.5067187734856455, + "flos": 20344832187840.0, + "grad_norm": 2.059064328226239, + "language_loss": 0.71583068, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73979104, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13330078, + "step": 8428, + "time_per_iteration": 2.759659767150879 + }, + { + "auxiliary_loss_clip": 0.01370066, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.25380301, + "balance_loss_mlp": 1.02043056, + "epoch": 0.5067788967383136, + "flos": 28847108737440.0, + "grad_norm": 1.5143510118515493, + "language_loss": 0.83236885, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.85639942, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.12567139, + "step": 8429, + "time_per_iteration": 2.8377017974853516 + }, + { + "auxiliary_loss_clip": 0.01385483, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.26346362, + "balance_loss_mlp": 1.01921988, + "epoch": 0.5068390199909815, + "flos": 32456282361480.0, + "grad_norm": 1.8278990100304116, + "language_loss": 0.73869938, + "learning_rate": 2.052765934536682e-06, + "loss": 0.76289904, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.15270996, + "step": 8430, + "time_per_iteration": 4.306357383728027 + }, + { + "auxiliary_loss_clip": 0.01373862, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.2556535, + "balance_loss_mlp": 1.02158046, + "epoch": 0.5068991432436495, + "flos": 23151511512720.0, + "grad_norm": 1.6334460642426003, + "language_loss": 0.76790792, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.79199612, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13391113, + "step": 8431, + "time_per_iteration": 2.7692642211914062 + }, + { + "auxiliary_loss_clip": 0.01372357, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.25557101, + "balance_loss_mlp": 1.02249098, + "epoch": 0.5069592664963174, + "flos": 19940640932160.0, + "grad_norm": 1.4302533344958512, + "language_loss": 0.72618389, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.75026608, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13366699, + "step": 8432, + "time_per_iteration": 2.7599098682403564 + }, + { + "auxiliary_loss_clip": 0.01214794, + "auxiliary_loss_mlp": 0.01009637, + "balance_loss_clip": 1.16629744, + "balance_loss_mlp": 1.00771797, + "epoch": 0.5070193897489854, + "flos": 65808424287720.0, + "grad_norm": 0.7671325946662783, + "language_loss": 0.63723099, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65947533, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.01916504, + "step": 8433, + "time_per_iteration": 3.2950732707977295 + }, + { + "auxiliary_loss_clip": 0.01375006, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.25719309, + "balance_loss_mlp": 1.02205253, + "epoch": 0.5070795130016534, + "flos": 17279880736680.0, + "grad_norm": 1.8560228935988492, + "language_loss": 0.77606708, + "learning_rate": 2.051208614233681e-06, + "loss": 0.80017269, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13500977, + "step": 8434, + "time_per_iteration": 2.767754077911377 + }, + { + "auxiliary_loss_clip": 0.01379735, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.25900519, + "balance_loss_mlp": 1.02075422, + "epoch": 0.5071396362543213, + "flos": 21074996351880.0, + "grad_norm": 2.019141835242583, + "language_loss": 0.71636951, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.74050951, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.13500977, + "step": 8435, + "time_per_iteration": 2.7913076877593994 + }, + { + "auxiliary_loss_clip": 0.01376616, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.25772655, + "balance_loss_mlp": 1.019086, + "epoch": 0.5071997595069894, + "flos": 23149521703080.0, + "grad_norm": 1.9946148216231554, + "language_loss": 0.73007047, + "learning_rate": 2.050429942372112e-06, + "loss": 0.75417173, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14422607, + "step": 8436, + "time_per_iteration": 2.8004024028778076 + }, + { + "auxiliary_loss_clip": 0.01374036, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.2563796, + "balance_loss_mlp": 1.01847529, + "epoch": 0.5072598827596573, + "flos": 22752315085320.0, + "grad_norm": 1.488227422642701, + "language_loss": 0.83875066, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86281919, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14343262, + "step": 8437, + "time_per_iteration": 2.748995065689087 + }, + { + "auxiliary_loss_clip": 0.01368489, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.25275946, + "balance_loss_mlp": 1.01957607, + "epoch": 0.5073200060123253, + "flos": 22571611572600.0, + "grad_norm": 1.6006018226601104, + "language_loss": 0.810395, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83440632, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13085938, + "step": 8438, + "time_per_iteration": 2.8277337551116943 + }, + { + "auxiliary_loss_clip": 0.01379866, + "auxiliary_loss_mlp": 0.01035206, + "balance_loss_clip": 1.25999641, + "balance_loss_mlp": 1.02068663, + "epoch": 0.5073801292649932, + "flos": 25811134674120.0, + "grad_norm": 1.469325678025473, + "language_loss": 0.79808366, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.82223439, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.1451416, + "step": 8439, + "time_per_iteration": 2.833045482635498 + }, + { + "auxiliary_loss_clip": 0.01374069, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.25700974, + "balance_loss_mlp": 1.02488422, + "epoch": 0.5074402525176612, + "flos": 25379387323200.0, + "grad_norm": 1.5061466306101048, + "language_loss": 0.71581912, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73994118, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13250732, + "step": 8440, + "time_per_iteration": 2.7948668003082275 + }, + { + "auxiliary_loss_clip": 0.01374584, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.25636053, + "balance_loss_mlp": 1.02141452, + "epoch": 0.5075003757703291, + "flos": 26069609842200.0, + "grad_norm": 1.8244555273853462, + "language_loss": 0.7120477, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73613799, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13043213, + "step": 8441, + "time_per_iteration": 2.818115234375 + }, + { + "auxiliary_loss_clip": 0.01387721, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.26651692, + "balance_loss_mlp": 1.02565217, + "epoch": 0.5075604990229972, + "flos": 21840513416280.0, + "grad_norm": 1.7016781588673204, + "language_loss": 0.63550246, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65978062, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14440918, + "step": 8442, + "time_per_iteration": 2.7826290130615234 + }, + { + "auxiliary_loss_clip": 0.01370588, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.25635266, + "balance_loss_mlp": 1.02024961, + "epoch": 0.5076206222756651, + "flos": 31985121274200.0, + "grad_norm": 1.4854271010630908, + "language_loss": 0.7160331, + "learning_rate": 2.047704531394006e-06, + "loss": 0.74006951, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.12799072, + "step": 8443, + "time_per_iteration": 2.867360830307007 + }, + { + "auxiliary_loss_clip": 0.01384641, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.26357567, + "balance_loss_mlp": 1.02272558, + "epoch": 0.5076807455283331, + "flos": 36911059381800.0, + "grad_norm": 1.255715240538378, + "language_loss": 0.62192297, + "learning_rate": 2.047315179614607e-06, + "loss": 0.646137, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14019775, + "step": 8444, + "time_per_iteration": 2.8605382442474365 + }, + { + "auxiliary_loss_clip": 0.01377264, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.25918293, + "balance_loss_mlp": 1.015342, + "epoch": 0.507740868781001, + "flos": 29868542643240.0, + "grad_norm": 1.8481565663656714, + "language_loss": 0.6409027, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66495973, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13110352, + "step": 8445, + "time_per_iteration": 2.8399579524993896 + }, + { + "auxiliary_loss_clip": 0.01212254, + "auxiliary_loss_mlp": 0.01005027, + "balance_loss_clip": 1.16527474, + "balance_loss_mlp": 1.00294101, + "epoch": 0.507800992033669, + "flos": 61932166523280.0, + "grad_norm": 0.8382633524027264, + "language_loss": 0.61932158, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64149439, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02087402, + "step": 8446, + "time_per_iteration": 3.289334535598755 + }, + { + "auxiliary_loss_clip": 0.01375222, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.25838757, + "balance_loss_mlp": 1.02043724, + "epoch": 0.507861115286337, + "flos": 20704899137400.0, + "grad_norm": 1.6088741633347825, + "language_loss": 0.81000227, + "learning_rate": 2.04614711357029e-06, + "loss": 0.83409095, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13208008, + "step": 8447, + "time_per_iteration": 2.74163556098938 + }, + { + "auxiliary_loss_clip": 0.0137525, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.25956321, + "balance_loss_mlp": 1.01631868, + "epoch": 0.507921238539005, + "flos": 30853242964440.0, + "grad_norm": 1.325161110711039, + "language_loss": 0.70810127, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.73214853, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13165283, + "step": 8448, + "time_per_iteration": 2.8638575077056885 + }, + { + "auxiliary_loss_clip": 0.01371158, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.25573409, + "balance_loss_mlp": 1.01893115, + "epoch": 0.507981361791673, + "flos": 35707094587080.0, + "grad_norm": 1.4210783621060514, + "language_loss": 0.72366023, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74768794, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.12677002, + "step": 8449, + "time_per_iteration": 2.8686206340789795 + }, + { + "auxiliary_loss_clip": 0.01372986, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.25714958, + "balance_loss_mlp": 1.01824832, + "epoch": 0.5080414850443409, + "flos": 27167109852240.0, + "grad_norm": 1.3796489173432935, + "language_loss": 0.72741258, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75145245, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.12756348, + "step": 8450, + "time_per_iteration": 2.9203553199768066 + }, + { + "auxiliary_loss_clip": 0.01376167, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.25840282, + "balance_loss_mlp": 1.01893163, + "epoch": 0.5081016082970089, + "flos": 27090394014240.0, + "grad_norm": 1.6442451011315804, + "language_loss": 0.77248609, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79657352, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13659668, + "step": 8451, + "time_per_iteration": 2.783757448196411 + }, + { + "auxiliary_loss_clip": 0.01373469, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.2549355, + "balance_loss_mlp": 1.01910579, + "epoch": 0.5081617315496768, + "flos": 22861663063560.0, + "grad_norm": 2.3549611465062092, + "language_loss": 0.85515094, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87921304, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13641357, + "step": 8452, + "time_per_iteration": 2.753523349761963 + }, + { + "auxiliary_loss_clip": 0.01380965, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.2603364, + "balance_loss_mlp": 1.01811194, + "epoch": 0.5082218548023448, + "flos": 16285231367280.0, + "grad_norm": 3.973919642276083, + "language_loss": 0.78359938, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80773592, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.14581299, + "step": 8453, + "time_per_iteration": 4.089768409729004 + }, + { + "auxiliary_loss_clip": 0.01365545, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.2523855, + "balance_loss_mlp": 1.02201486, + "epoch": 0.5082819780550127, + "flos": 24465961319760.0, + "grad_norm": 1.7056293942361915, + "language_loss": 0.77193522, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.79594058, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12982178, + "step": 8454, + "time_per_iteration": 2.7622056007385254 + }, + { + "auxiliary_loss_clip": 0.01376974, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.2600342, + "balance_loss_mlp": 1.01929331, + "epoch": 0.5083421013076808, + "flos": 23408402954760.0, + "grad_norm": 1.6404629415061194, + "language_loss": 0.89375508, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91786361, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.14587402, + "step": 8455, + "time_per_iteration": 2.8091981410980225 + }, + { + "auxiliary_loss_clip": 0.0138004, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.25769079, + "balance_loss_mlp": 1.02127051, + "epoch": 0.5084022245603487, + "flos": 23877249365520.0, + "grad_norm": 1.7027711066202462, + "language_loss": 0.62488353, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64905173, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15509033, + "step": 8456, + "time_per_iteration": 4.242663860321045 + }, + { + "auxiliary_loss_clip": 0.01211127, + "auxiliary_loss_mlp": 0.01007941, + "balance_loss_clip": 1.16397071, + "balance_loss_mlp": 1.00564003, + "epoch": 0.5084623478130167, + "flos": 62887402156320.0, + "grad_norm": 0.8269249576451205, + "language_loss": 0.62554711, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64773774, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.02294922, + "step": 8457, + "time_per_iteration": 3.1296138763427734 + }, + { + "auxiliary_loss_clip": 0.01380295, + "auxiliary_loss_mlp": 0.01031414, + "balance_loss_clip": 1.26176667, + "balance_loss_mlp": 1.01714516, + "epoch": 0.5085224710656846, + "flos": 22351250673360.0, + "grad_norm": 1.4227854495498145, + "language_loss": 0.67245251, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69656962, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14263916, + "step": 8458, + "time_per_iteration": 2.785945177078247 + }, + { + "auxiliary_loss_clip": 0.01376338, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.25612271, + "balance_loss_mlp": 1.01799619, + "epoch": 0.5085825943183526, + "flos": 26071680868560.0, + "grad_norm": 1.743498535940138, + "language_loss": 0.78023803, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80432189, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.140625, + "step": 8459, + "time_per_iteration": 2.815460681915283 + }, + { + "auxiliary_loss_clip": 0.01382758, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.26301563, + "balance_loss_mlp": 1.01960754, + "epoch": 0.5086427175710206, + "flos": 17425231349040.0, + "grad_norm": 1.9869013577383356, + "language_loss": 0.80362231, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82779324, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.14709473, + "step": 8460, + "time_per_iteration": 2.7275331020355225 + }, + { + "auxiliary_loss_clip": 0.01375091, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.25696158, + "balance_loss_mlp": 1.02052355, + "epoch": 0.5087028408236886, + "flos": 20636711055000.0, + "grad_norm": 1.5047284657795605, + "language_loss": 0.69022453, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71431941, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13879395, + "step": 8461, + "time_per_iteration": 2.761644124984741 + }, + { + "auxiliary_loss_clip": 0.01361294, + "auxiliary_loss_mlp": 0.01026741, + "balance_loss_clip": 1.24806392, + "balance_loss_mlp": 1.01271033, + "epoch": 0.5087629640763566, + "flos": 25599423355560.0, + "grad_norm": 1.6130153635310582, + "language_loss": 0.76559925, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78947961, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14013672, + "step": 8462, + "time_per_iteration": 4.274950265884399 + }, + { + "auxiliary_loss_clip": 0.01369256, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.25304115, + "balance_loss_mlp": 1.01712668, + "epoch": 0.5088230873290245, + "flos": 13265622473040.0, + "grad_norm": 2.0096280436496037, + "language_loss": 0.81950295, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.84350741, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14068604, + "step": 8463, + "time_per_iteration": 2.813917636871338 + }, + { + "auxiliary_loss_clip": 0.01371726, + "auxiliary_loss_mlp": 0.01025978, + "balance_loss_clip": 1.25596178, + "balance_loss_mlp": 1.01267433, + "epoch": 0.5088832105816925, + "flos": 20047755450600.0, + "grad_norm": 1.628128359143618, + "language_loss": 0.76873606, + "learning_rate": 2.039527786882341e-06, + "loss": 0.79271317, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.13323975, + "step": 8464, + "time_per_iteration": 2.7520105838775635 + }, + { + "auxiliary_loss_clip": 0.01209625, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 1.16147757, + "balance_loss_mlp": 0.99803585, + "epoch": 0.5089433338343604, + "flos": 67440051970200.0, + "grad_norm": 0.6872711513239075, + "language_loss": 0.59416175, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61625993, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.02160645, + "step": 8465, + "time_per_iteration": 3.38521409034729 + }, + { + "auxiliary_loss_clip": 0.01366494, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.25085902, + "balance_loss_mlp": 1.01675224, + "epoch": 0.5090034570870284, + "flos": 22715419067280.0, + "grad_norm": 2.1292918115524095, + "language_loss": 0.80514556, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82911527, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13739014, + "step": 8466, + "time_per_iteration": 2.8777430057525635 + }, + { + "auxiliary_loss_clip": 0.01363449, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.24860597, + "balance_loss_mlp": 1.0154264, + "epoch": 0.5090635803396963, + "flos": 20450281763520.0, + "grad_norm": 1.6859135401983327, + "language_loss": 0.79027796, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.81420642, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13977051, + "step": 8467, + "time_per_iteration": 2.7517964839935303 + }, + { + "auxiliary_loss_clip": 0.01359922, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.24872601, + "balance_loss_mlp": 1.01751947, + "epoch": 0.5091237035923644, + "flos": 23774155074720.0, + "grad_norm": 1.6717169058799433, + "language_loss": 0.75247157, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.77637196, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.1260376, + "step": 8468, + "time_per_iteration": 2.8363330364227295 + }, + { + "auxiliary_loss_clip": 0.01367202, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.25198376, + "balance_loss_mlp": 1.01891303, + "epoch": 0.5091838268450323, + "flos": 18331875756360.0, + "grad_norm": 2.013997204722028, + "language_loss": 0.78155529, + "learning_rate": 2.03758084040404e-06, + "loss": 0.8055473, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13116455, + "step": 8469, + "time_per_iteration": 4.26777458190918 + }, + { + "auxiliary_loss_clip": 0.01364933, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.25005364, + "balance_loss_mlp": 1.02067161, + "epoch": 0.5092439500977003, + "flos": 29063368192320.0, + "grad_norm": 1.4813814308398998, + "language_loss": 0.6963141, + "learning_rate": 2.037191446774109e-06, + "loss": 0.72031116, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14111328, + "step": 8470, + "time_per_iteration": 2.791295289993286 + }, + { + "auxiliary_loss_clip": 0.01368823, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.25056922, + "balance_loss_mlp": 1.02417445, + "epoch": 0.5093040733503682, + "flos": 13557988640520.0, + "grad_norm": 1.8340531818498602, + "language_loss": 0.7392866, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.76336032, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.1439209, + "step": 8471, + "time_per_iteration": 2.7488040924072266 + }, + { + "auxiliary_loss_clip": 0.01210152, + "auxiliary_loss_mlp": 0.01005353, + "balance_loss_clip": 1.16286039, + "balance_loss_mlp": 1.00264704, + "epoch": 0.5093641966030362, + "flos": 68923185215400.0, + "grad_norm": 0.7653527884733554, + "language_loss": 0.58156598, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60372102, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02709961, + "step": 8472, + "time_per_iteration": 3.2970340251922607 + }, + { + "auxiliary_loss_clip": 0.01369811, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.25162959, + "balance_loss_mlp": 1.02232933, + "epoch": 0.5094243198557042, + "flos": 21586099084200.0, + "grad_norm": 1.754720969575802, + "language_loss": 0.69524872, + "learning_rate": 2.03602325748156e-06, + "loss": 0.7192958, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.12579346, + "step": 8473, + "time_per_iteration": 2.844874620437622 + }, + { + "auxiliary_loss_clip": 0.01362232, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.24741864, + "balance_loss_mlp": 1.02231276, + "epoch": 0.5094844431083722, + "flos": 28846702653840.0, + "grad_norm": 2.2415930439101297, + "language_loss": 0.85454929, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87852824, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13354492, + "step": 8474, + "time_per_iteration": 2.9319097995758057 + }, + { + "auxiliary_loss_clip": 0.01368122, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.25055695, + "balance_loss_mlp": 1.02354515, + "epoch": 0.5095445663610402, + "flos": 14980283916480.0, + "grad_norm": 1.8173321891390135, + "language_loss": 0.65499192, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67904472, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.1361084, + "step": 8475, + "time_per_iteration": 2.7345571517944336 + }, + { + "auxiliary_loss_clip": 0.01370313, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.24975562, + "balance_loss_mlp": 1.02625823, + "epoch": 0.5096046896137081, + "flos": 20781980450640.0, + "grad_norm": 2.3187389749583045, + "language_loss": 0.82123315, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84534335, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14447021, + "step": 8476, + "time_per_iteration": 2.8204667568206787 + }, + { + "auxiliary_loss_clip": 0.01370562, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.25038373, + "balance_loss_mlp": 1.0243609, + "epoch": 0.5096648128663761, + "flos": 23190275515320.0, + "grad_norm": 1.8517338333189848, + "language_loss": 0.81144488, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83555067, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.15637207, + "step": 8477, + "time_per_iteration": 2.7491321563720703 + }, + { + "auxiliary_loss_clip": 0.01368578, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.25064683, + "balance_loss_mlp": 1.0153141, + "epoch": 0.509724936119044, + "flos": 22314517088760.0, + "grad_norm": 1.7126863486972328, + "language_loss": 0.61982965, + "learning_rate": 2.034076248204082e-06, + "loss": 0.64381182, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.14324951, + "step": 8478, + "time_per_iteration": 2.766505002975464 + }, + { + "auxiliary_loss_clip": 0.01360685, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.24558234, + "balance_loss_mlp": 1.03149617, + "epoch": 0.509785059371712, + "flos": 26292772718280.0, + "grad_norm": 1.6424153968795654, + "language_loss": 0.66542709, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.68947887, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.12994385, + "step": 8479, + "time_per_iteration": 2.793715476989746 + }, + { + "auxiliary_loss_clip": 0.01361006, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.2456938, + "balance_loss_mlp": 1.0206008, + "epoch": 0.50984518262438, + "flos": 22969549140840.0, + "grad_norm": 1.4254575417754276, + "language_loss": 0.69335806, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71730804, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13397217, + "step": 8480, + "time_per_iteration": 2.7202084064483643 + }, + { + "auxiliary_loss_clip": 0.01364245, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.24520278, + "balance_loss_mlp": 1.01644325, + "epoch": 0.509905305877048, + "flos": 26218980682200.0, + "grad_norm": 1.734721128683524, + "language_loss": 0.79420114, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81814903, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14099121, + "step": 8481, + "time_per_iteration": 2.7966017723083496 + }, + { + "auxiliary_loss_clip": 0.01355554, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.24141383, + "balance_loss_mlp": 1.02557731, + "epoch": 0.5099654291297159, + "flos": 20344994621280.0, + "grad_norm": 1.460179732030616, + "language_loss": 0.83705002, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.86099607, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13482666, + "step": 8482, + "time_per_iteration": 2.7399916648864746 + }, + { + "auxiliary_loss_clip": 0.01367457, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.24594331, + "balance_loss_mlp": 1.02512622, + "epoch": 0.5100255523823839, + "flos": 29060200740240.0, + "grad_norm": 1.6824391865995296, + "language_loss": 0.85453212, + "learning_rate": 2.032129206622238e-06, + "loss": 0.8785969, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.13897705, + "step": 8483, + "time_per_iteration": 2.839412212371826 + }, + { + "auxiliary_loss_clip": 0.0136339, + "auxiliary_loss_mlp": 0.01035908, + "balance_loss_clip": 1.24445045, + "balance_loss_mlp": 1.02231824, + "epoch": 0.5100856756350518, + "flos": 22461085951920.0, + "grad_norm": 1.8991801575381653, + "language_loss": 0.83700562, + "learning_rate": 2.031739794591775e-06, + "loss": 0.86099863, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.13580322, + "step": 8484, + "time_per_iteration": 2.7276761531829834 + }, + { + "auxiliary_loss_clip": 0.01358465, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.24167836, + "balance_loss_mlp": 1.01689386, + "epoch": 0.5101457988877198, + "flos": 19175651776440.0, + "grad_norm": 1.9634301145097506, + "language_loss": 0.81854868, + "learning_rate": 2.031350381357736e-06, + "loss": 0.84244466, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14239502, + "step": 8485, + "time_per_iteration": 2.7390925884246826 + }, + { + "auxiliary_loss_clip": 0.01354618, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.24138737, + "balance_loss_mlp": 1.0176518, + "epoch": 0.5102059221403878, + "flos": 14870489246280.0, + "grad_norm": 1.943227411910692, + "language_loss": 0.73958027, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76343548, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13262939, + "step": 8486, + "time_per_iteration": 2.719202995300293 + }, + { + "auxiliary_loss_clip": 0.01368489, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.24838018, + "balance_loss_mlp": 1.01914001, + "epoch": 0.5102660453930558, + "flos": 22965610129920.0, + "grad_norm": 1.7107613679093865, + "language_loss": 0.70377743, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72779578, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.14208984, + "step": 8487, + "time_per_iteration": 2.8716318607330322 + }, + { + "auxiliary_loss_clip": 0.01358348, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.24240613, + "balance_loss_mlp": 1.02138972, + "epoch": 0.5103261686457238, + "flos": 23154597748080.0, + "grad_norm": 2.0690845668472155, + "language_loss": 0.73013604, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75407743, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.14404297, + "step": 8488, + "time_per_iteration": 2.801107883453369 + }, + { + "auxiliary_loss_clip": 0.01367273, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.24806964, + "balance_loss_mlp": 1.02019453, + "epoch": 0.5103862918983917, + "flos": 14323302663120.0, + "grad_norm": 1.6211536721563586, + "language_loss": 0.7019912, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.7260049, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13909912, + "step": 8489, + "time_per_iteration": 2.716005325317383 + }, + { + "auxiliary_loss_clip": 0.01360378, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.24251568, + "balance_loss_mlp": 1.01714563, + "epoch": 0.5104464151510597, + "flos": 25854081337800.0, + "grad_norm": 1.7270728025051145, + "language_loss": 0.72214532, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74605501, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13446045, + "step": 8490, + "time_per_iteration": 2.7930407524108887 + }, + { + "auxiliary_loss_clip": 0.0135267, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.23825991, + "balance_loss_mlp": 1.01846862, + "epoch": 0.5105065384037276, + "flos": 21657941919000.0, + "grad_norm": 1.5397117315065094, + "language_loss": 0.81037319, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.83421332, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.12890625, + "step": 8491, + "time_per_iteration": 4.22855281829834 + }, + { + "auxiliary_loss_clip": 0.01347533, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.23565316, + "balance_loss_mlp": 1.01939249, + "epoch": 0.5105666616563956, + "flos": 22496641894080.0, + "grad_norm": 2.3565568299301174, + "language_loss": 0.79678231, + "learning_rate": 2.028624456259728e-06, + "loss": 0.82057941, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12792969, + "step": 8492, + "time_per_iteration": 2.7530767917633057 + }, + { + "auxiliary_loss_clip": 0.01367091, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_clip": 1.24742198, + "balance_loss_mlp": 1.0282886, + "epoch": 0.5106267849090635, + "flos": 22460923518480.0, + "grad_norm": 1.8415992796119711, + "language_loss": 0.78374255, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.80784225, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14587402, + "step": 8493, + "time_per_iteration": 2.7598795890808105 + }, + { + "auxiliary_loss_clip": 0.01356206, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.24035621, + "balance_loss_mlp": 1.01981115, + "epoch": 0.5106869081617316, + "flos": 23551966799280.0, + "grad_norm": 1.9269173412115326, + "language_loss": 0.84402347, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.8679235, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13995361, + "step": 8494, + "time_per_iteration": 2.7411587238311768 + }, + { + "auxiliary_loss_clip": 0.01361972, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.24257541, + "balance_loss_mlp": 1.02023804, + "epoch": 0.5107470314143995, + "flos": 26798109063480.0, + "grad_norm": 1.8863952576538028, + "language_loss": 0.79412436, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81807888, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13232422, + "step": 8495, + "time_per_iteration": 4.261289834976196 + }, + { + "auxiliary_loss_clip": 0.01357698, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.24160063, + "balance_loss_mlp": 1.01820874, + "epoch": 0.5108071546670675, + "flos": 25745301876600.0, + "grad_norm": 1.4502518438199654, + "language_loss": 0.78594565, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80984151, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13684082, + "step": 8496, + "time_per_iteration": 2.861020803451538 + }, + { + "auxiliary_loss_clip": 0.01353627, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.24044228, + "balance_loss_mlp": 1.01883972, + "epoch": 0.5108672779197354, + "flos": 18702176012640.0, + "grad_norm": 1.9350279175679708, + "language_loss": 0.79376161, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.81761432, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.12805176, + "step": 8497, + "time_per_iteration": 2.7565383911132812 + }, + { + "auxiliary_loss_clip": 0.01354493, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.23859489, + "balance_loss_mlp": 1.01597655, + "epoch": 0.5109274011724034, + "flos": 26693877738600.0, + "grad_norm": 1.6661721957853477, + "language_loss": 0.81968349, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84351856, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13043213, + "step": 8498, + "time_per_iteration": 2.83261775970459 + }, + { + "auxiliary_loss_clip": 0.01350832, + "auxiliary_loss_mlp": 0.01035271, + "balance_loss_clip": 1.23736572, + "balance_loss_mlp": 1.0217824, + "epoch": 0.5109875244250714, + "flos": 22789211103360.0, + "grad_norm": 1.764133541158174, + "language_loss": 0.71033621, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73419726, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13476562, + "step": 8499, + "time_per_iteration": 4.2368385791778564 + }, + { + "auxiliary_loss_clip": 0.0135907, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.24248624, + "balance_loss_mlp": 1.0217886, + "epoch": 0.5110476476777394, + "flos": 35595107065440.0, + "grad_norm": 1.486426115539843, + "language_loss": 0.72172928, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74567038, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13244629, + "step": 8500, + "time_per_iteration": 2.854343891143799 + }, + { + "auxiliary_loss_clip": 0.01371488, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.24940133, + "balance_loss_mlp": 1.01962042, + "epoch": 0.5111077709304074, + "flos": 19285446446640.0, + "grad_norm": 3.9196514719328954, + "language_loss": 0.63622785, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.66028905, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.15002441, + "step": 8501, + "time_per_iteration": 2.732902765274048 + }, + { + "auxiliary_loss_clip": 0.01366213, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.2450788, + "balance_loss_mlp": 1.02339101, + "epoch": 0.5111678941830753, + "flos": 20673160381080.0, + "grad_norm": 1.8191242970979664, + "language_loss": 0.87899023, + "learning_rate": 2.024730186540907e-06, + "loss": 0.90302765, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14123535, + "step": 8502, + "time_per_iteration": 2.726475954055786 + }, + { + "auxiliary_loss_clip": 0.01356415, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.2400403, + "balance_loss_mlp": 1.01964271, + "epoch": 0.5112280174357433, + "flos": 26293706710560.0, + "grad_norm": 1.41020296108566, + "language_loss": 0.82998192, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.85387444, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13208008, + "step": 8503, + "time_per_iteration": 2.8380398750305176 + }, + { + "auxiliary_loss_clip": 0.012212, + "auxiliary_loss_mlp": 0.01004463, + "balance_loss_clip": 1.17284441, + "balance_loss_mlp": 1.0021857, + "epoch": 0.5112881406884112, + "flos": 59487097265640.0, + "grad_norm": 0.8622245746752659, + "language_loss": 0.63931769, + "learning_rate": 2.023951320871339e-06, + "loss": 0.6615743, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02282715, + "step": 8504, + "time_per_iteration": 3.2805356979370117 + }, + { + "auxiliary_loss_clip": 0.01354925, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.23940539, + "balance_loss_mlp": 1.01586819, + "epoch": 0.5113482639410792, + "flos": 26474491440000.0, + "grad_norm": 1.5677326559544893, + "language_loss": 0.83957362, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86342102, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13934326, + "step": 8505, + "time_per_iteration": 2.8082759380340576 + }, + { + "auxiliary_loss_clip": 0.01349, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.23486733, + "balance_loss_mlp": 1.01373422, + "epoch": 0.5114083871937471, + "flos": 29901580867080.0, + "grad_norm": 5.736677361602048, + "language_loss": 0.75572896, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77949095, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13464355, + "step": 8506, + "time_per_iteration": 2.8096537590026855 + }, + { + "auxiliary_loss_clip": 0.01357688, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.24177814, + "balance_loss_mlp": 1.01847911, + "epoch": 0.5114685104464152, + "flos": 24319717323480.0, + "grad_norm": 1.8028176872232928, + "language_loss": 0.58693933, + "learning_rate": 2.022783015592131e-06, + "loss": 0.6108439, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14306641, + "step": 8507, + "time_per_iteration": 4.263874053955078 + }, + { + "auxiliary_loss_clip": 0.01359114, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.24333286, + "balance_loss_mlp": 1.02539551, + "epoch": 0.5115286336990831, + "flos": 17023720245120.0, + "grad_norm": 1.8461086478051687, + "language_loss": 0.8602922, + "learning_rate": 2.022393578751503e-06, + "loss": 0.88427871, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.14117432, + "step": 8508, + "time_per_iteration": 2.7645976543426514 + }, + { + "auxiliary_loss_clip": 0.01357248, + "auxiliary_loss_mlp": 0.01039386, + "balance_loss_clip": 1.24070024, + "balance_loss_mlp": 1.02504468, + "epoch": 0.5115887569517511, + "flos": 23664644663040.0, + "grad_norm": 2.154467274328438, + "language_loss": 0.72421026, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74817663, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14337158, + "step": 8509, + "time_per_iteration": 2.7688136100769043 + }, + { + "auxiliary_loss_clip": 0.01352455, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.23910618, + "balance_loss_mlp": 1.01534724, + "epoch": 0.511648880204419, + "flos": 16111512492480.0, + "grad_norm": 1.6304546761182868, + "language_loss": 0.76899767, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.79280126, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12554932, + "step": 8510, + "time_per_iteration": 2.7323427200317383 + }, + { + "auxiliary_loss_clip": 0.01354655, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.24147677, + "balance_loss_mlp": 1.01890779, + "epoch": 0.511709003457087, + "flos": 32641412185440.0, + "grad_norm": 1.497646738155972, + "language_loss": 0.71416306, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.7380268, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12811279, + "step": 8511, + "time_per_iteration": 2.892206907272339 + }, + { + "auxiliary_loss_clip": 0.01360393, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.2474699, + "balance_loss_mlp": 1.01702964, + "epoch": 0.511769126709755, + "flos": 21767086855440.0, + "grad_norm": 2.0133331472860982, + "language_loss": 0.66563928, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68954456, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13098145, + "step": 8512, + "time_per_iteration": 2.7718255519866943 + }, + { + "auxiliary_loss_clip": 0.01358576, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.2411803, + "balance_loss_mlp": 1.02106404, + "epoch": 0.511829249962423, + "flos": 23920723937880.0, + "grad_norm": 1.7609922070925503, + "language_loss": 0.67373729, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.69767725, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14355469, + "step": 8513, + "time_per_iteration": 2.838862180709839 + }, + { + "auxiliary_loss_clip": 0.01352565, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.23911798, + "balance_loss_mlp": 1.02262568, + "epoch": 0.511889373215091, + "flos": 23731573886280.0, + "grad_norm": 2.001711518263354, + "language_loss": 0.69427228, + "learning_rate": 2.0200569403921e-06, + "loss": 0.7181651, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14093018, + "step": 8514, + "time_per_iteration": 2.7688732147216797 + }, + { + "auxiliary_loss_clip": 0.0135068, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.23575568, + "balance_loss_mlp": 1.01418185, + "epoch": 0.5119494964677589, + "flos": 28117472482080.0, + "grad_norm": 1.7024565668498455, + "language_loss": 0.66484535, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68861961, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.12573242, + "step": 8515, + "time_per_iteration": 2.812896490097046 + }, + { + "auxiliary_loss_clip": 0.01347881, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.23517513, + "balance_loss_mlp": 1.01757002, + "epoch": 0.5120096197204269, + "flos": 24978688386480.0, + "grad_norm": 1.9853945281421166, + "language_loss": 0.7561422, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77992284, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.12615967, + "step": 8516, + "time_per_iteration": 2.767055034637451 + }, + { + "auxiliary_loss_clip": 0.01359618, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.24422932, + "balance_loss_mlp": 1.01750326, + "epoch": 0.5120697429730948, + "flos": 17972580365640.0, + "grad_norm": 1.7968383219724964, + "language_loss": 0.77717984, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80108523, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13415527, + "step": 8517, + "time_per_iteration": 2.758328676223755 + }, + { + "auxiliary_loss_clip": 0.01362074, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.24274516, + "balance_loss_mlp": 1.01561117, + "epoch": 0.5121298662257628, + "flos": 23297268208680.0, + "grad_norm": 1.7300397285181885, + "language_loss": 0.74030888, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76422513, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.13922119, + "step": 8518, + "time_per_iteration": 2.867873191833496 + }, + { + "auxiliary_loss_clip": 0.01350414, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.2349031, + "balance_loss_mlp": 1.02103066, + "epoch": 0.5121899894784308, + "flos": 17315517895560.0, + "grad_norm": 1.6351273724227713, + "language_loss": 0.78665721, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.81051171, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14007568, + "step": 8519, + "time_per_iteration": 2.7458276748657227 + }, + { + "auxiliary_loss_clip": 0.01353117, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.23915839, + "balance_loss_mlp": 1.01896155, + "epoch": 0.5122501127310988, + "flos": 24934685905440.0, + "grad_norm": 1.607739838666969, + "language_loss": 0.79477298, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8186245, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13079834, + "step": 8520, + "time_per_iteration": 2.8084206581115723 + }, + { + "auxiliary_loss_clip": 0.0136297, + "auxiliary_loss_mlp": 0.0103911, + "balance_loss_clip": 1.24177814, + "balance_loss_mlp": 1.0243634, + "epoch": 0.5123102359837667, + "flos": 18447558638760.0, + "grad_norm": 1.836326822490432, + "language_loss": 0.8204689, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.84448975, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14758301, + "step": 8521, + "time_per_iteration": 2.7166385650634766 + }, + { + "auxiliary_loss_clip": 0.0135219, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.23548841, + "balance_loss_mlp": 1.02007091, + "epoch": 0.5123703592364347, + "flos": 26690141769480.0, + "grad_norm": 1.7437107594356875, + "language_loss": 0.68758082, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.71144032, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13684082, + "step": 8522, + "time_per_iteration": 2.8091320991516113 + }, + { + "auxiliary_loss_clip": 0.01363995, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.24374235, + "balance_loss_mlp": 1.02347946, + "epoch": 0.5124304824891026, + "flos": 28809887852520.0, + "grad_norm": 1.604964600891611, + "language_loss": 0.62140995, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64543134, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14660645, + "step": 8523, + "time_per_iteration": 2.8181545734405518 + }, + { + "auxiliary_loss_clip": 0.01356149, + "auxiliary_loss_mlp": 0.01040422, + "balance_loss_clip": 1.24029362, + "balance_loss_mlp": 1.02777362, + "epoch": 0.5124906057417706, + "flos": 21766802596920.0, + "grad_norm": 2.0812965903205245, + "language_loss": 0.78474784, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80871356, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12652588, + "step": 8524, + "time_per_iteration": 2.7745306491851807 + }, + { + "auxiliary_loss_clip": 0.01359073, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.24407339, + "balance_loss_mlp": 1.02236795, + "epoch": 0.5125507289944387, + "flos": 18885275418600.0, + "grad_norm": 1.791797549031983, + "language_loss": 0.75611174, + "learning_rate": 2.015773034588706e-06, + "loss": 0.7800501, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.1239624, + "step": 8525, + "time_per_iteration": 2.73274302482605 + }, + { + "auxiliary_loss_clip": 0.01360086, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.241575, + "balance_loss_mlp": 1.02375066, + "epoch": 0.5126108522471066, + "flos": 35634805060320.0, + "grad_norm": 1.469139842759508, + "language_loss": 0.74639499, + "learning_rate": 2.015383584722531e-06, + "loss": 0.77037281, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.1394043, + "step": 8526, + "time_per_iteration": 2.923511028289795 + }, + { + "auxiliary_loss_clip": 0.01364146, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.24728346, + "balance_loss_mlp": 1.02099288, + "epoch": 0.5126709754997746, + "flos": 20195420739480.0, + "grad_norm": 1.6018263775781107, + "language_loss": 0.65701437, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.68100405, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.13818359, + "step": 8527, + "time_per_iteration": 2.7999136447906494 + }, + { + "auxiliary_loss_clip": 0.01341913, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.23224831, + "balance_loss_mlp": 1.02264214, + "epoch": 0.5127310987524425, + "flos": 18593234118000.0, + "grad_norm": 1.6584306522995893, + "language_loss": 0.7393434, + "learning_rate": 2.014604683254908e-06, + "loss": 0.7631076, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.11859131, + "step": 8528, + "time_per_iteration": 2.7325501441955566 + }, + { + "auxiliary_loss_clip": 0.01352075, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.23605037, + "balance_loss_mlp": 1.02183008, + "epoch": 0.5127912220051105, + "flos": 22459705267680.0, + "grad_norm": 1.5887624555461979, + "language_loss": 0.83419824, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85806799, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13061523, + "step": 8529, + "time_per_iteration": 2.8063580989837646 + }, + { + "auxiliary_loss_clip": 0.01346945, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.23262954, + "balance_loss_mlp": 1.01837873, + "epoch": 0.5128513452577784, + "flos": 19097717687640.0, + "grad_norm": 1.7243248603297925, + "language_loss": 0.7375282, + "learning_rate": 2.01382577957204e-06, + "loss": 0.76131213, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13092041, + "step": 8530, + "time_per_iteration": 4.244077920913696 + }, + { + "auxiliary_loss_clip": 0.01220615, + "auxiliary_loss_mlp": 0.01017707, + "balance_loss_clip": 1.17060792, + "balance_loss_mlp": 1.01550186, + "epoch": 0.5129114685104464, + "flos": 67909669939800.0, + "grad_norm": 0.7393776366027928, + "language_loss": 0.60831165, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.63069487, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.02209473, + "step": 8531, + "time_per_iteration": 3.3415346145629883 + }, + { + "auxiliary_loss_clip": 0.0136013, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.24167001, + "balance_loss_mlp": 1.02018094, + "epoch": 0.5129715917631144, + "flos": 20454058341000.0, + "grad_norm": 1.9399621605482305, + "language_loss": 0.77090454, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79484022, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13250732, + "step": 8532, + "time_per_iteration": 2.780428171157837 + }, + { + "auxiliary_loss_clip": 0.01358029, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.24197507, + "balance_loss_mlp": 1.01975739, + "epoch": 0.5130317150157824, + "flos": 35122930769160.0, + "grad_norm": 2.0789615738202474, + "language_loss": 0.6760937, + "learning_rate": 2.012657420152597e-06, + "loss": 0.70000792, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.13635254, + "step": 8533, + "time_per_iteration": 2.8634634017944336 + }, + { + "auxiliary_loss_clip": 0.01362182, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.24388242, + "balance_loss_mlp": 1.02428269, + "epoch": 0.5130918382684503, + "flos": 19796346137160.0, + "grad_norm": 1.8494919522009894, + "language_loss": 0.82373476, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84774739, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14794922, + "step": 8534, + "time_per_iteration": 4.194810152053833 + }, + { + "auxiliary_loss_clip": 0.01356727, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.23973441, + "balance_loss_mlp": 1.02108479, + "epoch": 0.5131519615211183, + "flos": 26328734744040.0, + "grad_norm": 1.3548687739310037, + "language_loss": 0.63978571, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66370428, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14044189, + "step": 8535, + "time_per_iteration": 2.8684844970703125 + }, + { + "auxiliary_loss_clip": 0.01352157, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.23700094, + "balance_loss_mlp": 1.01608729, + "epoch": 0.5132120847737862, + "flos": 19176626377080.0, + "grad_norm": 1.499300476932551, + "language_loss": 0.69490469, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71871626, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.12902832, + "step": 8536, + "time_per_iteration": 2.8129425048828125 + }, + { + "auxiliary_loss_clip": 0.01360656, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.24028921, + "balance_loss_mlp": 1.01637542, + "epoch": 0.5132722080264542, + "flos": 20235321776160.0, + "grad_norm": 1.9372970124857334, + "language_loss": 0.7160058, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.15270996, + "step": 8537, + "time_per_iteration": 2.847362995147705 + }, + { + "auxiliary_loss_clip": 0.01360013, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.24059868, + "balance_loss_mlp": 1.01660097, + "epoch": 0.5133323312791223, + "flos": 16473488034960.0, + "grad_norm": 1.8163880669309107, + "language_loss": 0.80718249, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.83109581, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.14697266, + "step": 8538, + "time_per_iteration": 4.260464429855347 + }, + { + "auxiliary_loss_clip": 0.01353687, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.23697817, + "balance_loss_mlp": 1.01471663, + "epoch": 0.5133924545317902, + "flos": 26073751894920.0, + "grad_norm": 2.5526389158693035, + "language_loss": 0.78607833, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80989826, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13580322, + "step": 8539, + "time_per_iteration": 2.789992570877075 + }, + { + "auxiliary_loss_clip": 0.01356417, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.23911035, + "balance_loss_mlp": 1.01995409, + "epoch": 0.5134525777844582, + "flos": 29136835361520.0, + "grad_norm": 1.5376358084382271, + "language_loss": 0.76741165, + "learning_rate": 2.009931232064105e-06, + "loss": 0.79131788, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.14263916, + "step": 8540, + "time_per_iteration": 2.8418374061584473 + }, + { + "auxiliary_loss_clip": 0.01369128, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.24799252, + "balance_loss_mlp": 1.01580977, + "epoch": 0.5135127010371261, + "flos": 17458959915000.0, + "grad_norm": 1.5384555105037154, + "language_loss": 0.74656367, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.77056575, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.15258789, + "step": 8541, + "time_per_iteration": 2.729995012283325 + }, + { + "auxiliary_loss_clip": 0.01355547, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.23907924, + "balance_loss_mlp": 1.02364326, + "epoch": 0.5135728242897941, + "flos": 21950064436320.0, + "grad_norm": 1.5641439269765838, + "language_loss": 0.70800483, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.73193508, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.1383667, + "step": 8542, + "time_per_iteration": 2.822843313217163 + }, + { + "auxiliary_loss_clip": 0.01355791, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.23903358, + "balance_loss_mlp": 1.01558542, + "epoch": 0.513632947542462, + "flos": 22680066166920.0, + "grad_norm": 1.8180590822374751, + "language_loss": 0.79783309, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.82168359, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13696289, + "step": 8543, + "time_per_iteration": 2.78717303276062 + }, + { + "auxiliary_loss_clip": 0.01358557, + "auxiliary_loss_mlp": 0.01035833, + "balance_loss_clip": 1.24298632, + "balance_loss_mlp": 1.02146816, + "epoch": 0.51369307079513, + "flos": 29462727053160.0, + "grad_norm": 1.829632562377682, + "language_loss": 0.68229711, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70624101, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14349365, + "step": 8544, + "time_per_iteration": 2.794896125793457 + }, + { + "auxiliary_loss_clip": 0.01355982, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.23622632, + "balance_loss_mlp": 1.02156734, + "epoch": 0.513753194047798, + "flos": 18994014271440.0, + "grad_norm": 1.9391443232501844, + "language_loss": 0.72531796, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74922723, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13372803, + "step": 8545, + "time_per_iteration": 2.7632625102996826 + }, + { + "auxiliary_loss_clip": 0.0136002, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.24090862, + "balance_loss_mlp": 1.01920664, + "epoch": 0.513813317300466, + "flos": 17826376977720.0, + "grad_norm": 1.989620575985546, + "language_loss": 0.8217752, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84571517, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14782715, + "step": 8546, + "time_per_iteration": 4.345507621765137 + }, + { + "auxiliary_loss_clip": 0.01359756, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.24145508, + "balance_loss_mlp": 1.01809859, + "epoch": 0.5138734405531339, + "flos": 24066561850560.0, + "grad_norm": 1.6367710908072373, + "language_loss": 0.73671627, + "learning_rate": 2.007205025522544e-06, + "loss": 0.76064491, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.15014648, + "step": 8547, + "time_per_iteration": 2.776090383529663 + }, + { + "auxiliary_loss_clip": 0.01348509, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.23241282, + "balance_loss_mlp": 1.02308297, + "epoch": 0.5139335638058019, + "flos": 26101957723920.0, + "grad_norm": 1.5977053826292187, + "language_loss": 0.73862946, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.76248538, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.14007568, + "step": 8548, + "time_per_iteration": 2.821165084838867 + }, + { + "auxiliary_loss_clip": 0.01358241, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.24116707, + "balance_loss_mlp": 1.01438475, + "epoch": 0.5139936870584698, + "flos": 18921846569760.0, + "grad_norm": 1.6877738335322539, + "language_loss": 0.82505512, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84892517, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14379883, + "step": 8549, + "time_per_iteration": 2.7826504707336426 + }, + { + "auxiliary_loss_clip": 0.01349773, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.23485279, + "balance_loss_mlp": 1.0149579, + "epoch": 0.5140538103111378, + "flos": 16148367902160.0, + "grad_norm": 1.81062359204467, + "language_loss": 0.72437233, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74815524, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13549805, + "step": 8550, + "time_per_iteration": 2.7550644874572754 + }, + { + "auxiliary_loss_clip": 0.01366097, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.24496412, + "balance_loss_mlp": 1.02062583, + "epoch": 0.5141139335638057, + "flos": 22425408184680.0, + "grad_norm": 1.501656188696358, + "language_loss": 0.75820369, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.78221571, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14483643, + "step": 8551, + "time_per_iteration": 2.7663605213165283 + }, + { + "auxiliary_loss_clip": 0.01348364, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.23586226, + "balance_loss_mlp": 1.0137527, + "epoch": 0.5141740568164738, + "flos": 27095429450880.0, + "grad_norm": 1.5748270598631011, + "language_loss": 0.69220537, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71595883, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13226318, + "step": 8552, + "time_per_iteration": 2.80167818069458 + }, + { + "auxiliary_loss_clip": 0.01362173, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.24451137, + "balance_loss_mlp": 1.01432395, + "epoch": 0.5142341800691418, + "flos": 24978728994840.0, + "grad_norm": 1.812630072885497, + "language_loss": 0.7524811, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77638996, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.144104, + "step": 8553, + "time_per_iteration": 2.7731645107269287 + }, + { + "auxiliary_loss_clip": 0.01358573, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.24314225, + "balance_loss_mlp": 1.0203855, + "epoch": 0.5142943033218097, + "flos": 20709447273720.0, + "grad_norm": 1.598100093910891, + "language_loss": 0.68206763, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70599198, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13470459, + "step": 8554, + "time_per_iteration": 2.8124163150787354 + }, + { + "auxiliary_loss_clip": 0.0136426, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.24296749, + "balance_loss_mlp": 1.01773906, + "epoch": 0.5143544265744777, + "flos": 22930094796120.0, + "grad_norm": 1.7498099940843888, + "language_loss": 0.73787266, + "learning_rate": 2.004089344806068e-06, + "loss": 0.76185459, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.16186523, + "step": 8555, + "time_per_iteration": 2.7925078868865967 + }, + { + "auxiliary_loss_clip": 0.01360214, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.24182117, + "balance_loss_mlp": 1.0178808, + "epoch": 0.5144145498271456, + "flos": 15925245634440.0, + "grad_norm": 2.921256122008567, + "language_loss": 0.75300729, + "learning_rate": 2.003699883863633e-06, + "loss": 0.77692831, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.13995361, + "step": 8556, + "time_per_iteration": 2.701327085494995 + }, + { + "auxiliary_loss_clip": 0.0134656, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.23326659, + "balance_loss_mlp": 1.01621115, + "epoch": 0.5144746730798136, + "flos": 19685820516480.0, + "grad_norm": 1.7453347028058408, + "language_loss": 0.86390102, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88765955, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13079834, + "step": 8557, + "time_per_iteration": 2.904773473739624 + }, + { + "auxiliary_loss_clip": 0.01349037, + "auxiliary_loss_mlp": 0.01025338, + "balance_loss_clip": 1.23518932, + "balance_loss_mlp": 1.01247478, + "epoch": 0.5145347963324816, + "flos": 23919992987400.0, + "grad_norm": 1.593918858385201, + "language_loss": 0.89293683, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91668057, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12872314, + "step": 8558, + "time_per_iteration": 2.784550666809082 + }, + { + "auxiliary_loss_clip": 0.01349025, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.23757851, + "balance_loss_mlp": 1.01520967, + "epoch": 0.5145949195851496, + "flos": 18264621666240.0, + "grad_norm": 2.6287592084283333, + "language_loss": 0.65916586, + "learning_rate": 2.002531500253602e-06, + "loss": 0.68294817, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14007568, + "step": 8559, + "time_per_iteration": 2.7625911235809326 + }, + { + "auxiliary_loss_clip": 0.01356335, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.24217248, + "balance_loss_mlp": 1.01337695, + "epoch": 0.5146550428378175, + "flos": 26219021290560.0, + "grad_norm": 1.530179157773058, + "language_loss": 0.63973153, + "learning_rate": 2.002142038838577e-06, + "loss": 0.6635614, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13262939, + "step": 8560, + "time_per_iteration": 2.811291456222534 + }, + { + "auxiliary_loss_clip": 0.01358697, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.24266684, + "balance_loss_mlp": 1.01555586, + "epoch": 0.5147151660904855, + "flos": 22679294608080.0, + "grad_norm": 1.7828851748868466, + "language_loss": 0.70837998, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.73225391, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.1315918, + "step": 8561, + "time_per_iteration": 2.785125732421875 + }, + { + "auxiliary_loss_clip": 0.01356638, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.24037182, + "balance_loss_mlp": 1.01659274, + "epoch": 0.5147752893431534, + "flos": 24977754394200.0, + "grad_norm": 1.5252661683048465, + "language_loss": 0.66657519, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.69043905, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13146973, + "step": 8562, + "time_per_iteration": 2.8221075534820557 + }, + { + "auxiliary_loss_clip": 0.01368685, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.251086, + "balance_loss_mlp": 1.01621473, + "epoch": 0.5148354125958214, + "flos": 22749472500120.0, + "grad_norm": 2.0643748279242513, + "language_loss": 0.78070843, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.80469155, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13409424, + "step": 8563, + "time_per_iteration": 2.8236682415008545 + }, + { + "auxiliary_loss_clip": 0.01370384, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.2484107, + "balance_loss_mlp": 1.0175817, + "epoch": 0.5148955358484893, + "flos": 23073049515240.0, + "grad_norm": 1.9101644871978, + "language_loss": 0.83025324, + "learning_rate": 2.0005841925139e-06, + "loss": 0.8542881, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.1552124, + "step": 8564, + "time_per_iteration": 2.7803516387939453 + }, + { + "auxiliary_loss_clip": 0.01375, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.25547624, + "balance_loss_mlp": 1.0202831, + "epoch": 0.5149556591011574, + "flos": 20345075838000.0, + "grad_norm": 1.629045040332506, + "language_loss": 0.73113042, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75522435, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14111328, + "step": 8565, + "time_per_iteration": 2.7537856101989746 + }, + { + "auxiliary_loss_clip": 0.01374828, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.2535311, + "balance_loss_mlp": 1.01517296, + "epoch": 0.5150157823538254, + "flos": 22643413799040.0, + "grad_norm": 2.417265686009181, + "language_loss": 0.685431, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70947689, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14599609, + "step": 8566, + "time_per_iteration": 2.7874958515167236 + }, + { + "auxiliary_loss_clip": 0.01375193, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.25325048, + "balance_loss_mlp": 1.01652789, + "epoch": 0.5150759056064933, + "flos": 26073223986240.0, + "grad_norm": 1.6626503931189893, + "language_loss": 0.78299725, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80705631, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.14196777, + "step": 8567, + "time_per_iteration": 2.811345338821411 + }, + { + "auxiliary_loss_clip": 0.01370669, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.25275612, + "balance_loss_mlp": 1.01683831, + "epoch": 0.5151360288591613, + "flos": 25957500495480.0, + "grad_norm": 2.1676448379074964, + "language_loss": 0.7933507, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81736434, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13861084, + "step": 8568, + "time_per_iteration": 2.7965047359466553 + }, + { + "auxiliary_loss_clip": 0.01356161, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.24127471, + "balance_loss_mlp": 1.0230186, + "epoch": 0.5151961521118292, + "flos": 18510833109600.0, + "grad_norm": 2.073080291798002, + "language_loss": 0.90884674, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93277311, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13439941, + "step": 8569, + "time_per_iteration": 2.7220048904418945 + }, + { + "auxiliary_loss_clip": 0.0137501, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.25616193, + "balance_loss_mlp": 1.0283699, + "epoch": 0.5152562753644973, + "flos": 22238410376160.0, + "grad_norm": 1.5083047440423605, + "language_loss": 0.7668978, + "learning_rate": 1.998247422657674e-06, + "loss": 0.79107261, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14111328, + "step": 8570, + "time_per_iteration": 4.244941473007202 + }, + { + "auxiliary_loss_clip": 0.0136148, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.2447902, + "balance_loss_mlp": 1.02969027, + "epoch": 0.5153163986171652, + "flos": 38443799061720.0, + "grad_norm": 1.486289461121965, + "language_loss": 0.73935205, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76341188, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14819336, + "step": 8571, + "time_per_iteration": 2.9744975566864014 + }, + { + "auxiliary_loss_clip": 0.01218603, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_clip": 1.16872811, + "balance_loss_mlp": 1.01301992, + "epoch": 0.5153765218698332, + "flos": 66400141260600.0, + "grad_norm": 0.9163536312775292, + "language_loss": 0.52943504, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.55177385, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.02258301, + "step": 8572, + "time_per_iteration": 3.3482751846313477 + }, + { + "auxiliary_loss_clip": 0.01361661, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.24950075, + "balance_loss_mlp": 1.01857734, + "epoch": 0.5154366451225011, + "flos": 24030031307760.0, + "grad_norm": 1.582014693595067, + "language_loss": 0.75861901, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78254771, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12628174, + "step": 8573, + "time_per_iteration": 4.222577333450317 + }, + { + "auxiliary_loss_clip": 0.01362023, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.24739206, + "balance_loss_mlp": 1.01837003, + "epoch": 0.5154967683751691, + "flos": 23473017501480.0, + "grad_norm": 1.995770282511251, + "language_loss": 0.77429521, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79823476, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13568115, + "step": 8574, + "time_per_iteration": 2.8032779693603516 + }, + { + "auxiliary_loss_clip": 0.01356159, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.24153185, + "balance_loss_mlp": 1.02568877, + "epoch": 0.515556891627837, + "flos": 23810969876040.0, + "grad_norm": 1.652617242053816, + "language_loss": 0.85897893, + "learning_rate": 1.996300116136367e-06, + "loss": 0.88292432, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.12689209, + "step": 8575, + "time_per_iteration": 2.76350736618042 + }, + { + "auxiliary_loss_clip": 0.01367425, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.24966311, + "balance_loss_mlp": 1.02434111, + "epoch": 0.515617014880505, + "flos": 19833323371920.0, + "grad_norm": 1.4128555957226534, + "language_loss": 0.77310902, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79716158, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13482666, + "step": 8576, + "time_per_iteration": 4.271980047225952 + }, + { + "auxiliary_loss_clip": 0.01380868, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.25729859, + "balance_loss_mlp": 1.02049208, + "epoch": 0.515677138133173, + "flos": 14249916710640.0, + "grad_norm": 2.2848380383064066, + "language_loss": 0.75926685, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.78342831, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14776611, + "step": 8577, + "time_per_iteration": 2.8102214336395264 + }, + { + "auxiliary_loss_clip": 0.01368963, + "auxiliary_loss_mlp": 0.01039997, + "balance_loss_clip": 1.24963152, + "balance_loss_mlp": 1.02466059, + "epoch": 0.515737261385841, + "flos": 28295130367800.0, + "grad_norm": 1.808934696324195, + "language_loss": 0.81465942, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83874905, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.15307617, + "step": 8578, + "time_per_iteration": 2.84617280960083 + }, + { + "auxiliary_loss_clip": 0.01359381, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.24295688, + "balance_loss_mlp": 1.02044892, + "epoch": 0.515797384638509, + "flos": 27897761316600.0, + "grad_norm": 1.6580585542243893, + "language_loss": 0.76468515, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78861511, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.1315918, + "step": 8579, + "time_per_iteration": 2.859649181365967 + }, + { + "auxiliary_loss_clip": 0.01366276, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.2482419, + "balance_loss_mlp": 1.02014375, + "epoch": 0.5158575078911769, + "flos": 23045371594920.0, + "grad_norm": 1.6427073985078016, + "language_loss": 0.79654402, + "learning_rate": 1.994352813122559e-06, + "loss": 0.82054234, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.1340332, + "step": 8580, + "time_per_iteration": 3.003847360610962 + }, + { + "auxiliary_loss_clip": 0.01375845, + "auxiliary_loss_mlp": 0.01040844, + "balance_loss_clip": 1.25676894, + "balance_loss_mlp": 1.02622938, + "epoch": 0.5159176311438449, + "flos": 12645252979200.0, + "grad_norm": 2.69800711827504, + "language_loss": 0.73253888, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75670576, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.1461792, + "step": 8581, + "time_per_iteration": 2.733076572418213 + }, + { + "auxiliary_loss_clip": 0.01358763, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.24456692, + "balance_loss_mlp": 1.02055538, + "epoch": 0.5159777543965128, + "flos": 15562173666240.0, + "grad_norm": 2.1520572940379172, + "language_loss": 0.75520825, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.77913523, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1338501, + "step": 8582, + "time_per_iteration": 2.816026449203491 + }, + { + "auxiliary_loss_clip": 0.01365267, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.2502476, + "balance_loss_mlp": 1.01882994, + "epoch": 0.5160378776491809, + "flos": 23226887274840.0, + "grad_norm": 2.2480376724734543, + "language_loss": 0.66527271, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68923926, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.12567139, + "step": 8583, + "time_per_iteration": 2.784761667251587 + }, + { + "auxiliary_loss_clip": 0.01374986, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.25834799, + "balance_loss_mlp": 1.01913249, + "epoch": 0.5160980009018488, + "flos": 21949658352720.0, + "grad_norm": 1.3977431916128467, + "language_loss": 0.76193607, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78602082, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14355469, + "step": 8584, + "time_per_iteration": 4.326828479766846 + }, + { + "auxiliary_loss_clip": 0.01371632, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.252316, + "balance_loss_mlp": 1.02880192, + "epoch": 0.5161581241545168, + "flos": 22789251711720.0, + "grad_norm": 2.0897182499390317, + "language_loss": 0.7912311, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81537598, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.14068604, + "step": 8585, + "time_per_iteration": 2.7997395992279053 + }, + { + "auxiliary_loss_clip": 0.01353261, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.24120426, + "balance_loss_mlp": 1.01965904, + "epoch": 0.5162182474071847, + "flos": 19679201353800.0, + "grad_norm": 1.9657266783306686, + "language_loss": 0.80837983, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.8322379, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12884521, + "step": 8586, + "time_per_iteration": 2.7308084964752197 + }, + { + "auxiliary_loss_clip": 0.01360603, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.24565136, + "balance_loss_mlp": 1.01638508, + "epoch": 0.5162783706598527, + "flos": 20050841685960.0, + "grad_norm": 1.9339599067804192, + "language_loss": 0.72198606, + "learning_rate": 1.991626598310701e-06, + "loss": 0.74589419, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13830566, + "step": 8587, + "time_per_iteration": 2.8125948905944824 + }, + { + "auxiliary_loss_clip": 0.01218278, + "auxiliary_loss_mlp": 0.01009295, + "balance_loss_clip": 1.16888022, + "balance_loss_mlp": 1.00681555, + "epoch": 0.5163384939125206, + "flos": 69975586318680.0, + "grad_norm": 0.7306246730076105, + "language_loss": 0.57907796, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.6013537, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.02478027, + "step": 8588, + "time_per_iteration": 3.2644338607788086 + }, + { + "auxiliary_loss_clip": 0.01371594, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.2538799, + "balance_loss_mlp": 1.02239418, + "epoch": 0.5163986171651886, + "flos": 17421048687960.0, + "grad_norm": 2.1878534273344825, + "language_loss": 0.75744128, + "learning_rate": 1.990847682429185e-06, + "loss": 0.78152549, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14428711, + "step": 8589, + "time_per_iteration": 2.7517826557159424 + }, + { + "auxiliary_loss_clip": 0.0137111, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.25351727, + "balance_loss_mlp": 1.02111423, + "epoch": 0.5164587404178566, + "flos": 21327420874320.0, + "grad_norm": 1.5761561206049135, + "language_loss": 0.67860323, + "learning_rate": 1.990458225001627e-06, + "loss": 0.70265448, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.12915039, + "step": 8590, + "time_per_iteration": 2.872390031814575 + }, + { + "auxiliary_loss_clip": 0.01213332, + "auxiliary_loss_mlp": 0.0100365, + "balance_loss_clip": 1.16413045, + "balance_loss_mlp": 1.00165927, + "epoch": 0.5165188636705246, + "flos": 68072262123960.0, + "grad_norm": 0.795236258360989, + "language_loss": 0.55896831, + "learning_rate": 1.990068767935895e-06, + "loss": 0.58113813, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.01989746, + "step": 8591, + "time_per_iteration": 3.1672279834747314 + }, + { + "auxiliary_loss_clip": 0.01349448, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.24045682, + "balance_loss_mlp": 1.01632071, + "epoch": 0.5165789869231926, + "flos": 19389718379880.0, + "grad_norm": 1.5544361406283254, + "language_loss": 0.81742972, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.84121192, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12432861, + "step": 8592, + "time_per_iteration": 2.7988765239715576 + }, + { + "auxiliary_loss_clip": 0.01359647, + "auxiliary_loss_mlp": 0.01028325, + "balance_loss_clip": 1.24782181, + "balance_loss_mlp": 1.01586211, + "epoch": 0.5166391101758605, + "flos": 20965201681680.0, + "grad_norm": 1.7916239701075796, + "language_loss": 0.83308864, + "learning_rate": 1.989289854948979e-06, + "loss": 0.8569684, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12475586, + "step": 8593, + "time_per_iteration": 2.7722835540771484 + }, + { + "auxiliary_loss_clip": 0.01358055, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.24375296, + "balance_loss_mlp": 1.01798487, + "epoch": 0.5166992334285285, + "flos": 29468655873720.0, + "grad_norm": 1.6341143898755313, + "language_loss": 0.69497257, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71886504, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13201904, + "step": 8594, + "time_per_iteration": 2.891399621963501 + }, + { + "auxiliary_loss_clip": 0.01359375, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.24616921, + "balance_loss_mlp": 1.01933825, + "epoch": 0.5167593566811964, + "flos": 20309398070760.0, + "grad_norm": 1.5064495436352847, + "language_loss": 0.7751863, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79910576, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13220215, + "step": 8595, + "time_per_iteration": 2.8432047367095947 + }, + { + "auxiliary_loss_clip": 0.01370969, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.25513935, + "balance_loss_mlp": 1.01919651, + "epoch": 0.5168194799338645, + "flos": 14615506397160.0, + "grad_norm": 4.917437830922116, + "language_loss": 0.66078877, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.68482566, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.13525391, + "step": 8596, + "time_per_iteration": 2.8043901920318604 + }, + { + "auxiliary_loss_clip": 0.01362759, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.24904144, + "balance_loss_mlp": 1.01716387, + "epoch": 0.5168796031865324, + "flos": 25012416952440.0, + "grad_norm": 1.4608246523079909, + "language_loss": 0.75739646, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.78134543, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14971924, + "step": 8597, + "time_per_iteration": 2.8549628257751465 + }, + { + "auxiliary_loss_clip": 0.01366875, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.25206113, + "balance_loss_mlp": 1.01151824, + "epoch": 0.5169397264392004, + "flos": 26945246443680.0, + "grad_norm": 1.6580321739158361, + "language_loss": 0.81049663, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83441025, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.12963867, + "step": 8598, + "time_per_iteration": 2.800049304962158 + }, + { + "auxiliary_loss_clip": 0.01354402, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.24121821, + "balance_loss_mlp": 1.0165019, + "epoch": 0.5169998496918683, + "flos": 25412912847360.0, + "grad_norm": 1.4405675698985798, + "language_loss": 0.75633538, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.78018034, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13586426, + "step": 8599, + "time_per_iteration": 2.8144969940185547 + }, + { + "auxiliary_loss_clip": 0.01356252, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.2452116, + "balance_loss_mlp": 1.01881337, + "epoch": 0.5170599729445363, + "flos": 24686159785560.0, + "grad_norm": 2.6397470023906164, + "language_loss": 0.72805989, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.75193638, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12585449, + "step": 8600, + "time_per_iteration": 2.7764949798583984 + }, + { + "auxiliary_loss_clip": 0.01363946, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.25035083, + "balance_loss_mlp": 1.01689935, + "epoch": 0.5171200961972042, + "flos": 20999173897800.0, + "grad_norm": 1.489149550654975, + "language_loss": 0.74797261, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.77191061, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.1295166, + "step": 8601, + "time_per_iteration": 2.9296634197235107 + }, + { + "auxiliary_loss_clip": 0.01363849, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.24915433, + "balance_loss_mlp": 1.02637482, + "epoch": 0.5171802194498722, + "flos": 22750568925840.0, + "grad_norm": 1.8927470935932114, + "language_loss": 0.83910251, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.86314487, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14019775, + "step": 8602, + "time_per_iteration": 2.753523826599121 + }, + { + "auxiliary_loss_clip": 0.01367291, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.25165236, + "balance_loss_mlp": 1.01951861, + "epoch": 0.5172403427025402, + "flos": 28181680945200.0, + "grad_norm": 1.7185390220985586, + "language_loss": 0.75043726, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.77444571, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.14031982, + "step": 8603, + "time_per_iteration": 2.817250967025757 + }, + { + "auxiliary_loss_clip": 0.01369769, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.25436366, + "balance_loss_mlp": 1.02136922, + "epoch": 0.5173004659552082, + "flos": 20342476902960.0, + "grad_norm": 2.1755999921773004, + "language_loss": 0.73114377, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.75518107, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.12591553, + "step": 8604, + "time_per_iteration": 2.7644240856170654 + }, + { + "auxiliary_loss_clip": 0.01372735, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.25055122, + "balance_loss_mlp": 1.0170424, + "epoch": 0.5173605892078762, + "flos": 19068131174400.0, + "grad_norm": 1.974602782777484, + "language_loss": 0.85141444, + "learning_rate": 1.984616415277469e-06, + "loss": 0.875462, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.14978027, + "step": 8605, + "time_per_iteration": 2.7273664474487305 + }, + { + "auxiliary_loss_clip": 0.01358753, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.24437082, + "balance_loss_mlp": 1.0168004, + "epoch": 0.5174207124605441, + "flos": 28000205873640.0, + "grad_norm": 1.3766942312532158, + "language_loss": 0.6480962, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67197597, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.12420654, + "step": 8606, + "time_per_iteration": 2.8548104763031006 + }, + { + "auxiliary_loss_clip": 0.01363961, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.25127459, + "balance_loss_mlp": 1.01767683, + "epoch": 0.5174808357132121, + "flos": 19500934342680.0, + "grad_norm": 1.5310342763566676, + "language_loss": 0.77665263, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80059624, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1272583, + "step": 8607, + "time_per_iteration": 2.7820491790771484 + }, + { + "auxiliary_loss_clip": 0.01359703, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.24436831, + "balance_loss_mlp": 1.01990008, + "epoch": 0.51754095896588, + "flos": 22789373536800.0, + "grad_norm": 1.7635812243700912, + "language_loss": 0.72057575, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74451017, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13842773, + "step": 8608, + "time_per_iteration": 2.786134719848633 + }, + { + "auxiliary_loss_clip": 0.01375394, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.25528193, + "balance_loss_mlp": 1.01593566, + "epoch": 0.5176010822185481, + "flos": 22673731262760.0, + "grad_norm": 1.9298491626037944, + "language_loss": 0.86840427, + "learning_rate": 1.983058619460531e-06, + "loss": 0.89245504, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13751221, + "step": 8609, + "time_per_iteration": 4.24108624458313 + }, + { + "auxiliary_loss_clip": 0.01357411, + "auxiliary_loss_mlp": 0.0102595, + "balance_loss_clip": 1.24361968, + "balance_loss_mlp": 1.01389205, + "epoch": 0.517661205471216, + "flos": 23956482921840.0, + "grad_norm": 1.7827472145428787, + "language_loss": 0.73766083, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.76149446, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1206665, + "step": 8610, + "time_per_iteration": 4.293105840682983 + }, + { + "auxiliary_loss_clip": 0.01371482, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.25256085, + "balance_loss_mlp": 1.0164969, + "epoch": 0.517721328723884, + "flos": 15600287935080.0, + "grad_norm": 1.9897400251721231, + "language_loss": 0.67745399, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.70147586, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14215088, + "step": 8611, + "time_per_iteration": 2.731595754623413 + }, + { + "auxiliary_loss_clip": 0.01359555, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.2445507, + "balance_loss_mlp": 1.02097428, + "epoch": 0.5177814519765519, + "flos": 20965201681680.0, + "grad_norm": 1.8320325407284612, + "language_loss": 0.77230394, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79624283, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13366699, + "step": 8612, + "time_per_iteration": 2.7860355377197266 + }, + { + "auxiliary_loss_clip": 0.0136378, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.247015, + "balance_loss_mlp": 1.02371311, + "epoch": 0.5178415752292199, + "flos": 17972377323840.0, + "grad_norm": 2.0018795285579585, + "language_loss": 0.82458586, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84861016, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14929199, + "step": 8613, + "time_per_iteration": 2.709258794784546 + }, + { + "auxiliary_loss_clip": 0.01366111, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.25130177, + "balance_loss_mlp": 1.01530671, + "epoch": 0.5179016984818878, + "flos": 17825970894120.0, + "grad_norm": 2.710190867939499, + "language_loss": 0.66573048, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68968284, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13812256, + "step": 8614, + "time_per_iteration": 2.7485525608062744 + }, + { + "auxiliary_loss_clip": 0.01366648, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.2492826, + "balance_loss_mlp": 1.02069354, + "epoch": 0.5179618217345558, + "flos": 17824793251680.0, + "grad_norm": 1.8552650110276443, + "language_loss": 0.86817878, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89219213, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.13989258, + "step": 8615, + "time_per_iteration": 4.25158429145813 + }, + { + "auxiliary_loss_clip": 0.01359441, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.24550509, + "balance_loss_mlp": 1.02471673, + "epoch": 0.5180219449872238, + "flos": 22526431449120.0, + "grad_norm": 1.49488334659493, + "language_loss": 0.81102276, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83498919, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.12487793, + "step": 8616, + "time_per_iteration": 2.747915267944336 + }, + { + "auxiliary_loss_clip": 0.01372109, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.25408745, + "balance_loss_mlp": 1.02367496, + "epoch": 0.5180820682398918, + "flos": 23920845762960.0, + "grad_norm": 2.3288486157263417, + "language_loss": 0.75425601, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77835453, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.14074707, + "step": 8617, + "time_per_iteration": 2.7928037643432617 + }, + { + "auxiliary_loss_clip": 0.01364446, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.24752498, + "balance_loss_mlp": 1.02204037, + "epoch": 0.5181421914925598, + "flos": 16983981641880.0, + "grad_norm": 2.4209124724800146, + "language_loss": 0.69831985, + "learning_rate": 1.979553617893785e-06, + "loss": 0.7223295, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.14477539, + "step": 8618, + "time_per_iteration": 2.740419387817383 + }, + { + "auxiliary_loss_clip": 0.01205081, + "auxiliary_loss_mlp": 0.01003717, + "balance_loss_clip": 1.15763903, + "balance_loss_mlp": 1.00138056, + "epoch": 0.5182023147452277, + "flos": 66075955120080.0, + "grad_norm": 0.9339277843799347, + "language_loss": 0.67310023, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69518822, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.02331543, + "step": 8619, + "time_per_iteration": 3.2618017196655273 + }, + { + "auxiliary_loss_clip": 0.01355727, + "auxiliary_loss_mlp": 0.01024642, + "balance_loss_clip": 1.24343896, + "balance_loss_mlp": 1.01184535, + "epoch": 0.5182624379978957, + "flos": 18192453964560.0, + "grad_norm": 2.1040505791535216, + "language_loss": 0.79856002, + "learning_rate": 1.97877473680631e-06, + "loss": 0.82236373, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.12811279, + "step": 8620, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.01351849, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.24044466, + "balance_loss_mlp": 1.01644421, + "epoch": 0.5183225612505636, + "flos": 14030652237120.0, + "grad_norm": 2.016142695630021, + "language_loss": 0.82318091, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84699976, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13580322, + "step": 8621, + "time_per_iteration": 2.8285629749298096 + }, + { + "auxiliary_loss_clip": 0.01359575, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.24586642, + "balance_loss_mlp": 1.02215052, + "epoch": 0.5183826845032317, + "flos": 23665091355000.0, + "grad_norm": 1.9389257600638443, + "language_loss": 0.66124594, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68519032, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.1270752, + "step": 8622, + "time_per_iteration": 2.8179619312286377 + }, + { + "auxiliary_loss_clip": 0.01377644, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.25756598, + "balance_loss_mlp": 1.01968575, + "epoch": 0.5184428077558996, + "flos": 15892938361080.0, + "grad_norm": 1.7819686669756147, + "language_loss": 0.61287934, + "learning_rate": 1.977606421248497e-06, + "loss": 0.63699692, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14428711, + "step": 8623, + "time_per_iteration": 4.241920709609985 + }, + { + "auxiliary_loss_clip": 0.01359616, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.24473596, + "balance_loss_mlp": 1.01794982, + "epoch": 0.5185029310085676, + "flos": 21035542007160.0, + "grad_norm": 2.363727098078764, + "language_loss": 0.76144969, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78535569, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13049316, + "step": 8624, + "time_per_iteration": 2.783912420272827 + }, + { + "auxiliary_loss_clip": 0.01364503, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.24820566, + "balance_loss_mlp": 1.01817751, + "epoch": 0.5185630542612355, + "flos": 26548445909520.0, + "grad_norm": 2.292464161073209, + "language_loss": 0.7112152, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73517466, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13275146, + "step": 8625, + "time_per_iteration": 2.823650598526001 + }, + { + "auxiliary_loss_clip": 0.01360608, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.24641275, + "balance_loss_mlp": 1.01801801, + "epoch": 0.5186231775139035, + "flos": 20673404031240.0, + "grad_norm": 1.7929461794862653, + "language_loss": 0.67771047, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70162886, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13214111, + "step": 8626, + "time_per_iteration": 2.7486751079559326 + }, + { + "auxiliary_loss_clip": 0.01361256, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.24734426, + "balance_loss_mlp": 1.01505184, + "epoch": 0.5186833007665714, + "flos": 20890272611520.0, + "grad_norm": 1.864135858437817, + "language_loss": 0.70833564, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.73222971, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13098145, + "step": 8627, + "time_per_iteration": 2.7541656494140625 + }, + { + "auxiliary_loss_clip": 0.01373739, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.25551558, + "balance_loss_mlp": 1.01687515, + "epoch": 0.5187434240192395, + "flos": 20891856337560.0, + "grad_norm": 5.21094599035731, + "language_loss": 0.7336331, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.7576735, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13409424, + "step": 8628, + "time_per_iteration": 2.729128122329712 + }, + { + "auxiliary_loss_clip": 0.0136526, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.25158942, + "balance_loss_mlp": 1.01972985, + "epoch": 0.5188035472719074, + "flos": 19864046919240.0, + "grad_norm": 2.1350609371510414, + "language_loss": 0.77815562, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80213904, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13360596, + "step": 8629, + "time_per_iteration": 2.7450973987579346 + }, + { + "auxiliary_loss_clip": 0.01372786, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.25624847, + "balance_loss_mlp": 1.019943, + "epoch": 0.5188636705245754, + "flos": 21142778350680.0, + "grad_norm": 1.9881744131498285, + "language_loss": 0.75699222, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.78105664, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13720703, + "step": 8630, + "time_per_iteration": 2.7419276237487793 + }, + { + "auxiliary_loss_clip": 0.01369159, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.25259805, + "balance_loss_mlp": 1.02538514, + "epoch": 0.5189237937772434, + "flos": 22425002101080.0, + "grad_norm": 1.6448827823697956, + "language_loss": 0.80590838, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82999361, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13970947, + "step": 8631, + "time_per_iteration": 2.7783868312835693 + }, + { + "auxiliary_loss_clip": 0.01373944, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.25690675, + "balance_loss_mlp": 1.01698971, + "epoch": 0.5189839170299113, + "flos": 25452123541920.0, + "grad_norm": 1.4761770953003068, + "language_loss": 0.75023639, + "learning_rate": 1.974101522024942e-06, + "loss": 0.77428353, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.13787842, + "step": 8632, + "time_per_iteration": 2.835550308227539 + }, + { + "auxiliary_loss_clip": 0.01355758, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.2445271, + "balance_loss_mlp": 1.01890278, + "epoch": 0.5190440402825793, + "flos": 18592300125720.0, + "grad_norm": 1.7445974002924536, + "language_loss": 0.78791082, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81178969, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13226318, + "step": 8633, + "time_per_iteration": 2.733135461807251 + }, + { + "auxiliary_loss_clip": 0.01364684, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.24941468, + "balance_loss_mlp": 1.01854372, + "epoch": 0.5191041635352472, + "flos": 21913736935320.0, + "grad_norm": 2.294425354000321, + "language_loss": 0.80692708, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.83089042, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.15185547, + "router_z_loss_mlp": 0.13104248, + "step": 8634, + "time_per_iteration": 2.748735189437866 + }, + { + "auxiliary_loss_clip": 0.01365654, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.25246119, + "balance_loss_mlp": 1.023314, + "epoch": 0.5191642867879153, + "flos": 27533877181200.0, + "grad_norm": 1.6380263310503316, + "language_loss": 0.69039607, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71441704, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13128662, + "step": 8635, + "time_per_iteration": 2.8231489658355713 + }, + { + "auxiliary_loss_clip": 0.01366215, + "auxiliary_loss_mlp": 0.01038439, + "balance_loss_clip": 1.24882901, + "balance_loss_mlp": 1.02536726, + "epoch": 0.5192244100405832, + "flos": 15710285647080.0, + "grad_norm": 1.6513703420094352, + "language_loss": 0.77610028, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.80014676, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13079834, + "step": 8636, + "time_per_iteration": 2.789517402648926 + }, + { + "auxiliary_loss_clip": 0.01375297, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.25780964, + "balance_loss_mlp": 1.02115548, + "epoch": 0.5192845332932512, + "flos": 12060845511120.0, + "grad_norm": 1.9676225156264107, + "language_loss": 0.71666223, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.7407639, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.13702393, + "step": 8637, + "time_per_iteration": 2.7546138763427734 + }, + { + "auxiliary_loss_clip": 0.01360354, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.24682188, + "balance_loss_mlp": 1.0202626, + "epoch": 0.5193446565459191, + "flos": 18957889812240.0, + "grad_norm": 2.233983553348511, + "language_loss": 0.76480079, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78874564, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13867188, + "step": 8638, + "time_per_iteration": 2.7633719444274902 + }, + { + "auxiliary_loss_clip": 0.01361252, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.24766445, + "balance_loss_mlp": 1.0176878, + "epoch": 0.5194047797985871, + "flos": 20379697787880.0, + "grad_norm": 1.780323834229315, + "language_loss": 0.75259483, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77651858, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13452148, + "step": 8639, + "time_per_iteration": 2.7379262447357178 + }, + { + "auxiliary_loss_clip": 0.01363741, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.24984026, + "balance_loss_mlp": 1.01474547, + "epoch": 0.519464903051255, + "flos": 24358400109360.0, + "grad_norm": 1.560498337137938, + "language_loss": 0.78114372, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.80507141, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14282227, + "step": 8640, + "time_per_iteration": 2.8146767616271973 + }, + { + "auxiliary_loss_clip": 0.01357908, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.24611008, + "balance_loss_mlp": 1.01960874, + "epoch": 0.519525026303923, + "flos": 14065599053880.0, + "grad_norm": 2.2840101780892432, + "language_loss": 0.66645706, + "learning_rate": 1.97059670234927e-06, + "loss": 0.69036305, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13067627, + "step": 8641, + "time_per_iteration": 2.708118438720703 + }, + { + "auxiliary_loss_clip": 0.01361698, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.24991, + "balance_loss_mlp": 1.01591969, + "epoch": 0.519585149556591, + "flos": 28841220525240.0, + "grad_norm": 1.8647881365578334, + "language_loss": 0.76837075, + "learning_rate": 1.97020728331885e-06, + "loss": 0.79226971, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1229248, + "step": 8642, + "time_per_iteration": 2.88173770904541 + }, + { + "auxiliary_loss_clip": 0.01353701, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.24130392, + "balance_loss_mlp": 1.01631308, + "epoch": 0.519645272809259, + "flos": 25378331505840.0, + "grad_norm": 1.5044619120054368, + "language_loss": 0.83478731, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85861492, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12762451, + "step": 8643, + "time_per_iteration": 2.873241424560547 + }, + { + "auxiliary_loss_clip": 0.01367969, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.24949551, + "balance_loss_mlp": 1.02635109, + "epoch": 0.519705396061927, + "flos": 25378169072400.0, + "grad_norm": 2.448144552271079, + "language_loss": 0.70427322, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72835732, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14086914, + "step": 8644, + "time_per_iteration": 2.8268558979034424 + }, + { + "auxiliary_loss_clip": 0.0136396, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.2487092, + "balance_loss_mlp": 1.0191195, + "epoch": 0.5197655193145949, + "flos": 28482209393040.0, + "grad_norm": 1.4771043047997447, + "language_loss": 0.80238807, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82634711, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.12811279, + "step": 8645, + "time_per_iteration": 2.8537299633026123 + }, + { + "auxiliary_loss_clip": 0.0135607, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.24081302, + "balance_loss_mlp": 1.01417792, + "epoch": 0.5198256425672629, + "flos": 20013702017760.0, + "grad_norm": 1.7756529230367935, + "language_loss": 0.78497922, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80881727, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13568115, + "step": 8646, + "time_per_iteration": 2.7933473587036133 + }, + { + "auxiliary_loss_clip": 0.01364291, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.2495333, + "balance_loss_mlp": 1.02222586, + "epoch": 0.5198857658199308, + "flos": 19833688847160.0, + "grad_norm": 1.9212664414005651, + "language_loss": 0.66849035, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.69248867, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13317871, + "step": 8647, + "time_per_iteration": 2.8840065002441406 + }, + { + "auxiliary_loss_clip": 0.01365737, + "auxiliary_loss_mlp": 0.01038244, + "balance_loss_clip": 1.247509, + "balance_loss_mlp": 1.02312195, + "epoch": 0.5199458890725989, + "flos": 24467260787280.0, + "grad_norm": 1.7026627200343585, + "language_loss": 0.71797729, + "learning_rate": 1.967870793377763e-06, + "loss": 0.74201703, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.15118408, + "step": 8648, + "time_per_iteration": 4.311499357223511 + }, + { + "auxiliary_loss_clip": 0.01362348, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.24695516, + "balance_loss_mlp": 1.01594901, + "epoch": 0.5200060123252668, + "flos": 23410108505880.0, + "grad_norm": 1.7085300995733415, + "language_loss": 0.64591753, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66983974, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13916016, + "step": 8649, + "time_per_iteration": 4.27158260345459 + }, + { + "auxiliary_loss_clip": 0.01376267, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.25547576, + "balance_loss_mlp": 1.01994085, + "epoch": 0.5200661355779348, + "flos": 17205763833720.0, + "grad_norm": 1.7464230990990646, + "language_loss": 0.70454955, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72866166, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.15002441, + "step": 8650, + "time_per_iteration": 2.7698147296905518 + }, + { + "auxiliary_loss_clip": 0.0136061, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.24575233, + "balance_loss_mlp": 1.01959062, + "epoch": 0.5201262588306027, + "flos": 18519685732080.0, + "grad_norm": 1.8055242547184016, + "language_loss": 0.77765357, + "learning_rate": 1.966702564655496e-06, + "loss": 0.80159283, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13726807, + "step": 8651, + "time_per_iteration": 2.8023622035980225 + }, + { + "auxiliary_loss_clip": 0.01367146, + "auxiliary_loss_mlp": 0.01039796, + "balance_loss_clip": 1.25045431, + "balance_loss_mlp": 1.02515697, + "epoch": 0.5201863820832707, + "flos": 18623348539920.0, + "grad_norm": 1.6597074411187578, + "language_loss": 0.78754878, + "learning_rate": 1.966313157587003e-06, + "loss": 0.81161821, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14648438, + "step": 8652, + "time_per_iteration": 2.7288010120391846 + }, + { + "auxiliary_loss_clip": 0.0136406, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.2489624, + "balance_loss_mlp": 1.025828, + "epoch": 0.5202465053359386, + "flos": 22862353405680.0, + "grad_norm": 1.805831724098376, + "language_loss": 0.69992578, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.7239728, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14813232, + "step": 8653, + "time_per_iteration": 2.829169988632202 + }, + { + "auxiliary_loss_clip": 0.01369228, + "auxiliary_loss_mlp": 0.01045211, + "balance_loss_clip": 1.25063908, + "balance_loss_mlp": 1.03115618, + "epoch": 0.5203066285886067, + "flos": 21986635587480.0, + "grad_norm": 1.5127106693030359, + "language_loss": 0.78900158, + "learning_rate": 1.965534347297008e-06, + "loss": 0.813146, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14050293, + "step": 8654, + "time_per_iteration": 4.212416410446167 + }, + { + "auxiliary_loss_clip": 0.01377633, + "auxiliary_loss_mlp": 0.01039616, + "balance_loss_clip": 1.25681317, + "balance_loss_mlp": 1.02456021, + "epoch": 0.5203667518412746, + "flos": 20238611053320.0, + "grad_norm": 1.7039114205913763, + "language_loss": 0.84113759, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86531001, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.1505127, + "step": 8655, + "time_per_iteration": 2.767848491668701 + }, + { + "auxiliary_loss_clip": 0.01359107, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.24671984, + "balance_loss_mlp": 1.01995897, + "epoch": 0.5204268750939426, + "flos": 15709676521680.0, + "grad_norm": 3.391231157928573, + "language_loss": 0.65920234, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68312055, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.12451172, + "router_z_loss_mlp": 0.12768555, + "step": 8656, + "time_per_iteration": 2.755868673324585 + }, + { + "auxiliary_loss_clip": 0.01368305, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.25201857, + "balance_loss_mlp": 1.02620387, + "epoch": 0.5204869983466105, + "flos": 27454643624880.0, + "grad_norm": 1.9767327756565685, + "language_loss": 0.73417497, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75825727, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13726807, + "step": 8657, + "time_per_iteration": 2.8115780353546143 + }, + { + "auxiliary_loss_clip": 0.013645, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.25031471, + "balance_loss_mlp": 1.02242291, + "epoch": 0.5205471215992785, + "flos": 20600424162360.0, + "grad_norm": 1.6974159081813838, + "language_loss": 0.71568263, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73969918, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1472168, + "step": 8658, + "time_per_iteration": 2.8046247959136963 + }, + { + "auxiliary_loss_clip": 0.01365094, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.24972463, + "balance_loss_mlp": 1.02208638, + "epoch": 0.5206072448519465, + "flos": 22133082625560.0, + "grad_norm": 1.8923411998512099, + "language_loss": 0.83717871, + "learning_rate": 1.963587344701897e-06, + "loss": 0.86119723, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14678955, + "step": 8659, + "time_per_iteration": 2.7433652877807617 + }, + { + "auxiliary_loss_clip": 0.01380643, + "auxiliary_loss_mlp": 0.01038856, + "balance_loss_clip": 1.25784564, + "balance_loss_mlp": 1.02260137, + "epoch": 0.5206673681046144, + "flos": 18334718341560.0, + "grad_norm": 1.9846311820875513, + "language_loss": 0.76001137, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.78420633, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.16241455, + "step": 8660, + "time_per_iteration": 2.7831995487213135 + }, + { + "auxiliary_loss_clip": 0.01362949, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.24739099, + "balance_loss_mlp": 1.02076268, + "epoch": 0.5207274913572825, + "flos": 20235118734360.0, + "grad_norm": 1.7010537960775003, + "language_loss": 0.78697848, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.81094784, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13232422, + "step": 8661, + "time_per_iteration": 4.291597604751587 + }, + { + "auxiliary_loss_clip": 0.01366506, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.24872804, + "balance_loss_mlp": 1.01805568, + "epoch": 0.5207876146099504, + "flos": 22132108024920.0, + "grad_norm": 7.5192607761169965, + "language_loss": 0.70066649, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72465086, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13897705, + "step": 8662, + "time_per_iteration": 2.7984955310821533 + }, + { + "auxiliary_loss_clip": 0.01360817, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.24554157, + "balance_loss_mlp": 1.0180068, + "epoch": 0.5208477378626184, + "flos": 23884274611800.0, + "grad_norm": 1.637027124472041, + "language_loss": 0.69797766, + "learning_rate": 1.962029767391098e-06, + "loss": 0.72190875, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.1427002, + "step": 8663, + "time_per_iteration": 2.831601142883301 + }, + { + "auxiliary_loss_clip": 0.0136843, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.25100207, + "balance_loss_mlp": 1.0183897, + "epoch": 0.5209078611152863, + "flos": 20966704191000.0, + "grad_norm": 1.4618817782454154, + "language_loss": 0.770239, + "learning_rate": 1.961640376626072e-06, + "loss": 0.79425454, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14752197, + "step": 8664, + "time_per_iteration": 2.7686939239501953 + }, + { + "auxiliary_loss_clip": 0.0136593, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.24927533, + "balance_loss_mlp": 1.0168817, + "epoch": 0.5209679843679543, + "flos": 20672632472400.0, + "grad_norm": 2.1111804597759294, + "language_loss": 0.76894283, + "learning_rate": 1.961250987315646e-06, + "loss": 0.79292232, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.15124512, + "step": 8665, + "time_per_iteration": 2.780712366104126 + }, + { + "auxiliary_loss_clip": 0.01366173, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.25114799, + "balance_loss_mlp": 1.01615596, + "epoch": 0.5210281076206222, + "flos": 20231788848840.0, + "grad_norm": 1.578885490287741, + "language_loss": 0.72433209, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74828982, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13458252, + "step": 8666, + "time_per_iteration": 2.777067184448242 + }, + { + "auxiliary_loss_clip": 0.01384281, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.25850725, + "balance_loss_mlp": 1.01810789, + "epoch": 0.5210882308732903, + "flos": 16074210390840.0, + "grad_norm": 2.6319823526992066, + "language_loss": 0.68766302, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.71184862, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.1619873, + "step": 8667, + "time_per_iteration": 2.726713180541992 + }, + { + "auxiliary_loss_clip": 0.01359429, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.24647558, + "balance_loss_mlp": 1.0166316, + "epoch": 0.5211483541259582, + "flos": 24830495188920.0, + "grad_norm": 1.3929281042019432, + "language_loss": 0.81145954, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83535576, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13574219, + "step": 8668, + "time_per_iteration": 2.7877562046051025 + }, + { + "auxiliary_loss_clip": 0.01372169, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.2552557, + "balance_loss_mlp": 1.01950788, + "epoch": 0.5212084773786262, + "flos": 20374824784680.0, + "grad_norm": 1.8017429005946515, + "language_loss": 0.64179653, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.66585457, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14135742, + "step": 8669, + "time_per_iteration": 2.7352325916290283 + }, + { + "auxiliary_loss_clip": 0.0136884, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.25287151, + "balance_loss_mlp": 1.02436185, + "epoch": 0.5212686006312941, + "flos": 23150252653560.0, + "grad_norm": 1.567607234156882, + "language_loss": 0.66821158, + "learning_rate": 1.959304063099325e-06, + "loss": 0.69228363, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.14019775, + "step": 8670, + "time_per_iteration": 2.8091683387756348 + }, + { + "auxiliary_loss_clip": 0.01360774, + "auxiliary_loss_mlp": 0.0103041, + "balance_loss_clip": 1.24760151, + "balance_loss_mlp": 1.01686811, + "epoch": 0.5213287238839621, + "flos": 27778748548680.0, + "grad_norm": 2.505116192351518, + "language_loss": 0.76845396, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.79236585, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.13232422, + "router_z_loss_mlp": 0.13549805, + "step": 8671, + "time_per_iteration": 2.7397520542144775 + }, + { + "auxiliary_loss_clip": 0.01371806, + "auxiliary_loss_mlp": 0.01038942, + "balance_loss_clip": 1.25253201, + "balance_loss_mlp": 1.02391028, + "epoch": 0.5213888471366301, + "flos": 19942427700000.0, + "grad_norm": 1.7279177984901253, + "language_loss": 0.78527713, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80938458, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.15026855, + "step": 8672, + "time_per_iteration": 2.776372194290161 + }, + { + "auxiliary_loss_clip": 0.01358132, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.24229002, + "balance_loss_mlp": 1.01769435, + "epoch": 0.521448970389298, + "flos": 16987149093960.0, + "grad_norm": 1.9015563091088525, + "language_loss": 0.72534698, + "learning_rate": 1.958135926969736e-06, + "loss": 0.7492336, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.128479, + "step": 8673, + "time_per_iteration": 2.7337260246276855 + }, + { + "auxiliary_loss_clip": 0.01362658, + "auxiliary_loss_mlp": 0.01036129, + "balance_loss_clip": 1.24649549, + "balance_loss_mlp": 1.02190709, + "epoch": 0.5215090936419661, + "flos": 18994420355040.0, + "grad_norm": 1.5720786241337847, + "language_loss": 0.75287503, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77686292, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14221191, + "step": 8674, + "time_per_iteration": 2.85819411277771 + }, + { + "auxiliary_loss_clip": 0.01373922, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.25474, + "balance_loss_mlp": 1.02076578, + "epoch": 0.521569216894634, + "flos": 16147718168400.0, + "grad_norm": 2.1719783813475955, + "language_loss": 0.86352414, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88762027, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.14929199, + "step": 8675, + "time_per_iteration": 2.7906718254089355 + }, + { + "auxiliary_loss_clip": 0.01200482, + "auxiliary_loss_mlp": 0.01002723, + "balance_loss_clip": 1.15344453, + "balance_loss_mlp": 0.99995703, + "epoch": 0.521629340147302, + "flos": 57592422651600.0, + "grad_norm": 0.8728416466038573, + "language_loss": 0.63135707, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65338916, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.02770996, + "step": 8676, + "time_per_iteration": 3.2396974563598633 + }, + { + "auxiliary_loss_clip": 0.01367186, + "auxiliary_loss_mlp": 0.01038809, + "balance_loss_clip": 1.25063145, + "balance_loss_mlp": 1.02521896, + "epoch": 0.5216894633999699, + "flos": 26802535374720.0, + "grad_norm": 1.5930442567980636, + "language_loss": 0.6922608, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71632075, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13574219, + "step": 8677, + "time_per_iteration": 2.926501750946045 + }, + { + "auxiliary_loss_clip": 0.01361793, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.24667847, + "balance_loss_mlp": 1.02203417, + "epoch": 0.5217495866526379, + "flos": 26364250077840.0, + "grad_norm": 1.5914969600375832, + "language_loss": 0.65338355, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67736304, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14129639, + "step": 8678, + "time_per_iteration": 2.975377321243286 + }, + { + "auxiliary_loss_clip": 0.01374282, + "auxiliary_loss_mlp": 0.01038951, + "balance_loss_clip": 1.25292325, + "balance_loss_mlp": 1.0235014, + "epoch": 0.5218097099053058, + "flos": 23588944034040.0, + "grad_norm": 1.9584262545177575, + "language_loss": 0.68341243, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70754474, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.15454102, + "step": 8679, + "time_per_iteration": 2.8663854598999023 + }, + { + "auxiliary_loss_clip": 0.01374242, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.25593734, + "balance_loss_mlp": 1.01922417, + "epoch": 0.5218698331579739, + "flos": 18081968952240.0, + "grad_norm": 1.651621176854904, + "language_loss": 0.67366964, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69775105, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14672852, + "step": 8680, + "time_per_iteration": 2.8535990715026855 + }, + { + "auxiliary_loss_clip": 0.01372344, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.25471425, + "balance_loss_mlp": 1.02042925, + "epoch": 0.5219299564106418, + "flos": 19285811921880.0, + "grad_norm": 1.7632863656650846, + "language_loss": 0.83292633, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85699248, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.1383667, + "step": 8681, + "time_per_iteration": 2.836235761642456 + }, + { + "auxiliary_loss_clip": 0.01366777, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.2506299, + "balance_loss_mlp": 1.01899874, + "epoch": 0.5219900796633098, + "flos": 26657022328920.0, + "grad_norm": 1.7355112005886182, + "language_loss": 0.78176606, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.80575848, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13470459, + "step": 8682, + "time_per_iteration": 2.950693368911743 + }, + { + "auxiliary_loss_clip": 0.01366458, + "auxiliary_loss_mlp": 0.0104257, + "balance_loss_clip": 1.25112522, + "balance_loss_mlp": 1.02912295, + "epoch": 0.5220502029159777, + "flos": 34319299435920.0, + "grad_norm": 1.5128306277523553, + "language_loss": 0.69218779, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71627808, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13452148, + "step": 8683, + "time_per_iteration": 2.9658989906311035 + }, + { + "auxiliary_loss_clip": 0.01372857, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.25352335, + "balance_loss_mlp": 1.02378368, + "epoch": 0.5221103261686457, + "flos": 22160638720800.0, + "grad_norm": 1.6041151059482888, + "language_loss": 0.76407617, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78819084, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14819336, + "step": 8684, + "time_per_iteration": 2.9116711616516113 + }, + { + "auxiliary_loss_clip": 0.01362845, + "auxiliary_loss_mlp": 0.01026011, + "balance_loss_clip": 1.24883652, + "balance_loss_mlp": 1.01215851, + "epoch": 0.5221704494213137, + "flos": 19213116311520.0, + "grad_norm": 1.6007596301563993, + "language_loss": 0.76511663, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.78900516, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13867188, + "step": 8685, + "time_per_iteration": 2.783200979232788 + }, + { + "auxiliary_loss_clip": 0.01374263, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.25528049, + "balance_loss_mlp": 1.02311432, + "epoch": 0.5222305726739817, + "flos": 19358751182400.0, + "grad_norm": 1.6675332638740028, + "language_loss": 0.80863839, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83275747, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14532471, + "step": 8686, + "time_per_iteration": 2.7779674530029297 + }, + { + "auxiliary_loss_clip": 0.01355482, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.24327111, + "balance_loss_mlp": 1.02050173, + "epoch": 0.5222906959266497, + "flos": 27820070877960.0, + "grad_norm": 1.8238848592724364, + "language_loss": 0.70288867, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72678304, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13464355, + "step": 8687, + "time_per_iteration": 4.262355089187622 + }, + { + "auxiliary_loss_clip": 0.01357567, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.24394381, + "balance_loss_mlp": 1.02199554, + "epoch": 0.5223508191793176, + "flos": 12716730338760.0, + "grad_norm": 1.9103076088592061, + "language_loss": 0.83038741, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85432255, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13928223, + "step": 8688, + "time_per_iteration": 4.199662923812866 + }, + { + "auxiliary_loss_clip": 0.01364679, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.24908984, + "balance_loss_mlp": 1.02137971, + "epoch": 0.5224109424319856, + "flos": 15636412394280.0, + "grad_norm": 3.0559671717239154, + "language_loss": 0.73487937, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75888121, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.14117432, + "step": 8689, + "time_per_iteration": 2.7926669120788574 + }, + { + "auxiliary_loss_clip": 0.01361843, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.24796009, + "balance_loss_mlp": 1.01994944, + "epoch": 0.5224710656846535, + "flos": 15746491323000.0, + "grad_norm": 2.28259569499267, + "language_loss": 0.83059871, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85455358, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13690186, + "step": 8690, + "time_per_iteration": 2.9127535820007324 + }, + { + "auxiliary_loss_clip": 0.01368746, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.25201523, + "balance_loss_mlp": 1.02567077, + "epoch": 0.5225311889373215, + "flos": 26036571618360.0, + "grad_norm": 1.9199434479727138, + "language_loss": 0.79367995, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81777227, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14819336, + "step": 8691, + "time_per_iteration": 2.7787656784057617 + }, + { + "auxiliary_loss_clip": 0.01378799, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.25777793, + "balance_loss_mlp": 1.03045106, + "epoch": 0.5225913121899894, + "flos": 18373888427760.0, + "grad_norm": 2.929973527725764, + "language_loss": 0.76883376, + "learning_rate": 1.950738079725646e-06, + "loss": 0.79308206, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.15582275, + "step": 8692, + "time_per_iteration": 2.755328893661499 + }, + { + "auxiliary_loss_clip": 0.01360901, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.24847984, + "balance_loss_mlp": 1.01683521, + "epoch": 0.5226514354426575, + "flos": 29278896696720.0, + "grad_norm": 2.1953406615215956, + "language_loss": 0.72981453, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75372124, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.1293335, + "step": 8693, + "time_per_iteration": 4.393530368804932 + }, + { + "auxiliary_loss_clip": 0.01383394, + "auxiliary_loss_mlp": 0.01038598, + "balance_loss_clip": 1.26159215, + "balance_loss_mlp": 1.02285039, + "epoch": 0.5227115586953254, + "flos": 22858008311160.0, + "grad_norm": 2.8145743056349812, + "language_loss": 0.82263267, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84685266, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.15765381, + "step": 8694, + "time_per_iteration": 2.847693681716919 + }, + { + "auxiliary_loss_clip": 0.01197817, + "auxiliary_loss_mlp": 0.01015216, + "balance_loss_clip": 1.15174699, + "balance_loss_mlp": 1.01194942, + "epoch": 0.5227716819479934, + "flos": 57487907068200.0, + "grad_norm": 0.7674746349124953, + "language_loss": 0.55665624, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57878655, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.03271484, + "step": 8695, + "time_per_iteration": 3.3203766345977783 + }, + { + "auxiliary_loss_clip": 0.01377079, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.2580514, + "balance_loss_mlp": 1.02246559, + "epoch": 0.5228318052006613, + "flos": 13812240539160.0, + "grad_norm": 1.6479105338051698, + "language_loss": 0.73415977, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75830215, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14697266, + "step": 8696, + "time_per_iteration": 2.7442336082458496 + }, + { + "auxiliary_loss_clip": 0.01378785, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.25991774, + "balance_loss_mlp": 1.02590179, + "epoch": 0.5228919284533293, + "flos": 15600409760160.0, + "grad_norm": 1.506124650694799, + "language_loss": 0.71239388, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73659223, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.15148926, + "step": 8697, + "time_per_iteration": 2.7992539405822754 + }, + { + "auxiliary_loss_clip": 0.01371617, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.25673449, + "balance_loss_mlp": 1.02250028, + "epoch": 0.5229520517059973, + "flos": 22496560677360.0, + "grad_norm": 1.6878565203682587, + "language_loss": 0.8094846, + "learning_rate": 1.948402052740906e-06, + "loss": 0.83355677, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13110352, + "step": 8698, + "time_per_iteration": 2.8275961875915527 + }, + { + "auxiliary_loss_clip": 0.01371384, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.25409615, + "balance_loss_mlp": 1.02512789, + "epoch": 0.5230121749586653, + "flos": 22095983565720.0, + "grad_norm": 1.5647526363468733, + "language_loss": 0.74352539, + "learning_rate": 1.948012721672093e-06, + "loss": 0.7676264, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13574219, + "step": 8699, + "time_per_iteration": 2.780571699142456 + }, + { + "auxiliary_loss_clip": 0.01396441, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.27227092, + "balance_loss_mlp": 1.02301669, + "epoch": 0.5230722982113333, + "flos": 22132229850000.0, + "grad_norm": 1.602818270764496, + "language_loss": 0.73814666, + "learning_rate": 1.947623392574642e-06, + "loss": 0.76249897, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.1574707, + "step": 8700, + "time_per_iteration": 4.333719253540039 + }, + { + "auxiliary_loss_clip": 0.01390914, + "auxiliary_loss_mlp": 0.0104758, + "balance_loss_clip": 1.26929641, + "balance_loss_mlp": 1.031654, + "epoch": 0.5231324214640012, + "flos": 25014691020600.0, + "grad_norm": 1.6249002712862326, + "language_loss": 0.67453229, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69891727, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.15917969, + "step": 8701, + "time_per_iteration": 2.9086520671844482 + }, + { + "auxiliary_loss_clip": 0.01381487, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.2646544, + "balance_loss_mlp": 1.02576184, + "epoch": 0.5231925447166692, + "flos": 25746235868880.0, + "grad_norm": 1.706228421595984, + "language_loss": 0.66689521, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.69111526, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14758301, + "step": 8702, + "time_per_iteration": 2.817070722579956 + }, + { + "auxiliary_loss_clip": 0.0137887, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.26140475, + "balance_loss_mlp": 1.0221132, + "epoch": 0.5232526679693371, + "flos": 21438921095640.0, + "grad_norm": 1.8403399155812177, + "language_loss": 0.76643038, + "learning_rate": 1.946455417258101e-06, + "loss": 0.79059088, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.15063477, + "step": 8703, + "time_per_iteration": 2.79095196723938 + }, + { + "auxiliary_loss_clip": 0.01391041, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.26691508, + "balance_loss_mlp": 1.02432299, + "epoch": 0.5233127912220051, + "flos": 35305177399560.0, + "grad_norm": 2.0007706842622466, + "language_loss": 0.77149034, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79581356, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.16931152, + "step": 8704, + "time_per_iteration": 2.9249637126922607 + }, + { + "auxiliary_loss_clip": 0.01382094, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.26578856, + "balance_loss_mlp": 1.02918482, + "epoch": 0.523372914474673, + "flos": 17055174742920.0, + "grad_norm": 1.76045939304201, + "language_loss": 0.78201443, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80627179, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14447021, + "step": 8705, + "time_per_iteration": 2.759214401245117 + }, + { + "auxiliary_loss_clip": 0.01397267, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.27517676, + "balance_loss_mlp": 1.02353418, + "epoch": 0.5234330377273411, + "flos": 18410987487600.0, + "grad_norm": 2.255899642835908, + "language_loss": 0.69747472, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72183496, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.15222168, + "step": 8706, + "time_per_iteration": 2.786623954772949 + }, + { + "auxiliary_loss_clip": 0.01200868, + "auxiliary_loss_mlp": 0.01009378, + "balance_loss_clip": 1.15384245, + "balance_loss_mlp": 1.00613582, + "epoch": 0.523493160980009, + "flos": 65867573687040.0, + "grad_norm": 0.681707452246365, + "language_loss": 0.52560306, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54770553, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.0324707, + "step": 8707, + "time_per_iteration": 3.3099300861358643 + }, + { + "auxiliary_loss_clip": 0.01379904, + "auxiliary_loss_mlp": 0.01043608, + "balance_loss_clip": 1.26152229, + "balance_loss_mlp": 1.02951765, + "epoch": 0.523553284232677, + "flos": 21877165784160.0, + "grad_norm": 1.6655253722270194, + "language_loss": 0.75421947, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.7784546, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14111328, + "step": 8708, + "time_per_iteration": 2.8209128379821777 + }, + { + "auxiliary_loss_clip": 0.01375907, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.25956404, + "balance_loss_mlp": 1.01987243, + "epoch": 0.5236134074853449, + "flos": 20852848684800.0, + "grad_norm": 1.543402053805509, + "language_loss": 0.77735251, + "learning_rate": 1.944119521844849e-06, + "loss": 0.80145645, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.1463623, + "step": 8709, + "time_per_iteration": 2.7665374279022217 + }, + { + "auxiliary_loss_clip": 0.01393722, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_clip": 1.26980126, + "balance_loss_mlp": 1.02714682, + "epoch": 0.5236735307380129, + "flos": 25525956186360.0, + "grad_norm": 1.9268936131308283, + "language_loss": 0.83664334, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.86101478, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.1628418, + "step": 8710, + "time_per_iteration": 2.789015531539917 + }, + { + "auxiliary_loss_clip": 0.01382754, + "auxiliary_loss_mlp": 0.01038467, + "balance_loss_clip": 1.26692057, + "balance_loss_mlp": 1.02364898, + "epoch": 0.523733653990681, + "flos": 23587928825040.0, + "grad_norm": 1.615129340070551, + "language_loss": 0.69268531, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71689749, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.14819336, + "step": 8711, + "time_per_iteration": 2.728229284286499 + }, + { + "auxiliary_loss_clip": 0.01384868, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.26619697, + "balance_loss_mlp": 1.0235157, + "epoch": 0.5237937772433489, + "flos": 21111283244520.0, + "grad_norm": 1.9046004301259993, + "language_loss": 0.83897567, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.86320961, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.15020752, + "step": 8712, + "time_per_iteration": 2.777799367904663 + }, + { + "auxiliary_loss_clip": 0.01383255, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.26208091, + "balance_loss_mlp": 1.02809417, + "epoch": 0.5238539004960169, + "flos": 19177722802800.0, + "grad_norm": 1.626569594670794, + "language_loss": 0.69717962, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.7214458, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.15258789, + "step": 8713, + "time_per_iteration": 2.8027355670928955 + }, + { + "auxiliary_loss_clip": 0.01392121, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.26847696, + "balance_loss_mlp": 1.02198458, + "epoch": 0.5239140237486848, + "flos": 17891884908360.0, + "grad_norm": 2.638261260003963, + "language_loss": 0.76673532, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79104722, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.1706543, + "step": 8714, + "time_per_iteration": 2.7937705516815186 + }, + { + "auxiliary_loss_clip": 0.01389677, + "auxiliary_loss_mlp": 0.01039057, + "balance_loss_clip": 1.26904702, + "balance_loss_mlp": 1.02341652, + "epoch": 0.5239741470013528, + "flos": 17934831572040.0, + "grad_norm": 1.7245717831387166, + "language_loss": 0.76046056, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78474784, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.15649414, + "step": 8715, + "time_per_iteration": 2.861701488494873 + }, + { + "auxiliary_loss_clip": 0.01382975, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.26583481, + "balance_loss_mlp": 1.02129292, + "epoch": 0.5240342702540207, + "flos": 30999649394160.0, + "grad_norm": 1.4459145490979772, + "language_loss": 0.71242964, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73661792, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14562988, + "step": 8716, + "time_per_iteration": 2.817725896835327 + }, + { + "auxiliary_loss_clip": 0.01387805, + "auxiliary_loss_mlp": 0.01039075, + "balance_loss_clip": 1.26927996, + "balance_loss_mlp": 1.02547288, + "epoch": 0.5240943935066887, + "flos": 25009939842480.0, + "grad_norm": 1.8378993164466666, + "language_loss": 0.87196714, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89623594, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.13598633, + "step": 8717, + "time_per_iteration": 2.7706077098846436 + }, + { + "auxiliary_loss_clip": 0.01392804, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.27144217, + "balance_loss_mlp": 1.02048707, + "epoch": 0.5241545167593566, + "flos": 23664319796160.0, + "grad_norm": 1.9124604714672198, + "language_loss": 0.6200465, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.64433312, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.15380859, + "step": 8718, + "time_per_iteration": 2.7782247066497803 + }, + { + "auxiliary_loss_clip": 0.01394177, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.27249515, + "balance_loss_mlp": 1.02136552, + "epoch": 0.5242146400120247, + "flos": 23405154285960.0, + "grad_norm": 1.7906752154317087, + "language_loss": 0.72204852, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74635899, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.15496826, + "step": 8719, + "time_per_iteration": 2.810396909713745 + }, + { + "auxiliary_loss_clip": 0.01389558, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.2725141, + "balance_loss_mlp": 1.02029824, + "epoch": 0.5242747632646926, + "flos": 17753600150640.0, + "grad_norm": 1.9070178596846485, + "language_loss": 0.73262274, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75685769, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13653564, + "step": 8720, + "time_per_iteration": 2.7299203872680664 + }, + { + "auxiliary_loss_clip": 0.01388882, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.27102637, + "balance_loss_mlp": 1.01861405, + "epoch": 0.5243348865173606, + "flos": 32604069475440.0, + "grad_norm": 1.6374616441565366, + "language_loss": 0.70871848, + "learning_rate": 1.939447963058281e-06, + "loss": 0.73295212, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.15887451, + "step": 8721, + "time_per_iteration": 2.8625895977020264 + }, + { + "auxiliary_loss_clip": 0.01385998, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_clip": 1.26739752, + "balance_loss_mlp": 1.02939868, + "epoch": 0.5243950097700285, + "flos": 25489669293720.0, + "grad_norm": 1.6715783600340794, + "language_loss": 0.86724281, + "learning_rate": 1.939058681065813e-06, + "loss": 0.8915484, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.15161133, + "step": 8722, + "time_per_iteration": 2.826214551925659 + }, + { + "auxiliary_loss_clip": 0.01386811, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.26927495, + "balance_loss_mlp": 1.02060032, + "epoch": 0.5244551330226965, + "flos": 15272650083960.0, + "grad_norm": 1.5991108017415894, + "language_loss": 0.80054784, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82477081, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14880371, + "step": 8723, + "time_per_iteration": 2.708960771560669 + }, + { + "auxiliary_loss_clip": 0.01395099, + "auxiliary_loss_mlp": 0.01041716, + "balance_loss_clip": 1.27411628, + "balance_loss_mlp": 1.02614737, + "epoch": 0.5245152562753645, + "flos": 22242268170360.0, + "grad_norm": 2.0441210970709207, + "language_loss": 0.75373173, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77809978, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.15563965, + "step": 8724, + "time_per_iteration": 2.776153564453125 + }, + { + "auxiliary_loss_clip": 0.01402866, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.27670109, + "balance_loss_mlp": 1.02278483, + "epoch": 0.5245753795280325, + "flos": 29432775064680.0, + "grad_norm": 1.5952904666291097, + "language_loss": 0.70580065, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.73021412, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.15661621, + "step": 8725, + "time_per_iteration": 4.26025390625 + }, + { + "auxiliary_loss_clip": 0.01199358, + "auxiliary_loss_mlp": 0.0100227, + "balance_loss_clip": 1.15277648, + "balance_loss_mlp": 0.99928933, + "epoch": 0.5246355027807005, + "flos": 58849364374920.0, + "grad_norm": 0.7561900331978577, + "language_loss": 0.55663675, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57865304, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02978516, + "step": 8726, + "time_per_iteration": 3.2658329010009766 + }, + { + "auxiliary_loss_clip": 0.01197825, + "auxiliary_loss_mlp": 0.01002234, + "balance_loss_clip": 1.15093017, + "balance_loss_mlp": 0.99926567, + "epoch": 0.5246956260333684, + "flos": 64542362664600.0, + "grad_norm": 0.7999895936124563, + "language_loss": 0.58383024, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60583079, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02966309, + "step": 8727, + "time_per_iteration": 4.763864755630493 + }, + { + "auxiliary_loss_clip": 0.01400181, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.27571249, + "balance_loss_mlp": 1.02342796, + "epoch": 0.5247557492860364, + "flos": 24538819363560.0, + "grad_norm": 1.3642631711049937, + "language_loss": 0.70766693, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.7320658, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1628418, + "step": 8728, + "time_per_iteration": 2.7806239128112793 + }, + { + "auxiliary_loss_clip": 0.01394842, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.27513731, + "balance_loss_mlp": 1.01871705, + "epoch": 0.5248158725387043, + "flos": 18810671215320.0, + "grad_norm": 1.4096173143986201, + "language_loss": 0.69847691, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.72275728, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.14477539, + "step": 8729, + "time_per_iteration": 2.713136672973633 + }, + { + "auxiliary_loss_clip": 0.01396822, + "auxiliary_loss_mlp": 0.01040067, + "balance_loss_clip": 1.27509665, + "balance_loss_mlp": 1.02479625, + "epoch": 0.5248759957913723, + "flos": 20960288070120.0, + "grad_norm": 1.6758280621952388, + "language_loss": 0.83687031, + "learning_rate": 1.935944509558464e-06, + "loss": 0.86123919, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.15258789, + "step": 8730, + "time_per_iteration": 2.768198013305664 + }, + { + "auxiliary_loss_clip": 0.01401753, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.28220129, + "balance_loss_mlp": 1.028543, + "epoch": 0.5249361190440403, + "flos": 18665604861480.0, + "grad_norm": 2.0605323820165435, + "language_loss": 0.79698008, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.82143086, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.14794922, + "step": 8731, + "time_per_iteration": 4.257337808609009 + }, + { + "auxiliary_loss_clip": 0.01386898, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.27110517, + "balance_loss_mlp": 1.02369153, + "epoch": 0.5249962422967083, + "flos": 24868731282840.0, + "grad_norm": 1.5335896433499854, + "language_loss": 0.83256602, + "learning_rate": 1.935165990676312e-06, + "loss": 0.8568213, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14904785, + "step": 8732, + "time_per_iteration": 2.7808942794799805 + }, + { + "auxiliary_loss_clip": 0.01388877, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.27017736, + "balance_loss_mlp": 1.02462375, + "epoch": 0.5250563655493762, + "flos": 15266193354720.0, + "grad_norm": 1.4838444919563214, + "language_loss": 0.7778793, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80215371, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13952637, + "step": 8733, + "time_per_iteration": 2.731771469116211 + }, + { + "auxiliary_loss_clip": 0.01404731, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.2815845, + "balance_loss_mlp": 1.02353323, + "epoch": 0.5251164888020442, + "flos": 18629886485880.0, + "grad_norm": 1.8163062428018868, + "language_loss": 0.81681371, + "learning_rate": 1.934387481628208e-06, + "loss": 0.8412497, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.15332031, + "step": 8734, + "time_per_iteration": 2.8914456367492676 + }, + { + "auxiliary_loss_clip": 0.01392364, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.27516484, + "balance_loss_mlp": 1.01912999, + "epoch": 0.5251766120547121, + "flos": 29716126176240.0, + "grad_norm": 1.3587193446898995, + "language_loss": 0.76987022, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79413426, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14916992, + "step": 8735, + "time_per_iteration": 2.8578615188598633 + }, + { + "auxiliary_loss_clip": 0.01392588, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_clip": 1.27352428, + "balance_loss_mlp": 1.02911305, + "epoch": 0.5252367353073801, + "flos": 23445502014600.0, + "grad_norm": 1.5978378500913415, + "language_loss": 0.80236554, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82671696, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13446045, + "step": 8736, + "time_per_iteration": 2.7903456687927246 + }, + { + "auxiliary_loss_clip": 0.0139923, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_clip": 1.27950478, + "balance_loss_mlp": 1.02924871, + "epoch": 0.5252968585600482, + "flos": 30816915463440.0, + "grad_norm": 2.2582272471840112, + "language_loss": 0.70120758, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72564411, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.15185547, + "step": 8737, + "time_per_iteration": 2.8837459087371826 + }, + { + "auxiliary_loss_clip": 0.0139659, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.27628303, + "balance_loss_mlp": 1.02261448, + "epoch": 0.5253569818127161, + "flos": 20632772044080.0, + "grad_norm": 1.417068757949013, + "language_loss": 0.77333462, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79767084, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.144104, + "step": 8738, + "time_per_iteration": 2.923569679260254 + }, + { + "auxiliary_loss_clip": 0.01195232, + "auxiliary_loss_mlp": 0.0100616, + "balance_loss_clip": 1.14801967, + "balance_loss_mlp": 1.00314379, + "epoch": 0.5254171050653841, + "flos": 63443888053920.0, + "grad_norm": 0.7570648368352422, + "language_loss": 0.54522538, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56723928, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.03015137, + "step": 8739, + "time_per_iteration": 4.737139463424683 + }, + { + "auxiliary_loss_clip": 0.01396724, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_clip": 1.27732587, + "balance_loss_mlp": 1.03267384, + "epoch": 0.525477228318052, + "flos": 34676523800280.0, + "grad_norm": 1.5712810161680142, + "language_loss": 0.84840572, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.87284076, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14111328, + "step": 8740, + "time_per_iteration": 2.8914754390716553 + }, + { + "auxiliary_loss_clip": 0.01393738, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.27510691, + "balance_loss_mlp": 1.03187823, + "epoch": 0.52553735157072, + "flos": 17935318872360.0, + "grad_norm": 1.9610935522896606, + "language_loss": 0.69368875, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71808684, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14208984, + "step": 8741, + "time_per_iteration": 2.722982168197632 + }, + { + "auxiliary_loss_clip": 0.01407103, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.28468001, + "balance_loss_mlp": 1.02151489, + "epoch": 0.5255974748233879, + "flos": 9944348096880.0, + "grad_norm": 1.7831381892005707, + "language_loss": 0.65883112, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68326789, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.15063477, + "step": 8742, + "time_per_iteration": 2.741337299346924 + }, + { + "auxiliary_loss_clip": 0.01409394, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.28565025, + "balance_loss_mlp": 1.03200197, + "epoch": 0.5256575980760559, + "flos": 16872197162040.0, + "grad_norm": 2.41402244025449, + "language_loss": 0.63620996, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.66078132, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.15734863, + "step": 8743, + "time_per_iteration": 2.7149269580841064 + }, + { + "auxiliary_loss_clip": 0.01199096, + "auxiliary_loss_mlp": 0.01010776, + "balance_loss_clip": 1.15238953, + "balance_loss_mlp": 1.00793886, + "epoch": 0.5257177213287239, + "flos": 62403303983400.0, + "grad_norm": 0.7737743386788003, + "language_loss": 0.54179877, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56389749, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02832031, + "step": 8744, + "time_per_iteration": 3.361366033554077 + }, + { + "auxiliary_loss_clip": 0.01409616, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.28490376, + "balance_loss_mlp": 1.02873588, + "epoch": 0.5257778445813919, + "flos": 20781777408840.0, + "grad_norm": 2.5666478986600914, + "language_loss": 0.76020515, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.16442871, + "step": 8745, + "time_per_iteration": 2.7948644161224365 + }, + { + "auxiliary_loss_clip": 0.01404158, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.28472662, + "balance_loss_mlp": 1.02952206, + "epoch": 0.5258379678340598, + "flos": 17022948686280.0, + "grad_norm": 1.7886333012437525, + "language_loss": 0.80999923, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83447945, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.14343262, + "step": 8746, + "time_per_iteration": 2.7473185062408447 + }, + { + "auxiliary_loss_clip": 0.01392991, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.27526617, + "balance_loss_mlp": 1.02290428, + "epoch": 0.5258980910867278, + "flos": 21073656276000.0, + "grad_norm": 1.7320967289843376, + "language_loss": 0.75418651, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77849638, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.15100098, + "step": 8747, + "time_per_iteration": 2.7324154376983643 + }, + { + "auxiliary_loss_clip": 0.01391234, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.27612162, + "balance_loss_mlp": 1.02143693, + "epoch": 0.5259582143393957, + "flos": 18009029691720.0, + "grad_norm": 1.825635601339857, + "language_loss": 0.83071661, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85498548, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14227295, + "step": 8748, + "time_per_iteration": 2.774386405944824 + }, + { + "auxiliary_loss_clip": 0.01398476, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.2769835, + "balance_loss_mlp": 1.02641177, + "epoch": 0.5260183375920637, + "flos": 22789008061560.0, + "grad_norm": 1.870617209073821, + "language_loss": 0.81060123, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.8350029, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.15283203, + "step": 8749, + "time_per_iteration": 2.7604053020477295 + }, + { + "auxiliary_loss_clip": 0.01401327, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.28260612, + "balance_loss_mlp": 1.023669, + "epoch": 0.5260784608447318, + "flos": 27058086740880.0, + "grad_norm": 1.701377077048242, + "language_loss": 0.73073745, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.75513989, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.15234375, + "step": 8750, + "time_per_iteration": 2.8203368186950684 + }, + { + "auxiliary_loss_clip": 0.01400132, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.28059769, + "balance_loss_mlp": 1.02299893, + "epoch": 0.5261385840973997, + "flos": 20667069127080.0, + "grad_norm": 1.3154039704410272, + "language_loss": 0.76485097, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78922653, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14428711, + "step": 8751, + "time_per_iteration": 2.7267751693725586 + }, + { + "auxiliary_loss_clip": 0.01398871, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.28150666, + "balance_loss_mlp": 1.02330303, + "epoch": 0.5261987073500677, + "flos": 23627870470080.0, + "grad_norm": 1.3369345405981787, + "language_loss": 0.76180816, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78617227, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14227295, + "step": 8752, + "time_per_iteration": 2.80678129196167 + }, + { + "auxiliary_loss_clip": 0.01407472, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.285537, + "balance_loss_mlp": 1.02044165, + "epoch": 0.5262588306027356, + "flos": 27642006908640.0, + "grad_norm": 1.6333069963587776, + "language_loss": 0.68014294, + "learning_rate": 1.926992158720058e-06, + "loss": 0.7045781, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.15600586, + "step": 8753, + "time_per_iteration": 2.7923977375030518 + }, + { + "auxiliary_loss_clip": 0.0139407, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.27724695, + "balance_loss_mlp": 1.02158368, + "epoch": 0.5263189538554036, + "flos": 21764406703680.0, + "grad_norm": 1.6262725501378135, + "language_loss": 0.84007525, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.8643806, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14886475, + "step": 8754, + "time_per_iteration": 2.7903647422790527 + }, + { + "auxiliary_loss_clip": 0.01408579, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.28598237, + "balance_loss_mlp": 1.01982641, + "epoch": 0.5263790771080715, + "flos": 14279584440600.0, + "grad_norm": 1.965289216901273, + "language_loss": 0.87820542, + "learning_rate": 1.926213760058522e-06, + "loss": 0.90264285, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.15332031, + "step": 8755, + "time_per_iteration": 2.7658112049102783 + }, + { + "auxiliary_loss_clip": 0.01193638, + "auxiliary_loss_mlp": 0.01002524, + "balance_loss_clip": 1.14736748, + "balance_loss_mlp": 0.99977058, + "epoch": 0.5264392003607395, + "flos": 65822718430440.0, + "grad_norm": 0.7151366313770642, + "language_loss": 0.58870029, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.61066192, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02758789, + "step": 8756, + "time_per_iteration": 3.3666093349456787 + }, + { + "auxiliary_loss_clip": 0.0141655, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.29220486, + "balance_loss_mlp": 1.02144766, + "epoch": 0.5264993236134075, + "flos": 21037328775000.0, + "grad_norm": 1.6182636211384216, + "language_loss": 0.70846474, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73299581, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.15124512, + "step": 8757, + "time_per_iteration": 2.8564364910125732 + }, + { + "auxiliary_loss_clip": 0.01408559, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.28756356, + "balance_loss_mlp": 1.02410495, + "epoch": 0.5265594468660755, + "flos": 16622615224800.0, + "grad_norm": 1.678400094883988, + "language_loss": 0.88143122, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90590829, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.1505127, + "step": 8758, + "time_per_iteration": 2.757878541946411 + }, + { + "auxiliary_loss_clip": 0.01406781, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.28454137, + "balance_loss_mlp": 1.02488685, + "epoch": 0.5266195701187434, + "flos": 24139501111080.0, + "grad_norm": 1.4338478137508004, + "language_loss": 0.76170027, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78617483, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15783691, + "step": 8759, + "time_per_iteration": 2.821770429611206 + }, + { + "auxiliary_loss_clip": 0.01394199, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.27766383, + "balance_loss_mlp": 1.0195415, + "epoch": 0.5266796933714114, + "flos": 15847392762360.0, + "grad_norm": 2.201788558635312, + "language_loss": 0.72201431, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74629581, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14416504, + "step": 8760, + "time_per_iteration": 2.826720714569092 + }, + { + "auxiliary_loss_clip": 0.01412113, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.28803885, + "balance_loss_mlp": 1.02168989, + "epoch": 0.5267398166240793, + "flos": 20955293241840.0, + "grad_norm": 2.4576755892084217, + "language_loss": 0.76228696, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78677505, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.15014648, + "step": 8761, + "time_per_iteration": 2.763354539871216 + }, + { + "auxiliary_loss_clip": 0.01404721, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.28371954, + "balance_loss_mlp": 1.01600718, + "epoch": 0.5267999398767473, + "flos": 21001366749240.0, + "grad_norm": 1.6580665334342237, + "language_loss": 0.71040487, + "learning_rate": 1.923489453654373e-06, + "loss": 0.73475051, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.13842773, + "step": 8762, + "time_per_iteration": 2.7733278274536133 + }, + { + "auxiliary_loss_clip": 0.01189516, + "auxiliary_loss_mlp": 0.01007419, + "balance_loss_clip": 1.14341331, + "balance_loss_mlp": 1.00465333, + "epoch": 0.5268600631294152, + "flos": 66863871018000.0, + "grad_norm": 0.9299290541598669, + "language_loss": 0.65555537, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67752469, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02770996, + "step": 8763, + "time_per_iteration": 4.649145126342773 + }, + { + "auxiliary_loss_clip": 0.01403542, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.2850008, + "balance_loss_mlp": 1.01618743, + "epoch": 0.5269201863820833, + "flos": 17170045458120.0, + "grad_norm": 1.9943045595584747, + "language_loss": 0.71225828, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73659956, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.1439209, + "step": 8764, + "time_per_iteration": 2.7092747688293457 + }, + { + "auxiliary_loss_clip": 0.01406791, + "auxiliary_loss_mlp": 0.01039373, + "balance_loss_clip": 1.28471434, + "balance_loss_mlp": 1.02321959, + "epoch": 0.5269803096347513, + "flos": 20527566118560.0, + "grad_norm": 1.655292292407506, + "language_loss": 0.74444252, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76890415, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.16149902, + "step": 8765, + "time_per_iteration": 2.7329516410827637 + }, + { + "auxiliary_loss_clip": 0.01405726, + "auxiliary_loss_mlp": 0.01038294, + "balance_loss_clip": 1.28147423, + "balance_loss_mlp": 1.02314246, + "epoch": 0.5270404328874192, + "flos": 27236191318560.0, + "grad_norm": 1.4066389561895456, + "language_loss": 0.85573423, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.8801744, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.15148926, + "step": 8766, + "time_per_iteration": 4.255889654159546 + }, + { + "auxiliary_loss_clip": 0.01404347, + "auxiliary_loss_mlp": 0.01038914, + "balance_loss_clip": 1.28347325, + "balance_loss_mlp": 1.02369142, + "epoch": 0.5271005561400872, + "flos": 23115468270240.0, + "grad_norm": 1.7505647443647716, + "language_loss": 0.79209495, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81652749, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.15209961, + "step": 8767, + "time_per_iteration": 2.7449862957000732 + }, + { + "auxiliary_loss_clip": 0.01406197, + "auxiliary_loss_mlp": 0.01035396, + "balance_loss_clip": 1.28453541, + "balance_loss_mlp": 1.0194813, + "epoch": 0.5271606793927551, + "flos": 22569906021480.0, + "grad_norm": 2.286593206576029, + "language_loss": 0.73823339, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.7626493, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.15917969, + "step": 8768, + "time_per_iteration": 2.782353162765503 + }, + { + "auxiliary_loss_clip": 0.01403781, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.28490233, + "balance_loss_mlp": 1.02580416, + "epoch": 0.5272208026454231, + "flos": 18768090026880.0, + "grad_norm": 1.776261501637173, + "language_loss": 0.74436408, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76879275, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13293457, + "step": 8769, + "time_per_iteration": 2.710425615310669 + }, + { + "auxiliary_loss_clip": 0.01399892, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.28175378, + "balance_loss_mlp": 1.02313232, + "epoch": 0.5272809258980911, + "flos": 20416999889520.0, + "grad_norm": 1.777856407044778, + "language_loss": 0.73834693, + "learning_rate": 1.920376134993436e-06, + "loss": 0.7627207, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.14355469, + "step": 8770, + "time_per_iteration": 4.313755989074707 + }, + { + "auxiliary_loss_clip": 0.01405131, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.2857852, + "balance_loss_mlp": 1.02228165, + "epoch": 0.5273410491507591, + "flos": 28262620052640.0, + "grad_norm": 1.6881629962894333, + "language_loss": 0.68035078, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70477277, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14807129, + "step": 8771, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.01398287, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.28066444, + "balance_loss_mlp": 1.02370787, + "epoch": 0.527401172403427, + "flos": 22460233176360.0, + "grad_norm": 1.7112950016195971, + "language_loss": 0.76666033, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.79102796, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14770508, + "step": 8772, + "time_per_iteration": 2.820371627807617 + }, + { + "auxiliary_loss_clip": 0.0139589, + "auxiliary_loss_mlp": 0.01045391, + "balance_loss_clip": 1.27457595, + "balance_loss_mlp": 1.02989423, + "epoch": 0.527461295656095, + "flos": 21035826265680.0, + "grad_norm": 2.0160441931873896, + "language_loss": 0.66250831, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68692112, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.1550293, + "step": 8773, + "time_per_iteration": 2.826406955718994 + }, + { + "auxiliary_loss_clip": 0.01398772, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.27716541, + "balance_loss_mlp": 1.0221405, + "epoch": 0.5275214189087629, + "flos": 26327516493240.0, + "grad_norm": 1.5345123650527304, + "language_loss": 0.86364871, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88799626, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.13861084, + "step": 8774, + "time_per_iteration": 2.7856740951538086 + }, + { + "auxiliary_loss_clip": 0.01395642, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.2769289, + "balance_loss_mlp": 1.01654959, + "epoch": 0.5275815421614309, + "flos": 20051775678240.0, + "grad_norm": 1.427059178901779, + "language_loss": 0.80298394, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82724166, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13574219, + "step": 8775, + "time_per_iteration": 2.841663360595703 + }, + { + "auxiliary_loss_clip": 0.01386285, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.27005124, + "balance_loss_mlp": 1.02239823, + "epoch": 0.5276416654140988, + "flos": 21436971894360.0, + "grad_norm": 1.5988808601683218, + "language_loss": 0.83894956, + "learning_rate": 1.918041272397012e-06, + "loss": 0.86318403, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14776611, + "step": 8776, + "time_per_iteration": 4.336820125579834 + }, + { + "auxiliary_loss_clip": 0.01392295, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.27216554, + "balance_loss_mlp": 1.01827264, + "epoch": 0.5277017886667669, + "flos": 17169517549440.0, + "grad_norm": 1.614573935489875, + "language_loss": 0.68198645, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70623696, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14477539, + "step": 8777, + "time_per_iteration": 2.83972430229187 + }, + { + "auxiliary_loss_clip": 0.01395386, + "auxiliary_loss_mlp": 0.01038396, + "balance_loss_clip": 1.2786783, + "balance_loss_mlp": 1.02430522, + "epoch": 0.5277619119194349, + "flos": 20452921306920.0, + "grad_norm": 1.4610377735205604, + "language_loss": 0.82729423, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.85163212, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14099121, + "step": 8778, + "time_per_iteration": 2.892730474472046 + }, + { + "auxiliary_loss_clip": 0.01401462, + "auxiliary_loss_mlp": 0.010398, + "balance_loss_clip": 1.28056872, + "balance_loss_mlp": 1.02447522, + "epoch": 0.5278220351721028, + "flos": 24066440025480.0, + "grad_norm": 1.9872117254214041, + "language_loss": 0.79475152, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81916416, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.15332031, + "step": 8779, + "time_per_iteration": 2.773062229156494 + }, + { + "auxiliary_loss_clip": 0.01389842, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.27217007, + "balance_loss_mlp": 1.02024126, + "epoch": 0.5278821584247708, + "flos": 24648086125080.0, + "grad_norm": 2.2032826830823846, + "language_loss": 0.76787162, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.79211563, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14331055, + "step": 8780, + "time_per_iteration": 2.7684242725372314 + }, + { + "auxiliary_loss_clip": 0.01401445, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.27948165, + "balance_loss_mlp": 1.01730382, + "epoch": 0.5279422816774387, + "flos": 35415093894840.0, + "grad_norm": 1.5364412090706905, + "language_loss": 0.69404489, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71837884, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.14660645, + "step": 8781, + "time_per_iteration": 2.89052152633667 + }, + { + "auxiliary_loss_clip": 0.01391281, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.27635932, + "balance_loss_mlp": 1.02267241, + "epoch": 0.5280024049301068, + "flos": 22972310509320.0, + "grad_norm": 1.5174037957116808, + "language_loss": 0.72503686, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.749313, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13684082, + "step": 8782, + "time_per_iteration": 2.7754604816436768 + }, + { + "auxiliary_loss_clip": 0.01393003, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.27648449, + "balance_loss_mlp": 1.01822996, + "epoch": 0.5280625281827747, + "flos": 21512713131720.0, + "grad_norm": 1.7063031663341044, + "language_loss": 0.68791163, + "learning_rate": 1.915317407666982e-06, + "loss": 0.71216261, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13879395, + "step": 8783, + "time_per_iteration": 2.790987968444824 + }, + { + "auxiliary_loss_clip": 0.01416872, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_clip": 1.29122591, + "balance_loss_mlp": 1.02504981, + "epoch": 0.5281226514354427, + "flos": 31213919039400.0, + "grad_norm": 1.9533133482885907, + "language_loss": 0.69410396, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71869451, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.17114258, + "step": 8784, + "time_per_iteration": 2.8193602561950684 + }, + { + "auxiliary_loss_clip": 0.01413602, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.28841734, + "balance_loss_mlp": 1.01803446, + "epoch": 0.5281827746881106, + "flos": 25082676061200.0, + "grad_norm": 2.239516779751235, + "language_loss": 0.75288534, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77736694, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.16516113, + "step": 8785, + "time_per_iteration": 2.806082248687744 + }, + { + "auxiliary_loss_clip": 0.01397444, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.27797949, + "balance_loss_mlp": 1.0216217, + "epoch": 0.5282428979407786, + "flos": 20636142537960.0, + "grad_norm": 1.4973956544969615, + "language_loss": 0.83295137, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.8572982, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.15625, + "step": 8786, + "time_per_iteration": 2.7762324810028076 + }, + { + "auxiliary_loss_clip": 0.01386423, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.27130342, + "balance_loss_mlp": 1.01534104, + "epoch": 0.5283030211934465, + "flos": 22424555409120.0, + "grad_norm": 2.0408978334503125, + "language_loss": 0.8337996, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85794568, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.128479, + "step": 8787, + "time_per_iteration": 2.760305404663086 + }, + { + "auxiliary_loss_clip": 0.0139724, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.28086138, + "balance_loss_mlp": 1.01847363, + "epoch": 0.5283631444461145, + "flos": 23619748798080.0, + "grad_norm": 1.7226254465497601, + "language_loss": 0.83747321, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.86176765, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13745117, + "step": 8788, + "time_per_iteration": 2.739060401916504 + }, + { + "auxiliary_loss_clip": 0.01390708, + "auxiliary_loss_mlp": 0.01037944, + "balance_loss_clip": 1.2738924, + "balance_loss_mlp": 1.02231574, + "epoch": 0.5284232676987825, + "flos": 32678511245280.0, + "grad_norm": 1.433483981265985, + "language_loss": 0.75335395, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77764046, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.15625, + "step": 8789, + "time_per_iteration": 2.8553402423858643 + }, + { + "auxiliary_loss_clip": 0.01405542, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.28522086, + "balance_loss_mlp": 1.02203631, + "epoch": 0.5284833909514505, + "flos": 26766613957320.0, + "grad_norm": 1.7020841826170616, + "language_loss": 0.70141935, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.72584915, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.15405273, + "step": 8790, + "time_per_iteration": 2.834608316421509 + }, + { + "auxiliary_loss_clip": 0.01395058, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.27943099, + "balance_loss_mlp": 1.01625288, + "epoch": 0.5285435142041185, + "flos": 22095780523920.0, + "grad_norm": 3.5760276695684134, + "language_loss": 0.79441071, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81865609, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.13214111, + "step": 8791, + "time_per_iteration": 2.753319501876831 + }, + { + "auxiliary_loss_clip": 0.01391354, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.27523434, + "balance_loss_mlp": 1.01901388, + "epoch": 0.5286036374567864, + "flos": 20380266304920.0, + "grad_norm": 2.178439702830439, + "language_loss": 0.66281223, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68705618, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14025879, + "step": 8792, + "time_per_iteration": 2.7565271854400635 + }, + { + "auxiliary_loss_clip": 0.01396831, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.2790246, + "balance_loss_mlp": 1.0217104, + "epoch": 0.5286637607094544, + "flos": 24357466117080.0, + "grad_norm": 2.139782793369099, + "language_loss": 0.7976675, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82198763, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13476562, + "step": 8793, + "time_per_iteration": 2.7503247261047363 + }, + { + "auxiliary_loss_clip": 0.01399741, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.2817173, + "balance_loss_mlp": 1.02567077, + "epoch": 0.5287238839621223, + "flos": 17275170166920.0, + "grad_norm": 2.0496057099660057, + "language_loss": 0.84514767, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86954778, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.14611816, + "step": 8794, + "time_per_iteration": 2.775290012359619 + }, + { + "auxiliary_loss_clip": 0.01410545, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.28550541, + "balance_loss_mlp": 1.02465713, + "epoch": 0.5287840072147904, + "flos": 17571434736960.0, + "grad_norm": 1.893699510300115, + "language_loss": 0.68419808, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70870775, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.15771484, + "step": 8795, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.01402209, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.28304195, + "balance_loss_mlp": 1.02269471, + "epoch": 0.5288441304674583, + "flos": 18556784791920.0, + "grad_norm": 1.8947883459838437, + "language_loss": 0.81330985, + "learning_rate": 1.910259223028374e-06, + "loss": 0.83770752, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14849854, + "step": 8796, + "time_per_iteration": 2.8086228370666504 + }, + { + "auxiliary_loss_clip": 0.01402965, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.28381693, + "balance_loss_mlp": 1.02110434, + "epoch": 0.5289042537201263, + "flos": 20819526202440.0, + "grad_norm": 1.490623613682461, + "language_loss": 0.69359815, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71799159, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.15264893, + "step": 8797, + "time_per_iteration": 2.8490211963653564 + }, + { + "auxiliary_loss_clip": 0.01396481, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.28205347, + "balance_loss_mlp": 1.02101994, + "epoch": 0.5289643769727942, + "flos": 15738613301160.0, + "grad_norm": 1.8379653259999102, + "language_loss": 0.82330728, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84762216, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13977051, + "step": 8798, + "time_per_iteration": 2.761121988296509 + }, + { + "auxiliary_loss_clip": 0.01409443, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.28598022, + "balance_loss_mlp": 1.02893543, + "epoch": 0.5290245002254622, + "flos": 19542134846880.0, + "grad_norm": 1.7105104079038425, + "language_loss": 0.70631647, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.73085582, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.15545654, + "step": 8799, + "time_per_iteration": 2.8509669303894043 + }, + { + "auxiliary_loss_clip": 0.01387721, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.27499557, + "balance_loss_mlp": 1.02308309, + "epoch": 0.5290846234781301, + "flos": 15819389975160.0, + "grad_norm": 1.8948585678965104, + "language_loss": 0.68802619, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71226859, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13446045, + "step": 8800, + "time_per_iteration": 2.72416090965271 + }, + { + "auxiliary_loss_clip": 0.01190495, + "auxiliary_loss_mlp": 0.01006812, + "balance_loss_clip": 1.14448178, + "balance_loss_mlp": 1.00440371, + "epoch": 0.5291447467307981, + "flos": 70072183271880.0, + "grad_norm": 0.9411809917930009, + "language_loss": 0.57004553, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.5920186, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02404785, + "step": 8801, + "time_per_iteration": 3.1618154048919678 + }, + { + "auxiliary_loss_clip": 0.01400359, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.28009462, + "balance_loss_mlp": 1.02255821, + "epoch": 0.529204869983466, + "flos": 28369815787800.0, + "grad_norm": 1.5286871559299173, + "language_loss": 0.64309001, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.6674602, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14105225, + "step": 8802, + "time_per_iteration": 4.37914514541626 + }, + { + "auxiliary_loss_clip": 0.01398996, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.28147197, + "balance_loss_mlp": 1.01734066, + "epoch": 0.5292649932361341, + "flos": 33764478481080.0, + "grad_norm": 1.6150466960692116, + "language_loss": 0.69011015, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71442205, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.1484375, + "step": 8803, + "time_per_iteration": 2.8528242111206055 + }, + { + "auxiliary_loss_clip": 0.01400471, + "auxiliary_loss_mlp": 0.01038026, + "balance_loss_clip": 1.28472066, + "balance_loss_mlp": 1.02372098, + "epoch": 0.5293251164888021, + "flos": 20452393398240.0, + "grad_norm": 1.5543402567170448, + "language_loss": 0.76263142, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78701639, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14306641, + "step": 8804, + "time_per_iteration": 2.7798967361450195 + }, + { + "auxiliary_loss_clip": 0.01190528, + "auxiliary_loss_mlp": 0.01004703, + "balance_loss_clip": 1.1450069, + "balance_loss_mlp": 1.0022831, + "epoch": 0.52938523974147, + "flos": 66562912859400.0, + "grad_norm": 0.7588169350439912, + "language_loss": 0.53046775, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55242002, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.02416992, + "step": 8805, + "time_per_iteration": 4.7421815395355225 + }, + { + "auxiliary_loss_clip": 0.01189904, + "auxiliary_loss_mlp": 0.0100785, + "balance_loss_clip": 1.14387751, + "balance_loss_mlp": 1.0054425, + "epoch": 0.529445362994138, + "flos": 67167607526280.0, + "grad_norm": 0.7459654874902125, + "language_loss": 0.63848281, + "learning_rate": 1.906368701413693e-06, + "loss": 0.66046035, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02404785, + "step": 8806, + "time_per_iteration": 3.1895511150360107 + }, + { + "auxiliary_loss_clip": 0.01416393, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.29069281, + "balance_loss_mlp": 1.02499688, + "epoch": 0.5295054862468059, + "flos": 17753884409160.0, + "grad_norm": 3.0381761107806224, + "language_loss": 0.72122395, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74578989, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.15209961, + "step": 8807, + "time_per_iteration": 2.761461019515991 + }, + { + "auxiliary_loss_clip": 0.01397345, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.2806251, + "balance_loss_mlp": 1.0240258, + "epoch": 0.529565609499474, + "flos": 11400696805680.0, + "grad_norm": 2.070393982489467, + "language_loss": 0.69917166, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.72351694, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.1315918, + "step": 8808, + "time_per_iteration": 4.128919839859009 + }, + { + "auxiliary_loss_clip": 0.01403094, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.28507674, + "balance_loss_mlp": 1.02164876, + "epoch": 0.5296257327521419, + "flos": 17200119271680.0, + "grad_norm": 1.6798070022939384, + "language_loss": 0.86913353, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.89351958, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13861084, + "step": 8809, + "time_per_iteration": 2.6981356143951416 + }, + { + "auxiliary_loss_clip": 0.01417771, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.29229629, + "balance_loss_mlp": 1.0238117, + "epoch": 0.5296858560048099, + "flos": 39970894179600.0, + "grad_norm": 1.7950588434907486, + "language_loss": 0.64841199, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.67299342, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.16577148, + "step": 8810, + "time_per_iteration": 2.970458984375 + }, + { + "auxiliary_loss_clip": 0.01392258, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.27592385, + "balance_loss_mlp": 1.02624297, + "epoch": 0.5297459792574778, + "flos": 20966582365920.0, + "grad_norm": 1.47876890809565, + "language_loss": 0.68141133, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70573723, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14105225, + "step": 8811, + "time_per_iteration": 2.7778754234313965 + }, + { + "auxiliary_loss_clip": 0.01186989, + "auxiliary_loss_mlp": 0.01004434, + "balance_loss_clip": 1.14142871, + "balance_loss_mlp": 1.00169206, + "epoch": 0.5298061025101458, + "flos": 66538385409960.0, + "grad_norm": 0.6667173850610929, + "language_loss": 0.5339995, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55591375, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02746582, + "step": 8812, + "time_per_iteration": 3.3731577396392822 + }, + { + "auxiliary_loss_clip": 0.01190205, + "auxiliary_loss_mlp": 0.01003611, + "balance_loss_clip": 1.14493132, + "balance_loss_mlp": 1.00094116, + "epoch": 0.5298662257628137, + "flos": 67678994517120.0, + "grad_norm": 0.7303538724909138, + "language_loss": 0.56414723, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58608532, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.0267334, + "step": 8813, + "time_per_iteration": 3.2975661754608154 + }, + { + "auxiliary_loss_clip": 0.01388755, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.27497053, + "balance_loss_mlp": 1.01807523, + "epoch": 0.5299263490154817, + "flos": 19651198566600.0, + "grad_norm": 1.5631103457100282, + "language_loss": 0.81976593, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.84397686, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14263916, + "step": 8814, + "time_per_iteration": 2.7862346172332764 + }, + { + "auxiliary_loss_clip": 0.01413949, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.29125524, + "balance_loss_mlp": 1.02177739, + "epoch": 0.5299864722681497, + "flos": 22060183973400.0, + "grad_norm": 1.579478606045096, + "language_loss": 0.85036302, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87486106, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.140625, + "step": 8815, + "time_per_iteration": 4.35550856590271 + }, + { + "auxiliary_loss_clip": 0.0139685, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.28151965, + "balance_loss_mlp": 1.02091968, + "epoch": 0.5300465955208177, + "flos": 21768914231640.0, + "grad_norm": 1.9711135192236378, + "language_loss": 0.66664892, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69096249, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13592529, + "step": 8816, + "time_per_iteration": 2.926809787750244 + }, + { + "auxiliary_loss_clip": 0.01400753, + "auxiliary_loss_mlp": 0.01041157, + "balance_loss_clip": 1.2833364, + "balance_loss_mlp": 1.02678061, + "epoch": 0.5301067187734857, + "flos": 43004228699520.0, + "grad_norm": 1.5930701693807632, + "language_loss": 0.73173368, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.75615275, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.14385986, + "step": 8817, + "time_per_iteration": 2.9783706665039062 + }, + { + "auxiliary_loss_clip": 0.01399652, + "auxiliary_loss_mlp": 0.01038495, + "balance_loss_clip": 1.27961457, + "balance_loss_mlp": 1.02408862, + "epoch": 0.5301668420261536, + "flos": 20558655141120.0, + "grad_norm": 1.575289629927545, + "language_loss": 0.65135992, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67574137, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.144104, + "step": 8818, + "time_per_iteration": 2.8038811683654785 + }, + { + "auxiliary_loss_clip": 0.01401282, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.28159869, + "balance_loss_mlp": 1.01872706, + "epoch": 0.5302269652788216, + "flos": 17490251979360.0, + "grad_norm": 1.8921835702591507, + "language_loss": 0.75264055, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77699029, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14978027, + "step": 8819, + "time_per_iteration": 2.881563663482666 + }, + { + "auxiliary_loss_clip": 0.01413915, + "auxiliary_loss_mlp": 0.01037559, + "balance_loss_clip": 1.29142261, + "balance_loss_mlp": 1.0230577, + "epoch": 0.5302870885314895, + "flos": 14578244903880.0, + "grad_norm": 1.909700639295425, + "language_loss": 0.82256711, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84708184, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.14508057, + "step": 8820, + "time_per_iteration": 2.720158100128174 + }, + { + "auxiliary_loss_clip": 0.01401626, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_clip": 1.2817595, + "balance_loss_mlp": 1.02922034, + "epoch": 0.5303472117841576, + "flos": 23442943687920.0, + "grad_norm": 1.3600969177316886, + "language_loss": 0.72830927, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.752756, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.13818359, + "step": 8821, + "time_per_iteration": 2.8610377311706543 + }, + { + "auxiliary_loss_clip": 0.01397563, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.28195989, + "balance_loss_mlp": 1.02849889, + "epoch": 0.5304073350368255, + "flos": 22713916557960.0, + "grad_norm": 1.4467708755233342, + "language_loss": 0.74685025, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.7712431, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13226318, + "step": 8822, + "time_per_iteration": 2.7256007194519043 + }, + { + "auxiliary_loss_clip": 0.01401243, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.28159428, + "balance_loss_mlp": 1.02369571, + "epoch": 0.5304674582894935, + "flos": 27934413684480.0, + "grad_norm": 1.6024384822558442, + "language_loss": 0.6758728, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70027715, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.15478516, + "step": 8823, + "time_per_iteration": 2.7938129901885986 + }, + { + "auxiliary_loss_clip": 0.01408344, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.28621054, + "balance_loss_mlp": 1.02535319, + "epoch": 0.5305275815421614, + "flos": 21255496822800.0, + "grad_norm": 1.542427927061086, + "language_loss": 0.69590831, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.72040248, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.15722656, + "step": 8824, + "time_per_iteration": 2.7198429107666016 + }, + { + "auxiliary_loss_clip": 0.01390079, + "auxiliary_loss_mlp": 0.01036327, + "balance_loss_clip": 1.2745378, + "balance_loss_mlp": 1.02251065, + "epoch": 0.5305877047948294, + "flos": 17607599804520.0, + "grad_norm": 2.0475879579889655, + "language_loss": 0.76411355, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78837764, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13818359, + "step": 8825, + "time_per_iteration": 2.8081343173980713 + }, + { + "auxiliary_loss_clip": 0.01394227, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_clip": 1.27811503, + "balance_loss_mlp": 1.0299468, + "epoch": 0.5306478280474973, + "flos": 15199751431800.0, + "grad_norm": 1.7664521653201344, + "language_loss": 0.86041915, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.88479888, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13824463, + "step": 8826, + "time_per_iteration": 2.757009983062744 + }, + { + "auxiliary_loss_clip": 0.01393144, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.27642417, + "balance_loss_mlp": 1.0245384, + "epoch": 0.5307079513001653, + "flos": 15345467519400.0, + "grad_norm": 1.455203920238347, + "language_loss": 0.64581072, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.67013192, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14440918, + "step": 8827, + "time_per_iteration": 2.762686252593994 + }, + { + "auxiliary_loss_clip": 0.01399921, + "auxiliary_loss_mlp": 0.01044994, + "balance_loss_clip": 1.28009987, + "balance_loss_mlp": 1.02996182, + "epoch": 0.5307680745528333, + "flos": 43552349274960.0, + "grad_norm": 1.6896469077195557, + "language_loss": 0.60000455, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62445366, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.15026855, + "step": 8828, + "time_per_iteration": 2.9764564037323 + }, + { + "auxiliary_loss_clip": 0.01399565, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.27938104, + "balance_loss_mlp": 1.02374935, + "epoch": 0.5308281978055013, + "flos": 20053927921320.0, + "grad_norm": 2.07675313239185, + "language_loss": 0.81321108, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83759463, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.15039062, + "step": 8829, + "time_per_iteration": 2.741806983947754 + }, + { + "auxiliary_loss_clip": 0.01395101, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.27860451, + "balance_loss_mlp": 1.02403772, + "epoch": 0.5308883210581693, + "flos": 20708797539960.0, + "grad_norm": 1.3477789868009364, + "language_loss": 0.78829682, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.81262946, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14135742, + "step": 8830, + "time_per_iteration": 2.779463768005371 + }, + { + "auxiliary_loss_clip": 0.01395644, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.27755642, + "balance_loss_mlp": 1.01628518, + "epoch": 0.5309484443108372, + "flos": 14359548947400.0, + "grad_norm": 1.9334980665554296, + "language_loss": 0.80581093, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83007056, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.14044189, + "step": 8831, + "time_per_iteration": 2.733402967453003 + }, + { + "auxiliary_loss_clip": 0.01390703, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.27401483, + "balance_loss_mlp": 1.0187856, + "epoch": 0.5310085675635052, + "flos": 20015245135440.0, + "grad_norm": 2.0299513582105626, + "language_loss": 0.73488146, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75911975, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14343262, + "step": 8832, + "time_per_iteration": 2.856295108795166 + }, + { + "auxiliary_loss_clip": 0.01407347, + "auxiliary_loss_mlp": 0.01041716, + "balance_loss_clip": 1.28593016, + "balance_loss_mlp": 1.02650499, + "epoch": 0.5310686908161731, + "flos": 22132392283440.0, + "grad_norm": 1.8605730310549653, + "language_loss": 0.75916636, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.78365701, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.15209961, + "step": 8833, + "time_per_iteration": 2.8321115970611572 + }, + { + "auxiliary_loss_clip": 0.01403135, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.28195322, + "balance_loss_mlp": 1.02130961, + "epoch": 0.5311288140688412, + "flos": 24723340062120.0, + "grad_norm": 1.9365238603592996, + "language_loss": 0.74086154, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.76525021, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.14428711, + "step": 8834, + "time_per_iteration": 2.7839126586914062 + }, + { + "auxiliary_loss_clip": 0.01412246, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.28658676, + "balance_loss_mlp": 1.03025794, + "epoch": 0.5311889373215091, + "flos": 24103092393360.0, + "grad_norm": 1.7176830511024181, + "language_loss": 0.77878779, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.80336803, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.15527344, + "step": 8835, + "time_per_iteration": 2.7737619876861572 + }, + { + "auxiliary_loss_clip": 0.01397814, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.27788901, + "balance_loss_mlp": 1.02513242, + "epoch": 0.5312490605741771, + "flos": 22021988487840.0, + "grad_norm": 1.6080872611769512, + "language_loss": 0.72601235, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.75039816, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.15631104, + "step": 8836, + "time_per_iteration": 2.787903308868408 + }, + { + "auxiliary_loss_clip": 0.01402329, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.28055429, + "balance_loss_mlp": 1.02785707, + "epoch": 0.531309183826845, + "flos": 19395241116840.0, + "grad_norm": 1.6200744565775484, + "language_loss": 0.80969548, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83415061, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.15332031, + "step": 8837, + "time_per_iteration": 2.850820779800415 + }, + { + "auxiliary_loss_clip": 0.01394294, + "auxiliary_loss_mlp": 0.01033271, + "balance_loss_clip": 1.2781899, + "balance_loss_mlp": 1.01819062, + "epoch": 0.531369307079513, + "flos": 20194446138840.0, + "grad_norm": 1.7160896725644776, + "language_loss": 0.8611877, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88546336, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.15081787, + "step": 8838, + "time_per_iteration": 2.877741575241089 + }, + { + "auxiliary_loss_clip": 0.0139064, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.27291036, + "balance_loss_mlp": 1.02314019, + "epoch": 0.5314294303321809, + "flos": 18884828726640.0, + "grad_norm": 1.5628022397560803, + "language_loss": 0.72715843, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75143528, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.13928223, + "step": 8839, + "time_per_iteration": 2.7533092498779297 + }, + { + "auxiliary_loss_clip": 0.01401475, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.27993441, + "balance_loss_mlp": 1.02993345, + "epoch": 0.531489553584849, + "flos": 23045330986560.0, + "grad_norm": 1.7489391751527126, + "language_loss": 0.7669397, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79139614, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14245605, + "step": 8840, + "time_per_iteration": 2.7161004543304443 + }, + { + "auxiliary_loss_clip": 0.01399536, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.27797103, + "balance_loss_mlp": 1.02379048, + "epoch": 0.5315496768375169, + "flos": 19795046669640.0, + "grad_norm": 2.048873665353626, + "language_loss": 0.77406085, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79845166, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.15740967, + "step": 8841, + "time_per_iteration": 2.7281463146209717 + }, + { + "auxiliary_loss_clip": 0.01187202, + "auxiliary_loss_mlp": 0.0100391, + "balance_loss_clip": 1.14188576, + "balance_loss_mlp": 1.00134683, + "epoch": 0.5316098000901849, + "flos": 71040129321600.0, + "grad_norm": 0.6922794736923731, + "language_loss": 0.56805205, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5899632, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02563477, + "step": 8842, + "time_per_iteration": 6.200224161148071 + }, + { + "auxiliary_loss_clip": 0.01403606, + "auxiliary_loss_mlp": 0.01039384, + "balance_loss_clip": 1.28258979, + "balance_loss_mlp": 1.02367246, + "epoch": 0.5316699233428529, + "flos": 16439921902440.0, + "grad_norm": 1.6584774814272176, + "language_loss": 0.73519802, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75962794, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.15722656, + "step": 8843, + "time_per_iteration": 2.7465457916259766 + }, + { + "auxiliary_loss_clip": 0.01187369, + "auxiliary_loss_mlp": 0.01004779, + "balance_loss_clip": 1.14212811, + "balance_loss_mlp": 1.00182235, + "epoch": 0.5317300465955208, + "flos": 67438508852520.0, + "grad_norm": 0.8606621782491757, + "language_loss": 0.61089635, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63281786, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02954102, + "step": 8844, + "time_per_iteration": 3.2807936668395996 + }, + { + "auxiliary_loss_clip": 0.01186593, + "auxiliary_loss_mlp": 0.01006735, + "balance_loss_clip": 1.14101839, + "balance_loss_mlp": 1.00373065, + "epoch": 0.5317901698481888, + "flos": 59520582181440.0, + "grad_norm": 0.8335975305269316, + "language_loss": 0.62228185, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64421505, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.0300293, + "step": 8845, + "time_per_iteration": 3.2108652591705322 + }, + { + "auxiliary_loss_clip": 0.01391758, + "auxiliary_loss_mlp": 0.01045531, + "balance_loss_clip": 1.2739352, + "balance_loss_mlp": 1.02965808, + "epoch": 0.5318502931008567, + "flos": 19133111196360.0, + "grad_norm": 1.8432230036310637, + "language_loss": 0.76015055, + "learning_rate": 1.890810312970474e-06, + "loss": 0.78452343, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.15869141, + "step": 8846, + "time_per_iteration": 2.705129861831665 + }, + { + "auxiliary_loss_clip": 0.01403682, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.2833693, + "balance_loss_mlp": 1.02555227, + "epoch": 0.5319104163535248, + "flos": 24686444044080.0, + "grad_norm": 1.6275983305522956, + "language_loss": 0.75433189, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.7787596, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.13531494, + "step": 8847, + "time_per_iteration": 4.310090065002441 + }, + { + "auxiliary_loss_clip": 0.01390932, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.27318597, + "balance_loss_mlp": 1.02301168, + "epoch": 0.5319705396061927, + "flos": 19389555946440.0, + "grad_norm": 1.4966300303659261, + "language_loss": 0.88016033, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90444434, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14471436, + "step": 8848, + "time_per_iteration": 2.7566633224487305 + }, + { + "auxiliary_loss_clip": 0.01399292, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_clip": 1.27891576, + "balance_loss_mlp": 1.02889884, + "epoch": 0.5320306628588607, + "flos": 18263850107400.0, + "grad_norm": 2.2010010735878534, + "language_loss": 0.75026941, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.77470851, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.1572876, + "step": 8849, + "time_per_iteration": 2.707859754562378 + }, + { + "auxiliary_loss_clip": 0.0140517, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.28173053, + "balance_loss_mlp": 1.01948786, + "epoch": 0.5320907861115286, + "flos": 23737583923560.0, + "grad_norm": 1.7189076419048868, + "language_loss": 0.80002737, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.8244279, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.15405273, + "step": 8850, + "time_per_iteration": 2.7717950344085693 + }, + { + "auxiliary_loss_clip": 0.01397284, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.27740014, + "balance_loss_mlp": 1.02461922, + "epoch": 0.5321509093641966, + "flos": 34501870933200.0, + "grad_norm": 1.3311273127041343, + "language_loss": 0.55131006, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57566845, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13946533, + "step": 8851, + "time_per_iteration": 2.871825695037842 + }, + { + "auxiliary_loss_clip": 0.01396987, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.27567315, + "balance_loss_mlp": 1.02112424, + "epoch": 0.5322110326168645, + "flos": 20015732435760.0, + "grad_norm": 1.5449163315436776, + "language_loss": 0.68672395, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.71104628, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.14135742, + "step": 8852, + "time_per_iteration": 2.7258191108703613 + }, + { + "auxiliary_loss_clip": 0.01185318, + "auxiliary_loss_mlp": 0.01001515, + "balance_loss_clip": 1.13912666, + "balance_loss_mlp": 0.99910659, + "epoch": 0.5322711558695326, + "flos": 64646309730960.0, + "grad_norm": 0.803028840899896, + "language_loss": 0.62940973, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.65127808, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02404785, + "step": 8853, + "time_per_iteration": 4.740699768066406 + }, + { + "auxiliary_loss_clip": 0.01407359, + "auxiliary_loss_mlp": 0.01040975, + "balance_loss_clip": 1.28423452, + "balance_loss_mlp": 1.02587152, + "epoch": 0.5323312791222005, + "flos": 14943469115160.0, + "grad_norm": 5.447191211501722, + "language_loss": 0.8001855, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.82466882, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.15106201, + "step": 8854, + "time_per_iteration": 2.780768871307373 + }, + { + "auxiliary_loss_clip": 0.01380235, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.26569486, + "balance_loss_mlp": 1.02706861, + "epoch": 0.5323914023748685, + "flos": 23446111140000.0, + "grad_norm": 1.568893799536918, + "language_loss": 0.73811257, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.76231593, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13031006, + "step": 8855, + "time_per_iteration": 2.7880899906158447 + }, + { + "auxiliary_loss_clip": 0.01386007, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.26908708, + "balance_loss_mlp": 1.02199078, + "epoch": 0.5324515256275365, + "flos": 26291757509280.0, + "grad_norm": 1.9412956558802001, + "language_loss": 0.65122253, + "learning_rate": 1.886921714110507e-06, + "loss": 0.6754297, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.12713623, + "step": 8856, + "time_per_iteration": 2.8586061000823975 + }, + { + "auxiliary_loss_clip": 0.01401215, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.27776599, + "balance_loss_mlp": 1.02258492, + "epoch": 0.5325116488802044, + "flos": 26876692886040.0, + "grad_norm": 1.8465705154833354, + "language_loss": 0.77741134, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.80180126, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.15185547, + "step": 8857, + "time_per_iteration": 2.859833002090454 + }, + { + "auxiliary_loss_clip": 0.01387163, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.26932979, + "balance_loss_mlp": 1.02065086, + "epoch": 0.5325717721328724, + "flos": 25890246405360.0, + "grad_norm": 1.8321702470204133, + "language_loss": 0.71056628, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73478878, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14440918, + "step": 8858, + "time_per_iteration": 2.81394100189209 + }, + { + "auxiliary_loss_clip": 0.0139267, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.27307653, + "balance_loss_mlp": 1.02532983, + "epoch": 0.5326318953855403, + "flos": 21804429565440.0, + "grad_norm": 2.297862597945247, + "language_loss": 0.69756466, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.72189808, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.15338135, + "step": 8859, + "time_per_iteration": 2.76706600189209 + }, + { + "auxiliary_loss_clip": 0.01373264, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.2607528, + "balance_loss_mlp": 1.0184319, + "epoch": 0.5326920186382084, + "flos": 20927818363320.0, + "grad_norm": 1.8592506769555432, + "language_loss": 0.69199556, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71603644, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1239624, + "step": 8860, + "time_per_iteration": 2.74245548248291 + }, + { + "auxiliary_loss_clip": 0.01386059, + "auxiliary_loss_mlp": 0.01033439, + "balance_loss_clip": 1.26960611, + "balance_loss_mlp": 1.01902044, + "epoch": 0.5327521418908763, + "flos": 21438352578600.0, + "grad_norm": 4.099753295929296, + "language_loss": 0.77884114, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80303609, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14416504, + "step": 8861, + "time_per_iteration": 2.756835460662842 + }, + { + "auxiliary_loss_clip": 0.01384242, + "auxiliary_loss_mlp": 0.01040912, + "balance_loss_clip": 1.26640618, + "balance_loss_mlp": 1.02540886, + "epoch": 0.5328122651435443, + "flos": 21765015829080.0, + "grad_norm": 1.8705221327689308, + "language_loss": 0.85724664, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88149816, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.15496826, + "step": 8862, + "time_per_iteration": 2.7844889163970947 + }, + { + "auxiliary_loss_clip": 0.01395401, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.27456307, + "balance_loss_mlp": 1.02646697, + "epoch": 0.5328723883962122, + "flos": 18301111600680.0, + "grad_norm": 2.1578285260190104, + "language_loss": 0.62414074, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.64851487, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.15533447, + "step": 8863, + "time_per_iteration": 2.7366831302642822 + }, + { + "auxiliary_loss_clip": 0.01380253, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.26691973, + "balance_loss_mlp": 1.02193296, + "epoch": 0.5329325116488802, + "flos": 25380483748920.0, + "grad_norm": 1.7969146532104991, + "language_loss": 0.74199218, + "learning_rate": 1.883811143046377e-06, + "loss": 0.76614827, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13421631, + "step": 8864, + "time_per_iteration": 2.761169672012329 + }, + { + "auxiliary_loss_clip": 0.01385993, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.26852608, + "balance_loss_mlp": 1.02406073, + "epoch": 0.5329926349015481, + "flos": 25597474154280.0, + "grad_norm": 1.916388799728745, + "language_loss": 0.64676738, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.67100185, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13397217, + "step": 8865, + "time_per_iteration": 2.804872512817383 + }, + { + "auxiliary_loss_clip": 0.01389834, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.27139556, + "balance_loss_mlp": 1.01705909, + "epoch": 0.5330527581542162, + "flos": 22894295203800.0, + "grad_norm": 1.7589388531891246, + "language_loss": 0.78983521, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.81405205, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14794922, + "step": 8866, + "time_per_iteration": 2.8500423431396484 + }, + { + "auxiliary_loss_clip": 0.01386749, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.26886916, + "balance_loss_mlp": 1.01971459, + "epoch": 0.5331128814068841, + "flos": 16030086084720.0, + "grad_norm": 3.112498749017555, + "language_loss": 0.73737937, + "learning_rate": 1.882644751189108e-06, + "loss": 0.76159406, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.15008545, + "step": 8867, + "time_per_iteration": 2.7900285720825195 + }, + { + "auxiliary_loss_clip": 0.0139036, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.27231932, + "balance_loss_mlp": 1.02192903, + "epoch": 0.5331730046595521, + "flos": 39351539894760.0, + "grad_norm": 1.4660817782185998, + "language_loss": 0.72166812, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74594903, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.15783691, + "step": 8868, + "time_per_iteration": 3.004465103149414 + }, + { + "auxiliary_loss_clip": 0.01387607, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.27003598, + "balance_loss_mlp": 1.0185802, + "epoch": 0.5332331279122201, + "flos": 24029706440880.0, + "grad_norm": 1.785356486858301, + "language_loss": 0.78830075, + "learning_rate": 1.881867178843637e-06, + "loss": 0.81250137, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13879395, + "step": 8869, + "time_per_iteration": 2.8093340396881104 + }, + { + "auxiliary_loss_clip": 0.01399741, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.27609968, + "balance_loss_mlp": 1.01942754, + "epoch": 0.533293251164888, + "flos": 17133839782200.0, + "grad_norm": 1.6831787816449757, + "language_loss": 0.7605114, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.78485173, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.14868164, + "step": 8870, + "time_per_iteration": 2.7702901363372803 + }, + { + "auxiliary_loss_clip": 0.01399442, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.27794886, + "balance_loss_mlp": 1.02305174, + "epoch": 0.533353374417556, + "flos": 22131336466080.0, + "grad_norm": 2.004395199581255, + "language_loss": 0.75868279, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.78306699, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.15930176, + "step": 8871, + "time_per_iteration": 2.7515339851379395 + }, + { + "auxiliary_loss_clip": 0.01388801, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.27027559, + "balance_loss_mlp": 1.02120626, + "epoch": 0.533413497670224, + "flos": 15015068299800.0, + "grad_norm": 1.7185901815269065, + "language_loss": 0.71943647, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.7436775, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14105225, + "step": 8872, + "time_per_iteration": 2.7455270290374756 + }, + { + "auxiliary_loss_clip": 0.01385726, + "auxiliary_loss_mlp": 0.0104341, + "balance_loss_clip": 1.26886773, + "balance_loss_mlp": 1.02881336, + "epoch": 0.533473620922892, + "flos": 19614749240520.0, + "grad_norm": 1.6498723512387936, + "language_loss": 0.65043175, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67472315, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.14599609, + "step": 8873, + "time_per_iteration": 2.706531524658203 + }, + { + "auxiliary_loss_clip": 0.01384785, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.26708949, + "balance_loss_mlp": 1.02052295, + "epoch": 0.5335337441755599, + "flos": 14286690903600.0, + "grad_norm": 3.521185524207756, + "language_loss": 0.81114751, + "learning_rate": 1.879923326631099e-06, + "loss": 0.83534527, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14477539, + "step": 8874, + "time_per_iteration": 2.7592568397521973 + }, + { + "auxiliary_loss_clip": 0.01384293, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.26723993, + "balance_loss_mlp": 1.01415217, + "epoch": 0.5335938674282279, + "flos": 20819891677680.0, + "grad_norm": 1.5932695613017736, + "language_loss": 0.69779313, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72192103, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14343262, + "step": 8875, + "time_per_iteration": 2.7156145572662354 + }, + { + "auxiliary_loss_clip": 0.01183216, + "auxiliary_loss_mlp": 0.01002063, + "balance_loss_clip": 1.13778555, + "balance_loss_mlp": 0.99934477, + "epoch": 0.5336539906808958, + "flos": 71414490413880.0, + "grad_norm": 0.7263372863882791, + "language_loss": 0.59708893, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61894172, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02722168, + "step": 8876, + "time_per_iteration": 3.3649237155914307 + }, + { + "auxiliary_loss_clip": 0.01385327, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.26725245, + "balance_loss_mlp": 1.01991808, + "epoch": 0.5337141139335638, + "flos": 20157306470640.0, + "grad_norm": 1.6305449024652814, + "language_loss": 0.7543664, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77855521, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13647461, + "step": 8877, + "time_per_iteration": 2.732123374938965 + }, + { + "auxiliary_loss_clip": 0.01181737, + "auxiliary_loss_mlp": 0.01001371, + "balance_loss_clip": 1.1369102, + "balance_loss_mlp": 0.9989627, + "epoch": 0.5337742371862317, + "flos": 67743406022040.0, + "grad_norm": 0.7566219544739243, + "language_loss": 0.5727042, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59453523, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02404785, + "step": 8878, + "time_per_iteration": 3.109389305114746 + }, + { + "auxiliary_loss_clip": 0.01393252, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.27182817, + "balance_loss_mlp": 1.01826572, + "epoch": 0.5338343604388998, + "flos": 25014284937000.0, + "grad_norm": 1.5123333601392008, + "language_loss": 0.72495079, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74921352, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14752197, + "step": 8879, + "time_per_iteration": 2.8071420192718506 + }, + { + "auxiliary_loss_clip": 0.0139403, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.27388763, + "balance_loss_mlp": 1.02038407, + "epoch": 0.5338944836915677, + "flos": 17605528778160.0, + "grad_norm": 3.4287353788128883, + "language_loss": 0.83690846, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.86119783, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14526367, + "step": 8880, + "time_per_iteration": 2.7016050815582275 + }, + { + "auxiliary_loss_clip": 0.01381551, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.26644015, + "balance_loss_mlp": 1.0190134, + "epoch": 0.5339546069442357, + "flos": 21728931978240.0, + "grad_norm": 1.4276901305970238, + "language_loss": 0.7947349, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.8188737, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13330078, + "step": 8881, + "time_per_iteration": 4.321067571640015 + }, + { + "auxiliary_loss_clip": 0.01182153, + "auxiliary_loss_mlp": 0.01004259, + "balance_loss_clip": 1.13647306, + "balance_loss_mlp": 1.00168371, + "epoch": 0.5340147301969036, + "flos": 69737357741040.0, + "grad_norm": 0.7835361846370842, + "language_loss": 0.59287453, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61473864, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02575684, + "step": 8882, + "time_per_iteration": 3.1585559844970703 + }, + { + "auxiliary_loss_clip": 0.01183028, + "auxiliary_loss_mlp": 0.01000611, + "balance_loss_clip": 1.13744664, + "balance_loss_mlp": 0.99801177, + "epoch": 0.5340748534495716, + "flos": 63893039410080.0, + "grad_norm": 0.8653147761338654, + "language_loss": 0.63735569, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65919209, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02600098, + "step": 8883, + "time_per_iteration": 2.9949939250946045 + }, + { + "auxiliary_loss_clip": 0.01391456, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.27030635, + "balance_loss_mlp": 1.02171981, + "epoch": 0.5341349767022396, + "flos": 28700215007400.0, + "grad_norm": 2.416086681636536, + "language_loss": 0.82332551, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84760296, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.14550781, + "step": 8884, + "time_per_iteration": 2.783198833465576 + }, + { + "auxiliary_loss_clip": 0.01373843, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.26042318, + "balance_loss_mlp": 1.02351451, + "epoch": 0.5341950999549075, + "flos": 16294043381400.0, + "grad_norm": 1.6834053878009945, + "language_loss": 0.72170001, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74580884, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13519287, + "step": 8885, + "time_per_iteration": 2.70430064201355 + }, + { + "auxiliary_loss_clip": 0.01390734, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.26815176, + "balance_loss_mlp": 1.01759148, + "epoch": 0.5342552232075756, + "flos": 14359711380840.0, + "grad_norm": 1.9852036858215965, + "language_loss": 0.79144925, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81567979, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.14733887, + "step": 8886, + "time_per_iteration": 4.1838250160217285 + }, + { + "auxiliary_loss_clip": 0.01386786, + "auxiliary_loss_mlp": 0.01041164, + "balance_loss_clip": 1.26807332, + "balance_loss_mlp": 1.0255239, + "epoch": 0.5343153464602435, + "flos": 30303863529840.0, + "grad_norm": 1.6767723465681394, + "language_loss": 0.74797922, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.77225876, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.15637207, + "step": 8887, + "time_per_iteration": 2.885850191116333 + }, + { + "auxiliary_loss_clip": 0.01376189, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.25912237, + "balance_loss_mlp": 1.01956749, + "epoch": 0.5343754697129115, + "flos": 15600531585240.0, + "grad_norm": 2.3691435766812248, + "language_loss": 0.69780153, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.72189224, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.13305664, + "step": 8888, + "time_per_iteration": 2.7056260108947754 + }, + { + "auxiliary_loss_clip": 0.0139644, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.27089882, + "balance_loss_mlp": 1.03227305, + "epoch": 0.5344355929655794, + "flos": 16914006791640.0, + "grad_norm": 2.629346426229505, + "language_loss": 0.77707744, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.80151081, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.14630127, + "step": 8889, + "time_per_iteration": 2.7796151638031006 + }, + { + "auxiliary_loss_clip": 0.01375226, + "auxiliary_loss_mlp": 0.01044058, + "balance_loss_clip": 1.26006126, + "balance_loss_mlp": 1.02995014, + "epoch": 0.5344957162182474, + "flos": 16802303528520.0, + "grad_norm": 1.8324950251039458, + "language_loss": 0.68917441, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71336722, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14111328, + "step": 8890, + "time_per_iteration": 2.708392381668091 + }, + { + "auxiliary_loss_clip": 0.0138473, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.26384878, + "balance_loss_mlp": 1.02504587, + "epoch": 0.5345558394709153, + "flos": 12708365016600.0, + "grad_norm": 2.033160959883156, + "language_loss": 0.77159071, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79584098, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.15258789, + "step": 8891, + "time_per_iteration": 2.782470703125 + }, + { + "auxiliary_loss_clip": 0.01369745, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.25557709, + "balance_loss_mlp": 1.02220535, + "epoch": 0.5346159627235834, + "flos": 22460111351280.0, + "grad_norm": 1.4122841976449318, + "language_loss": 0.7431426, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76719296, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13085938, + "step": 8892, + "time_per_iteration": 4.277246952056885 + }, + { + "auxiliary_loss_clip": 0.0138211, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.26452208, + "balance_loss_mlp": 1.02504373, + "epoch": 0.5346760859762513, + "flos": 22420453964760.0, + "grad_norm": 1.633781706703354, + "language_loss": 0.87726384, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90147144, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13598633, + "step": 8893, + "time_per_iteration": 2.7435073852539062 + }, + { + "auxiliary_loss_clip": 0.01374329, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.25931287, + "balance_loss_mlp": 1.02266181, + "epoch": 0.5347362092289193, + "flos": 22820746817880.0, + "grad_norm": 1.5653432870579584, + "language_loss": 0.73316556, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75726795, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13238525, + "step": 8894, + "time_per_iteration": 2.750840425491333 + }, + { + "auxiliary_loss_clip": 0.0137238, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.25721037, + "balance_loss_mlp": 1.02346134, + "epoch": 0.5347963324815872, + "flos": 23224410164880.0, + "grad_norm": 1.5290894590092015, + "language_loss": 0.74894339, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77304351, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.1416626, + "step": 8895, + "time_per_iteration": 2.791688919067383 + }, + { + "auxiliary_loss_clip": 0.01379852, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.26229119, + "balance_loss_mlp": 1.02100968, + "epoch": 0.5348564557342552, + "flos": 22606477172640.0, + "grad_norm": 1.5637200569018297, + "language_loss": 0.76858902, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79272902, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13134766, + "step": 8896, + "time_per_iteration": 2.7377610206604004 + }, + { + "auxiliary_loss_clip": 0.0137182, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.256742, + "balance_loss_mlp": 1.01730847, + "epoch": 0.5349165789869232, + "flos": 18006674406840.0, + "grad_norm": 1.5754558431842736, + "language_loss": 0.78738189, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81141955, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14624023, + "step": 8897, + "time_per_iteration": 2.729597330093384 + }, + { + "auxiliary_loss_clip": 0.01374892, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.25677037, + "balance_loss_mlp": 1.01654077, + "epoch": 0.5349767022395912, + "flos": 17163507512160.0, + "grad_norm": 1.817936334887345, + "language_loss": 0.75898844, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.78304207, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13922119, + "step": 8898, + "time_per_iteration": 2.7095754146575928 + }, + { + "auxiliary_loss_clip": 0.01181832, + "auxiliary_loss_mlp": 0.01013074, + "balance_loss_clip": 1.13627982, + "balance_loss_mlp": 1.01088035, + "epoch": 0.5350368254922592, + "flos": 71009649424440.0, + "grad_norm": 0.8293836955026092, + "language_loss": 0.58061731, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60256642, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.02197266, + "step": 8899, + "time_per_iteration": 3.4575459957122803 + }, + { + "auxiliary_loss_clip": 0.01369374, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.25501347, + "balance_loss_mlp": 1.01799011, + "epoch": 0.5350969487449271, + "flos": 27423635819040.0, + "grad_norm": 1.4729675849002217, + "language_loss": 0.70000589, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72401381, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13439941, + "step": 8900, + "time_per_iteration": 2.78446102142334 + }, + { + "auxiliary_loss_clip": 0.01385661, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.26626348, + "balance_loss_mlp": 1.01856709, + "epoch": 0.5351570719975951, + "flos": 19320636913560.0, + "grad_norm": 1.5818036908785087, + "language_loss": 0.71649784, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.74068534, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14526367, + "step": 8901, + "time_per_iteration": 2.7574462890625 + }, + { + "auxiliary_loss_clip": 0.01385921, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.26739848, + "balance_loss_mlp": 1.02177787, + "epoch": 0.535217195250263, + "flos": 19833363980280.0, + "grad_norm": 1.7644833581915258, + "language_loss": 0.78137589, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.80559105, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13824463, + "step": 8902, + "time_per_iteration": 2.757338047027588 + }, + { + "auxiliary_loss_clip": 0.0136836, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.25617886, + "balance_loss_mlp": 1.02001905, + "epoch": 0.535277318502931, + "flos": 22133042017200.0, + "grad_norm": 1.375454678518712, + "language_loss": 0.705594, + "learning_rate": 1.868651286721281e-06, + "loss": 0.7296102, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13232422, + "step": 8903, + "time_per_iteration": 2.7600343227386475 + }, + { + "auxiliary_loss_clip": 0.01382639, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.26292479, + "balance_loss_mlp": 1.01878142, + "epoch": 0.5353374417555989, + "flos": 25051343388480.0, + "grad_norm": 1.4686053838653164, + "language_loss": 0.72498524, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74914342, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14398193, + "step": 8904, + "time_per_iteration": 2.814652442932129 + }, + { + "auxiliary_loss_clip": 0.01381386, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.26365995, + "balance_loss_mlp": 1.02312183, + "epoch": 0.535397565008267, + "flos": 19393170090480.0, + "grad_norm": 2.0895868601413183, + "language_loss": 0.73904997, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.76324236, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.1472168, + "step": 8905, + "time_per_iteration": 2.7360453605651855 + }, + { + "auxiliary_loss_clip": 0.01366068, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.25398874, + "balance_loss_mlp": 1.02154291, + "epoch": 0.5354576882609349, + "flos": 21476304414000.0, + "grad_norm": 1.4252287444330585, + "language_loss": 0.83578587, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85978824, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12628174, + "step": 8906, + "time_per_iteration": 2.7702724933624268 + }, + { + "auxiliary_loss_clip": 0.01382878, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.2644453, + "balance_loss_mlp": 1.02086544, + "epoch": 0.5355178115136029, + "flos": 20782467750960.0, + "grad_norm": 1.7166905494242106, + "language_loss": 0.741193, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76537955, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.14898682, + "step": 8907, + "time_per_iteration": 2.745253562927246 + }, + { + "auxiliary_loss_clip": 0.01377799, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.26139498, + "balance_loss_mlp": 1.0197103, + "epoch": 0.5355779347662708, + "flos": 23519334659040.0, + "grad_norm": 1.9230105539342444, + "language_loss": 0.7713722, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79548192, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13452148, + "step": 8908, + "time_per_iteration": 2.7684640884399414 + }, + { + "auxiliary_loss_clip": 0.01385424, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.26627505, + "balance_loss_mlp": 1.02758121, + "epoch": 0.5356380580189388, + "flos": 20308098603240.0, + "grad_norm": 1.806707143159616, + "language_loss": 0.74645579, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.77073538, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.14941406, + "step": 8909, + "time_per_iteration": 2.738691806793213 + }, + { + "auxiliary_loss_clip": 0.01379043, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.2631917, + "balance_loss_mlp": 1.02069855, + "epoch": 0.5356981812716068, + "flos": 21366712785600.0, + "grad_norm": 2.0012761057147177, + "language_loss": 0.84026289, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86439747, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13708496, + "step": 8910, + "time_per_iteration": 2.8015570640563965 + }, + { + "auxiliary_loss_clip": 0.01377986, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.26061153, + "balance_loss_mlp": 1.01945579, + "epoch": 0.5357583045242748, + "flos": 23116321045800.0, + "grad_norm": 1.454637837122256, + "language_loss": 0.82115346, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84527236, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.14453125, + "step": 8911, + "time_per_iteration": 2.962440252304077 + }, + { + "auxiliary_loss_clip": 0.01377167, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.26162243, + "balance_loss_mlp": 1.02451253, + "epoch": 0.5358184277769428, + "flos": 21146676753240.0, + "grad_norm": 1.8713730464408034, + "language_loss": 0.68980837, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71395594, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13079834, + "step": 8912, + "time_per_iteration": 2.772841691970825 + }, + { + "auxiliary_loss_clip": 0.01371073, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.25664747, + "balance_loss_mlp": 1.0227201, + "epoch": 0.5358785510296107, + "flos": 16285799884320.0, + "grad_norm": 1.776020954307172, + "language_loss": 0.71837217, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.7424472, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13708496, + "step": 8913, + "time_per_iteration": 2.744736909866333 + }, + { + "auxiliary_loss_clip": 0.01385829, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.26408827, + "balance_loss_mlp": 1.02357435, + "epoch": 0.5359386742822787, + "flos": 16980529931280.0, + "grad_norm": 1.6396825644694464, + "language_loss": 0.71911347, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74334836, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.14093018, + "step": 8914, + "time_per_iteration": 2.7525534629821777 + }, + { + "auxiliary_loss_clip": 0.01388367, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.26756024, + "balance_loss_mlp": 1.02501845, + "epoch": 0.5359987975349466, + "flos": 20817739434600.0, + "grad_norm": 1.629427476395291, + "language_loss": 0.70475709, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72904223, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.15136719, + "step": 8915, + "time_per_iteration": 2.9291341304779053 + }, + { + "auxiliary_loss_clip": 0.0136994, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.25478733, + "balance_loss_mlp": 1.02500606, + "epoch": 0.5360589207876146, + "flos": 22205006677080.0, + "grad_norm": 1.6605329297871425, + "language_loss": 0.75720423, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.78129852, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14477539, + "step": 8916, + "time_per_iteration": 2.7881226539611816 + }, + { + "auxiliary_loss_clip": 0.01386132, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.26734567, + "balance_loss_mlp": 1.02688694, + "epoch": 0.5361190440402825, + "flos": 31400632589400.0, + "grad_norm": 2.0324047342074056, + "language_loss": 0.7303142, + "learning_rate": 1.863211089308289e-06, + "loss": 0.75458229, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.13781738, + "step": 8917, + "time_per_iteration": 2.8528432846069336 + }, + { + "auxiliary_loss_clip": 0.01377678, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.26105964, + "balance_loss_mlp": 1.02825904, + "epoch": 0.5361791672929506, + "flos": 16074088565760.0, + "grad_norm": 1.7267898783131184, + "language_loss": 0.71709275, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.74129546, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14337158, + "step": 8918, + "time_per_iteration": 2.7240099906921387 + }, + { + "auxiliary_loss_clip": 0.01374262, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.25896049, + "balance_loss_mlp": 1.02899218, + "epoch": 0.5362392905456185, + "flos": 20745855991440.0, + "grad_norm": 1.537096192344501, + "language_loss": 0.752774, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77693915, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13262939, + "step": 8919, + "time_per_iteration": 4.154537916183472 + }, + { + "auxiliary_loss_clip": 0.01384728, + "auxiliary_loss_mlp": 0.01041475, + "balance_loss_clip": 1.26618648, + "balance_loss_mlp": 1.02787936, + "epoch": 0.5362994137982865, + "flos": 17343561291120.0, + "grad_norm": 2.1307558860524427, + "language_loss": 0.71825469, + "learning_rate": 1.862045463611864e-06, + "loss": 0.7425167, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13586426, + "step": 8920, + "time_per_iteration": 4.250119686126709 + }, + { + "auxiliary_loss_clip": 0.01376736, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.26002169, + "balance_loss_mlp": 1.02525759, + "epoch": 0.5363595370509544, + "flos": 42821657202240.0, + "grad_norm": 1.4230847479153912, + "language_loss": 0.69059944, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.71475315, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13366699, + "step": 8921, + "time_per_iteration": 2.9710569381713867 + }, + { + "auxiliary_loss_clip": 0.01376126, + "auxiliary_loss_mlp": 0.01039413, + "balance_loss_clip": 1.2602644, + "balance_loss_mlp": 1.02598381, + "epoch": 0.5364196603036224, + "flos": 19176666985440.0, + "grad_norm": 2.044772278297457, + "language_loss": 0.81752837, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84168375, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.13427734, + "step": 8922, + "time_per_iteration": 2.7321557998657227 + }, + { + "auxiliary_loss_clip": 0.0137703, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.2592876, + "balance_loss_mlp": 1.01892555, + "epoch": 0.5364797835562904, + "flos": 17935562522520.0, + "grad_norm": 2.0542414400636457, + "language_loss": 0.76719409, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79128551, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13195801, + "step": 8923, + "time_per_iteration": 2.745055675506592 + }, + { + "auxiliary_loss_clip": 0.01378512, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.25962687, + "balance_loss_mlp": 1.02758014, + "epoch": 0.5365399068089584, + "flos": 30234335371560.0, + "grad_norm": 1.7174340736521283, + "language_loss": 0.70945042, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.73364669, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.13525391, + "step": 8924, + "time_per_iteration": 2.8181121349334717 + }, + { + "auxiliary_loss_clip": 0.01385601, + "auxiliary_loss_mlp": 0.0104711, + "balance_loss_clip": 1.26566386, + "balance_loss_mlp": 1.03154755, + "epoch": 0.5366000300616264, + "flos": 24896043727920.0, + "grad_norm": 1.7538870061815168, + "language_loss": 0.87325078, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89757794, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.15545654, + "step": 8925, + "time_per_iteration": 4.232579231262207 + }, + { + "auxiliary_loss_clip": 0.01379206, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.25919604, + "balance_loss_mlp": 1.02229178, + "epoch": 0.5366601533142943, + "flos": 29832864876000.0, + "grad_norm": 1.497271100482957, + "language_loss": 0.78220171, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80635041, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.1338501, + "step": 8926, + "time_per_iteration": 2.8585872650146484 + }, + { + "auxiliary_loss_clip": 0.01369249, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.25634181, + "balance_loss_mlp": 1.02313733, + "epoch": 0.5367202765669623, + "flos": 27205183512720.0, + "grad_norm": 1.4196465920962997, + "language_loss": 0.66963333, + "learning_rate": 1.85932585410148e-06, + "loss": 0.6936841, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.12689209, + "step": 8927, + "time_per_iteration": 2.844478130340576 + }, + { + "auxiliary_loss_clip": 0.0138083, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.26132345, + "balance_loss_mlp": 1.02011657, + "epoch": 0.5367803998196302, + "flos": 20234793867480.0, + "grad_norm": 1.72926554600616, + "language_loss": 0.73753071, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.76167768, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.13763428, + "step": 8928, + "time_per_iteration": 2.7852282524108887 + }, + { + "auxiliary_loss_clip": 0.01376777, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.25913227, + "balance_loss_mlp": 1.01806474, + "epoch": 0.5368405230722982, + "flos": 32160220833240.0, + "grad_norm": 1.6752757744526003, + "language_loss": 0.62746537, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65154541, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.1317749, + "step": 8929, + "time_per_iteration": 2.8665225505828857 + }, + { + "auxiliary_loss_clip": 0.01375262, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.25684524, + "balance_loss_mlp": 1.02187085, + "epoch": 0.5369006463249661, + "flos": 26253277765200.0, + "grad_norm": 1.6124357978598374, + "language_loss": 0.66245711, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68656766, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13922119, + "step": 8930, + "time_per_iteration": 4.311391353607178 + }, + { + "auxiliary_loss_clip": 0.01367002, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.25275791, + "balance_loss_mlp": 1.02115607, + "epoch": 0.5369607695776342, + "flos": 26216625397320.0, + "grad_norm": 1.431282598432658, + "language_loss": 0.67396283, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69797981, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13531494, + "step": 8931, + "time_per_iteration": 2.804589033126831 + }, + { + "auxiliary_loss_clip": 0.01376634, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.26124477, + "balance_loss_mlp": 1.01950502, + "epoch": 0.5370208928303021, + "flos": 25014406762080.0, + "grad_norm": 1.6948038686162565, + "language_loss": 0.762573, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78667951, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14526367, + "step": 8932, + "time_per_iteration": 2.8051066398620605 + }, + { + "auxiliary_loss_clip": 0.01374442, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.25917268, + "balance_loss_mlp": 1.01794863, + "epoch": 0.5370810160829701, + "flos": 31798001640600.0, + "grad_norm": 1.7065803637729426, + "language_loss": 0.66477942, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68884271, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.1394043, + "step": 8933, + "time_per_iteration": 2.803986072540283 + }, + { + "auxiliary_loss_clip": 0.01368704, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.25489998, + "balance_loss_mlp": 1.02357197, + "epoch": 0.537141139335638, + "flos": 23847987719160.0, + "grad_norm": 1.4902071969805553, + "language_loss": 0.83395243, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85801327, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13818359, + "step": 8934, + "time_per_iteration": 2.762026309967041 + }, + { + "auxiliary_loss_clip": 0.01363091, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.24924517, + "balance_loss_mlp": 1.0258615, + "epoch": 0.537201262588306, + "flos": 18512619877440.0, + "grad_norm": 1.7991868273832488, + "language_loss": 0.80022293, + "learning_rate": 1.856218049303999e-06, + "loss": 0.8242501, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13757324, + "step": 8935, + "time_per_iteration": 2.7217605113983154 + }, + { + "auxiliary_loss_clip": 0.01370985, + "auxiliary_loss_mlp": 0.01040866, + "balance_loss_clip": 1.25416374, + "balance_loss_mlp": 1.02691829, + "epoch": 0.537261385840974, + "flos": 25668058129920.0, + "grad_norm": 1.6472721390861498, + "language_loss": 0.84107423, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86519271, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13946533, + "step": 8936, + "time_per_iteration": 2.8630764484405518 + }, + { + "auxiliary_loss_clip": 0.01369631, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.25504994, + "balance_loss_mlp": 1.020419, + "epoch": 0.537321509093642, + "flos": 40742543106360.0, + "grad_norm": 1.2820163403825273, + "language_loss": 0.7294153, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.75344729, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13165283, + "step": 8937, + "time_per_iteration": 2.8961291313171387 + }, + { + "auxiliary_loss_clip": 0.01373279, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.2543788, + "balance_loss_mlp": 1.01604366, + "epoch": 0.53738163234631, + "flos": 17242984718640.0, + "grad_norm": 2.0907029391372682, + "language_loss": 0.81828827, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.84232455, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14306641, + "step": 8938, + "time_per_iteration": 2.709540367126465 + }, + { + "auxiliary_loss_clip": 0.01380845, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.25848055, + "balance_loss_mlp": 1.0182538, + "epoch": 0.5374417555989779, + "flos": 12825672233400.0, + "grad_norm": 2.317050348991334, + "language_loss": 0.80537069, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82949764, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.13598633, + "step": 8939, + "time_per_iteration": 2.7028887271881104 + }, + { + "auxiliary_loss_clip": 0.01181831, + "auxiliary_loss_mlp": 0.01007548, + "balance_loss_clip": 1.1357671, + "balance_loss_mlp": 1.00455618, + "epoch": 0.5375018788516459, + "flos": 67271229725760.0, + "grad_norm": 0.7110977500226623, + "language_loss": 0.5246563, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.5465501, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02990723, + "step": 8940, + "time_per_iteration": 3.270137071609497 + }, + { + "auxiliary_loss_clip": 0.0136632, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.2519933, + "balance_loss_mlp": 1.01459384, + "epoch": 0.5375620021043138, + "flos": 18118783753560.0, + "grad_norm": 1.8149409800400846, + "language_loss": 0.71867168, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.74261665, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1361084, + "step": 8941, + "time_per_iteration": 2.7481894493103027 + }, + { + "auxiliary_loss_clip": 0.01361035, + "auxiliary_loss_mlp": 0.01029044, + "balance_loss_clip": 1.24779725, + "balance_loss_mlp": 1.0160383, + "epoch": 0.5376221253569818, + "flos": 23154678964800.0, + "grad_norm": 1.6949084845006497, + "language_loss": 0.79651642, + "learning_rate": 1.853499006090237e-06, + "loss": 0.82041723, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13012695, + "step": 8942, + "time_per_iteration": 2.7517054080963135 + }, + { + "auxiliary_loss_clip": 0.01375155, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.25557756, + "balance_loss_mlp": 1.02635026, + "epoch": 0.5376822486096497, + "flos": 29978499746880.0, + "grad_norm": 3.392952383751041, + "language_loss": 0.70649624, + "learning_rate": 1.853110593448911e-06, + "loss": 0.73065001, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.13879395, + "step": 8943, + "time_per_iteration": 2.82507586479187 + }, + { + "auxiliary_loss_clip": 0.01181004, + "auxiliary_loss_mlp": 0.01003237, + "balance_loss_clip": 1.13632917, + "balance_loss_mlp": 1.00050688, + "epoch": 0.5377423718623178, + "flos": 54183931853400.0, + "grad_norm": 1.9489359431763056, + "language_loss": 0.59629834, + "learning_rate": 1.852722186377645e-06, + "loss": 0.6181407, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02734375, + "step": 8944, + "time_per_iteration": 3.205744743347168 + }, + { + "auxiliary_loss_clip": 0.01388147, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.2641468, + "balance_loss_mlp": 1.02130556, + "epoch": 0.5378024951149857, + "flos": 23262158958480.0, + "grad_norm": 2.17518253726404, + "language_loss": 0.77694166, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80118918, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.15289307, + "step": 8945, + "time_per_iteration": 2.7834157943725586 + }, + { + "auxiliary_loss_clip": 0.01366655, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.24852073, + "balance_loss_mlp": 1.02416778, + "epoch": 0.5378626183676537, + "flos": 24029381574000.0, + "grad_norm": 1.5886456543795717, + "language_loss": 0.68630236, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.71034133, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13067627, + "step": 8946, + "time_per_iteration": 2.769911289215088 + }, + { + "auxiliary_loss_clip": 0.01358005, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_clip": 1.24431419, + "balance_loss_mlp": 1.0286231, + "epoch": 0.5379227416203216, + "flos": 27167475327480.0, + "grad_norm": 1.6905713121905137, + "language_loss": 0.77004242, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79404819, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13934326, + "step": 8947, + "time_per_iteration": 2.816232442855835 + }, + { + "auxiliary_loss_clip": 0.01365679, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.25023675, + "balance_loss_mlp": 1.01693869, + "epoch": 0.5379828648729896, + "flos": 24687093777840.0, + "grad_norm": 3.434795271379195, + "language_loss": 0.60003638, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62399107, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.128479, + "step": 8948, + "time_per_iteration": 2.8450329303741455 + }, + { + "auxiliary_loss_clip": 0.0136791, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.25184238, + "balance_loss_mlp": 1.01907468, + "epoch": 0.5380429881256577, + "flos": 22527730916640.0, + "grad_norm": 1.5596993540986765, + "language_loss": 0.79913545, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.8231349, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.12957764, + "step": 8949, + "time_per_iteration": 2.745164394378662 + }, + { + "auxiliary_loss_clip": 0.0136415, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.25075829, + "balance_loss_mlp": 1.02359378, + "epoch": 0.5381031113783256, + "flos": 26985228697080.0, + "grad_norm": 1.635961792546934, + "language_loss": 0.78277522, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80678666, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.1340332, + "step": 8950, + "time_per_iteration": 2.805615186691284 + }, + { + "auxiliary_loss_clip": 0.0135504, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.24371743, + "balance_loss_mlp": 1.01775491, + "epoch": 0.5381632346309936, + "flos": 24759505129680.0, + "grad_norm": 1.5128457980196626, + "language_loss": 0.73371381, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.75756764, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12591553, + "step": 8951, + "time_per_iteration": 2.7939789295196533 + }, + { + "auxiliary_loss_clip": 0.0136864, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.25076175, + "balance_loss_mlp": 1.01569819, + "epoch": 0.5382233578836615, + "flos": 15564325909320.0, + "grad_norm": 1.7234927328796557, + "language_loss": 0.75431156, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77829015, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13525391, + "step": 8952, + "time_per_iteration": 2.827515125274658 + }, + { + "auxiliary_loss_clip": 0.01365471, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.25061262, + "balance_loss_mlp": 1.01735234, + "epoch": 0.5382834811363295, + "flos": 25090229216160.0, + "grad_norm": 1.4622657710132216, + "language_loss": 0.79712355, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.82109648, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14465332, + "step": 8953, + "time_per_iteration": 2.838754177093506 + }, + { + "auxiliary_loss_clip": 0.01359776, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.24813032, + "balance_loss_mlp": 1.0158788, + "epoch": 0.5383436043889974, + "flos": 13301868757320.0, + "grad_norm": 1.7371837427056653, + "language_loss": 0.80758315, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.83148128, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14154053, + "step": 8954, + "time_per_iteration": 2.735511064529419 + }, + { + "auxiliary_loss_clip": 0.01367188, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.25217104, + "balance_loss_mlp": 1.01663518, + "epoch": 0.5384037276416654, + "flos": 23044600036080.0, + "grad_norm": 2.35016018985154, + "language_loss": 0.76871276, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.79268539, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13433838, + "step": 8955, + "time_per_iteration": 2.7856321334838867 + }, + { + "auxiliary_loss_clip": 0.01361315, + "auxiliary_loss_mlp": 0.01037085, + "balance_loss_clip": 1.24718928, + "balance_loss_mlp": 1.0235548, + "epoch": 0.5384638508943334, + "flos": 20636020712880.0, + "grad_norm": 1.5280023213153353, + "language_loss": 0.78543848, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80942249, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13543701, + "step": 8956, + "time_per_iteration": 2.878704309463501 + }, + { + "auxiliary_loss_clip": 0.01177197, + "auxiliary_loss_mlp": 0.01008947, + "balance_loss_clip": 1.13219571, + "balance_loss_mlp": 1.00637209, + "epoch": 0.5385239741470014, + "flos": 66751842888000.0, + "grad_norm": 0.868758559122881, + "language_loss": 0.63482487, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65668631, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02575684, + "step": 8957, + "time_per_iteration": 3.2104153633117676 + }, + { + "auxiliary_loss_clip": 0.01174372, + "auxiliary_loss_mlp": 0.01008632, + "balance_loss_clip": 1.12965596, + "balance_loss_mlp": 1.00645101, + "epoch": 0.5385840973996693, + "flos": 64732771575360.0, + "grad_norm": 0.7190542893658326, + "language_loss": 0.51614463, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53797472, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02185059, + "step": 8958, + "time_per_iteration": 4.7153167724609375 + }, + { + "auxiliary_loss_clip": 0.01379057, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.26067853, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5386442206523373, + "flos": 26147543931000.0, + "grad_norm": 3.257312260823676, + "language_loss": 0.77547836, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79958457, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.14013672, + "step": 8959, + "time_per_iteration": 4.218889951705933 + }, + { + "auxiliary_loss_clip": 0.01368419, + "auxiliary_loss_mlp": 0.01028368, + "balance_loss_clip": 1.25070274, + "balance_loss_mlp": 1.01490283, + "epoch": 0.5387043439050052, + "flos": 18255403568520.0, + "grad_norm": 2.0206315632327865, + "language_loss": 0.83739412, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86136198, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13452148, + "step": 8960, + "time_per_iteration": 2.695993661880493 + }, + { + "auxiliary_loss_clip": 0.01364451, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.24861515, + "balance_loss_mlp": 1.01601672, + "epoch": 0.5387644671576732, + "flos": 29794263306840.0, + "grad_norm": 1.4172486589166986, + "language_loss": 0.78623813, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.81017226, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12957764, + "step": 8961, + "time_per_iteration": 2.8634016513824463 + }, + { + "auxiliary_loss_clip": 0.01365515, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.24941492, + "balance_loss_mlp": 1.02340007, + "epoch": 0.5388245904103413, + "flos": 22378522510080.0, + "grad_norm": 1.6038163398345306, + "language_loss": 0.84454149, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86857009, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13952637, + "step": 8962, + "time_per_iteration": 2.7290713787078857 + }, + { + "auxiliary_loss_clip": 0.01178472, + "auxiliary_loss_mlp": 0.01007374, + "balance_loss_clip": 1.13357902, + "balance_loss_mlp": 1.00463247, + "epoch": 0.5388847136630092, + "flos": 69823048026600.0, + "grad_norm": 0.74664051278561, + "language_loss": 0.54192048, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56377894, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02746582, + "step": 8963, + "time_per_iteration": 3.2275938987731934 + }, + { + "auxiliary_loss_clip": 0.01178816, + "auxiliary_loss_mlp": 0.01007288, + "balance_loss_clip": 1.13365936, + "balance_loss_mlp": 1.00460553, + "epoch": 0.5389448369156772, + "flos": 69839453804040.0, + "grad_norm": 0.8027902726337227, + "language_loss": 0.63483059, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65669167, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02685547, + "step": 8964, + "time_per_iteration": 4.764710426330566 + }, + { + "auxiliary_loss_clip": 0.01376081, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.25452316, + "balance_loss_mlp": 1.02011204, + "epoch": 0.5390049601683451, + "flos": 31728717132480.0, + "grad_norm": 1.4737764845159704, + "language_loss": 0.70355129, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72764695, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.13360596, + "step": 8965, + "time_per_iteration": 2.827907085418701 + }, + { + "auxiliary_loss_clip": 0.0137157, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.25344336, + "balance_loss_mlp": 1.01800752, + "epoch": 0.5390650834210131, + "flos": 18118012194720.0, + "grad_norm": 2.2152063266348176, + "language_loss": 0.82700944, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.85104299, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13781738, + "step": 8966, + "time_per_iteration": 2.7574715614318848 + }, + { + "auxiliary_loss_clip": 0.01360686, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.24719262, + "balance_loss_mlp": 1.01583898, + "epoch": 0.539125206673681, + "flos": 17420683212720.0, + "grad_norm": 1.9030898228357422, + "language_loss": 0.72702301, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.75092578, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13775635, + "step": 8967, + "time_per_iteration": 2.7132959365844727 + }, + { + "auxiliary_loss_clip": 0.01365546, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.24997306, + "balance_loss_mlp": 1.01813364, + "epoch": 0.539185329926349, + "flos": 22203869643000.0, + "grad_norm": 1.654237816992011, + "language_loss": 0.81726575, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84123242, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.12994385, + "step": 8968, + "time_per_iteration": 2.776726245880127 + }, + { + "auxiliary_loss_clip": 0.01366155, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.24893332, + "balance_loss_mlp": 1.01946294, + "epoch": 0.539245453179017, + "flos": 21439449004320.0, + "grad_norm": 1.4104783776641534, + "language_loss": 0.74092245, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76491725, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.1385498, + "step": 8969, + "time_per_iteration": 4.307426691055298 + }, + { + "auxiliary_loss_clip": 0.01377262, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.25769806, + "balance_loss_mlp": 1.01815379, + "epoch": 0.539305576431685, + "flos": 20739155612040.0, + "grad_norm": 3.8752656476293565, + "language_loss": 0.82553738, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84963351, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.14190674, + "step": 8970, + "time_per_iteration": 2.7291409969329834 + }, + { + "auxiliary_loss_clip": 0.01362598, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.24915886, + "balance_loss_mlp": 1.02096653, + "epoch": 0.5393656996843529, + "flos": 30926710133640.0, + "grad_norm": 1.3443324667971925, + "language_loss": 0.75724125, + "learning_rate": 1.842237354749146e-06, + "loss": 0.78120661, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.12982178, + "step": 8971, + "time_per_iteration": 2.8893449306488037 + }, + { + "auxiliary_loss_clip": 0.01181235, + "auxiliary_loss_mlp": 0.01008117, + "balance_loss_clip": 1.13653374, + "balance_loss_mlp": 1.00553024, + "epoch": 0.5394258229370209, + "flos": 50329626230520.0, + "grad_norm": 0.9054647341545635, + "language_loss": 0.60394526, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62583876, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02587891, + "step": 8972, + "time_per_iteration": 3.277463674545288 + }, + { + "auxiliary_loss_clip": 0.0136135, + "auxiliary_loss_mlp": 0.01040116, + "balance_loss_clip": 1.24624968, + "balance_loss_mlp": 1.02646661, + "epoch": 0.5394859461896888, + "flos": 25417582808760.0, + "grad_norm": 1.344256967222693, + "language_loss": 0.78726321, + "learning_rate": 1.841460870485045e-06, + "loss": 0.81127787, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.15087891, + "router_z_loss_mlp": 0.13641357, + "step": 8973, + "time_per_iteration": 2.8044114112854004 + }, + { + "auxiliary_loss_clip": 0.01370981, + "auxiliary_loss_mlp": 0.01040182, + "balance_loss_clip": 1.24966121, + "balance_loss_mlp": 1.02551866, + "epoch": 0.5395460694423568, + "flos": 25483293781200.0, + "grad_norm": 2.0044651572048062, + "language_loss": 0.74282986, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76694149, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14648438, + "step": 8974, + "time_per_iteration": 2.862541675567627 + }, + { + "auxiliary_loss_clip": 0.01178975, + "auxiliary_loss_mlp": 0.01009903, + "balance_loss_clip": 1.13522685, + "balance_loss_mlp": 1.00756633, + "epoch": 0.5396061926950249, + "flos": 53263399386960.0, + "grad_norm": 0.7849002180122127, + "language_loss": 0.51105833, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53294706, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02331543, + "step": 8975, + "time_per_iteration": 3.23006534576416 + }, + { + "auxiliary_loss_clip": 0.01366738, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.25269127, + "balance_loss_mlp": 1.02346516, + "epoch": 0.5396663159476928, + "flos": 26730854973360.0, + "grad_norm": 1.7877807828529808, + "language_loss": 0.72504562, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74907422, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12652588, + "step": 8976, + "time_per_iteration": 2.859297037124634 + }, + { + "auxiliary_loss_clip": 0.01362928, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.24861002, + "balance_loss_mlp": 1.02444267, + "epoch": 0.5397264392003608, + "flos": 23257976297400.0, + "grad_norm": 1.6868469550754888, + "language_loss": 0.70360446, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.72761273, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13452148, + "step": 8977, + "time_per_iteration": 2.772289514541626 + }, + { + "auxiliary_loss_clip": 0.01370723, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.25443387, + "balance_loss_mlp": 1.02366996, + "epoch": 0.5397865624530287, + "flos": 18298512665640.0, + "grad_norm": 1.6260445147872877, + "language_loss": 0.7280364, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.75211811, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13775635, + "step": 8978, + "time_per_iteration": 2.6996452808380127 + }, + { + "auxiliary_loss_clip": 0.01379679, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.25934172, + "balance_loss_mlp": 1.02099597, + "epoch": 0.5398466857056967, + "flos": 15300815304600.0, + "grad_norm": 1.803320500226706, + "language_loss": 0.7446751, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76882678, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14489746, + "step": 8979, + "time_per_iteration": 2.7372055053710938 + }, + { + "auxiliary_loss_clip": 0.01374944, + "auxiliary_loss_mlp": 0.01043542, + "balance_loss_clip": 1.25488734, + "balance_loss_mlp": 1.02911758, + "epoch": 0.5399068089583646, + "flos": 17826255152640.0, + "grad_norm": 1.8611188647925345, + "language_loss": 0.7706055, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79479045, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.14422607, + "step": 8980, + "time_per_iteration": 2.7013356685638428 + }, + { + "auxiliary_loss_clip": 0.01363459, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.24749577, + "balance_loss_mlp": 1.0235498, + "epoch": 0.5399669322110326, + "flos": 27387470751480.0, + "grad_norm": 2.0083064223637055, + "language_loss": 0.82356083, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84756351, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.13262939, + "step": 8981, + "time_per_iteration": 2.830798864364624 + }, + { + "auxiliary_loss_clip": 0.0136683, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.24932933, + "balance_loss_mlp": 1.02149916, + "epoch": 0.5400270554637006, + "flos": 20454017732640.0, + "grad_norm": 1.7484918408967238, + "language_loss": 0.67200989, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69603109, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.13806152, + "step": 8982, + "time_per_iteration": 2.756523847579956 + }, + { + "auxiliary_loss_clip": 0.01365694, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.25145769, + "balance_loss_mlp": 1.01985264, + "epoch": 0.5400871787163686, + "flos": 21694634895240.0, + "grad_norm": 1.5802434623424844, + "language_loss": 0.83002234, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.85400093, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.12304688, + "step": 8983, + "time_per_iteration": 2.77193284034729 + }, + { + "auxiliary_loss_clip": 0.01359131, + "auxiliary_loss_mlp": 0.0104002, + "balance_loss_clip": 1.24561906, + "balance_loss_mlp": 1.02529144, + "epoch": 0.5401473019690365, + "flos": 19208974258800.0, + "grad_norm": 1.8473024445198016, + "language_loss": 0.71456897, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.73856044, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.13525391, + "router_z_loss_mlp": 0.1472168, + "step": 8984, + "time_per_iteration": 2.7065978050231934 + }, + { + "auxiliary_loss_clip": 0.0137235, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.25468826, + "balance_loss_mlp": 1.0203656, + "epoch": 0.5402074252217045, + "flos": 20631756835080.0, + "grad_norm": 1.570118694804103, + "language_loss": 0.80344492, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82751691, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.1449585, + "step": 8985, + "time_per_iteration": 2.894644260406494 + }, + { + "auxiliary_loss_clip": 0.01351354, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.24221516, + "balance_loss_mlp": 1.01775122, + "epoch": 0.5402675484743724, + "flos": 24978972645000.0, + "grad_norm": 1.3891441503800754, + "language_loss": 0.79371822, + "learning_rate": 1.83641431418363e-06, + "loss": 0.81754899, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13983154, + "step": 8986, + "time_per_iteration": 2.831561803817749 + }, + { + "auxiliary_loss_clip": 0.01361315, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.2470665, + "balance_loss_mlp": 1.01973772, + "epoch": 0.5403276717270404, + "flos": 19463144940720.0, + "grad_norm": 1.7162130543336425, + "language_loss": 0.7704615, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79440427, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13226318, + "step": 8987, + "time_per_iteration": 2.784956693649292 + }, + { + "auxiliary_loss_clip": 0.01364767, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.25005984, + "balance_loss_mlp": 1.01503611, + "epoch": 0.5403877949797083, + "flos": 18446746471560.0, + "grad_norm": 1.7230399327264916, + "language_loss": 0.71426833, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73820019, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13397217, + "step": 8988, + "time_per_iteration": 2.7881064414978027 + }, + { + "auxiliary_loss_clip": 0.01363311, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.24850893, + "balance_loss_mlp": 1.01906264, + "epoch": 0.5404479182323764, + "flos": 28298703903480.0, + "grad_norm": 2.3129974289098487, + "language_loss": 0.68697155, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.71094352, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14813232, + "step": 8989, + "time_per_iteration": 2.8110129833221436 + }, + { + "auxiliary_loss_clip": 0.01365041, + "auxiliary_loss_mlp": 0.01038003, + "balance_loss_clip": 1.24960113, + "balance_loss_mlp": 1.02370965, + "epoch": 0.5405080414850444, + "flos": 23372522145720.0, + "grad_norm": 1.4459804880862082, + "language_loss": 0.78231966, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.80635011, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14294434, + "step": 8990, + "time_per_iteration": 2.790360927581787 + }, + { + "auxiliary_loss_clip": 0.01355425, + "auxiliary_loss_mlp": 0.01027598, + "balance_loss_clip": 1.24212074, + "balance_loss_mlp": 1.01499808, + "epoch": 0.5405681647377123, + "flos": 21111486286320.0, + "grad_norm": 1.8392398633455282, + "language_loss": 0.6953336, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71916378, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12585449, + "step": 8991, + "time_per_iteration": 2.7497901916503906 + }, + { + "auxiliary_loss_clip": 0.01360457, + "auxiliary_loss_mlp": 0.01030329, + "balance_loss_clip": 1.2455039, + "balance_loss_mlp": 1.01697743, + "epoch": 0.5406282879903803, + "flos": 20454098949360.0, + "grad_norm": 1.8942469078240156, + "language_loss": 0.76557857, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78948641, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13342285, + "step": 8992, + "time_per_iteration": 2.7588753700256348 + }, + { + "auxiliary_loss_clip": 0.01366787, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.25006962, + "balance_loss_mlp": 1.01768744, + "epoch": 0.5406884112430482, + "flos": 14213507992920.0, + "grad_norm": 5.32168872359415, + "language_loss": 0.76707578, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.79105407, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13366699, + "step": 8993, + "time_per_iteration": 2.7761294841766357 + }, + { + "auxiliary_loss_clip": 0.01363085, + "auxiliary_loss_mlp": 0.01025222, + "balance_loss_clip": 1.2508204, + "balance_loss_mlp": 1.01231766, + "epoch": 0.5407485344957162, + "flos": 23880498034320.0, + "grad_norm": 1.4480056622483324, + "language_loss": 0.70218527, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72606838, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12915039, + "step": 8994, + "time_per_iteration": 2.840291738510132 + }, + { + "auxiliary_loss_clip": 0.01367407, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.2515986, + "balance_loss_mlp": 1.01377726, + "epoch": 0.5408086577483842, + "flos": 23153663755800.0, + "grad_norm": 1.5988851686060903, + "language_loss": 0.75799727, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.78194845, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.1394043, + "step": 8995, + "time_per_iteration": 2.808349609375 + }, + { + "auxiliary_loss_clip": 0.0135419, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.24212706, + "balance_loss_mlp": 1.0193336, + "epoch": 0.5408687810010522, + "flos": 18775765006920.0, + "grad_norm": 1.7530412672106723, + "language_loss": 0.73701704, + "learning_rate": 1.832533059471282e-06, + "loss": 0.76087731, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.125, + "step": 8996, + "time_per_iteration": 2.7355501651763916 + }, + { + "auxiliary_loss_clip": 0.01354959, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.24356842, + "balance_loss_mlp": 1.02317846, + "epoch": 0.5409289042537201, + "flos": 13885585883280.0, + "grad_norm": 6.932264333114, + "language_loss": 0.73109412, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75500548, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13000488, + "step": 8997, + "time_per_iteration": 4.177504062652588 + }, + { + "auxiliary_loss_clip": 0.0136636, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.25270736, + "balance_loss_mlp": 1.01967084, + "epoch": 0.5409890275063881, + "flos": 14469465442680.0, + "grad_norm": 1.8982584455889329, + "language_loss": 0.72099233, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74498659, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13391113, + "step": 8998, + "time_per_iteration": 4.199387788772583 + }, + { + "auxiliary_loss_clip": 0.01362374, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.24892294, + "balance_loss_mlp": 1.02292299, + "epoch": 0.541049150759056, + "flos": 48985613537400.0, + "grad_norm": 1.5798270924472066, + "language_loss": 0.70624971, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.73023522, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13256836, + "step": 8999, + "time_per_iteration": 3.081439733505249 + }, + { + "auxiliary_loss_clip": 0.01358736, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.24720943, + "balance_loss_mlp": 1.01631737, + "epoch": 0.541109274011724, + "flos": 18151821977400.0, + "grad_norm": 2.174171429599714, + "language_loss": 0.80746257, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.83134604, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13299561, + "step": 9000, + "time_per_iteration": 2.8941359519958496 + }, + { + "auxiliary_loss_clip": 0.01357258, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.2458185, + "balance_loss_mlp": 1.015468, + "epoch": 0.541169397264392, + "flos": 20527525510200.0, + "grad_norm": 1.5122999035429365, + "language_loss": 0.72908419, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75294757, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13598633, + "step": 9001, + "time_per_iteration": 2.7985727787017822 + }, + { + "auxiliary_loss_clip": 0.0136722, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.24953175, + "balance_loss_mlp": 1.02026057, + "epoch": 0.54122952051706, + "flos": 20048405184360.0, + "grad_norm": 2.276773373172242, + "language_loss": 0.85290051, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87691784, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14263916, + "step": 9002, + "time_per_iteration": 2.8472702503204346 + }, + { + "auxiliary_loss_clip": 0.01357187, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.24611592, + "balance_loss_mlp": 1.01598203, + "epoch": 0.541289643769728, + "flos": 19066912923600.0, + "grad_norm": 1.678887176361747, + "language_loss": 0.78043145, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80429041, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12731934, + "step": 9003, + "time_per_iteration": 4.185221195220947 + }, + { + "auxiliary_loss_clip": 0.01355992, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.24375081, + "balance_loss_mlp": 1.01744449, + "epoch": 0.5413497670223959, + "flos": 22387375132560.0, + "grad_norm": 1.9339083548793554, + "language_loss": 0.69897717, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.72285068, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13916016, + "step": 9004, + "time_per_iteration": 2.8021833896636963 + }, + { + "auxiliary_loss_clip": 0.01184659, + "auxiliary_loss_mlp": 0.01002648, + "balance_loss_clip": 1.13985443, + "balance_loss_mlp": 1.00000119, + "epoch": 0.5414098902750639, + "flos": 70047672803640.0, + "grad_norm": 0.9878501980459892, + "language_loss": 0.59130633, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61317945, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02648926, + "step": 9005, + "time_per_iteration": 3.351001739501953 + }, + { + "auxiliary_loss_clip": 0.01363169, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.24656081, + "balance_loss_mlp": 1.02315092, + "epoch": 0.5414700135277318, + "flos": 21804023481840.0, + "grad_norm": 3.397866541916094, + "language_loss": 0.78722644, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.81121713, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.12744141, + "step": 9006, + "time_per_iteration": 2.7930374145507812 + }, + { + "auxiliary_loss_clip": 0.01350568, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.23906994, + "balance_loss_mlp": 1.02419186, + "epoch": 0.5415301367803999, + "flos": 16912098198720.0, + "grad_norm": 1.643948119960634, + "language_loss": 0.83502281, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85888761, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.11724854, + "step": 9007, + "time_per_iteration": 2.750727891921997 + }, + { + "auxiliary_loss_clip": 0.01361627, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.24760461, + "balance_loss_mlp": 1.01882124, + "epoch": 0.5415902600330678, + "flos": 25709908367880.0, + "grad_norm": 2.471042644894134, + "language_loss": 0.67122138, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69516367, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13787842, + "step": 9008, + "time_per_iteration": 4.3269572257995605 + }, + { + "auxiliary_loss_clip": 0.01364662, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.24643803, + "balance_loss_mlp": 1.01921129, + "epoch": 0.5416503832857358, + "flos": 19212832053000.0, + "grad_norm": 5.311619547426866, + "language_loss": 0.7458483, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76983333, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.14630127, + "step": 9009, + "time_per_iteration": 2.7565205097198486 + }, + { + "auxiliary_loss_clip": 0.0136238, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.24602365, + "balance_loss_mlp": 1.02632773, + "epoch": 0.5417105065384037, + "flos": 12717542505960.0, + "grad_norm": 2.090888934588533, + "language_loss": 0.88050634, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90453112, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13763428, + "step": 9010, + "time_per_iteration": 2.7391412258148193 + }, + { + "auxiliary_loss_clip": 0.01363447, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.24934459, + "balance_loss_mlp": 1.02454674, + "epoch": 0.5417706297910717, + "flos": 30342018407040.0, + "grad_norm": 1.8540706619380454, + "language_loss": 0.65179539, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67580879, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13360596, + "step": 9011, + "time_per_iteration": 2.8601174354553223 + }, + { + "auxiliary_loss_clip": 0.01361251, + "auxiliary_loss_mlp": 0.01040001, + "balance_loss_clip": 1.24787045, + "balance_loss_mlp": 1.02684045, + "epoch": 0.5418307530437396, + "flos": 29026715824440.0, + "grad_norm": 3.315359712031737, + "language_loss": 0.79706216, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.82107466, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13153076, + "step": 9012, + "time_per_iteration": 2.8897502422332764 + }, + { + "auxiliary_loss_clip": 0.01366429, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.25200319, + "balance_loss_mlp": 1.01992047, + "epoch": 0.5418908762964076, + "flos": 16877882332440.0, + "grad_norm": 1.8545547537353815, + "language_loss": 0.74577194, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.7697677, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13226318, + "step": 9013, + "time_per_iteration": 2.791565179824829 + }, + { + "auxiliary_loss_clip": 0.0137003, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.25186813, + "balance_loss_mlp": 1.02367246, + "epoch": 0.5419509995490756, + "flos": 18953950801320.0, + "grad_norm": 1.7032352432599858, + "language_loss": 0.72032154, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74439359, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13500977, + "step": 9014, + "time_per_iteration": 2.7621941566467285 + }, + { + "auxiliary_loss_clip": 0.01362905, + "auxiliary_loss_mlp": 0.01039483, + "balance_loss_clip": 1.248945, + "balance_loss_mlp": 1.02583313, + "epoch": 0.5420111228017436, + "flos": 18082334427480.0, + "grad_norm": 1.6935560041285074, + "language_loss": 0.80745101, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.8314749, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13659668, + "step": 9015, + "time_per_iteration": 2.7094054222106934 + }, + { + "auxiliary_loss_clip": 0.01367415, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.25032353, + "balance_loss_mlp": 1.02429307, + "epoch": 0.5420712460544116, + "flos": 19066385014920.0, + "grad_norm": 2.056599868989066, + "language_loss": 0.81636184, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.84042037, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14135742, + "step": 9016, + "time_per_iteration": 2.7617342472076416 + }, + { + "auxiliary_loss_clip": 0.01356369, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.24278891, + "balance_loss_mlp": 1.02019262, + "epoch": 0.5421313693070795, + "flos": 18191723014080.0, + "grad_norm": 1.7236918455509338, + "language_loss": 0.81633919, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.84022951, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.12463379, + "step": 9017, + "time_per_iteration": 2.7178118228912354 + }, + { + "auxiliary_loss_clip": 0.01355176, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.24464655, + "balance_loss_mlp": 1.0197804, + "epoch": 0.5421914925597475, + "flos": 13009868065080.0, + "grad_norm": 1.5657812432286957, + "language_loss": 0.77982485, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.80370998, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13580322, + "step": 9018, + "time_per_iteration": 2.7812423706054688 + }, + { + "auxiliary_loss_clip": 0.0136411, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.2472589, + "balance_loss_mlp": 1.0215795, + "epoch": 0.5422516158124154, + "flos": 46767199474800.0, + "grad_norm": 1.3477395706067383, + "language_loss": 0.6644119, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68840492, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.1361084, + "step": 9019, + "time_per_iteration": 2.971829891204834 + }, + { + "auxiliary_loss_clip": 0.01353129, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.24357438, + "balance_loss_mlp": 1.02037597, + "epoch": 0.5423117390650835, + "flos": 31764882200040.0, + "grad_norm": 1.779007200179038, + "language_loss": 0.69938737, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72324353, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12121582, + "step": 9020, + "time_per_iteration": 2.8226702213287354 + }, + { + "auxiliary_loss_clip": 0.01354515, + "auxiliary_loss_mlp": 0.01039219, + "balance_loss_clip": 1.24418974, + "balance_loss_mlp": 1.02594519, + "epoch": 0.5423718623177514, + "flos": 27208147923000.0, + "grad_norm": 1.4065616951444118, + "language_loss": 0.80413425, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82807159, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13287354, + "step": 9021, + "time_per_iteration": 2.791464328765869 + }, + { + "auxiliary_loss_clip": 0.0135878, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.24528384, + "balance_loss_mlp": 1.02341199, + "epoch": 0.5424319855704194, + "flos": 23551235848800.0, + "grad_norm": 1.648233818698015, + "language_loss": 0.79172373, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81568861, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14306641, + "step": 9022, + "time_per_iteration": 2.8087844848632812 + }, + { + "auxiliary_loss_clip": 0.01362143, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.24855149, + "balance_loss_mlp": 1.02498627, + "epoch": 0.5424921088230873, + "flos": 26621385170040.0, + "grad_norm": 1.4932002241806512, + "language_loss": 0.82332015, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84732008, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.128479, + "step": 9023, + "time_per_iteration": 2.87241268157959 + }, + { + "auxiliary_loss_clip": 0.01355797, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.24281299, + "balance_loss_mlp": 1.01514792, + "epoch": 0.5425522320757553, + "flos": 23592111486120.0, + "grad_norm": 1.5386712441700319, + "language_loss": 0.71690756, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.74074686, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12994385, + "step": 9024, + "time_per_iteration": 2.7642335891723633 + }, + { + "auxiliary_loss_clip": 0.01364028, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.24881494, + "balance_loss_mlp": 1.0170033, + "epoch": 0.5426123553284232, + "flos": 30598503765480.0, + "grad_norm": 1.6692495823709081, + "language_loss": 0.6562233, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.6801585, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.12493896, + "step": 9025, + "time_per_iteration": 2.8566713333129883 + }, + { + "auxiliary_loss_clip": 0.01365148, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.24894631, + "balance_loss_mlp": 1.01947892, + "epoch": 0.5426724785810912, + "flos": 12498968374560.0, + "grad_norm": 1.717924010890756, + "language_loss": 0.73500979, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75898582, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.12988281, + "step": 9026, + "time_per_iteration": 2.7323739528656006 + }, + { + "auxiliary_loss_clip": 0.01362418, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.24635959, + "balance_loss_mlp": 1.02094269, + "epoch": 0.5427326018337592, + "flos": 26069812884000.0, + "grad_norm": 1.6676834543801753, + "language_loss": 0.79135609, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.81534481, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.15527344, + "step": 9027, + "time_per_iteration": 2.8883042335510254 + }, + { + "auxiliary_loss_clip": 0.0119861, + "auxiliary_loss_mlp": 0.01000775, + "balance_loss_clip": 1.15279698, + "balance_loss_mlp": 0.99831897, + "epoch": 0.5427927250864272, + "flos": 66000538749600.0, + "grad_norm": 0.7461000614158807, + "language_loss": 0.56652832, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.5885222, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02453613, + "step": 9028, + "time_per_iteration": 3.3454394340515137 + }, + { + "auxiliary_loss_clip": 0.01360191, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.24523664, + "balance_loss_mlp": 1.01400423, + "epoch": 0.5428528483390952, + "flos": 19980298318680.0, + "grad_norm": 1.7690929239637212, + "language_loss": 0.78171492, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80559784, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14105225, + "step": 9029, + "time_per_iteration": 2.764464855194092 + }, + { + "auxiliary_loss_clip": 0.0135328, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.24045396, + "balance_loss_mlp": 1.0158999, + "epoch": 0.5429129715917631, + "flos": 21836980488960.0, + "grad_norm": 1.3666909039801125, + "language_loss": 0.83361989, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85745049, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13903809, + "step": 9030, + "time_per_iteration": 2.8223328590393066 + }, + { + "auxiliary_loss_clip": 0.01350289, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.2395463, + "balance_loss_mlp": 1.01913083, + "epoch": 0.5429730948444311, + "flos": 27788535163440.0, + "grad_norm": 1.4992902402142503, + "language_loss": 0.74903011, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.7728548, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13037109, + "step": 9031, + "time_per_iteration": 2.8002662658691406 + }, + { + "auxiliary_loss_clip": 0.01346968, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.23843193, + "balance_loss_mlp": 1.01999319, + "epoch": 0.543033218097099, + "flos": 26766086048640.0, + "grad_norm": 1.8869726889986664, + "language_loss": 0.85470247, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87849128, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.11920166, + "step": 9032, + "time_per_iteration": 2.8014883995056152 + }, + { + "auxiliary_loss_clip": 0.01364213, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.2460382, + "balance_loss_mlp": 1.01872337, + "epoch": 0.5430933413497671, + "flos": 22680512858880.0, + "grad_norm": 1.6638098924510598, + "language_loss": 0.7376076, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.7615723, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13531494, + "step": 9033, + "time_per_iteration": 2.7612624168395996 + }, + { + "auxiliary_loss_clip": 0.01357042, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.24422002, + "balance_loss_mlp": 1.02326679, + "epoch": 0.543153464602435, + "flos": 24612814441440.0, + "grad_norm": 1.600622394284589, + "language_loss": 0.75602758, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77996755, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13690186, + "step": 9034, + "time_per_iteration": 2.8301150798797607 + }, + { + "auxiliary_loss_clip": 0.01355145, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.24271464, + "balance_loss_mlp": 1.02131438, + "epoch": 0.543213587855103, + "flos": 19030382380800.0, + "grad_norm": 1.5889096825367393, + "language_loss": 0.8452189, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86910927, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.12591553, + "step": 9035, + "time_per_iteration": 4.211485147476196 + }, + { + "auxiliary_loss_clip": 0.01192325, + "auxiliary_loss_mlp": 0.01001952, + "balance_loss_clip": 1.14745665, + "balance_loss_mlp": 0.99960369, + "epoch": 0.5432737111077709, + "flos": 65701350377640.0, + "grad_norm": 0.7524619058312276, + "language_loss": 0.55845523, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58039796, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.0234375, + "step": 9036, + "time_per_iteration": 3.2144548892974854 + }, + { + "auxiliary_loss_clip": 0.01364028, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.24836254, + "balance_loss_mlp": 1.01665866, + "epoch": 0.5433338343604389, + "flos": 22096876949640.0, + "grad_norm": 1.476554224467969, + "language_loss": 0.75196207, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77590555, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13677979, + "step": 9037, + "time_per_iteration": 4.19917631149292 + }, + { + "auxiliary_loss_clip": 0.01355201, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.24225533, + "balance_loss_mlp": 1.02302933, + "epoch": 0.5433939576131068, + "flos": 34678635435000.0, + "grad_norm": 1.6142055793954808, + "language_loss": 0.66993809, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.69385087, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13043213, + "step": 9038, + "time_per_iteration": 2.911590814590454 + }, + { + "auxiliary_loss_clip": 0.01357398, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.24439859, + "balance_loss_mlp": 1.01531649, + "epoch": 0.5434540808657748, + "flos": 20308261036680.0, + "grad_norm": 2.76441637678111, + "language_loss": 0.78237402, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80622959, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.128479, + "step": 9039, + "time_per_iteration": 2.745497703552246 + }, + { + "auxiliary_loss_clip": 0.01360152, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.24682021, + "balance_loss_mlp": 1.02096474, + "epoch": 0.5435142041184428, + "flos": 23118067205280.0, + "grad_norm": 1.9772276535687758, + "language_loss": 0.76906312, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.79300785, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13354492, + "step": 9040, + "time_per_iteration": 4.260162591934204 + }, + { + "auxiliary_loss_clip": 0.0119527, + "auxiliary_loss_mlp": 0.01002542, + "balance_loss_clip": 1.14995193, + "balance_loss_mlp": 1.00055122, + "epoch": 0.5435743273711108, + "flos": 64028051871840.0, + "grad_norm": 0.654826247181577, + "language_loss": 0.52439582, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54637396, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.01989746, + "step": 9041, + "time_per_iteration": 3.304694652557373 + }, + { + "auxiliary_loss_clip": 0.01362581, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.24701476, + "balance_loss_mlp": 1.0240891, + "epoch": 0.5436344506237788, + "flos": 25124363865720.0, + "grad_norm": 1.5636825897199813, + "language_loss": 0.76373756, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78773689, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13256836, + "step": 9042, + "time_per_iteration": 2.770780086517334 + }, + { + "auxiliary_loss_clip": 0.013564, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.24394107, + "balance_loss_mlp": 1.0189991, + "epoch": 0.5436945738764467, + "flos": 19577690789040.0, + "grad_norm": 1.461133977742022, + "language_loss": 0.67374301, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69762045, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12359619, + "step": 9043, + "time_per_iteration": 2.8008182048797607 + }, + { + "auxiliary_loss_clip": 0.01354904, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.24336481, + "balance_loss_mlp": 1.01676464, + "epoch": 0.5437546971291147, + "flos": 21147529528800.0, + "grad_norm": 1.7983231455005542, + "language_loss": 0.84760302, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.87145054, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13085938, + "step": 9044, + "time_per_iteration": 2.873029947280884 + }, + { + "auxiliary_loss_clip": 0.01369541, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.24902105, + "balance_loss_mlp": 1.01620102, + "epoch": 0.5438148203817826, + "flos": 25124120215560.0, + "grad_norm": 1.481422738683504, + "language_loss": 0.62336242, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64735973, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14001465, + "step": 9045, + "time_per_iteration": 2.859013319015503 + }, + { + "auxiliary_loss_clip": 0.01365554, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.25070202, + "balance_loss_mlp": 1.02163196, + "epoch": 0.5438749436344507, + "flos": 23008150710000.0, + "grad_norm": 1.5639887982453082, + "language_loss": 0.7041955, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72819531, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.12817383, + "step": 9046, + "time_per_iteration": 4.356630325317383 + }, + { + "auxiliary_loss_clip": 0.01358736, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.24624014, + "balance_loss_mlp": 1.01566732, + "epoch": 0.5439350668871186, + "flos": 15491264823720.0, + "grad_norm": 1.5417777318398997, + "language_loss": 0.77441597, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79829168, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1317749, + "step": 9047, + "time_per_iteration": 2.7923460006713867 + }, + { + "auxiliary_loss_clip": 0.01369985, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.25466788, + "balance_loss_mlp": 1.0257895, + "epoch": 0.5439951901397866, + "flos": 17242903501920.0, + "grad_norm": 2.085730451029279, + "language_loss": 0.73066962, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75476193, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13470459, + "step": 9048, + "time_per_iteration": 2.7549517154693604 + }, + { + "auxiliary_loss_clip": 0.01361359, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.24826157, + "balance_loss_mlp": 1.02191091, + "epoch": 0.5440553133924545, + "flos": 18665645469840.0, + "grad_norm": 1.9674442210655552, + "language_loss": 0.93586612, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95984435, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14550781, + "step": 9049, + "time_per_iteration": 2.742374897003174 + }, + { + "auxiliary_loss_clip": 0.01359178, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.24519134, + "balance_loss_mlp": 1.01721597, + "epoch": 0.5441154366451225, + "flos": 27128305241280.0, + "grad_norm": 1.6415941338421152, + "language_loss": 0.74304461, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76693618, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.12780762, + "step": 9050, + "time_per_iteration": 2.7787437438964844 + }, + { + "auxiliary_loss_clip": 0.0136954, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.2532475, + "balance_loss_mlp": 1.01717734, + "epoch": 0.5441755598977904, + "flos": 25999350733440.0, + "grad_norm": 1.7840536329950005, + "language_loss": 0.67035627, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69436395, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14056396, + "step": 9051, + "time_per_iteration": 2.8139092922210693 + }, + { + "auxiliary_loss_clip": 0.01362123, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.2480669, + "balance_loss_mlp": 1.01661503, + "epoch": 0.5442356831504584, + "flos": 32386510553040.0, + "grad_norm": 1.5404089789530702, + "language_loss": 0.67472506, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69864786, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13543701, + "step": 9052, + "time_per_iteration": 2.8516860008239746 + }, + { + "auxiliary_loss_clip": 0.01365788, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.25017929, + "balance_loss_mlp": 1.01995182, + "epoch": 0.5442958064031264, + "flos": 24168843974160.0, + "grad_norm": 1.7842885767193912, + "language_loss": 0.93428111, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95827734, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13891602, + "step": 9053, + "time_per_iteration": 2.8623838424682617 + }, + { + "auxiliary_loss_clip": 0.01368332, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.2508502, + "balance_loss_mlp": 1.02159166, + "epoch": 0.5443559296557944, + "flos": 18768577327200.0, + "grad_norm": 1.9275241515757748, + "language_loss": 0.83857715, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.86261886, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.14257812, + "step": 9054, + "time_per_iteration": 2.8149616718292236 + }, + { + "auxiliary_loss_clip": 0.01365276, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.24958181, + "balance_loss_mlp": 1.01943958, + "epoch": 0.5444160529084624, + "flos": 22636997678160.0, + "grad_norm": 2.0787269471543626, + "language_loss": 0.68942738, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71341503, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.14056396, + "step": 9055, + "time_per_iteration": 2.77894926071167 + }, + { + "auxiliary_loss_clip": 0.01189977, + "auxiliary_loss_mlp": 0.01001713, + "balance_loss_clip": 1.14619708, + "balance_loss_mlp": 0.99859011, + "epoch": 0.5444761761611303, + "flos": 69688458629640.0, + "grad_norm": 1.044762209429197, + "language_loss": 0.57742923, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59934616, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.03112793, + "step": 9056, + "time_per_iteration": 3.2651357650756836 + }, + { + "auxiliary_loss_clip": 0.01367536, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.25039899, + "balance_loss_mlp": 1.01599455, + "epoch": 0.5445362994137983, + "flos": 14281939725480.0, + "grad_norm": 1.7443035489527183, + "language_loss": 0.69896495, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.72293675, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13635254, + "step": 9057, + "time_per_iteration": 2.733365297317505 + }, + { + "auxiliary_loss_clip": 0.01359134, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.24517119, + "balance_loss_mlp": 1.02053714, + "epoch": 0.5445964226664662, + "flos": 28992134482920.0, + "grad_norm": 1.8995483811620992, + "language_loss": 0.75337428, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77730662, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13549805, + "step": 9058, + "time_per_iteration": 2.8410818576812744 + }, + { + "auxiliary_loss_clip": 0.01187604, + "auxiliary_loss_mlp": 0.01000354, + "balance_loss_clip": 1.14397335, + "balance_loss_mlp": 0.99814814, + "epoch": 0.5446565459191343, + "flos": 68637478818960.0, + "grad_norm": 0.7899071801219657, + "language_loss": 0.62761939, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64949894, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02209473, + "step": 9059, + "time_per_iteration": 3.3282997608184814 + }, + { + "auxiliary_loss_clip": 0.01361642, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.24807715, + "balance_loss_mlp": 1.01953506, + "epoch": 0.5447166691718022, + "flos": 16220779254000.0, + "grad_norm": 1.6547463289073063, + "language_loss": 0.79521668, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81916296, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13446045, + "step": 9060, + "time_per_iteration": 2.744884729385376 + }, + { + "auxiliary_loss_clip": 0.01365732, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.2499969, + "balance_loss_mlp": 1.01906586, + "epoch": 0.5447767924244702, + "flos": 25854649854840.0, + "grad_norm": 1.6193089356722903, + "language_loss": 0.79689807, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.82088721, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.14123535, + "step": 9061, + "time_per_iteration": 2.7869021892547607 + }, + { + "auxiliary_loss_clip": 0.01360117, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.24626946, + "balance_loss_mlp": 1.01957989, + "epoch": 0.5448369156771381, + "flos": 19681840897200.0, + "grad_norm": 1.5560803686333298, + "language_loss": 0.87376314, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89769179, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13189697, + "step": 9062, + "time_per_iteration": 2.7249767780303955 + }, + { + "auxiliary_loss_clip": 0.01376087, + "auxiliary_loss_mlp": 0.01036917, + "balance_loss_clip": 1.25689495, + "balance_loss_mlp": 1.02234983, + "epoch": 0.5448970389298061, + "flos": 19286583480720.0, + "grad_norm": 1.901720209248488, + "language_loss": 0.82853603, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.85266602, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14556885, + "step": 9063, + "time_per_iteration": 2.7851614952087402 + }, + { + "auxiliary_loss_clip": 0.01369538, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.25268006, + "balance_loss_mlp": 1.02040493, + "epoch": 0.544957162182474, + "flos": 20996047054080.0, + "grad_norm": 1.712011264977186, + "language_loss": 0.6344853, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65852898, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14428711, + "step": 9064, + "time_per_iteration": 2.772914409637451 + }, + { + "auxiliary_loss_clip": 0.01371026, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.25223553, + "balance_loss_mlp": 1.02064896, + "epoch": 0.545017285435142, + "flos": 25380118273680.0, + "grad_norm": 2.0452147495889785, + "language_loss": 0.79914474, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82320857, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.1472168, + "step": 9065, + "time_per_iteration": 2.823779582977295 + }, + { + "auxiliary_loss_clip": 0.01357649, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.24534917, + "balance_loss_mlp": 1.02194238, + "epoch": 0.54507740868781, + "flos": 19139364883800.0, + "grad_norm": 2.1009417017725425, + "language_loss": 0.78467238, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80858678, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.12353516, + "router_z_loss_mlp": 0.11859131, + "step": 9066, + "time_per_iteration": 2.7427268028259277 + }, + { + "auxiliary_loss_clip": 0.01375763, + "auxiliary_loss_mlp": 0.01037637, + "balance_loss_clip": 1.25603032, + "balance_loss_mlp": 1.02337337, + "epoch": 0.545137531940478, + "flos": 26255389399920.0, + "grad_norm": 1.6050790957114025, + "language_loss": 0.75899041, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78312445, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.14276123, + "step": 9067, + "time_per_iteration": 2.8321950435638428 + }, + { + "auxiliary_loss_clip": 0.01376734, + "auxiliary_loss_mlp": 0.0104357, + "balance_loss_clip": 1.25627255, + "balance_loss_mlp": 1.02763808, + "epoch": 0.545197655193146, + "flos": 37562071206240.0, + "grad_norm": 2.2375028933644816, + "language_loss": 0.63493693, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65913999, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.15942383, + "step": 9068, + "time_per_iteration": 2.911367416381836 + }, + { + "auxiliary_loss_clip": 0.01365199, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_clip": 1.25168657, + "balance_loss_mlp": 1.03030443, + "epoch": 0.5452577784458139, + "flos": 26036855876880.0, + "grad_norm": 1.5251098795658307, + "language_loss": 0.72388983, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74797666, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.13427734, + "router_z_loss_mlp": 0.13171387, + "step": 9069, + "time_per_iteration": 2.844419240951538 + }, + { + "auxiliary_loss_clip": 0.01366429, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.25394177, + "balance_loss_mlp": 1.01835942, + "epoch": 0.5453179016984819, + "flos": 17643480613560.0, + "grad_norm": 1.7693919011676704, + "language_loss": 0.73905337, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76301885, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.11761475, + "step": 9070, + "time_per_iteration": 2.8459484577178955 + }, + { + "auxiliary_loss_clip": 0.01370985, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.25596762, + "balance_loss_mlp": 1.01964378, + "epoch": 0.5453780249511498, + "flos": 23221242712800.0, + "grad_norm": 2.157801484341845, + "language_loss": 0.60576117, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.6298036, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13616943, + "step": 9071, + "time_per_iteration": 2.8253014087677 + }, + { + "auxiliary_loss_clip": 0.01188506, + "auxiliary_loss_mlp": 0.01009863, + "balance_loss_clip": 1.14571369, + "balance_loss_mlp": 1.00772953, + "epoch": 0.5454381482038179, + "flos": 68714194656960.0, + "grad_norm": 0.6992408443140582, + "language_loss": 0.57102609, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59300977, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.0213623, + "step": 9072, + "time_per_iteration": 3.4342570304870605 + }, + { + "auxiliary_loss_clip": 0.01359662, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.24661422, + "balance_loss_mlp": 1.02002072, + "epoch": 0.5454982714564858, + "flos": 13264485438960.0, + "grad_norm": 1.6442462122880592, + "language_loss": 0.70420057, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72813261, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13494873, + "step": 9073, + "time_per_iteration": 2.7837109565734863 + }, + { + "auxiliary_loss_clip": 0.01357418, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.24559617, + "balance_loss_mlp": 1.02413702, + "epoch": 0.5455583947091538, + "flos": 21841000716600.0, + "grad_norm": 1.6939862781988149, + "language_loss": 0.71269286, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73663485, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.11865234, + "router_z_loss_mlp": 0.12652588, + "step": 9074, + "time_per_iteration": 4.2831432819366455 + }, + { + "auxiliary_loss_clip": 0.01363029, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.24835062, + "balance_loss_mlp": 1.02250874, + "epoch": 0.5456185179618217, + "flos": 17821382149440.0, + "grad_norm": 1.9554955102196394, + "language_loss": 0.6840781, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70807326, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13970947, + "step": 9075, + "time_per_iteration": 2.792513608932495 + }, + { + "auxiliary_loss_clip": 0.01364094, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.25163233, + "balance_loss_mlp": 1.02243853, + "epoch": 0.5456786412144897, + "flos": 21074387226480.0, + "grad_norm": 1.5693690623734324, + "language_loss": 0.8111887, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83518726, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13330078, + "step": 9076, + "time_per_iteration": 4.234564542770386 + }, + { + "auxiliary_loss_clip": 0.01370758, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.25488567, + "balance_loss_mlp": 1.01889372, + "epoch": 0.5457387644671576, + "flos": 23300232618960.0, + "grad_norm": 1.6089710510163824, + "language_loss": 0.80502003, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82904774, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13110352, + "step": 9077, + "time_per_iteration": 2.763704538345337 + }, + { + "auxiliary_loss_clip": 0.01366673, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.24985504, + "balance_loss_mlp": 1.02094936, + "epoch": 0.5457988877198257, + "flos": 21622142326680.0, + "grad_norm": 2.0840764257774422, + "language_loss": 0.68078613, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70480013, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13781738, + "step": 9078, + "time_per_iteration": 2.8182756900787354 + }, + { + "auxiliary_loss_clip": 0.01376615, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.25972867, + "balance_loss_mlp": 1.0223788, + "epoch": 0.5458590109724936, + "flos": 23766926786640.0, + "grad_norm": 1.748087788632649, + "language_loss": 0.81371683, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83784151, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13464355, + "step": 9079, + "time_per_iteration": 4.275875568389893 + }, + { + "auxiliary_loss_clip": 0.01381112, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.26003289, + "balance_loss_mlp": 1.01975441, + "epoch": 0.5459191342251616, + "flos": 24429471385320.0, + "grad_norm": 1.478083550640416, + "language_loss": 0.76052928, + "learning_rate": 1.799957023759277e-06, + "loss": 0.78468835, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.15039062, + "step": 9080, + "time_per_iteration": 2.7770583629608154 + }, + { + "auxiliary_loss_clip": 0.01368556, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.25179124, + "balance_loss_mlp": 1.01562333, + "epoch": 0.5459792574778296, + "flos": 23628276553680.0, + "grad_norm": 3.7755239072158133, + "language_loss": 0.84030449, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.86428583, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13952637, + "step": 9081, + "time_per_iteration": 2.847818613052368 + }, + { + "auxiliary_loss_clip": 0.01380477, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.26191008, + "balance_loss_mlp": 1.01651216, + "epoch": 0.5460393807304975, + "flos": 19140258267720.0, + "grad_norm": 1.8125343229269333, + "language_loss": 0.70538163, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.7294991, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14752197, + "step": 9082, + "time_per_iteration": 2.7596516609191895 + }, + { + "auxiliary_loss_clip": 0.01367564, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.25432634, + "balance_loss_mlp": 1.01582491, + "epoch": 0.5460995039831655, + "flos": 35925384459960.0, + "grad_norm": 1.7808805827015932, + "language_loss": 0.66944754, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.69341546, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.1340332, + "step": 9083, + "time_per_iteration": 2.932899236679077 + }, + { + "auxiliary_loss_clip": 0.01361928, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.2497139, + "balance_loss_mlp": 1.01530552, + "epoch": 0.5461596272358334, + "flos": 26764705364400.0, + "grad_norm": 1.511633102909762, + "language_loss": 0.79144675, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81535184, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.1328125, + "step": 9084, + "time_per_iteration": 4.3592822551727295 + }, + { + "auxiliary_loss_clip": 0.01373509, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.25708377, + "balance_loss_mlp": 1.01759958, + "epoch": 0.5462197504885015, + "flos": 20891490862320.0, + "grad_norm": 1.7787578399739101, + "language_loss": 0.75499308, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77903503, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.1307373, + "step": 9085, + "time_per_iteration": 2.778146266937256 + }, + { + "auxiliary_loss_clip": 0.01374892, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.25625896, + "balance_loss_mlp": 1.01400733, + "epoch": 0.5462798737411694, + "flos": 25809591556440.0, + "grad_norm": 2.243447128845104, + "language_loss": 0.74751961, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.7715494, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14080811, + "step": 9086, + "time_per_iteration": 2.8023300170898438 + }, + { + "auxiliary_loss_clip": 0.01368534, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.25301218, + "balance_loss_mlp": 1.01599514, + "epoch": 0.5463399969938374, + "flos": 25780492343520.0, + "grad_norm": 1.5031476546029583, + "language_loss": 0.77350187, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79748154, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13439941, + "step": 9087, + "time_per_iteration": 2.8165252208709717 + }, + { + "auxiliary_loss_clip": 0.0136492, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.24829841, + "balance_loss_mlp": 1.02507162, + "epoch": 0.5464001202465053, + "flos": 18847932708600.0, + "grad_norm": 2.150643569365418, + "language_loss": 0.77984309, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.80388594, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.14294434, + "step": 9088, + "time_per_iteration": 2.759596109390259 + }, + { + "auxiliary_loss_clip": 0.01188426, + "auxiliary_loss_mlp": 0.01012814, + "balance_loss_clip": 1.14499283, + "balance_loss_mlp": 1.01040637, + "epoch": 0.5464602434991733, + "flos": 69065733850920.0, + "grad_norm": 0.7361418909247669, + "language_loss": 0.57822406, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.60023642, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02404785, + "step": 9089, + "time_per_iteration": 3.3719494342803955 + }, + { + "auxiliary_loss_clip": 0.01375873, + "auxiliary_loss_mlp": 0.01033157, + "balance_loss_clip": 1.25808001, + "balance_loss_mlp": 1.0191021, + "epoch": 0.5465203667518412, + "flos": 27565372287360.0, + "grad_norm": 1.6658972195671078, + "language_loss": 0.77077115, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.79486144, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14050293, + "step": 9090, + "time_per_iteration": 2.802415370941162 + }, + { + "auxiliary_loss_clip": 0.01376007, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.2564851, + "balance_loss_mlp": 1.02131534, + "epoch": 0.5465804900045093, + "flos": 21214539968760.0, + "grad_norm": 1.7114100158119492, + "language_loss": 0.74096596, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76508725, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.14807129, + "step": 9091, + "time_per_iteration": 2.809856414794922 + }, + { + "auxiliary_loss_clip": 0.01374402, + "auxiliary_loss_mlp": 0.01039413, + "balance_loss_clip": 1.25797188, + "balance_loss_mlp": 1.02592409, + "epoch": 0.5466406132571772, + "flos": 22493677483800.0, + "grad_norm": 1.6801214736991668, + "language_loss": 0.78127003, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80540812, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.1350708, + "step": 9092, + "time_per_iteration": 2.7847068309783936 + }, + { + "auxiliary_loss_clip": 0.01376275, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.25877714, + "balance_loss_mlp": 1.02028632, + "epoch": 0.5467007365098452, + "flos": 17680336023240.0, + "grad_norm": 1.968553937373159, + "language_loss": 0.75424355, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77834994, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14074707, + "step": 9093, + "time_per_iteration": 2.7873964309692383 + }, + { + "auxiliary_loss_clip": 0.01371256, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.25455654, + "balance_loss_mlp": 1.02385139, + "epoch": 0.5467608597625132, + "flos": 15702204583440.0, + "grad_norm": 1.8114476726767024, + "language_loss": 0.693165, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.7172662, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.15026855, + "step": 9094, + "time_per_iteration": 2.8415980339050293 + }, + { + "auxiliary_loss_clip": 0.01366076, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.25156844, + "balance_loss_mlp": 1.0182209, + "epoch": 0.5468209830151811, + "flos": 24317727513840.0, + "grad_norm": 3.582508602883114, + "language_loss": 0.68026185, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70423472, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.14404297, + "router_z_loss_mlp": 0.12963867, + "step": 9095, + "time_per_iteration": 2.735628843307495 + }, + { + "auxiliary_loss_clip": 0.01374489, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.25949693, + "balance_loss_mlp": 1.01960111, + "epoch": 0.5468811062678491, + "flos": 29171985220080.0, + "grad_norm": 1.5384787904840023, + "language_loss": 0.67103636, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.69511002, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.1328125, + "step": 9096, + "time_per_iteration": 2.803647756576538 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01007225, + "balance_loss_clip": 1.14666128, + "balance_loss_mlp": 1.00485229, + "epoch": 0.546941229520517, + "flos": 67881122262720.0, + "grad_norm": 0.7378774393157398, + "language_loss": 0.5761407, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59811437, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02368164, + "step": 9097, + "time_per_iteration": 3.3462576866149902 + }, + { + "auxiliary_loss_clip": 0.0119151, + "auxiliary_loss_mlp": 0.0100791, + "balance_loss_clip": 1.14814746, + "balance_loss_mlp": 1.00587106, + "epoch": 0.5470013527731851, + "flos": 58283010645840.0, + "grad_norm": 0.9547815210931718, + "language_loss": 0.6487394, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.67073357, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02038574, + "step": 9098, + "time_per_iteration": 3.1662585735321045 + }, + { + "auxiliary_loss_clip": 0.01369249, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.25307143, + "balance_loss_mlp": 1.02678084, + "epoch": 0.547061476025853, + "flos": 22971051650160.0, + "grad_norm": 1.4336053620402438, + "language_loss": 0.74234861, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.76645207, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14331055, + "step": 9099, + "time_per_iteration": 2.8340225219726562 + }, + { + "auxiliary_loss_clip": 0.01369733, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.25552535, + "balance_loss_mlp": 1.02165914, + "epoch": 0.547121599278521, + "flos": 29973911002200.0, + "grad_norm": 2.4988600463457282, + "language_loss": 0.72994059, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.75398308, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.12860107, + "step": 9100, + "time_per_iteration": 2.8144044876098633 + }, + { + "auxiliary_loss_clip": 0.01364445, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.25054967, + "balance_loss_mlp": 1.01901138, + "epoch": 0.5471817225311889, + "flos": 36541936767960.0, + "grad_norm": 2.5862914002701105, + "language_loss": 0.68324947, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.70721781, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13378906, + "step": 9101, + "time_per_iteration": 2.8694887161254883 + }, + { + "auxiliary_loss_clip": 0.01370513, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.25490248, + "balance_loss_mlp": 1.01828909, + "epoch": 0.5472418457838569, + "flos": 25780735993680.0, + "grad_norm": 1.6512174553234966, + "language_loss": 0.78671467, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.81073606, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.13330078, + "step": 9102, + "time_per_iteration": 2.774991273880005 + }, + { + "auxiliary_loss_clip": 0.01361945, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.25037169, + "balance_loss_mlp": 1.02203858, + "epoch": 0.5473019690365248, + "flos": 27892969530120.0, + "grad_norm": 1.3583197677153955, + "language_loss": 0.72425759, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74822855, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13122559, + "step": 9103, + "time_per_iteration": 2.8368566036224365 + }, + { + "auxiliary_loss_clip": 0.01366042, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.25372291, + "balance_loss_mlp": 1.0184989, + "epoch": 0.5473620922891929, + "flos": 57641687238240.0, + "grad_norm": 1.2565947464053595, + "language_loss": 0.65393317, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67790908, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1305542, + "step": 9104, + "time_per_iteration": 3.0877695083618164 + }, + { + "auxiliary_loss_clip": 0.01372745, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.25584567, + "balance_loss_mlp": 1.01708674, + "epoch": 0.5474222155418608, + "flos": 19358751182400.0, + "grad_norm": 2.1699898470412635, + "language_loss": 0.81721449, + "learning_rate": 1.790271716558888e-06, + "loss": 0.84124839, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.13555908, + "step": 9105, + "time_per_iteration": 2.8159615993499756 + }, + { + "auxiliary_loss_clip": 0.01364314, + "auxiliary_loss_mlp": 0.01027001, + "balance_loss_clip": 1.251369, + "balance_loss_mlp": 1.0141145, + "epoch": 0.5474823387945288, + "flos": 25126110025200.0, + "grad_norm": 1.409230393429556, + "language_loss": 0.80495542, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82886857, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.12902832, + "step": 9106, + "time_per_iteration": 2.847515344619751 + }, + { + "auxiliary_loss_clip": 0.01366051, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.25219131, + "balance_loss_mlp": 1.02434957, + "epoch": 0.5475424620471967, + "flos": 18008704824840.0, + "grad_norm": 1.695504629724734, + "language_loss": 0.6994884, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72351962, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1272583, + "step": 9107, + "time_per_iteration": 2.7831079959869385 + }, + { + "auxiliary_loss_clip": 0.01376396, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.25964391, + "balance_loss_mlp": 1.01922715, + "epoch": 0.5476025852998647, + "flos": 22314395263680.0, + "grad_norm": 2.2852901963593304, + "language_loss": 0.63513529, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65922636, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13476562, + "step": 9108, + "time_per_iteration": 2.8405373096466064 + }, + { + "auxiliary_loss_clip": 0.01368016, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.25418413, + "balance_loss_mlp": 1.01489973, + "epoch": 0.5476627085525327, + "flos": 20125324064160.0, + "grad_norm": 1.645313618706379, + "language_loss": 0.75225449, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77620924, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.12567139, + "step": 9109, + "time_per_iteration": 2.8096487522125244 + }, + { + "auxiliary_loss_clip": 0.01364085, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.2523942, + "balance_loss_mlp": 1.01619744, + "epoch": 0.5477228318052006, + "flos": 17717110216200.0, + "grad_norm": 1.7271722894423907, + "language_loss": 0.77922463, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.80316031, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13287354, + "step": 9110, + "time_per_iteration": 2.8048105239868164 + }, + { + "auxiliary_loss_clip": 0.0136128, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.24984562, + "balance_loss_mlp": 1.02074933, + "epoch": 0.5477829550578687, + "flos": 25854487421400.0, + "grad_norm": 1.5046771587073569, + "language_loss": 0.71397948, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73792815, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.1282959, + "step": 9111, + "time_per_iteration": 2.8378729820251465 + }, + { + "auxiliary_loss_clip": 0.01364207, + "auxiliary_loss_mlp": 0.0104028, + "balance_loss_clip": 1.2493329, + "balance_loss_mlp": 1.02648163, + "epoch": 0.5478430783105366, + "flos": 23044924902960.0, + "grad_norm": 1.519072042901358, + "language_loss": 0.71439576, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73844063, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13800049, + "step": 9112, + "time_per_iteration": 4.258775472640991 + }, + { + "auxiliary_loss_clip": 0.01369039, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.25417995, + "balance_loss_mlp": 1.01819921, + "epoch": 0.5479032015632046, + "flos": 16075591075080.0, + "grad_norm": 1.8073391905185632, + "language_loss": 0.88758123, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.91158646, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13311768, + "step": 9113, + "time_per_iteration": 2.7227463722229004 + }, + { + "auxiliary_loss_clip": 0.01368545, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.25263262, + "balance_loss_mlp": 1.01484728, + "epoch": 0.5479633248158725, + "flos": 24283917731160.0, + "grad_norm": 1.376656141524278, + "language_loss": 0.73203075, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75599951, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.1348877, + "step": 9114, + "time_per_iteration": 4.348244905471802 + }, + { + "auxiliary_loss_clip": 0.01365068, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.25199103, + "balance_loss_mlp": 1.02132237, + "epoch": 0.5480234480685405, + "flos": 26363681560800.0, + "grad_norm": 1.98024760483411, + "language_loss": 0.72196198, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.7459451, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.11914062, + "step": 9115, + "time_per_iteration": 2.775815486907959 + }, + { + "auxiliary_loss_clip": 0.01374483, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.25667119, + "balance_loss_mlp": 1.0221945, + "epoch": 0.5480835713212084, + "flos": 22060265190120.0, + "grad_norm": 1.6178087537192054, + "language_loss": 0.7249524, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.7490598, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14086914, + "step": 9116, + "time_per_iteration": 2.7645246982574463 + }, + { + "auxiliary_loss_clip": 0.01367281, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.25252533, + "balance_loss_mlp": 1.02227569, + "epoch": 0.5481436945738765, + "flos": 25306488671040.0, + "grad_norm": 1.8533849830361229, + "language_loss": 0.76623136, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.79025668, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.12976074, + "step": 9117, + "time_per_iteration": 2.783994674682617 + }, + { + "auxiliary_loss_clip": 0.0135754, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.24640775, + "balance_loss_mlp": 1.02048278, + "epoch": 0.5482038178265444, + "flos": 33587064245520.0, + "grad_norm": 1.533613496473092, + "language_loss": 0.63103575, + "learning_rate": 1.785237306671674e-06, + "loss": 0.6549381, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12207031, + "step": 9118, + "time_per_iteration": 4.283658266067505 + }, + { + "auxiliary_loss_clip": 0.01373838, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.25700462, + "balance_loss_mlp": 1.0220921, + "epoch": 0.5482639410792124, + "flos": 19030910289480.0, + "grad_norm": 2.2140947681378296, + "language_loss": 0.79113036, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81522888, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13909912, + "step": 9119, + "time_per_iteration": 2.7464284896850586 + }, + { + "auxiliary_loss_clip": 0.01363306, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.25129414, + "balance_loss_mlp": 1.02368665, + "epoch": 0.5483240643318803, + "flos": 25415755432560.0, + "grad_norm": 2.708673380021244, + "language_loss": 0.82356477, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84754956, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.11499023, + "step": 9120, + "time_per_iteration": 2.7631518840789795 + }, + { + "auxiliary_loss_clip": 0.01368757, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.25322914, + "balance_loss_mlp": 1.01996493, + "epoch": 0.5483841875845483, + "flos": 21471431410800.0, + "grad_norm": 1.7020474449459735, + "language_loss": 0.80136824, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82538617, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13067627, + "step": 9121, + "time_per_iteration": 2.7562716007232666 + }, + { + "auxiliary_loss_clip": 0.01371694, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.25482917, + "balance_loss_mlp": 1.02199268, + "epoch": 0.5484443108372163, + "flos": 24751789541280.0, + "grad_norm": 1.7283315011137093, + "language_loss": 0.61299741, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63707054, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13635254, + "step": 9122, + "time_per_iteration": 2.7567574977874756 + }, + { + "auxiliary_loss_clip": 0.0136202, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.2508781, + "balance_loss_mlp": 1.02406907, + "epoch": 0.5485044340898843, + "flos": 25381011657600.0, + "grad_norm": 1.550586139038017, + "language_loss": 0.71572387, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73970139, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.11669922, + "step": 9123, + "time_per_iteration": 4.32279896736145 + }, + { + "auxiliary_loss_clip": 0.01366768, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.25376892, + "balance_loss_mlp": 1.02256227, + "epoch": 0.5485645573425523, + "flos": 12645049937400.0, + "grad_norm": 1.7990040343250524, + "language_loss": 0.83554631, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85956287, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.12335205, + "step": 9124, + "time_per_iteration": 2.8054397106170654 + }, + { + "auxiliary_loss_clip": 0.01364205, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.25317001, + "balance_loss_mlp": 1.02353799, + "epoch": 0.5486246805952202, + "flos": 28335275054640.0, + "grad_norm": 1.625308954952596, + "language_loss": 0.80195004, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82594848, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12097168, + "step": 9125, + "time_per_iteration": 2.8345532417297363 + }, + { + "auxiliary_loss_clip": 0.01371147, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.25407088, + "balance_loss_mlp": 1.01904976, + "epoch": 0.5486848038478882, + "flos": 16804374554880.0, + "grad_norm": 2.203936639413092, + "language_loss": 0.74378479, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76781893, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13201904, + "step": 9126, + "time_per_iteration": 2.7480006217956543 + }, + { + "auxiliary_loss_clip": 0.01372416, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.25426888, + "balance_loss_mlp": 1.01995873, + "epoch": 0.5487449271005561, + "flos": 17240426391960.0, + "grad_norm": 2.473495664332561, + "language_loss": 0.66912854, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.6931929, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.140625, + "step": 9127, + "time_per_iteration": 2.674273729324341 + }, + { + "auxiliary_loss_clip": 0.01360457, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.24650264, + "balance_loss_mlp": 1.02254188, + "epoch": 0.5488050503532241, + "flos": 17344454675040.0, + "grad_norm": 1.5982285669923413, + "language_loss": 0.83521599, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85918319, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13720703, + "step": 9128, + "time_per_iteration": 2.7953202724456787 + }, + { + "auxiliary_loss_clip": 0.01361408, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.24923658, + "balance_loss_mlp": 1.02458978, + "epoch": 0.548865173605892, + "flos": 17243756277480.0, + "grad_norm": 1.7349836824887415, + "language_loss": 0.74479091, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76877558, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12487793, + "step": 9129, + "time_per_iteration": 2.789928436279297 + }, + { + "auxiliary_loss_clip": 0.01372693, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.25566578, + "balance_loss_mlp": 1.02071881, + "epoch": 0.5489252968585601, + "flos": 17461152766440.0, + "grad_norm": 2.101913941685959, + "language_loss": 0.62820709, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65227908, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13763428, + "step": 9130, + "time_per_iteration": 2.750885248184204 + }, + { + "auxiliary_loss_clip": 0.0136996, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.25353301, + "balance_loss_mlp": 1.02382243, + "epoch": 0.548985420111228, + "flos": 26329059610920.0, + "grad_norm": 2.18750687722109, + "language_loss": 0.63640499, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.66048157, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13873291, + "step": 9131, + "time_per_iteration": 2.812589645385742 + }, + { + "auxiliary_loss_clip": 0.01368533, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.25381005, + "balance_loss_mlp": 1.02440298, + "epoch": 0.549045543363896, + "flos": 18697871526480.0, + "grad_norm": 1.6957727163307164, + "language_loss": 0.75507736, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.77914894, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14233398, + "step": 9132, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.01365749, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.24985719, + "balance_loss_mlp": 1.0203321, + "epoch": 0.5491056666165639, + "flos": 24722771545080.0, + "grad_norm": 1.4529963075957888, + "language_loss": 0.81658149, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.84056926, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12713623, + "step": 9133, + "time_per_iteration": 2.864567995071411 + }, + { + "auxiliary_loss_clip": 0.01363607, + "auxiliary_loss_mlp": 0.01036732, + "balance_loss_clip": 1.2494154, + "balance_loss_mlp": 1.02375031, + "epoch": 0.5491657898692319, + "flos": 21581550947880.0, + "grad_norm": 1.7250841242456885, + "language_loss": 0.70301485, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72701824, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.12969971, + "step": 9134, + "time_per_iteration": 2.7302300930023193 + }, + { + "auxiliary_loss_clip": 0.01365226, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.24901068, + "balance_loss_mlp": 1.02090812, + "epoch": 0.5492259131219, + "flos": 50485193168400.0, + "grad_norm": 2.1484309686186163, + "language_loss": 0.61405337, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63803512, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.12042236, + "step": 9135, + "time_per_iteration": 2.9961764812469482 + }, + { + "auxiliary_loss_clip": 0.01369515, + "auxiliary_loss_mlp": 0.0102943, + "balance_loss_clip": 1.25315452, + "balance_loss_mlp": 1.01578081, + "epoch": 0.5492860363745679, + "flos": 25124769949320.0, + "grad_norm": 1.6449958101961781, + "language_loss": 0.72915161, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.75314111, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13659668, + "step": 9136, + "time_per_iteration": 2.766324281692505 + }, + { + "auxiliary_loss_clip": 0.01374082, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.2536453, + "balance_loss_mlp": 1.02431655, + "epoch": 0.5493461596272359, + "flos": 22638378362400.0, + "grad_norm": 2.0061978662537148, + "language_loss": 0.68291378, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70704484, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.14709473, + "step": 9137, + "time_per_iteration": 2.7387638092041016 + }, + { + "auxiliary_loss_clip": 0.01187959, + "auxiliary_loss_mlp": 0.01013765, + "balance_loss_clip": 1.14220715, + "balance_loss_mlp": 1.01125014, + "epoch": 0.5494062828799038, + "flos": 66165608043720.0, + "grad_norm": 0.7692845652612602, + "language_loss": 0.65376025, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67577749, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02514648, + "step": 9138, + "time_per_iteration": 3.278052806854248 + }, + { + "auxiliary_loss_clip": 0.01363091, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.24712026, + "balance_loss_mlp": 1.02073526, + "epoch": 0.5494664061325718, + "flos": 21110958377640.0, + "grad_norm": 1.7121111466977788, + "language_loss": 0.75653183, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7805087, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13842773, + "step": 9139, + "time_per_iteration": 2.718338966369629 + }, + { + "auxiliary_loss_clip": 0.01358079, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.24488401, + "balance_loss_mlp": 1.01879263, + "epoch": 0.5495265293852397, + "flos": 14396688615600.0, + "grad_norm": 1.741284336915909, + "language_loss": 0.7155301, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73942333, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12451172, + "step": 9140, + "time_per_iteration": 2.760258674621582 + }, + { + "auxiliary_loss_clip": 0.01358934, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.24393106, + "balance_loss_mlp": 1.01938295, + "epoch": 0.5495866526379077, + "flos": 25553674715040.0, + "grad_norm": 2.0428680331799387, + "language_loss": 0.76711059, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.79103506, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14141846, + "step": 9141, + "time_per_iteration": 2.779320001602173 + }, + { + "auxiliary_loss_clip": 0.013526, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.24145496, + "balance_loss_mlp": 1.01810598, + "epoch": 0.5496467758905756, + "flos": 21321289011960.0, + "grad_norm": 1.8035546419864896, + "language_loss": 0.75197852, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77580559, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12011719, + "step": 9142, + "time_per_iteration": 2.7882299423217773 + }, + { + "auxiliary_loss_clip": 0.01365842, + "auxiliary_loss_mlp": 0.01039557, + "balance_loss_clip": 1.24925554, + "balance_loss_mlp": 1.02521646, + "epoch": 0.5497068991432437, + "flos": 22237720034040.0, + "grad_norm": 2.0523075956564707, + "language_loss": 0.76967853, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.79373258, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14337158, + "step": 9143, + "time_per_iteration": 2.7442800998687744 + }, + { + "auxiliary_loss_clip": 0.01360408, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.24666238, + "balance_loss_mlp": 1.01840913, + "epoch": 0.5497670223959116, + "flos": 18483764314680.0, + "grad_norm": 1.8717336812882432, + "language_loss": 0.798244, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82216078, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12866211, + "step": 9144, + "time_per_iteration": 2.7676939964294434 + }, + { + "auxiliary_loss_clip": 0.01369063, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.25304008, + "balance_loss_mlp": 1.02320433, + "epoch": 0.5498271456485796, + "flos": 29211074089560.0, + "grad_norm": 1.7218381218809726, + "language_loss": 0.7135607, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73761773, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13415527, + "step": 9145, + "time_per_iteration": 2.796156883239746 + }, + { + "auxiliary_loss_clip": 0.01367207, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.2521503, + "balance_loss_mlp": 1.02185082, + "epoch": 0.5498872689012475, + "flos": 34831133118720.0, + "grad_norm": 1.4203045010222126, + "language_loss": 0.70637912, + "learning_rate": 1.774398678985076e-06, + "loss": 0.73039389, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.12408447, + "step": 9146, + "time_per_iteration": 2.9051899909973145 + }, + { + "auxiliary_loss_clip": 0.01356231, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.24509382, + "balance_loss_mlp": 1.01876473, + "epoch": 0.5499473921539155, + "flos": 25927548507000.0, + "grad_norm": 1.6974518958298748, + "language_loss": 0.64401805, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66788387, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.11584473, + "step": 9147, + "time_per_iteration": 2.797525405883789 + }, + { + "auxiliary_loss_clip": 0.01362953, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.24898076, + "balance_loss_mlp": 1.02323985, + "epoch": 0.5500075154065835, + "flos": 22278879929880.0, + "grad_norm": 4.330676507072945, + "language_loss": 0.8075285, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83151901, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.12866211, + "step": 9148, + "time_per_iteration": 2.7379088401794434 + }, + { + "auxiliary_loss_clip": 0.0136241, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.24843311, + "balance_loss_mlp": 1.02178478, + "epoch": 0.5500676386592515, + "flos": 28043314970760.0, + "grad_norm": 1.8106055391374218, + "language_loss": 0.7932021, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81717241, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.12835693, + "step": 9149, + "time_per_iteration": 2.8379759788513184 + }, + { + "auxiliary_loss_clip": 0.0135894, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.24604166, + "balance_loss_mlp": 1.01872838, + "epoch": 0.5501277619119195, + "flos": 23920114812480.0, + "grad_norm": 7.133175024945236, + "language_loss": 0.71976089, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74366581, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.12823486, + "step": 9150, + "time_per_iteration": 2.7916011810302734 + }, + { + "auxiliary_loss_clip": 0.01362245, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.24364042, + "balance_loss_mlp": 1.02450144, + "epoch": 0.5501878851645874, + "flos": 20928792963960.0, + "grad_norm": 2.059355836160669, + "language_loss": 0.7547031, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77871251, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14190674, + "step": 9151, + "time_per_iteration": 2.7343204021453857 + }, + { + "auxiliary_loss_clip": 0.01364464, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.24931431, + "balance_loss_mlp": 1.02132034, + "epoch": 0.5502480084172554, + "flos": 20669871103920.0, + "grad_norm": 1.7658896636648984, + "language_loss": 0.76619953, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.79018855, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13122559, + "step": 9152, + "time_per_iteration": 5.724074125289917 + }, + { + "auxiliary_loss_clip": 0.01356229, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.24276114, + "balance_loss_mlp": 1.01978624, + "epoch": 0.5503081316699233, + "flos": 26438204547360.0, + "grad_norm": 1.641081488142825, + "language_loss": 0.82764447, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.85154486, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.14019775, + "step": 9153, + "time_per_iteration": 3.0434343814849854 + }, + { + "auxiliary_loss_clip": 0.01359691, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.24669325, + "balance_loss_mlp": 1.01996779, + "epoch": 0.5503682549225913, + "flos": 30635440391880.0, + "grad_norm": 2.4004639768432923, + "language_loss": 0.7422713, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76620042, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13262939, + "step": 9154, + "time_per_iteration": 2.8319461345672607 + }, + { + "auxiliary_loss_clip": 0.01373598, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.25372243, + "balance_loss_mlp": 1.01714623, + "epoch": 0.5504283781752592, + "flos": 22570636971960.0, + "grad_norm": 1.5147981911622863, + "language_loss": 0.73149395, + "learning_rate": 1.770916243273199e-06, + "loss": 0.75554007, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13867188, + "step": 9155, + "time_per_iteration": 2.7693912982940674 + }, + { + "auxiliary_loss_clip": 0.0118648, + "auxiliary_loss_mlp": 0.01008062, + "balance_loss_clip": 1.14118886, + "balance_loss_mlp": 1.00553465, + "epoch": 0.5504885014279273, + "flos": 67914769611960.0, + "grad_norm": 0.7598900469910871, + "language_loss": 0.55428284, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57622826, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02526855, + "step": 9156, + "time_per_iteration": 3.406799793243408 + }, + { + "auxiliary_loss_clip": 0.01360492, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.24634051, + "balance_loss_mlp": 1.01885033, + "epoch": 0.5505486246805952, + "flos": 22454466789240.0, + "grad_norm": 1.9586677425585475, + "language_loss": 0.82722008, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85114419, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.1305542, + "step": 9157, + "time_per_iteration": 4.21427845954895 + }, + { + "auxiliary_loss_clip": 0.01374872, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.2545197, + "balance_loss_mlp": 1.01943243, + "epoch": 0.5506087479332632, + "flos": 26912614303440.0, + "grad_norm": 3.6896561848375073, + "language_loss": 0.75860369, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.78269154, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.14477539, + "step": 9158, + "time_per_iteration": 2.867436408996582 + }, + { + "auxiliary_loss_clip": 0.01354821, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.24334431, + "balance_loss_mlp": 1.01861036, + "epoch": 0.5506688711859311, + "flos": 22935495708000.0, + "grad_norm": 1.4661049935354473, + "language_loss": 0.70040762, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72426355, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.1217041, + "step": 9159, + "time_per_iteration": 2.770796060562134 + }, + { + "auxiliary_loss_clip": 0.01360856, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.24493313, + "balance_loss_mlp": 1.01930928, + "epoch": 0.5507289944385991, + "flos": 29612422760040.0, + "grad_norm": 1.5966656742923255, + "language_loss": 0.68287408, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70681524, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.13952637, + "step": 9160, + "time_per_iteration": 2.8514223098754883 + }, + { + "auxiliary_loss_clip": 0.01358603, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.24425077, + "balance_loss_mlp": 1.01736736, + "epoch": 0.5507891176912671, + "flos": 15337792539360.0, + "grad_norm": 1.827538472866524, + "language_loss": 0.72104061, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.74492705, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.12658691, + "step": 9161, + "time_per_iteration": 2.747532606124878 + }, + { + "auxiliary_loss_clip": 0.01356436, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.24192548, + "balance_loss_mlp": 1.0232625, + "epoch": 0.5508492409439351, + "flos": 26583961243320.0, + "grad_norm": 1.6238899877591915, + "language_loss": 0.70043886, + "learning_rate": 1.768208168081359e-06, + "loss": 0.72436732, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.1315918, + "step": 9162, + "time_per_iteration": 4.35046648979187 + }, + { + "auxiliary_loss_clip": 0.01356839, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.24286485, + "balance_loss_mlp": 1.02366376, + "epoch": 0.5509093641966031, + "flos": 25448428181160.0, + "grad_norm": 2.515701483994973, + "language_loss": 0.8583349, + "learning_rate": 1.767821335237733e-06, + "loss": 0.88227475, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13494873, + "step": 9163, + "time_per_iteration": 2.810152292251587 + }, + { + "auxiliary_loss_clip": 0.0135333, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.24099874, + "balance_loss_mlp": 1.01869655, + "epoch": 0.550969487449271, + "flos": 18703313046720.0, + "grad_norm": 1.6150547640506554, + "language_loss": 0.80689132, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.83073366, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12213135, + "step": 9164, + "time_per_iteration": 2.7985100746154785 + }, + { + "auxiliary_loss_clip": 0.01371892, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.25434875, + "balance_loss_mlp": 1.01748252, + "epoch": 0.551029610701939, + "flos": 22713591691080.0, + "grad_norm": 1.8169230751462888, + "language_loss": 0.73663956, + "learning_rate": 1.767047695977863e-06, + "loss": 0.76067209, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.13873291, + "step": 9165, + "time_per_iteration": 2.7626078128814697 + }, + { + "auxiliary_loss_clip": 0.01350149, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.23855591, + "balance_loss_mlp": 1.01469111, + "epoch": 0.5510897339546069, + "flos": 12424120521120.0, + "grad_norm": 1.8774814246661327, + "language_loss": 0.79534119, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81912208, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13226318, + "step": 9166, + "time_per_iteration": 2.8308961391448975 + }, + { + "auxiliary_loss_clip": 0.01360465, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.24437392, + "balance_loss_mlp": 1.01748145, + "epoch": 0.5511498572072749, + "flos": 18775237098240.0, + "grad_norm": 2.04027217504251, + "language_loss": 0.77073342, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.79464751, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13458252, + "step": 9167, + "time_per_iteration": 2.734874725341797 + }, + { + "auxiliary_loss_clip": 0.01353687, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.2398901, + "balance_loss_mlp": 1.0153029, + "epoch": 0.5512099804599428, + "flos": 19578096872640.0, + "grad_norm": 2.0694491148456584, + "language_loss": 0.8094523, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.83327925, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.137146, + "step": 9168, + "time_per_iteration": 2.8058557510375977 + }, + { + "auxiliary_loss_clip": 0.0136374, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.24705029, + "balance_loss_mlp": 1.02199376, + "epoch": 0.5512701037126109, + "flos": 26250638221800.0, + "grad_norm": 1.515938957958835, + "language_loss": 0.68670332, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71070039, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13970947, + "step": 9169, + "time_per_iteration": 2.7889628410339355 + }, + { + "auxiliary_loss_clip": 0.01348399, + "auxiliary_loss_mlp": 0.01026626, + "balance_loss_clip": 1.23716795, + "balance_loss_mlp": 1.01430535, + "epoch": 0.5513302269652788, + "flos": 21950551736640.0, + "grad_norm": 1.9559009642196277, + "language_loss": 0.85975742, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.88350767, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12322998, + "step": 9170, + "time_per_iteration": 2.7498779296875 + }, + { + "auxiliary_loss_clip": 0.01182097, + "auxiliary_loss_mlp": 0.01013648, + "balance_loss_clip": 1.13725471, + "balance_loss_mlp": 1.01137066, + "epoch": 0.5513903502179468, + "flos": 68250610351800.0, + "grad_norm": 0.9294240572318018, + "language_loss": 0.59890884, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.6208663, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02282715, + "step": 9171, + "time_per_iteration": 3.3014369010925293 + }, + { + "auxiliary_loss_clip": 0.013555, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.24224389, + "balance_loss_mlp": 1.02059042, + "epoch": 0.5514504734706147, + "flos": 18738909597240.0, + "grad_norm": 1.5663457889455017, + "language_loss": 0.71012849, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.7340185, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12921143, + "step": 9172, + "time_per_iteration": 2.735595941543579 + }, + { + "auxiliary_loss_clip": 0.01353581, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.23991394, + "balance_loss_mlp": 1.01806855, + "epoch": 0.5515105967232827, + "flos": 22275793694520.0, + "grad_norm": 1.6828177414288428, + "language_loss": 0.7641921, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78804439, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13562012, + "step": 9173, + "time_per_iteration": 2.7689876556396484 + }, + { + "auxiliary_loss_clip": 0.0135197, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.23909998, + "balance_loss_mlp": 1.01588941, + "epoch": 0.5515707199759508, + "flos": 22561865566200.0, + "grad_norm": 2.9542710497631455, + "language_loss": 0.75188565, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77569729, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13287354, + "step": 9174, + "time_per_iteration": 2.7383224964141846 + }, + { + "auxiliary_loss_clip": 0.0135707, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.24223816, + "balance_loss_mlp": 1.01661849, + "epoch": 0.5516308432286187, + "flos": 28296673485480.0, + "grad_norm": 1.9028067330462657, + "language_loss": 0.73045313, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.75431859, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.12860107, + "step": 9175, + "time_per_iteration": 2.8129687309265137 + }, + { + "auxiliary_loss_clip": 0.0135681, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.24263597, + "balance_loss_mlp": 1.02090645, + "epoch": 0.5516909664812867, + "flos": 18768333677040.0, + "grad_norm": 1.6746842587346122, + "language_loss": 0.6898666, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71377146, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.12780762, + "step": 9176, + "time_per_iteration": 2.7168421745300293 + }, + { + "auxiliary_loss_clip": 0.01352537, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.24057841, + "balance_loss_mlp": 1.01880515, + "epoch": 0.5517510897339546, + "flos": 27745426066320.0, + "grad_norm": 1.5502381428088785, + "language_loss": 0.71105903, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73490024, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12768555, + "step": 9177, + "time_per_iteration": 2.8968710899353027 + }, + { + "auxiliary_loss_clip": 0.01356154, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.24226809, + "balance_loss_mlp": 1.01657248, + "epoch": 0.5518112129866226, + "flos": 18408916461240.0, + "grad_norm": 1.5200316255176543, + "language_loss": 0.80157292, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82542181, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12164307, + "step": 9178, + "time_per_iteration": 2.711481809616089 + }, + { + "auxiliary_loss_clip": 0.01366001, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.24909592, + "balance_loss_mlp": 1.02312899, + "epoch": 0.5518713362392905, + "flos": 25088158189800.0, + "grad_norm": 1.5364529933915285, + "language_loss": 0.75526774, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77929878, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.13989258, + "step": 9179, + "time_per_iteration": 2.783384323120117 + }, + { + "auxiliary_loss_clip": 0.01359747, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.24637008, + "balance_loss_mlp": 1.02077198, + "epoch": 0.5519314594919585, + "flos": 36545956995600.0, + "grad_norm": 1.588365907213528, + "language_loss": 0.70224071, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72617137, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.12561035, + "step": 9180, + "time_per_iteration": 2.9076344966888428 + }, + { + "auxiliary_loss_clip": 0.01360157, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.24492359, + "balance_loss_mlp": 1.01967144, + "epoch": 0.5519915827446265, + "flos": 20453449215600.0, + "grad_norm": 1.753447017834717, + "language_loss": 0.67513084, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69906837, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13922119, + "step": 9181, + "time_per_iteration": 2.7402329444885254 + }, + { + "auxiliary_loss_clip": 0.01369442, + "auxiliary_loss_mlp": 0.01036139, + "balance_loss_clip": 1.25125003, + "balance_loss_mlp": 1.02204812, + "epoch": 0.5520517059972945, + "flos": 23773221082440.0, + "grad_norm": 2.018660692640311, + "language_loss": 0.79319382, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.8172496, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.14086914, + "step": 9182, + "time_per_iteration": 2.791189432144165 + }, + { + "auxiliary_loss_clip": 0.01357368, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.24195719, + "balance_loss_mlp": 1.016855, + "epoch": 0.5521118292499624, + "flos": 22201067666160.0, + "grad_norm": 1.9611162774825746, + "language_loss": 0.83152997, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85540211, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13006592, + "step": 9183, + "time_per_iteration": 2.7351162433624268 + }, + { + "auxiliary_loss_clip": 0.01357348, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.24331045, + "balance_loss_mlp": 1.01738286, + "epoch": 0.5521719525026304, + "flos": 23588131866840.0, + "grad_norm": 1.2485810417797227, + "language_loss": 0.67572713, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.6996026, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1282959, + "step": 9184, + "time_per_iteration": 2.796403646469116 + }, + { + "auxiliary_loss_clip": 0.0135419, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.23952496, + "balance_loss_mlp": 1.01909113, + "epoch": 0.5522320757552983, + "flos": 26142833361240.0, + "grad_norm": 1.401072943081936, + "language_loss": 0.76304311, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78691554, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13946533, + "step": 9185, + "time_per_iteration": 2.774510622024536 + }, + { + "auxiliary_loss_clip": 0.01364006, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.24764943, + "balance_loss_mlp": 1.02511549, + "epoch": 0.5522921990079663, + "flos": 24680921307120.0, + "grad_norm": 2.6825356931022033, + "language_loss": 0.73891252, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76293945, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13574219, + "step": 9186, + "time_per_iteration": 2.7717411518096924 + }, + { + "auxiliary_loss_clip": 0.01366036, + "auxiliary_loss_mlp": 0.01040229, + "balance_loss_clip": 1.2493118, + "balance_loss_mlp": 1.02697897, + "epoch": 0.5523523222606344, + "flos": 22753411511040.0, + "grad_norm": 1.7860761794403501, + "language_loss": 0.66496062, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68902326, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13256836, + "step": 9187, + "time_per_iteration": 2.7279653549194336 + }, + { + "auxiliary_loss_clip": 0.0136072, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.24603117, + "balance_loss_mlp": 1.020046, + "epoch": 0.5524124455133023, + "flos": 19760627761560.0, + "grad_norm": 1.5622399826349311, + "language_loss": 0.78021848, + "learning_rate": 1.758153413657318e-06, + "loss": 0.80415493, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.12884521, + "step": 9188, + "time_per_iteration": 2.7960739135742188 + }, + { + "auxiliary_loss_clip": 0.01360561, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.24595082, + "balance_loss_mlp": 1.02127659, + "epoch": 0.5524725687659703, + "flos": 23300070185520.0, + "grad_norm": 1.7432652421805688, + "language_loss": 0.81684899, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.84080255, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13500977, + "step": 9189, + "time_per_iteration": 2.791584014892578 + }, + { + "auxiliary_loss_clip": 0.01354977, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.2428205, + "balance_loss_mlp": 1.01635361, + "epoch": 0.5525326920186382, + "flos": 24867431815320.0, + "grad_norm": 1.2993077612836728, + "language_loss": 0.76961643, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.79346275, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13311768, + "step": 9190, + "time_per_iteration": 4.196221113204956 + }, + { + "auxiliary_loss_clip": 0.0137026, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.24965239, + "balance_loss_mlp": 1.02234852, + "epoch": 0.5525928152713062, + "flos": 13739585537160.0, + "grad_norm": 2.279654360802633, + "language_loss": 0.78870451, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81277603, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.14538574, + "step": 9191, + "time_per_iteration": 4.196937322616577 + }, + { + "auxiliary_loss_clip": 0.01356208, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.24201775, + "balance_loss_mlp": 1.02063704, + "epoch": 0.5526529385239741, + "flos": 13074563828520.0, + "grad_norm": 1.8603414024445262, + "language_loss": 0.68989098, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71378881, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.12927246, + "step": 9192, + "time_per_iteration": 2.708378791809082 + }, + { + "auxiliary_loss_clip": 0.01355976, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.24340916, + "balance_loss_mlp": 1.01882565, + "epoch": 0.5527130617766421, + "flos": 23153541930720.0, + "grad_norm": 1.4451317429255108, + "language_loss": 0.77879459, + "learning_rate": 1.756220509823588e-06, + "loss": 0.80266619, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12359619, + "step": 9193, + "time_per_iteration": 2.714066743850708 + }, + { + "auxiliary_loss_clip": 0.0135907, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.24523687, + "balance_loss_mlp": 1.02679396, + "epoch": 0.55277318502931, + "flos": 21290199989400.0, + "grad_norm": 1.4069153584108134, + "language_loss": 0.7834183, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80740368, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.12689209, + "step": 9194, + "time_per_iteration": 2.756432056427002 + }, + { + "auxiliary_loss_clip": 0.01370148, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.24994802, + "balance_loss_mlp": 1.02492964, + "epoch": 0.5528333082819781, + "flos": 38331446064840.0, + "grad_norm": 2.074439705318254, + "language_loss": 0.70220816, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.7262941, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.1350708, + "step": 9195, + "time_per_iteration": 2.8547046184539795 + }, + { + "auxiliary_loss_clip": 0.01374496, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.25237656, + "balance_loss_mlp": 1.02247, + "epoch": 0.552893431534646, + "flos": 13557663773640.0, + "grad_norm": 2.2009626217133045, + "language_loss": 0.74629998, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.77041084, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.14117432, + "step": 9196, + "time_per_iteration": 4.163687467575073 + }, + { + "auxiliary_loss_clip": 0.013597, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.24589777, + "balance_loss_mlp": 1.01750302, + "epoch": 0.552953554787314, + "flos": 21943770140520.0, + "grad_norm": 1.4795318162118904, + "language_loss": 0.77144122, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79534531, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13208008, + "step": 9197, + "time_per_iteration": 2.8700015544891357 + }, + { + "auxiliary_loss_clip": 0.01356869, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.24368453, + "balance_loss_mlp": 1.01843691, + "epoch": 0.5530136780399819, + "flos": 43666448431320.0, + "grad_norm": 1.4165599325419178, + "language_loss": 0.76707637, + "learning_rate": 1.754287837093407e-06, + "loss": 0.79095149, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.12207031, + "step": 9198, + "time_per_iteration": 2.948744773864746 + }, + { + "auxiliary_loss_clip": 0.01361816, + "auxiliary_loss_mlp": 0.01026441, + "balance_loss_clip": 1.24716401, + "balance_loss_mlp": 1.0138762, + "epoch": 0.5530738012926499, + "flos": 25050896696520.0, + "grad_norm": 1.3841030153204892, + "language_loss": 0.79760981, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.82149231, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.12567139, + "step": 9199, + "time_per_iteration": 2.7992451190948486 + }, + { + "auxiliary_loss_clip": 0.01360313, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.2449441, + "balance_loss_mlp": 1.01737571, + "epoch": 0.553133924545318, + "flos": 16476452445240.0, + "grad_norm": 1.8073554396423714, + "language_loss": 0.64135087, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66525316, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.12524414, + "step": 9200, + "time_per_iteration": 4.297309637069702 + }, + { + "auxiliary_loss_clip": 0.01371559, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.25256658, + "balance_loss_mlp": 1.01687539, + "epoch": 0.5531940477979859, + "flos": 24611352540480.0, + "grad_norm": 1.8688039733596868, + "language_loss": 0.66367626, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68770617, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.14550781, + "step": 9201, + "time_per_iteration": 2.792078733444214 + }, + { + "auxiliary_loss_clip": 0.01366268, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.25156665, + "balance_loss_mlp": 1.0184828, + "epoch": 0.5532541710506539, + "flos": 22164131039760.0, + "grad_norm": 2.708328353190614, + "language_loss": 0.60826021, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.6322453, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13757324, + "step": 9202, + "time_per_iteration": 2.8082900047302246 + }, + { + "auxiliary_loss_clip": 0.01352836, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.24007404, + "balance_loss_mlp": 1.01868188, + "epoch": 0.5533142943033218, + "flos": 21402228119400.0, + "grad_norm": 1.9200827746833067, + "language_loss": 0.64413089, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66797322, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.12731934, + "step": 9203, + "time_per_iteration": 2.7692484855651855 + }, + { + "auxiliary_loss_clip": 0.01361289, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.2457298, + "balance_loss_mlp": 1.01576364, + "epoch": 0.5533744175559898, + "flos": 23555865201840.0, + "grad_norm": 1.482523304548092, + "language_loss": 0.64268029, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66659003, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13922119, + "step": 9204, + "time_per_iteration": 2.7522404193878174 + }, + { + "auxiliary_loss_clip": 0.01350507, + "auxiliary_loss_mlp": 0.01024747, + "balance_loss_clip": 1.23812222, + "balance_loss_mlp": 1.01258779, + "epoch": 0.5534345408086577, + "flos": 24066927325800.0, + "grad_norm": 1.5742713923429044, + "language_loss": 0.77319592, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79694843, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.12158203, + "step": 9205, + "time_per_iteration": 2.773467779159546 + }, + { + "auxiliary_loss_clip": 0.01352707, + "auxiliary_loss_mlp": 0.01034896, + "balance_loss_clip": 1.24141908, + "balance_loss_mlp": 1.02228951, + "epoch": 0.5534946640613257, + "flos": 33779178707400.0, + "grad_norm": 1.4016534592276615, + "language_loss": 0.72362542, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74750149, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12609863, + "step": 9206, + "time_per_iteration": 2.869590997695923 + }, + { + "auxiliary_loss_clip": 0.01360933, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.24588954, + "balance_loss_mlp": 1.02095199, + "epoch": 0.5535547873139937, + "flos": 15163505147520.0, + "grad_norm": 1.8326069176191435, + "language_loss": 0.75252557, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77647918, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13464355, + "step": 9207, + "time_per_iteration": 2.7352399826049805 + }, + { + "auxiliary_loss_clip": 0.01371569, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.25179827, + "balance_loss_mlp": 1.01611698, + "epoch": 0.5536149105666617, + "flos": 16985037459240.0, + "grad_norm": 3.1780655771327058, + "language_loss": 0.62228966, + "learning_rate": 1.750423192272189e-06, + "loss": 0.64631164, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.1451416, + "step": 9208, + "time_per_iteration": 2.7344837188720703 + }, + { + "auxiliary_loss_clip": 0.01365805, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.24872267, + "balance_loss_mlp": 1.02220535, + "epoch": 0.5536750338193296, + "flos": 18154461520800.0, + "grad_norm": 1.8901118422792953, + "language_loss": 0.65153837, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.67554891, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13049316, + "step": 9209, + "time_per_iteration": 2.778099775314331 + }, + { + "auxiliary_loss_clip": 0.01357919, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.24404776, + "balance_loss_mlp": 1.01760006, + "epoch": 0.5537351570719976, + "flos": 22753005427440.0, + "grad_norm": 1.9318499821915462, + "language_loss": 0.82892323, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.85281706, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13861084, + "step": 9210, + "time_per_iteration": 2.7273595333099365 + }, + { + "auxiliary_loss_clip": 0.01356294, + "auxiliary_loss_mlp": 0.01026863, + "balance_loss_clip": 1.24316299, + "balance_loss_mlp": 1.01464438, + "epoch": 0.5537952803246655, + "flos": 26361204450840.0, + "grad_norm": 1.608501002969501, + "language_loss": 0.7327072, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75653875, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12237549, + "step": 9211, + "time_per_iteration": 2.792140007019043 + }, + { + "auxiliary_loss_clip": 0.01365081, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.24710488, + "balance_loss_mlp": 1.02078414, + "epoch": 0.5538554035773335, + "flos": 18041540006880.0, + "grad_norm": 2.4577383403895148, + "language_loss": 0.67293453, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.69693589, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.14257812, + "step": 9212, + "time_per_iteration": 2.734128475189209 + }, + { + "auxiliary_loss_clip": 0.01365654, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.24664021, + "balance_loss_mlp": 1.01281869, + "epoch": 0.5539155268300014, + "flos": 31692552064920.0, + "grad_norm": 1.4566182745189928, + "language_loss": 0.52126324, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.54520071, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.15270996, + "step": 9213, + "time_per_iteration": 2.8411965370178223 + }, + { + "auxiliary_loss_clip": 0.01370578, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.2533164, + "balance_loss_mlp": 1.01308787, + "epoch": 0.5539756500826695, + "flos": 15197802230520.0, + "grad_norm": 1.915687424880062, + "language_loss": 0.86196041, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.88593256, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13549805, + "step": 9214, + "time_per_iteration": 2.744474411010742 + }, + { + "auxiliary_loss_clip": 0.01356647, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.2431128, + "balance_loss_mlp": 1.01492655, + "epoch": 0.5540357733353375, + "flos": 26357915173680.0, + "grad_norm": 1.7520531452012749, + "language_loss": 0.7032305, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72707021, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.1239624, + "step": 9215, + "time_per_iteration": 2.760891914367676 + }, + { + "auxiliary_loss_clip": 0.01364198, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.24674487, + "balance_loss_mlp": 1.01977408, + "epoch": 0.5540958965880054, + "flos": 21328557908400.0, + "grad_norm": 1.5316425158239586, + "language_loss": 0.73754406, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.7615217, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.13781738, + "step": 9216, + "time_per_iteration": 2.762519359588623 + }, + { + "auxiliary_loss_clip": 0.01354637, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.242805, + "balance_loss_mlp": 1.01695561, + "epoch": 0.5541560198406734, + "flos": 25672443832800.0, + "grad_norm": 2.1107137671282414, + "language_loss": 0.72228694, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.74613619, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13330078, + "step": 9217, + "time_per_iteration": 2.7514564990997314 + }, + { + "auxiliary_loss_clip": 0.01357744, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.24390054, + "balance_loss_mlp": 1.01445711, + "epoch": 0.5542161430933413, + "flos": 21944013790680.0, + "grad_norm": 1.532844901993427, + "language_loss": 0.78248668, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80633807, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1293335, + "step": 9218, + "time_per_iteration": 2.7976043224334717 + }, + { + "auxiliary_loss_clip": 0.01362574, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.24389935, + "balance_loss_mlp": 1.01703715, + "epoch": 0.5542762663460093, + "flos": 19575985237920.0, + "grad_norm": 1.6765810184189205, + "language_loss": 0.72561419, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74956113, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.1506958, + "step": 9219, + "time_per_iteration": 2.769594192504883 + }, + { + "auxiliary_loss_clip": 0.01363545, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.24770975, + "balance_loss_mlp": 1.01759338, + "epoch": 0.5543363895986773, + "flos": 19503655102800.0, + "grad_norm": 1.755262541430367, + "language_loss": 0.71584594, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73980105, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14355469, + "step": 9220, + "time_per_iteration": 2.7503530979156494 + }, + { + "auxiliary_loss_clip": 0.0135562, + "auxiliary_loss_mlp": 0.01023083, + "balance_loss_clip": 1.24252605, + "balance_loss_mlp": 1.01059031, + "epoch": 0.5543965128513453, + "flos": 22640449388760.0, + "grad_norm": 1.834199260182774, + "language_loss": 0.79816419, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.82195127, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12493896, + "step": 9221, + "time_per_iteration": 2.775636672973633 + }, + { + "auxiliary_loss_clip": 0.01357926, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.24575329, + "balance_loss_mlp": 1.01611876, + "epoch": 0.5544566361040132, + "flos": 25994924422200.0, + "grad_norm": 1.7628891246829097, + "language_loss": 0.83693308, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86080551, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13214111, + "step": 9222, + "time_per_iteration": 2.8360233306884766 + }, + { + "auxiliary_loss_clip": 0.01368057, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.24949431, + "balance_loss_mlp": 1.01657581, + "epoch": 0.5545167593566812, + "flos": 28263554044920.0, + "grad_norm": 1.6716512311257907, + "language_loss": 0.75662303, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.7806142, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14471436, + "step": 9223, + "time_per_iteration": 2.8325693607330322 + }, + { + "auxiliary_loss_clip": 0.01357783, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.24331713, + "balance_loss_mlp": 1.01240635, + "epoch": 0.5545768826093491, + "flos": 28482574868280.0, + "grad_norm": 1.583652615908205, + "language_loss": 0.81668645, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84052616, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13806152, + "step": 9224, + "time_per_iteration": 2.8123700618743896 + }, + { + "auxiliary_loss_clip": 0.01365504, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.24947011, + "balance_loss_mlp": 1.02157259, + "epoch": 0.5546370058620171, + "flos": 18482586672240.0, + "grad_norm": 1.8446275677261574, + "language_loss": 0.57744712, + "learning_rate": 1.743855475904141e-06, + "loss": 0.6014595, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.14147949, + "step": 9225, + "time_per_iteration": 2.80195951461792 + }, + { + "auxiliary_loss_clip": 0.01361146, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.24451375, + "balance_loss_mlp": 1.02156949, + "epoch": 0.554697129114685, + "flos": 22935779966520.0, + "grad_norm": 1.5786943509248983, + "language_loss": 0.67494261, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69891071, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14099121, + "step": 9226, + "time_per_iteration": 2.8047962188720703 + }, + { + "auxiliary_loss_clip": 0.01366471, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.24907637, + "balance_loss_mlp": 1.0176487, + "epoch": 0.5547572523673531, + "flos": 21801993063840.0, + "grad_norm": 1.3930099576629287, + "language_loss": 0.74827683, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.77225161, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13360596, + "step": 9227, + "time_per_iteration": 2.8212714195251465 + }, + { + "auxiliary_loss_clip": 0.01369686, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.2533958, + "balance_loss_mlp": 1.01666164, + "epoch": 0.5548173756200211, + "flos": 22347514704240.0, + "grad_norm": 1.735257370047242, + "language_loss": 0.73627597, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.76027191, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13232422, + "step": 9228, + "time_per_iteration": 2.74436354637146 + }, + { + "auxiliary_loss_clip": 0.01364285, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.24919116, + "balance_loss_mlp": 1.02018046, + "epoch": 0.554877498872689, + "flos": 17863313604120.0, + "grad_norm": 1.9906698263095315, + "language_loss": 0.76175165, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.78573084, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13439941, + "step": 9229, + "time_per_iteration": 4.309906959533691 + }, + { + "auxiliary_loss_clip": 0.01366634, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.25077355, + "balance_loss_mlp": 1.02534032, + "epoch": 0.554937622125357, + "flos": 17242822285200.0, + "grad_norm": 1.3532659314777207, + "language_loss": 0.69013017, + "learning_rate": 1.741924325613172e-06, + "loss": 0.71418738, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13745117, + "step": 9230, + "time_per_iteration": 4.208350896835327 + }, + { + "auxiliary_loss_clip": 0.01368601, + "auxiliary_loss_mlp": 0.01032431, + "balance_loss_clip": 1.25022936, + "balance_loss_mlp": 1.01743412, + "epoch": 0.5549977453780249, + "flos": 25372768160520.0, + "grad_norm": 2.849324971579155, + "language_loss": 0.69335914, + "learning_rate": 1.741538124855163e-06, + "loss": 0.71736944, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14990234, + "step": 9231, + "time_per_iteration": 2.782762289047241 + }, + { + "auxiliary_loss_clip": 0.01370531, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.25035834, + "balance_loss_mlp": 1.01987004, + "epoch": 0.555057868630693, + "flos": 25084137962160.0, + "grad_norm": 1.6092450260993392, + "language_loss": 0.7856226, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80967212, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.14544678, + "step": 9232, + "time_per_iteration": 2.7574760913848877 + }, + { + "auxiliary_loss_clip": 0.01356465, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.24329782, + "balance_loss_mlp": 1.01641595, + "epoch": 0.5551179918833609, + "flos": 26109835745760.0, + "grad_norm": 1.6250720144785131, + "language_loss": 0.82453156, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84837711, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.11669922, + "step": 9233, + "time_per_iteration": 2.811521053314209 + }, + { + "auxiliary_loss_clip": 0.01371049, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.25291932, + "balance_loss_mlp": 1.02211881, + "epoch": 0.5551781151360289, + "flos": 19388703170880.0, + "grad_norm": 2.0287293935856425, + "language_loss": 0.75625437, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.78031397, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.12786865, + "step": 9234, + "time_per_iteration": 2.7136125564575195 + }, + { + "auxiliary_loss_clip": 0.0135757, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.24458587, + "balance_loss_mlp": 1.0151813, + "epoch": 0.5552382383886968, + "flos": 21731003004600.0, + "grad_norm": 1.8037967383501974, + "language_loss": 0.64822263, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.67208028, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13000488, + "step": 9235, + "time_per_iteration": 4.29327654838562 + }, + { + "auxiliary_loss_clip": 0.01368301, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.2527945, + "balance_loss_mlp": 1.01520729, + "epoch": 0.5552983616413648, + "flos": 14360320506240.0, + "grad_norm": 2.013675727161141, + "language_loss": 0.68247223, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70643759, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13018799, + "step": 9236, + "time_per_iteration": 2.7479536533355713 + }, + { + "auxiliary_loss_clip": 0.01353955, + "auxiliary_loss_mlp": 0.01026204, + "balance_loss_clip": 1.24258375, + "balance_loss_mlp": 1.01245308, + "epoch": 0.5553584848940327, + "flos": 25483374997920.0, + "grad_norm": 1.654939839873721, + "language_loss": 0.86668235, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.89048398, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13769531, + "step": 9237, + "time_per_iteration": 2.76975154876709 + }, + { + "auxiliary_loss_clip": 0.01354373, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.24222302, + "balance_loss_mlp": 1.01906407, + "epoch": 0.5554186081467007, + "flos": 22168963434600.0, + "grad_norm": 1.9643385845617378, + "language_loss": 0.73338693, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75725508, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13354492, + "step": 9238, + "time_per_iteration": 2.8013248443603516 + }, + { + "auxiliary_loss_clip": 0.01367347, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.25009692, + "balance_loss_mlp": 1.01632202, + "epoch": 0.5554787313993687, + "flos": 49755313262880.0, + "grad_norm": 1.5692567945293543, + "language_loss": 0.78945237, + "learning_rate": 1.73844887285358e-06, + "loss": 0.8134222, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13305664, + "step": 9239, + "time_per_iteration": 4.558143138885498 + }, + { + "auxiliary_loss_clip": 0.0136207, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.24640584, + "balance_loss_mlp": 1.01970541, + "epoch": 0.5555388546520367, + "flos": 22132473500160.0, + "grad_norm": 1.4911068607606783, + "language_loss": 0.8024056, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82636285, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13952637, + "step": 9240, + "time_per_iteration": 2.787846565246582 + }, + { + "auxiliary_loss_clip": 0.01359693, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.2464509, + "balance_loss_mlp": 1.01922679, + "epoch": 0.5555989779047047, + "flos": 24687662294880.0, + "grad_norm": 1.654478061930405, + "language_loss": 0.65292335, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67684245, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13000488, + "step": 9241, + "time_per_iteration": 2.9400722980499268 + }, + { + "auxiliary_loss_clip": 0.01361941, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.24668002, + "balance_loss_mlp": 1.01993155, + "epoch": 0.5556591011573726, + "flos": 16110740933640.0, + "grad_norm": 4.8269591400912635, + "language_loss": 0.72988904, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.75383568, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.12799072, + "step": 9242, + "time_per_iteration": 2.8213589191436768 + }, + { + "auxiliary_loss_clip": 0.01362142, + "auxiliary_loss_mlp": 0.01039947, + "balance_loss_clip": 1.2451607, + "balance_loss_mlp": 1.02576077, + "epoch": 0.5557192244100406, + "flos": 12937700363400.0, + "grad_norm": 1.736574782351704, + "language_loss": 0.64183885, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.6658597, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14178467, + "step": 9243, + "time_per_iteration": 2.758348226547241 + }, + { + "auxiliary_loss_clip": 0.01366475, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.25231743, + "balance_loss_mlp": 1.01903915, + "epoch": 0.5557793476627085, + "flos": 23116848954480.0, + "grad_norm": 1.79150823419497, + "language_loss": 0.75245655, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77644157, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13000488, + "step": 9244, + "time_per_iteration": 2.892760992050171 + }, + { + "auxiliary_loss_clip": 0.01350352, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.24057794, + "balance_loss_mlp": 1.02446818, + "epoch": 0.5558394709153766, + "flos": 21432261324600.0, + "grad_norm": 1.9579936011823669, + "language_loss": 0.75068927, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77455944, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12188721, + "step": 9245, + "time_per_iteration": 2.7515556812286377 + }, + { + "auxiliary_loss_clip": 0.01371833, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.25358224, + "balance_loss_mlp": 1.02042365, + "epoch": 0.5558995941680445, + "flos": 25083244578240.0, + "grad_norm": 6.882149522025104, + "language_loss": 0.79842985, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82249618, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.1439209, + "step": 9246, + "time_per_iteration": 2.8327207565307617 + }, + { + "auxiliary_loss_clip": 0.01362632, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.24854219, + "balance_loss_mlp": 1.01918972, + "epoch": 0.5559597174207125, + "flos": 20015976085920.0, + "grad_norm": 1.6975757672838294, + "language_loss": 0.74220669, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76615906, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13397217, + "step": 9247, + "time_per_iteration": 2.733825922012329 + }, + { + "auxiliary_loss_clip": 0.01365311, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.24976778, + "balance_loss_mlp": 1.0210681, + "epoch": 0.5560198406733804, + "flos": 16839930497040.0, + "grad_norm": 2.3097341263727964, + "language_loss": 0.7513454, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.77534974, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.14050293, + "step": 9248, + "time_per_iteration": 2.782252788543701 + }, + { + "auxiliary_loss_clip": 0.01186703, + "auxiliary_loss_mlp": 0.0100334, + "balance_loss_clip": 1.14090765, + "balance_loss_mlp": 1.00095546, + "epoch": 0.5560799639260484, + "flos": 70714197021600.0, + "grad_norm": 0.8683145400733536, + "language_loss": 0.5948658, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61676621, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02380371, + "step": 9249, + "time_per_iteration": 3.416473388671875 + }, + { + "auxiliary_loss_clip": 0.01363149, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.24631119, + "balance_loss_mlp": 1.01745689, + "epoch": 0.5561400871787163, + "flos": 23153785580880.0, + "grad_norm": 1.8510421110722721, + "language_loss": 0.79909521, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82303154, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.13024902, + "step": 9250, + "time_per_iteration": 2.788677453994751 + }, + { + "auxiliary_loss_clip": 0.01365698, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.24821222, + "balance_loss_mlp": 1.02119851, + "epoch": 0.5562002104313843, + "flos": 17570906828280.0, + "grad_norm": 2.1251421921978255, + "language_loss": 0.69824266, + "learning_rate": 1.733816187358836e-06, + "loss": 0.72225183, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.17333984, + "router_z_loss_mlp": 0.14031982, + "step": 9251, + "time_per_iteration": 2.728111982345581 + }, + { + "auxiliary_loss_clip": 0.01360444, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.24541473, + "balance_loss_mlp": 1.01977158, + "epoch": 0.5562603336840523, + "flos": 25050815479800.0, + "grad_norm": 1.6974832938100068, + "language_loss": 0.75501513, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77894473, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.12744141, + "step": 9252, + "time_per_iteration": 2.78060245513916 + }, + { + "auxiliary_loss_clip": 0.01366672, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.24858642, + "balance_loss_mlp": 1.01991844, + "epoch": 0.5563204569367203, + "flos": 29064951918360.0, + "grad_norm": 1.5359076623095251, + "language_loss": 0.72924435, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.75324726, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.137146, + "step": 9253, + "time_per_iteration": 2.8352479934692383 + }, + { + "auxiliary_loss_clip": 0.01359016, + "auxiliary_loss_mlp": 0.01023423, + "balance_loss_clip": 1.24463391, + "balance_loss_mlp": 1.011127, + "epoch": 0.5563805801893883, + "flos": 22095496265400.0, + "grad_norm": 1.8956639997454203, + "language_loss": 0.83183861, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.855663, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.12304688, + "step": 9254, + "time_per_iteration": 2.715986967086792 + }, + { + "auxiliary_loss_clip": 0.01186376, + "auxiliary_loss_mlp": 0.01004913, + "balance_loss_clip": 1.14131629, + "balance_loss_mlp": 1.00274301, + "epoch": 0.5564407034420562, + "flos": 58649006415960.0, + "grad_norm": 0.8688558826768771, + "language_loss": 0.64875817, + "learning_rate": 1.732272280610387e-06, + "loss": 0.67067099, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02172852, + "step": 9255, + "time_per_iteration": 3.097163200378418 + }, + { + "auxiliary_loss_clip": 0.01362502, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.24904156, + "balance_loss_mlp": 1.02197707, + "epoch": 0.5565008266947242, + "flos": 23117539296600.0, + "grad_norm": 1.6165626387341374, + "language_loss": 0.69245726, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71642601, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.13427734, + "router_z_loss_mlp": 0.12390137, + "step": 9256, + "time_per_iteration": 2.7605206966400146 + }, + { + "auxiliary_loss_clip": 0.01352506, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.24113536, + "balance_loss_mlp": 1.01727796, + "epoch": 0.5565609499473921, + "flos": 21583215890640.0, + "grad_norm": 1.4616003313293393, + "language_loss": 0.76018083, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78399885, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12011719, + "step": 9257, + "time_per_iteration": 2.770265817642212 + }, + { + "auxiliary_loss_clip": 0.01360361, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.24436808, + "balance_loss_mlp": 1.01995313, + "epoch": 0.5566210732000602, + "flos": 18374294511360.0, + "grad_norm": 2.2004306345838587, + "language_loss": 0.61315691, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63709354, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13348389, + "step": 9258, + "time_per_iteration": 2.7826192378997803 + }, + { + "auxiliary_loss_clip": 0.01360391, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.24520421, + "balance_loss_mlp": 1.01797593, + "epoch": 0.5566811964527281, + "flos": 25708974375600.0, + "grad_norm": 1.636003931025909, + "language_loss": 0.7933045, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81722772, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13983154, + "step": 9259, + "time_per_iteration": 2.7822680473327637 + }, + { + "auxiliary_loss_clip": 0.01354854, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.23941302, + "balance_loss_mlp": 1.01887035, + "epoch": 0.5567413197053961, + "flos": 26949794580000.0, + "grad_norm": 3.690787496545596, + "language_loss": 0.8215487, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84542066, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13470459, + "step": 9260, + "time_per_iteration": 2.898918390274048 + }, + { + "auxiliary_loss_clip": 0.01363548, + "auxiliary_loss_mlp": 0.01040094, + "balance_loss_clip": 1.24858069, + "balance_loss_mlp": 1.02589583, + "epoch": 0.556801442958064, + "flos": 20855569444920.0, + "grad_norm": 1.4965318691799583, + "language_loss": 0.69198614, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71602255, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14202881, + "step": 9261, + "time_per_iteration": 2.8063607215881348 + }, + { + "auxiliary_loss_clip": 0.0118391, + "auxiliary_loss_mlp": 0.01005757, + "balance_loss_clip": 1.13920927, + "balance_loss_mlp": 1.00377786, + "epoch": 0.556861566210732, + "flos": 70513189328880.0, + "grad_norm": 0.7263041332381251, + "language_loss": 0.61117244, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63306916, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.01977539, + "step": 9262, + "time_per_iteration": 3.3086252212524414 + }, + { + "auxiliary_loss_clip": 0.01366668, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.25014949, + "balance_loss_mlp": 1.01895058, + "epoch": 0.5569216894633999, + "flos": 25342694346960.0, + "grad_norm": 1.4657608760703045, + "language_loss": 0.64442247, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66841519, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13641357, + "step": 9263, + "time_per_iteration": 2.826972007751465 + }, + { + "auxiliary_loss_clip": 0.01355664, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.24210262, + "balance_loss_mlp": 1.01798582, + "epoch": 0.556981812716068, + "flos": 22643982316080.0, + "grad_norm": 1.683882203555373, + "language_loss": 0.73093367, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75480151, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13134766, + "step": 9264, + "time_per_iteration": 2.7921361923217773 + }, + { + "auxiliary_loss_clip": 0.01361163, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.24660528, + "balance_loss_mlp": 1.01920366, + "epoch": 0.5570419359687359, + "flos": 11039533430400.0, + "grad_norm": 3.2673429012518547, + "language_loss": 0.76885545, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.79278827, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.12927246, + "step": 9265, + "time_per_iteration": 2.7341692447662354 + }, + { + "auxiliary_loss_clip": 0.01348257, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.23958051, + "balance_loss_mlp": 1.01853931, + "epoch": 0.5571020592214039, + "flos": 22828665448080.0, + "grad_norm": 1.5071984815035229, + "language_loss": 0.7107662, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73455763, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12347412, + "step": 9266, + "time_per_iteration": 2.956153631210327 + }, + { + "auxiliary_loss_clip": 0.01358605, + "auxiliary_loss_mlp": 0.01035102, + "balance_loss_clip": 1.24641871, + "balance_loss_mlp": 1.02229333, + "epoch": 0.5571621824740719, + "flos": 22932693731160.0, + "grad_norm": 1.6480432776409184, + "language_loss": 0.6796577, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7035948, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12811279, + "step": 9267, + "time_per_iteration": 2.824549913406372 + }, + { + "auxiliary_loss_clip": 0.0134703, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.23746836, + "balance_loss_mlp": 1.01998436, + "epoch": 0.5572223057267398, + "flos": 22971945034080.0, + "grad_norm": 1.7705368414821612, + "language_loss": 0.75384307, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.77764261, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12915039, + "step": 9268, + "time_per_iteration": 4.301886796951294 + }, + { + "auxiliary_loss_clip": 0.01357606, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.24596596, + "balance_loss_mlp": 1.01746035, + "epoch": 0.5572824289794078, + "flos": 20965039248240.0, + "grad_norm": 1.9585686457627662, + "language_loss": 0.75175536, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77563059, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12463379, + "step": 9269, + "time_per_iteration": 4.195640802383423 + }, + { + "auxiliary_loss_clip": 0.01360913, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.24702322, + "balance_loss_mlp": 1.01941681, + "epoch": 0.5573425522320757, + "flos": 25047404377560.0, + "grad_norm": 1.6143758709059373, + "language_loss": 0.83231366, + "learning_rate": 1.726484084647256e-06, + "loss": 0.85624272, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.12579346, + "step": 9270, + "time_per_iteration": 2.814593553543091 + }, + { + "auxiliary_loss_clip": 0.01365517, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.25020802, + "balance_loss_mlp": 1.01879704, + "epoch": 0.5574026754847438, + "flos": 23664928921560.0, + "grad_norm": 2.210026984421974, + "language_loss": 0.79805005, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.82203221, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13897705, + "step": 9271, + "time_per_iteration": 2.785905122756958 + }, + { + "auxiliary_loss_clip": 0.01361993, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.24720418, + "balance_loss_mlp": 1.01901364, + "epoch": 0.5574627987374117, + "flos": 24786573924600.0, + "grad_norm": 1.6901474199126645, + "language_loss": 0.90290594, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92685127, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13513184, + "step": 9272, + "time_per_iteration": 2.7462923526763916 + }, + { + "auxiliary_loss_clip": 0.01353919, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.24308109, + "balance_loss_mlp": 1.02015209, + "epoch": 0.5575229219900797, + "flos": 21839944899240.0, + "grad_norm": 1.8982736312916058, + "language_loss": 0.8432734, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86714315, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12921143, + "step": 9273, + "time_per_iteration": 2.7819125652313232 + }, + { + "auxiliary_loss_clip": 0.01359668, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.2461282, + "balance_loss_mlp": 1.0225172, + "epoch": 0.5575830452427476, + "flos": 27820476961560.0, + "grad_norm": 1.9694026738504733, + "language_loss": 0.74195552, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76591998, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.14263916, + "step": 9274, + "time_per_iteration": 4.324695825576782 + }, + { + "auxiliary_loss_clip": 0.01373859, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_clip": 1.25246561, + "balance_loss_mlp": 1.026214, + "epoch": 0.5576431684954156, + "flos": 17816143671000.0, + "grad_norm": 2.825852018119199, + "language_loss": 0.78196478, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.80611658, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.15106201, + "step": 9275, + "time_per_iteration": 2.721778631210327 + }, + { + "auxiliary_loss_clip": 0.01363086, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.24907231, + "balance_loss_mlp": 1.02286839, + "epoch": 0.5577032917480835, + "flos": 15491102390280.0, + "grad_norm": 1.8186729356629674, + "language_loss": 0.75078368, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77476847, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12518311, + "step": 9276, + "time_per_iteration": 2.746910810470581 + }, + { + "auxiliary_loss_clip": 0.01356614, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.2426039, + "balance_loss_mlp": 1.01899695, + "epoch": 0.5577634150007516, + "flos": 21584759008320.0, + "grad_norm": 1.776720790230206, + "language_loss": 0.75885653, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.78273833, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.12585449, + "step": 9277, + "time_per_iteration": 4.248003721237183 + }, + { + "auxiliary_loss_clip": 0.01355928, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.24406409, + "balance_loss_mlp": 1.02745378, + "epoch": 0.5578235382534195, + "flos": 21144402685080.0, + "grad_norm": 1.4593598960905296, + "language_loss": 0.71911865, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74307632, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12390137, + "step": 9278, + "time_per_iteration": 2.791783571243286 + }, + { + "auxiliary_loss_clip": 0.01364914, + "auxiliary_loss_mlp": 0.01037265, + "balance_loss_clip": 1.24891567, + "balance_loss_mlp": 1.02319801, + "epoch": 0.5578836615060875, + "flos": 26510818941000.0, + "grad_norm": 1.525702121116762, + "language_loss": 0.76084089, + "learning_rate": 1.723012284057868e-06, + "loss": 0.78486264, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.14074707, + "step": 9279, + "time_per_iteration": 2.883999824523926 + }, + { + "auxiliary_loss_clip": 0.01360452, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.24569297, + "balance_loss_mlp": 1.02079105, + "epoch": 0.5579437847587555, + "flos": 20158199854560.0, + "grad_norm": 1.7322900503147616, + "language_loss": 0.6791231, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.70307374, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13818359, + "step": 9280, + "time_per_iteration": 2.847243070602417 + }, + { + "auxiliary_loss_clip": 0.01369942, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.25289953, + "balance_loss_mlp": 1.02483439, + "epoch": 0.5580039080114234, + "flos": 26107642894320.0, + "grad_norm": 1.8297908609500388, + "language_loss": 0.73659241, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.76067096, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13085938, + "step": 9281, + "time_per_iteration": 2.850001096725464 + }, + { + "auxiliary_loss_clip": 0.01354441, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.2423327, + "balance_loss_mlp": 1.02310073, + "epoch": 0.5580640312640914, + "flos": 13775628779640.0, + "grad_norm": 2.8823834785804014, + "language_loss": 0.7505579, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77446234, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12902832, + "step": 9282, + "time_per_iteration": 2.736820697784424 + }, + { + "auxiliary_loss_clip": 0.01354863, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.24303544, + "balance_loss_mlp": 1.02236712, + "epoch": 0.5581241545167593, + "flos": 17680498456680.0, + "grad_norm": 1.8276643497906901, + "language_loss": 0.66188872, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68578714, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12615967, + "step": 9283, + "time_per_iteration": 2.7788829803466797 + }, + { + "auxiliary_loss_clip": 0.01357823, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.24297893, + "balance_loss_mlp": 1.02235341, + "epoch": 0.5581842777694274, + "flos": 19573386302880.0, + "grad_norm": 1.7111801283059052, + "language_loss": 0.83258653, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85650831, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.14990234, + "router_z_loss_mlp": 0.12011719, + "step": 9284, + "time_per_iteration": 2.7584967613220215 + }, + { + "auxiliary_loss_clip": 0.01365287, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.25177264, + "balance_loss_mlp": 1.02529371, + "epoch": 0.5582444010220953, + "flos": 20600099295480.0, + "grad_norm": 2.349875226928953, + "language_loss": 0.85411763, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87815583, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13250732, + "step": 9285, + "time_per_iteration": 2.806925058364868 + }, + { + "auxiliary_loss_clip": 0.01358701, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.24502814, + "balance_loss_mlp": 1.02659321, + "epoch": 0.5583045242747633, + "flos": 19140217659360.0, + "grad_norm": 2.3543281211638445, + "language_loss": 0.7468493, + "learning_rate": 1.720312582354912e-06, + "loss": 0.77082956, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.12738037, + "step": 9286, + "time_per_iteration": 2.9478464126586914 + }, + { + "auxiliary_loss_clip": 0.01359667, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.24617434, + "balance_loss_mlp": 1.02364051, + "epoch": 0.5583646475274312, + "flos": 27460410012000.0, + "grad_norm": 1.5073876358315736, + "language_loss": 0.74393225, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76789933, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13409424, + "step": 9287, + "time_per_iteration": 2.800596237182617 + }, + { + "auxiliary_loss_clip": 0.01370228, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.25362861, + "balance_loss_mlp": 1.02220607, + "epoch": 0.5584247707800992, + "flos": 23658309758880.0, + "grad_norm": 2.6141517913417784, + "language_loss": 0.75037158, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77443206, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13616943, + "step": 9288, + "time_per_iteration": 2.7981619834899902 + }, + { + "auxiliary_loss_clip": 0.01364041, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.25006437, + "balance_loss_mlp": 1.0227046, + "epoch": 0.5584848940327671, + "flos": 13702851952560.0, + "grad_norm": 2.060036561818298, + "language_loss": 0.77472007, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.7987299, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14233398, + "step": 9289, + "time_per_iteration": 2.6999096870422363 + }, + { + "auxiliary_loss_clip": 0.01376365, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.25650215, + "balance_loss_mlp": 1.02394581, + "epoch": 0.5585450172854352, + "flos": 27021921673320.0, + "grad_norm": 1.660108935175417, + "language_loss": 0.61733603, + "learning_rate": 1.718770128672817e-06, + "loss": 0.64147556, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.13647461, + "step": 9290, + "time_per_iteration": 2.8028674125671387 + }, + { + "auxiliary_loss_clip": 0.01366046, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.24886107, + "balance_loss_mlp": 1.01596344, + "epoch": 0.5586051405381031, + "flos": 23190925249080.0, + "grad_norm": 1.8586323982512654, + "language_loss": 0.68522811, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70917821, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.12994385, + "step": 9291, + "time_per_iteration": 2.81549072265625 + }, + { + "auxiliary_loss_clip": 0.0136551, + "auxiliary_loss_mlp": 0.01040322, + "balance_loss_clip": 1.24947286, + "balance_loss_mlp": 1.02623177, + "epoch": 0.5586652637907711, + "flos": 20780234291160.0, + "grad_norm": 2.2586074585233247, + "language_loss": 0.84454179, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86860013, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14111328, + "step": 9292, + "time_per_iteration": 2.7707996368408203 + }, + { + "auxiliary_loss_clip": 0.01358844, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.24576688, + "balance_loss_mlp": 1.02158046, + "epoch": 0.5587253870434391, + "flos": 28225724034600.0, + "grad_norm": 2.8802367808763685, + "language_loss": 0.73768771, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76162124, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12945557, + "step": 9293, + "time_per_iteration": 2.853672504425049 + }, + { + "auxiliary_loss_clip": 0.01358043, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.24516678, + "balance_loss_mlp": 1.02248847, + "epoch": 0.558785510296107, + "flos": 26621506995120.0, + "grad_norm": 1.725839239022826, + "language_loss": 0.72640771, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.75034142, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.128479, + "step": 9294, + "time_per_iteration": 2.7855193614959717 + }, + { + "auxiliary_loss_clip": 0.01365108, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.25017822, + "balance_loss_mlp": 1.02185059, + "epoch": 0.558845633548775, + "flos": 20161570348440.0, + "grad_norm": 1.964322519463621, + "language_loss": 0.68554598, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70954877, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13330078, + "step": 9295, + "time_per_iteration": 2.7579495906829834 + }, + { + "auxiliary_loss_clip": 0.01360912, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.2469883, + "balance_loss_mlp": 1.02074814, + "epoch": 0.5589057568014429, + "flos": 24355882391040.0, + "grad_norm": 1.497126876096842, + "language_loss": 0.80784965, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.83180946, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14337158, + "step": 9296, + "time_per_iteration": 2.766066789627075 + }, + { + "auxiliary_loss_clip": 0.01360051, + "auxiliary_loss_mlp": 0.01025828, + "balance_loss_clip": 1.24625444, + "balance_loss_mlp": 1.01210093, + "epoch": 0.558965880054111, + "flos": 21110064993720.0, + "grad_norm": 1.6307106316279172, + "language_loss": 0.65753448, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.68139327, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.1373291, + "step": 9297, + "time_per_iteration": 2.770141124725342 + }, + { + "auxiliary_loss_clip": 0.01369606, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.25191009, + "balance_loss_mlp": 1.02279353, + "epoch": 0.5590260033067789, + "flos": 18439883658720.0, + "grad_norm": 2.121214638960171, + "language_loss": 0.75372118, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77778757, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14245605, + "step": 9298, + "time_per_iteration": 2.7309463024139404 + }, + { + "auxiliary_loss_clip": 0.01187062, + "auxiliary_loss_mlp": 0.00999972, + "balance_loss_clip": 1.14226305, + "balance_loss_mlp": 0.99757576, + "epoch": 0.5590861265594469, + "flos": 70593235052400.0, + "grad_norm": 0.6840398682435843, + "language_loss": 0.52405262, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54592299, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02392578, + "step": 9299, + "time_per_iteration": 3.330033302307129 + }, + { + "auxiliary_loss_clip": 0.01355328, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.24337125, + "balance_loss_mlp": 1.01768363, + "epoch": 0.5591462498121148, + "flos": 30670549642080.0, + "grad_norm": 2.324378521924603, + "language_loss": 0.6901772, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71403897, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1317749, + "step": 9300, + "time_per_iteration": 2.8993444442749023 + }, + { + "auxiliary_loss_clip": 0.01364864, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.24758637, + "balance_loss_mlp": 1.02161717, + "epoch": 0.5592063730647828, + "flos": 18155030037840.0, + "grad_norm": 1.7681730314092836, + "language_loss": 0.82757747, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.85158229, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13995361, + "step": 9301, + "time_per_iteration": 2.7709500789642334 + }, + { + "auxiliary_loss_clip": 0.01367039, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.2516762, + "balance_loss_mlp": 1.01761198, + "epoch": 0.5592664963174507, + "flos": 24065668466640.0, + "grad_norm": 2.4595756006627334, + "language_loss": 0.68262684, + "learning_rate": 1.714143795138756e-06, + "loss": 0.70660746, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13415527, + "step": 9302, + "time_per_iteration": 2.886561393737793 + }, + { + "auxiliary_loss_clip": 0.01370734, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.25143027, + "balance_loss_mlp": 1.01711619, + "epoch": 0.5593266195701188, + "flos": 19832592421440.0, + "grad_norm": 3.6730542223911007, + "language_loss": 0.70783478, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73186147, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.14825439, + "step": 9303, + "time_per_iteration": 2.754721164703369 + }, + { + "auxiliary_loss_clip": 0.01354338, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.24343884, + "balance_loss_mlp": 1.01730299, + "epoch": 0.5593867428227867, + "flos": 25306163804160.0, + "grad_norm": 1.5257333251724325, + "language_loss": 0.72989446, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.75373638, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12548828, + "step": 9304, + "time_per_iteration": 2.8382136821746826 + }, + { + "auxiliary_loss_clip": 0.01357097, + "auxiliary_loss_mlp": 0.01029575, + "balance_loss_clip": 1.24277151, + "balance_loss_mlp": 1.01681376, + "epoch": 0.5594468660754547, + "flos": 12936928804560.0, + "grad_norm": 1.9922629845304012, + "language_loss": 0.78139818, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80526483, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.12762451, + "step": 9305, + "time_per_iteration": 2.7332396507263184 + }, + { + "auxiliary_loss_clip": 0.01350369, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.24013567, + "balance_loss_mlp": 1.01780486, + "epoch": 0.5595069893281227, + "flos": 19067440832280.0, + "grad_norm": 2.177959105425339, + "language_loss": 0.69800776, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.72181928, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12976074, + "step": 9306, + "time_per_iteration": 4.209521055221558 + }, + { + "auxiliary_loss_clip": 0.01188483, + "auxiliary_loss_mlp": 0.01003097, + "balance_loss_clip": 1.14399636, + "balance_loss_mlp": 1.00079632, + "epoch": 0.5595671125807906, + "flos": 70287549342840.0, + "grad_norm": 0.9345723842601705, + "language_loss": 0.60313165, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62504745, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02294922, + "step": 9307, + "time_per_iteration": 3.444333791732788 + }, + { + "auxiliary_loss_clip": 0.01357634, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.24463761, + "balance_loss_mlp": 1.02031446, + "epoch": 0.5596272358334586, + "flos": 20670114754080.0, + "grad_norm": 1.578082037970582, + "language_loss": 0.74352372, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76742798, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12493896, + "step": 9308, + "time_per_iteration": 4.232651948928833 + }, + { + "auxiliary_loss_clip": 0.01363827, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.24678028, + "balance_loss_mlp": 1.02234125, + "epoch": 0.5596873590861265, + "flos": 25045820651520.0, + "grad_norm": 1.8860642724570478, + "language_loss": 0.70375061, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.727763, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.15075684, + "step": 9309, + "time_per_iteration": 2.93279767036438 + }, + { + "auxiliary_loss_clip": 0.0136372, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.24858665, + "balance_loss_mlp": 1.01853573, + "epoch": 0.5597474823387946, + "flos": 25963997833080.0, + "grad_norm": 1.8524852254314592, + "language_loss": 0.75358796, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77755618, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14550781, + "step": 9310, + "time_per_iteration": 2.8405919075012207 + }, + { + "auxiliary_loss_clip": 0.01366294, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.24911153, + "balance_loss_mlp": 1.02039778, + "epoch": 0.5598076055914625, + "flos": 26183099873160.0, + "grad_norm": 2.402461509283709, + "language_loss": 0.69925344, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.72325879, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13842773, + "step": 9311, + "time_per_iteration": 2.802769422531128 + }, + { + "auxiliary_loss_clip": 0.01354365, + "auxiliary_loss_mlp": 0.01023876, + "balance_loss_clip": 1.24100745, + "balance_loss_mlp": 1.01063228, + "epoch": 0.5598677288441305, + "flos": 11659496840640.0, + "grad_norm": 1.8653359440048556, + "language_loss": 0.72265404, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74643642, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13244629, + "step": 9312, + "time_per_iteration": 4.229341745376587 + }, + { + "auxiliary_loss_clip": 0.01361707, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.24862623, + "balance_loss_mlp": 1.01745605, + "epoch": 0.5599278520967984, + "flos": 22971823209000.0, + "grad_norm": 1.948233596383905, + "language_loss": 0.89674288, + "learning_rate": 1.709904360003822e-06, + "loss": 0.92067236, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13806152, + "step": 9313, + "time_per_iteration": 2.7750022411346436 + }, + { + "auxiliary_loss_clip": 0.01359614, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.24714625, + "balance_loss_mlp": 1.01914096, + "epoch": 0.5599879753494664, + "flos": 21220590614400.0, + "grad_norm": 1.4231758325006594, + "language_loss": 0.77751058, + "learning_rate": 1.709519022520204e-06, + "loss": 0.80143285, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13470459, + "step": 9314, + "time_per_iteration": 2.7650394439697266 + }, + { + "auxiliary_loss_clip": 0.01360016, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.24671829, + "balance_loss_mlp": 1.01503813, + "epoch": 0.5600480986021343, + "flos": 31909298820120.0, + "grad_norm": 1.609383842934054, + "language_loss": 0.70410275, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72799373, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14044189, + "step": 9315, + "time_per_iteration": 2.8448073863983154 + }, + { + "auxiliary_loss_clip": 0.01369619, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.25227523, + "balance_loss_mlp": 1.02127564, + "epoch": 0.5601082218548024, + "flos": 28482006351240.0, + "grad_norm": 1.8048280565347163, + "language_loss": 0.67293143, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.69697857, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13824463, + "step": 9316, + "time_per_iteration": 2.8156373500823975 + }, + { + "auxiliary_loss_clip": 0.01351549, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.2381686, + "balance_loss_mlp": 1.01510429, + "epoch": 0.5601683451074703, + "flos": 24102483267960.0, + "grad_norm": 1.9884288457868748, + "language_loss": 0.86841738, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89222056, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13665771, + "step": 9317, + "time_per_iteration": 4.252440929412842 + }, + { + "auxiliary_loss_clip": 0.01362362, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.24517524, + "balance_loss_mlp": 1.02147293, + "epoch": 0.5602284683601383, + "flos": 26361407492640.0, + "grad_norm": 1.8393396754449285, + "language_loss": 0.77428997, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79828334, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.15496826, + "step": 9318, + "time_per_iteration": 2.819103479385376 + }, + { + "auxiliary_loss_clip": 0.01357071, + "auxiliary_loss_mlp": 0.01040828, + "balance_loss_clip": 1.24371052, + "balance_loss_mlp": 1.0280968, + "epoch": 0.5602885916128063, + "flos": 24501476653560.0, + "grad_norm": 1.4989203314849626, + "language_loss": 0.7659539, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78993291, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12750244, + "step": 9319, + "time_per_iteration": 2.7915773391723633 + }, + { + "auxiliary_loss_clip": 0.01355716, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.24296045, + "balance_loss_mlp": 1.02025032, + "epoch": 0.5603487148654742, + "flos": 27350818383600.0, + "grad_norm": 1.3456742533571782, + "language_loss": 0.85524583, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87913442, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.12902832, + "step": 9320, + "time_per_iteration": 2.849205493927002 + }, + { + "auxiliary_loss_clip": 0.01194846, + "auxiliary_loss_mlp": 0.01005654, + "balance_loss_clip": 1.14987874, + "balance_loss_mlp": 1.0031147, + "epoch": 0.5604088381181422, + "flos": 54101652670080.0, + "grad_norm": 0.8708535319159008, + "language_loss": 0.52653861, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54854357, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02539062, + "step": 9321, + "time_per_iteration": 3.119814395904541 + }, + { + "auxiliary_loss_clip": 0.01356246, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.2456336, + "balance_loss_mlp": 1.01927686, + "epoch": 0.5604689613708101, + "flos": 22241456003160.0, + "grad_norm": 1.4066837098538432, + "language_loss": 0.74899411, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.77287829, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12884521, + "step": 9322, + "time_per_iteration": 2.8239505290985107 + }, + { + "auxiliary_loss_clip": 0.01357758, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.2429924, + "balance_loss_mlp": 1.01842713, + "epoch": 0.5605290846234782, + "flos": 35304487057440.0, + "grad_norm": 1.9881047024042697, + "language_loss": 0.73945761, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76335752, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13800049, + "step": 9323, + "time_per_iteration": 2.9039700031280518 + }, + { + "auxiliary_loss_clip": 0.01368119, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.25038373, + "balance_loss_mlp": 1.02416205, + "epoch": 0.5605892078761461, + "flos": 20267750874600.0, + "grad_norm": 1.7590764771004281, + "language_loss": 0.62182522, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.64589238, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14404297, + "step": 9324, + "time_per_iteration": 2.7980966567993164 + }, + { + "auxiliary_loss_clip": 0.01364416, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.24907494, + "balance_loss_mlp": 1.01673889, + "epoch": 0.5606493311288141, + "flos": 17312431660200.0, + "grad_norm": 1.8383837999384085, + "language_loss": 0.87706935, + "learning_rate": 1.705281040409226e-06, + "loss": 0.90102392, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14300537, + "step": 9325, + "time_per_iteration": 2.740600347518921 + }, + { + "auxiliary_loss_clip": 0.01365122, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.24908042, + "balance_loss_mlp": 1.02375627, + "epoch": 0.560709454381482, + "flos": 21658023135720.0, + "grad_norm": 1.5214967610418975, + "language_loss": 0.74051929, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.7645573, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14916992, + "step": 9326, + "time_per_iteration": 2.767191171646118 + }, + { + "auxiliary_loss_clip": 0.01372304, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.25293231, + "balance_loss_mlp": 1.01829088, + "epoch": 0.56076957763415, + "flos": 20308261036680.0, + "grad_norm": 1.7156869741471856, + "language_loss": 0.78234768, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80639827, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.14459229, + "step": 9327, + "time_per_iteration": 2.7670836448669434 + }, + { + "auxiliary_loss_clip": 0.01363042, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.24699259, + "balance_loss_mlp": 1.01608515, + "epoch": 0.5608297008868179, + "flos": 25051343388480.0, + "grad_norm": 1.519173133897904, + "language_loss": 0.78558427, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80951536, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13983154, + "step": 9328, + "time_per_iteration": 2.804792642593384 + }, + { + "auxiliary_loss_clip": 0.01358383, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.24364352, + "balance_loss_mlp": 1.0143851, + "epoch": 0.560889824139486, + "flos": 19871843724360.0, + "grad_norm": 4.278006241774334, + "language_loss": 0.74058783, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.76445699, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14172363, + "step": 9329, + "time_per_iteration": 2.7531192302703857 + }, + { + "auxiliary_loss_clip": 0.01377805, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.25719225, + "balance_loss_mlp": 1.02426326, + "epoch": 0.5609499473921539, + "flos": 22934521107360.0, + "grad_norm": 1.4943475126379697, + "language_loss": 0.84067589, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.86485147, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.15484619, + "step": 9330, + "time_per_iteration": 2.7903084754943848 + }, + { + "auxiliary_loss_clip": 0.01190872, + "auxiliary_loss_mlp": 0.01001523, + "balance_loss_clip": 1.14509857, + "balance_loss_mlp": 0.99897236, + "epoch": 0.5610100706448219, + "flos": 53050388600880.0, + "grad_norm": 0.7250238669816413, + "language_loss": 0.5790658, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.60098976, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.0255127, + "step": 9331, + "time_per_iteration": 3.309375286102295 + }, + { + "auxiliary_loss_clip": 0.01365576, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.24786425, + "balance_loss_mlp": 1.0269208, + "epoch": 0.5610701938974898, + "flos": 21839944899240.0, + "grad_norm": 1.7362272980645745, + "language_loss": 0.81914306, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84320253, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13464355, + "step": 9332, + "time_per_iteration": 2.820852279663086 + }, + { + "auxiliary_loss_clip": 0.01373816, + "auxiliary_loss_mlp": 0.01044613, + "balance_loss_clip": 1.25331843, + "balance_loss_mlp": 1.02891278, + "epoch": 0.5611303171501578, + "flos": 17461680675120.0, + "grad_norm": 2.227396646447007, + "language_loss": 0.82166576, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.84584999, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.15698242, + "step": 9333, + "time_per_iteration": 2.7505886554718018 + }, + { + "auxiliary_loss_clip": 0.01362317, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.2461096, + "balance_loss_mlp": 1.0165503, + "epoch": 0.5611904404028258, + "flos": 22642926498720.0, + "grad_norm": 1.5738566259530375, + "language_loss": 0.72987443, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75379527, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13226318, + "step": 9334, + "time_per_iteration": 2.741516590118408 + }, + { + "auxiliary_loss_clip": 0.01364324, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.2490108, + "balance_loss_mlp": 1.01632667, + "epoch": 0.5612505636554938, + "flos": 14320013385960.0, + "grad_norm": 1.7198662905512851, + "language_loss": 0.71113896, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73508501, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13970947, + "step": 9335, + "time_per_iteration": 2.7440030574798584 + }, + { + "auxiliary_loss_clip": 0.01363463, + "auxiliary_loss_mlp": 0.01037663, + "balance_loss_clip": 1.24687636, + "balance_loss_mlp": 1.02376282, + "epoch": 0.5613106869081618, + "flos": 16512617512800.0, + "grad_norm": 1.6685625748011084, + "language_loss": 0.76822889, + "learning_rate": 1.701044410566205e-06, + "loss": 0.7922402, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13903809, + "step": 9336, + "time_per_iteration": 2.705244302749634 + }, + { + "auxiliary_loss_clip": 0.01362059, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.24735236, + "balance_loss_mlp": 1.02200603, + "epoch": 0.5613708101608297, + "flos": 24063597440280.0, + "grad_norm": 2.0208799827757313, + "language_loss": 0.65120983, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.6751802, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.12976074, + "step": 9337, + "time_per_iteration": 2.759220600128174 + }, + { + "auxiliary_loss_clip": 0.01187633, + "auxiliary_loss_mlp": 0.01007428, + "balance_loss_clip": 1.14179635, + "balance_loss_mlp": 1.00500762, + "epoch": 0.5614309334134977, + "flos": 64918655976960.0, + "grad_norm": 0.9013697684312185, + "language_loss": 0.62649095, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64844155, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02416992, + "step": 9338, + "time_per_iteration": 3.2027642726898193 + }, + { + "auxiliary_loss_clip": 0.01364831, + "auxiliary_loss_mlp": 0.01041615, + "balance_loss_clip": 1.24799919, + "balance_loss_mlp": 1.02775145, + "epoch": 0.5614910566661656, + "flos": 32925413030760.0, + "grad_norm": 1.7119116929113742, + "language_loss": 0.65611881, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.68018329, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.13873291, + "step": 9339, + "time_per_iteration": 2.8333375453948975 + }, + { + "auxiliary_loss_clip": 0.01356875, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.24286723, + "balance_loss_mlp": 1.02165782, + "epoch": 0.5615511799188336, + "flos": 18593599593240.0, + "grad_norm": 1.7392713018056636, + "language_loss": 0.700872, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72479099, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13360596, + "step": 9340, + "time_per_iteration": 2.6985461711883545 + }, + { + "auxiliary_loss_clip": 0.01352935, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.24140191, + "balance_loss_mlp": 1.01983726, + "epoch": 0.5616113031715015, + "flos": 22825132520760.0, + "grad_norm": 1.4574258667173303, + "language_loss": 0.77422357, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79808271, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13134766, + "step": 9341, + "time_per_iteration": 2.7937724590301514 + }, + { + "auxiliary_loss_clip": 0.01367945, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.24936247, + "balance_loss_mlp": 1.02023411, + "epoch": 0.5616714264241696, + "flos": 22350763373040.0, + "grad_norm": 1.5062791794346786, + "language_loss": 0.79533029, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81936175, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.14959717, + "step": 9342, + "time_per_iteration": 2.782857656478882 + }, + { + "auxiliary_loss_clip": 0.01371592, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.25172997, + "balance_loss_mlp": 1.01965117, + "epoch": 0.5617315496768375, + "flos": 18812539199880.0, + "grad_norm": 1.7380295747216559, + "language_loss": 0.76527977, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78934002, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.14794922, + "step": 9343, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.01364891, + "auxiliary_loss_mlp": 0.01040957, + "balance_loss_clip": 1.25026059, + "balance_loss_mlp": 1.02597213, + "epoch": 0.5617916729295055, + "flos": 18373929036120.0, + "grad_norm": 1.6866504258530453, + "language_loss": 0.69155413, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.71561265, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.15002441, + "step": 9344, + "time_per_iteration": 4.154307842254639 + }, + { + "auxiliary_loss_clip": 0.0136878, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.25036979, + "balance_loss_mlp": 1.02745438, + "epoch": 0.5618517961821734, + "flos": 28185335697600.0, + "grad_norm": 2.070389715710546, + "language_loss": 0.66599625, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.69010019, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.14172363, + "step": 9345, + "time_per_iteration": 2.807097911834717 + }, + { + "auxiliary_loss_clip": 0.01369203, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.25279999, + "balance_loss_mlp": 1.02303934, + "epoch": 0.5619119194348414, + "flos": 15491102390280.0, + "grad_norm": 1.776112040757407, + "language_loss": 0.87433994, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.8983928, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.1305542, + "step": 9346, + "time_per_iteration": 4.236160755157471 + }, + { + "auxiliary_loss_clip": 0.01365756, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.25053644, + "balance_loss_mlp": 1.02499962, + "epoch": 0.5619720426875094, + "flos": 29134155209760.0, + "grad_norm": 3.1024749670038707, + "language_loss": 0.59187484, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61593032, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14801025, + "step": 9347, + "time_per_iteration": 2.789585828781128 + }, + { + "auxiliary_loss_clip": 0.01369761, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.25157547, + "balance_loss_mlp": 1.02228403, + "epoch": 0.5620321659401774, + "flos": 18008217524520.0, + "grad_norm": 2.1388530030702464, + "language_loss": 0.69637978, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.72045243, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.15216064, + "step": 9348, + "time_per_iteration": 2.6941580772399902 + }, + { + "auxiliary_loss_clip": 0.01375653, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.25367999, + "balance_loss_mlp": 1.02144182, + "epoch": 0.5620922891928454, + "flos": 20599368345000.0, + "grad_norm": 1.8329067641709145, + "language_loss": 0.79183817, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81596327, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.15405273, + "step": 9349, + "time_per_iteration": 2.75998854637146 + }, + { + "auxiliary_loss_clip": 0.01365092, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.24839592, + "balance_loss_mlp": 1.02189076, + "epoch": 0.5621524124455133, + "flos": 26292691501560.0, + "grad_norm": 2.002948234998638, + "language_loss": 0.67158711, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69561303, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.15606689, + "step": 9350, + "time_per_iteration": 2.781430959701538 + }, + { + "auxiliary_loss_clip": 0.01374863, + "auxiliary_loss_mlp": 0.01033347, + "balance_loss_clip": 1.25554132, + "balance_loss_mlp": 1.01926851, + "epoch": 0.5622125356981813, + "flos": 12754154265480.0, + "grad_norm": 1.9959387763247214, + "language_loss": 0.78812015, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.81220216, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14086914, + "step": 9351, + "time_per_iteration": 4.151336431503296 + }, + { + "auxiliary_loss_clip": 0.01379512, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.25814688, + "balance_loss_mlp": 1.0271796, + "epoch": 0.5622726589508492, + "flos": 23810604400800.0, + "grad_norm": 1.4121811119814096, + "language_loss": 0.59205663, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.61626685, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.14331055, + "step": 9352, + "time_per_iteration": 2.769397497177124 + }, + { + "auxiliary_loss_clip": 0.0135436, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.24299955, + "balance_loss_mlp": 1.01787138, + "epoch": 0.5623327822035172, + "flos": 24723705537360.0, + "grad_norm": 1.240336364082574, + "language_loss": 0.72005475, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74390435, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12738037, + "step": 9353, + "time_per_iteration": 2.830842971801758 + }, + { + "auxiliary_loss_clip": 0.01366355, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.24837744, + "balance_loss_mlp": 1.02284956, + "epoch": 0.5623929054561851, + "flos": 14022246306600.0, + "grad_norm": 3.5958991225077837, + "language_loss": 0.7713061, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.79533565, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.13763428, + "step": 9354, + "time_per_iteration": 2.682373285293579 + }, + { + "auxiliary_loss_clip": 0.01378858, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.25643826, + "balance_loss_mlp": 1.02765906, + "epoch": 0.5624530287088532, + "flos": 20709325448640.0, + "grad_norm": 1.7365304426267811, + "language_loss": 0.73270059, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.75691819, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.15240479, + "step": 9355, + "time_per_iteration": 2.771766185760498 + }, + { + "auxiliary_loss_clip": 0.01367141, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.25090528, + "balance_loss_mlp": 1.02099693, + "epoch": 0.5625131519615211, + "flos": 21475979547120.0, + "grad_norm": 1.4057244097174735, + "language_loss": 0.73888588, + "learning_rate": 1.693344975084274e-06, + "loss": 0.76290458, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13745117, + "step": 9356, + "time_per_iteration": 4.217599630355835 + }, + { + "auxiliary_loss_clip": 0.01364511, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.25074768, + "balance_loss_mlp": 1.0250746, + "epoch": 0.5625732752141891, + "flos": 18702703921320.0, + "grad_norm": 1.9217410451143755, + "language_loss": 0.84070444, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.86474228, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14202881, + "step": 9357, + "time_per_iteration": 2.767646074295044 + }, + { + "auxiliary_loss_clip": 0.0136887, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.25263262, + "balance_loss_mlp": 1.02384722, + "epoch": 0.562633398466857, + "flos": 16221266554320.0, + "grad_norm": 1.7998480195044118, + "language_loss": 0.72662318, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.75068176, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13153076, + "step": 9358, + "time_per_iteration": 2.7529947757720947 + }, + { + "auxiliary_loss_clip": 0.01360947, + "auxiliary_loss_mlp": 0.01043795, + "balance_loss_clip": 1.24676752, + "balance_loss_mlp": 1.02968037, + "epoch": 0.562693521719525, + "flos": 22497047977680.0, + "grad_norm": 1.585431179263261, + "language_loss": 0.77894276, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.8029902, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14117432, + "step": 9359, + "time_per_iteration": 2.7650084495544434 + }, + { + "auxiliary_loss_clip": 0.01370686, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.2544179, + "balance_loss_mlp": 1.02268648, + "epoch": 0.562753644972193, + "flos": 25335790925760.0, + "grad_norm": 1.652665507577937, + "language_loss": 0.70709538, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.73116755, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13848877, + "step": 9360, + "time_per_iteration": 2.813527822494507 + }, + { + "auxiliary_loss_clip": 0.01185938, + "auxiliary_loss_mlp": 0.01007828, + "balance_loss_clip": 1.14059448, + "balance_loss_mlp": 1.00522947, + "epoch": 0.562813768224861, + "flos": 67406631289920.0, + "grad_norm": 0.7779440060606191, + "language_loss": 0.55590916, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57784688, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02600098, + "step": 9361, + "time_per_iteration": 3.180340528488159 + }, + { + "auxiliary_loss_clip": 0.01358936, + "auxiliary_loss_mlp": 0.01040508, + "balance_loss_clip": 1.2456944, + "balance_loss_mlp": 1.02744246, + "epoch": 0.562873891477529, + "flos": 23336397686520.0, + "grad_norm": 1.408135254897716, + "language_loss": 0.82056439, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84455884, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13079834, + "step": 9362, + "time_per_iteration": 2.7992897033691406 + }, + { + "auxiliary_loss_clip": 0.01361995, + "auxiliary_loss_mlp": 0.01035565, + "balance_loss_clip": 1.24702179, + "balance_loss_mlp": 1.02207613, + "epoch": 0.5629340147301969, + "flos": 38480045346000.0, + "grad_norm": 1.585390685847475, + "language_loss": 0.7472111, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.77118677, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.1350708, + "step": 9363, + "time_per_iteration": 2.940842628479004 + }, + { + "auxiliary_loss_clip": 0.01367984, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.24999118, + "balance_loss_mlp": 1.01773405, + "epoch": 0.5629941379828649, + "flos": 29248051324320.0, + "grad_norm": 1.7235404750193193, + "language_loss": 0.83109325, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85509503, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14471436, + "step": 9364, + "time_per_iteration": 2.8478169441223145 + }, + { + "auxiliary_loss_clip": 0.01354989, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.24165154, + "balance_loss_mlp": 1.02041268, + "epoch": 0.5630542612355328, + "flos": 19424787021720.0, + "grad_norm": 2.123240759918927, + "language_loss": 0.65086359, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67474788, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13024902, + "step": 9365, + "time_per_iteration": 2.7754790782928467 + }, + { + "auxiliary_loss_clip": 0.01380672, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.2580694, + "balance_loss_mlp": 1.01926684, + "epoch": 0.5631143844882008, + "flos": 22270514607720.0, + "grad_norm": 2.6772652176531877, + "language_loss": 0.81760275, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.84175575, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.15356445, + "step": 9366, + "time_per_iteration": 2.776451826095581 + }, + { + "auxiliary_loss_clip": 0.01360758, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.24826169, + "balance_loss_mlp": 1.01725805, + "epoch": 0.5631745077408687, + "flos": 22970361308040.0, + "grad_norm": 1.5429591843762565, + "language_loss": 0.73786485, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.76177448, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.12939453, + "step": 9367, + "time_per_iteration": 2.8069937229156494 + }, + { + "auxiliary_loss_clip": 0.01182721, + "auxiliary_loss_mlp": 0.01004702, + "balance_loss_clip": 1.13746166, + "balance_loss_mlp": 1.00166249, + "epoch": 0.5632346309935368, + "flos": 65097224227800.0, + "grad_norm": 0.6313662587425096, + "language_loss": 0.53551698, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55739117, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.03039551, + "step": 9368, + "time_per_iteration": 3.3707733154296875 + }, + { + "auxiliary_loss_clip": 0.01358755, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.24461865, + "balance_loss_mlp": 1.01827097, + "epoch": 0.5632947542462047, + "flos": 23008110101640.0, + "grad_norm": 1.600445920734586, + "language_loss": 0.68897063, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71288383, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.1428833, + "step": 9369, + "time_per_iteration": 2.7978968620300293 + }, + { + "auxiliary_loss_clip": 0.01361556, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.24580228, + "balance_loss_mlp": 1.01916623, + "epoch": 0.5633548774988727, + "flos": 30488790312000.0, + "grad_norm": 1.888852907460339, + "language_loss": 0.76248693, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78644127, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14709473, + "step": 9370, + "time_per_iteration": 2.833278179168701 + }, + { + "auxiliary_loss_clip": 0.01372976, + "auxiliary_loss_mlp": 0.01035781, + "balance_loss_clip": 1.25420642, + "balance_loss_mlp": 1.02096367, + "epoch": 0.5634150007515406, + "flos": 18519645123720.0, + "grad_norm": 2.053557461331247, + "language_loss": 0.75677037, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78085792, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.14819336, + "step": 9371, + "time_per_iteration": 2.7670304775238037 + }, + { + "auxiliary_loss_clip": 0.01360012, + "auxiliary_loss_mlp": 0.01040646, + "balance_loss_clip": 1.24576354, + "balance_loss_mlp": 1.02724671, + "epoch": 0.5634751240042086, + "flos": 19249362595800.0, + "grad_norm": 2.23386198146353, + "language_loss": 0.76069576, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78470242, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.1340332, + "step": 9372, + "time_per_iteration": 2.7639946937561035 + }, + { + "auxiliary_loss_clip": 0.01358613, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.24407053, + "balance_loss_mlp": 1.01991415, + "epoch": 0.5635352472568766, + "flos": 12024396185040.0, + "grad_norm": 2.074977725141281, + "language_loss": 0.71361119, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73753941, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1428833, + "step": 9373, + "time_per_iteration": 2.6824886798858643 + }, + { + "auxiliary_loss_clip": 0.01360683, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.24526381, + "balance_loss_mlp": 1.0238955, + "epoch": 0.5635953705095446, + "flos": 21876678483840.0, + "grad_norm": 2.0299499005270403, + "language_loss": 0.83067763, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.85467267, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14910889, + "step": 9374, + "time_per_iteration": 2.7834792137145996 + }, + { + "auxiliary_loss_clip": 0.01358257, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.24353206, + "balance_loss_mlp": 1.01530051, + "epoch": 0.5636554937622126, + "flos": 27131959993680.0, + "grad_norm": 1.5479410398067743, + "language_loss": 0.66492987, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.6887995, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13391113, + "step": 9375, + "time_per_iteration": 2.8086941242218018 + }, + { + "auxiliary_loss_clip": 0.01370496, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.25284159, + "balance_loss_mlp": 1.02302301, + "epoch": 0.5637156170148805, + "flos": 12929456866320.0, + "grad_norm": 2.142507378749697, + "language_loss": 0.81282115, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.8368938, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13751221, + "step": 9376, + "time_per_iteration": 2.7128868103027344 + }, + { + "auxiliary_loss_clip": 0.01370411, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.24949265, + "balance_loss_mlp": 1.02351642, + "epoch": 0.5637757402675485, + "flos": 45559173844080.0, + "grad_norm": 1.373311788424933, + "language_loss": 0.69808149, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.72216988, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.14904785, + "step": 9377, + "time_per_iteration": 2.9643445014953613 + }, + { + "auxiliary_loss_clip": 0.01358101, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.24689889, + "balance_loss_mlp": 1.01735127, + "epoch": 0.5638358635202164, + "flos": 20890881736920.0, + "grad_norm": 1.5148169367580642, + "language_loss": 0.74761146, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.77149433, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12841797, + "step": 9378, + "time_per_iteration": 2.7899112701416016 + }, + { + "auxiliary_loss_clip": 0.01378287, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.25505233, + "balance_loss_mlp": 1.0179261, + "epoch": 0.5638959867728844, + "flos": 18811158515640.0, + "grad_norm": 2.664994957418623, + "language_loss": 0.81894779, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84306145, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.15148926, + "step": 9379, + "time_per_iteration": 2.8203282356262207 + }, + { + "auxiliary_loss_clip": 0.01362365, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.24512362, + "balance_loss_mlp": 1.02153707, + "epoch": 0.5639561100255523, + "flos": 27496087779240.0, + "grad_norm": 2.521949548363061, + "language_loss": 0.71938944, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.74336684, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.13842773, + "step": 9380, + "time_per_iteration": 2.853930950164795 + }, + { + "auxiliary_loss_clip": 0.01367317, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.24906397, + "balance_loss_mlp": 1.02225661, + "epoch": 0.5640162332782204, + "flos": 18081034959960.0, + "grad_norm": 2.082664890856657, + "language_loss": 0.74357367, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76762056, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.15093994, + "step": 9381, + "time_per_iteration": 2.7624971866607666 + }, + { + "auxiliary_loss_clip": 0.01365638, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.24697161, + "balance_loss_mlp": 1.02214766, + "epoch": 0.5640763565308883, + "flos": 20889541661040.0, + "grad_norm": 1.9313812090620328, + "language_loss": 0.7246784, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74869967, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14324951, + "step": 9382, + "time_per_iteration": 2.749039888381958 + }, + { + "auxiliary_loss_clip": 0.01179151, + "auxiliary_loss_mlp": 0.01003163, + "balance_loss_clip": 1.13447046, + "balance_loss_mlp": 1.00074291, + "epoch": 0.5641364797835563, + "flos": 64462926066480.0, + "grad_norm": 0.7242788218917502, + "language_loss": 0.54448932, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56631249, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02416992, + "step": 9383, + "time_per_iteration": 3.43761944770813 + }, + { + "auxiliary_loss_clip": 0.01372211, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.25157166, + "balance_loss_mlp": 1.01551867, + "epoch": 0.5641966030362242, + "flos": 18665564253120.0, + "grad_norm": 3.169943053025629, + "language_loss": 0.70208925, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72611558, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.14892578, + "step": 9384, + "time_per_iteration": 4.279662847518921 + }, + { + "auxiliary_loss_clip": 0.01362536, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.24487877, + "balance_loss_mlp": 1.01915514, + "epoch": 0.5642567262888922, + "flos": 22497494669640.0, + "grad_norm": 1.8899125811983246, + "language_loss": 0.76268816, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78664827, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14300537, + "step": 9385, + "time_per_iteration": 4.2448296546936035 + }, + { + "auxiliary_loss_clip": 0.01358232, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.24269247, + "balance_loss_mlp": 1.02062631, + "epoch": 0.5643168495415603, + "flos": 13007634605280.0, + "grad_norm": 1.909993367367493, + "language_loss": 0.82452643, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84845179, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13696289, + "step": 9386, + "time_per_iteration": 2.713294744491577 + }, + { + "auxiliary_loss_clip": 0.01376068, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.25465465, + "balance_loss_mlp": 1.02403438, + "epoch": 0.5643769727942282, + "flos": 18592949859480.0, + "grad_norm": 2.2826462245278027, + "language_loss": 0.70465946, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72881329, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.15283203, + "step": 9387, + "time_per_iteration": 2.7469873428344727 + }, + { + "auxiliary_loss_clip": 0.01375045, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.25496447, + "balance_loss_mlp": 1.02214038, + "epoch": 0.5644370960468962, + "flos": 33813353965320.0, + "grad_norm": 1.4576423517622772, + "language_loss": 0.7497524, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.77386904, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14483643, + "step": 9388, + "time_per_iteration": 2.8592002391815186 + }, + { + "auxiliary_loss_clip": 0.01356538, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.24207687, + "balance_loss_mlp": 1.02294385, + "epoch": 0.5644972192995641, + "flos": 21219859663920.0, + "grad_norm": 1.504344214378416, + "language_loss": 0.82927239, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.85319316, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1260376, + "step": 9389, + "time_per_iteration": 2.805429697036743 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.24698472, + "balance_loss_mlp": 1.02833569, + "epoch": 0.5645573425522321, + "flos": 18592300125720.0, + "grad_norm": 2.9665419598549603, + "language_loss": 0.64062428, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66473925, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.1574707, + "step": 9390, + "time_per_iteration": 4.228619575500488 + }, + { + "auxiliary_loss_clip": 0.01360243, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.24590468, + "balance_loss_mlp": 1.01873231, + "epoch": 0.5646174658049, + "flos": 18118377669960.0, + "grad_norm": 1.701231058789832, + "language_loss": 0.92592734, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94985169, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13470459, + "step": 9391, + "time_per_iteration": 2.6935412883758545 + }, + { + "auxiliary_loss_clip": 0.01379136, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.25588858, + "balance_loss_mlp": 1.02371979, + "epoch": 0.564677589057568, + "flos": 28335884180040.0, + "grad_norm": 2.005192617034408, + "language_loss": 0.61171389, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.63589978, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.15734863, + "step": 9392, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.01366742, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.24979842, + "balance_loss_mlp": 1.01447511, + "epoch": 0.564737712310236, + "flos": 22168841609520.0, + "grad_norm": 1.8266834714755997, + "language_loss": 0.81387854, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83783567, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.14501953, + "step": 9393, + "time_per_iteration": 2.743070363998413 + }, + { + "auxiliary_loss_clip": 0.01365625, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.24911106, + "balance_loss_mlp": 1.02035189, + "epoch": 0.564797835562904, + "flos": 20963861605800.0, + "grad_norm": 1.6154408995750695, + "language_loss": 0.87356877, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89757586, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14733887, + "step": 9394, + "time_per_iteration": 2.826427936553955 + }, + { + "auxiliary_loss_clip": 0.01358011, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.2434361, + "balance_loss_mlp": 1.02070606, + "epoch": 0.5648579588155719, + "flos": 17424744048720.0, + "grad_norm": 1.7897386861074631, + "language_loss": 0.84607595, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86999375, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13061523, + "step": 9395, + "time_per_iteration": 4.1983864307403564 + }, + { + "auxiliary_loss_clip": 0.01179134, + "auxiliary_loss_mlp": 0.01002482, + "balance_loss_clip": 1.13407397, + "balance_loss_mlp": 1.00009751, + "epoch": 0.5649180820682399, + "flos": 69946446497400.0, + "grad_norm": 0.7885119318372735, + "language_loss": 0.58331263, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60512882, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02380371, + "step": 9396, + "time_per_iteration": 3.234492063522339 + }, + { + "auxiliary_loss_clip": 0.01365805, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.24750912, + "balance_loss_mlp": 1.01567554, + "epoch": 0.5649782053209078, + "flos": 24978525953040.0, + "grad_norm": 1.8058870428202194, + "language_loss": 0.70470715, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.7286607, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.13867188, + "step": 9397, + "time_per_iteration": 2.9177725315093994 + }, + { + "auxiliary_loss_clip": 0.01368468, + "auxiliary_loss_mlp": 0.01037798, + "balance_loss_clip": 1.24922287, + "balance_loss_mlp": 1.02404082, + "epoch": 0.5650383285735758, + "flos": 21731733955080.0, + "grad_norm": 1.6766378883436783, + "language_loss": 0.67008501, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69414771, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13757324, + "step": 9398, + "time_per_iteration": 2.775319814682007 + }, + { + "auxiliary_loss_clip": 0.0117929, + "auxiliary_loss_mlp": 0.0099848, + "balance_loss_clip": 1.13402903, + "balance_loss_mlp": 0.9960956, + "epoch": 0.5650984518262439, + "flos": 65919315383640.0, + "grad_norm": 0.7659840485455661, + "language_loss": 0.58218002, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60395765, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02380371, + "step": 9399, + "time_per_iteration": 3.195155620574951 + }, + { + "auxiliary_loss_clip": 0.01369719, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.24846709, + "balance_loss_mlp": 1.02000999, + "epoch": 0.5651585750789118, + "flos": 21037531816800.0, + "grad_norm": 1.8329761525044657, + "language_loss": 0.73274583, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75679761, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.15454102, + "step": 9400, + "time_per_iteration": 2.761753797531128 + }, + { + "auxiliary_loss_clip": 0.01371332, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.25095606, + "balance_loss_mlp": 1.01905394, + "epoch": 0.5652186983315798, + "flos": 18556581750120.0, + "grad_norm": 1.8471994164005177, + "language_loss": 0.61319691, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63725781, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.15710449, + "step": 9401, + "time_per_iteration": 2.7431206703186035 + }, + { + "auxiliary_loss_clip": 0.01361379, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.24510002, + "balance_loss_mlp": 1.013448, + "epoch": 0.5652788215842477, + "flos": 18483277014360.0, + "grad_norm": 1.987558174378119, + "language_loss": 0.81305182, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83692801, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.12799072, + "step": 9402, + "time_per_iteration": 2.771876573562622 + }, + { + "auxiliary_loss_clip": 0.01360325, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.24497533, + "balance_loss_mlp": 1.02248573, + "epoch": 0.5653389448369157, + "flos": 30050098931520.0, + "grad_norm": 1.8793508503416998, + "language_loss": 0.77952111, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80348051, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13110352, + "step": 9403, + "time_per_iteration": 2.80692458152771 + }, + { + "auxiliary_loss_clip": 0.01368735, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.25122201, + "balance_loss_mlp": 1.01610827, + "epoch": 0.5653990680895836, + "flos": 16731963203040.0, + "grad_norm": 1.5753421520942947, + "language_loss": 0.69177306, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71576023, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.13885498, + "step": 9404, + "time_per_iteration": 2.8689608573913574 + }, + { + "auxiliary_loss_clip": 0.01354809, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.24056947, + "balance_loss_mlp": 1.01723433, + "epoch": 0.5654591913422516, + "flos": 14542364094840.0, + "grad_norm": 1.8799337513865655, + "language_loss": 0.67740673, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.70125383, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.12677002, + "step": 9405, + "time_per_iteration": 2.7979001998901367 + }, + { + "auxiliary_loss_clip": 0.01351735, + "auxiliary_loss_mlp": 0.01035879, + "balance_loss_clip": 1.24110126, + "balance_loss_mlp": 1.02304053, + "epoch": 0.5655193145949196, + "flos": 26215041671280.0, + "grad_norm": 1.674723218894885, + "language_loss": 0.74350482, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76738095, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12841797, + "step": 9406, + "time_per_iteration": 2.808493137359619 + }, + { + "auxiliary_loss_clip": 0.01365977, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.24677289, + "balance_loss_mlp": 1.01779151, + "epoch": 0.5655794378475876, + "flos": 25052114947320.0, + "grad_norm": 1.8661593051954812, + "language_loss": 0.80438614, + "learning_rate": 1.673732740698882e-06, + "loss": 0.82837605, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.15209961, + "step": 9407, + "time_per_iteration": 2.783137321472168 + }, + { + "auxiliary_loss_clip": 0.01352189, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.2406311, + "balance_loss_mlp": 1.02227199, + "epoch": 0.5656395611002555, + "flos": 31039509822480.0, + "grad_norm": 1.4244074909786602, + "language_loss": 0.71251053, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73639196, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13677979, + "step": 9408, + "time_per_iteration": 2.834710121154785 + }, + { + "auxiliary_loss_clip": 0.0135665, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.24274111, + "balance_loss_mlp": 1.01865447, + "epoch": 0.5656996843529235, + "flos": 20234306567160.0, + "grad_norm": 1.8697329807692398, + "language_loss": 0.81331486, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83720887, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14105225, + "step": 9409, + "time_per_iteration": 2.7522127628326416 + }, + { + "auxiliary_loss_clip": 0.01364142, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.24671435, + "balance_loss_mlp": 1.01474762, + "epoch": 0.5657598076055914, + "flos": 21001204315800.0, + "grad_norm": 2.5310640067902335, + "language_loss": 0.78620422, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.8101368, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14361572, + "step": 9410, + "time_per_iteration": 2.7404768466949463 + }, + { + "auxiliary_loss_clip": 0.01362008, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.24639106, + "balance_loss_mlp": 1.01820016, + "epoch": 0.5658199308582594, + "flos": 11549661562080.0, + "grad_norm": 2.0859372621480556, + "language_loss": 0.84098166, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.86492217, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.1383667, + "step": 9411, + "time_per_iteration": 2.734121561050415 + }, + { + "auxiliary_loss_clip": 0.01372147, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.25208247, + "balance_loss_mlp": 1.01640892, + "epoch": 0.5658800541109275, + "flos": 14176043457840.0, + "grad_norm": 2.133355686014933, + "language_loss": 0.67672461, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.70075679, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14648438, + "step": 9412, + "time_per_iteration": 2.703768014907837 + }, + { + "auxiliary_loss_clip": 0.01349322, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.23784053, + "balance_loss_mlp": 1.01849008, + "epoch": 0.5659401773635954, + "flos": 27310430046600.0, + "grad_norm": 1.4762067069777836, + "language_loss": 0.58409762, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60789579, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.11999512, + "step": 9413, + "time_per_iteration": 2.783754825592041 + }, + { + "auxiliary_loss_clip": 0.01356531, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.24270308, + "balance_loss_mlp": 1.02128696, + "epoch": 0.5660003006162634, + "flos": 16732694153520.0, + "grad_norm": 1.5195505033108787, + "language_loss": 0.69473255, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71863997, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.12927246, + "step": 9414, + "time_per_iteration": 2.714510679244995 + }, + { + "auxiliary_loss_clip": 0.01356097, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.24264801, + "balance_loss_mlp": 1.01915741, + "epoch": 0.5660604238689313, + "flos": 21658551044400.0, + "grad_norm": 2.045119318693086, + "language_loss": 0.78686452, + "learning_rate": 1.670659182280247e-06, + "loss": 0.81074214, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12512207, + "step": 9415, + "time_per_iteration": 2.8066582679748535 + }, + { + "auxiliary_loss_clip": 0.01180396, + "auxiliary_loss_mlp": 0.01007917, + "balance_loss_clip": 1.13575816, + "balance_loss_mlp": 1.00572348, + "epoch": 0.5661205471215993, + "flos": 68839322306040.0, + "grad_norm": 0.6971383867282561, + "language_loss": 0.49213523, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51401836, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02197266, + "step": 9416, + "time_per_iteration": 3.379948616027832 + }, + { + "auxiliary_loss_clip": 0.01366408, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.25050592, + "balance_loss_mlp": 1.01971459, + "epoch": 0.5661806703742672, + "flos": 28627722438840.0, + "grad_norm": 1.7458756125555452, + "language_loss": 0.63006508, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6540693, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.14312744, + "step": 9417, + "time_per_iteration": 2.803860902786255 + }, + { + "auxiliary_loss_clip": 0.01366333, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.24778116, + "balance_loss_mlp": 1.02027893, + "epoch": 0.5662407936269352, + "flos": 21402999678240.0, + "grad_norm": 1.631401700612082, + "language_loss": 0.68822253, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71223068, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14208984, + "step": 9418, + "time_per_iteration": 2.805828332901001 + }, + { + "auxiliary_loss_clip": 0.0136197, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.24656987, + "balance_loss_mlp": 1.01894259, + "epoch": 0.5663009168796032, + "flos": 25664728244400.0, + "grad_norm": 1.7807953275972968, + "language_loss": 0.64338422, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66734564, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.15209961, + "step": 9419, + "time_per_iteration": 2.8815956115722656 + }, + { + "auxiliary_loss_clip": 0.01179312, + "auxiliary_loss_mlp": 0.01003859, + "balance_loss_clip": 1.13472438, + "balance_loss_mlp": 1.0017128, + "epoch": 0.5663610401322712, + "flos": 67948068467160.0, + "grad_norm": 0.7802330741499277, + "language_loss": 0.59720325, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61903501, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02148438, + "step": 9420, + "time_per_iteration": 3.3213911056518555 + }, + { + "auxiliary_loss_clip": 0.01355715, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.24297273, + "balance_loss_mlp": 1.01869392, + "epoch": 0.5664211633849391, + "flos": 24614926076160.0, + "grad_norm": 1.6069880457907244, + "language_loss": 0.74430549, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76817763, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12811279, + "step": 9421, + "time_per_iteration": 2.841266632080078 + }, + { + "auxiliary_loss_clip": 0.0136307, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.24516034, + "balance_loss_mlp": 1.01931715, + "epoch": 0.5664812866376071, + "flos": 11651618818800.0, + "grad_norm": 2.0471480685291463, + "language_loss": 0.73199743, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75595868, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13708496, + "step": 9422, + "time_per_iteration": 4.20029354095459 + }, + { + "auxiliary_loss_clip": 0.01354766, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.2426393, + "balance_loss_mlp": 1.01856732, + "epoch": 0.566541409890275, + "flos": 24649304375880.0, + "grad_norm": 1.5890744115198372, + "language_loss": 0.81714761, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84101075, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12982178, + "step": 9423, + "time_per_iteration": 4.180302619934082 + }, + { + "auxiliary_loss_clip": 0.01355298, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.24070001, + "balance_loss_mlp": 1.01861477, + "epoch": 0.566601533142943, + "flos": 22275347002560.0, + "grad_norm": 1.4829998117219325, + "language_loss": 0.81067753, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.83455396, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13739014, + "step": 9424, + "time_per_iteration": 2.771548271179199 + }, + { + "auxiliary_loss_clip": 0.01369561, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.25030696, + "balance_loss_mlp": 1.01664376, + "epoch": 0.5666616563956111, + "flos": 29977768796400.0, + "grad_norm": 2.741293028546563, + "language_loss": 0.78987026, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.81387222, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.13989258, + "step": 9425, + "time_per_iteration": 2.7965760231018066 + }, + { + "auxiliary_loss_clip": 0.01356375, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.24221337, + "balance_loss_mlp": 1.01784086, + "epoch": 0.566721779648279, + "flos": 17785460732040.0, + "grad_norm": 2.0827271157862173, + "language_loss": 0.59543109, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61930448, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13140869, + "step": 9426, + "time_per_iteration": 2.737725257873535 + }, + { + "auxiliary_loss_clip": 0.01364226, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.24575758, + "balance_loss_mlp": 1.02286148, + "epoch": 0.566781902900947, + "flos": 21038547025800.0, + "grad_norm": 1.6594293693662785, + "language_loss": 0.8178494, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84185934, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.13909912, + "step": 9427, + "time_per_iteration": 2.7376227378845215 + }, + { + "auxiliary_loss_clip": 0.01350795, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.23962867, + "balance_loss_mlp": 1.02381349, + "epoch": 0.5668420261536149, + "flos": 23153785580880.0, + "grad_norm": 2.607160614958179, + "language_loss": 0.86235416, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88623846, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13824463, + "step": 9428, + "time_per_iteration": 2.728790283203125 + }, + { + "auxiliary_loss_clip": 0.01368757, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.25008893, + "balance_loss_mlp": 1.02595806, + "epoch": 0.5669021494062829, + "flos": 22606761431160.0, + "grad_norm": 2.1830674585319216, + "language_loss": 0.74172413, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.76581192, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14056396, + "step": 9429, + "time_per_iteration": 4.207144498825073 + }, + { + "auxiliary_loss_clip": 0.01364908, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.24695325, + "balance_loss_mlp": 1.01818705, + "epoch": 0.5669622726589508, + "flos": 17385330312360.0, + "grad_norm": 1.9084499751666573, + "language_loss": 0.75454903, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77852392, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.1439209, + "step": 9430, + "time_per_iteration": 2.75118350982666 + }, + { + "auxiliary_loss_clip": 0.01356897, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.24094713, + "balance_loss_mlp": 1.01849627, + "epoch": 0.5670223959116188, + "flos": 18766952992800.0, + "grad_norm": 1.6636843116792226, + "language_loss": 0.72697806, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75086498, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13293457, + "step": 9431, + "time_per_iteration": 2.7382760047912598 + }, + { + "auxiliary_loss_clip": 0.01340368, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.23339128, + "balance_loss_mlp": 1.0194999, + "epoch": 0.5670825191642868, + "flos": 13557338906760.0, + "grad_norm": 1.7930039585989326, + "language_loss": 0.73873818, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.76245868, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12182617, + "step": 9432, + "time_per_iteration": 2.7152209281921387 + }, + { + "auxiliary_loss_clip": 0.01354714, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.24005604, + "balance_loss_mlp": 1.0175755, + "epoch": 0.5671426424169548, + "flos": 22059087547680.0, + "grad_norm": 1.4492675741660284, + "language_loss": 0.78188002, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80573285, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.12988281, + "step": 9433, + "time_per_iteration": 2.741968870162964 + }, + { + "auxiliary_loss_clip": 0.01366073, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.2482996, + "balance_loss_mlp": 1.01791263, + "epoch": 0.5672027656696227, + "flos": 21328882775280.0, + "grad_norm": 2.1005599957057712, + "language_loss": 0.63497251, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65896893, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.15655518, + "step": 9434, + "time_per_iteration": 4.268208742141724 + }, + { + "auxiliary_loss_clip": 0.01346691, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.23493481, + "balance_loss_mlp": 1.01611841, + "epoch": 0.5672628889222907, + "flos": 23519375267400.0, + "grad_norm": 1.6703555661845795, + "language_loss": 0.66849363, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.69225425, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.13256836, + "step": 9435, + "time_per_iteration": 2.8054051399230957 + }, + { + "auxiliary_loss_clip": 0.01349006, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.23600876, + "balance_loss_mlp": 1.01766765, + "epoch": 0.5673230121749586, + "flos": 27127168207200.0, + "grad_norm": 1.4961115369473306, + "language_loss": 0.7188459, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.7426461, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13354492, + "step": 9436, + "time_per_iteration": 2.764706611633301 + }, + { + "auxiliary_loss_clip": 0.0136264, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.24476767, + "balance_loss_mlp": 1.01939893, + "epoch": 0.5673831354276266, + "flos": 31148614150560.0, + "grad_norm": 1.4172436198146818, + "language_loss": 0.74208784, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76604617, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.13775635, + "step": 9437, + "time_per_iteration": 2.838181495666504 + }, + { + "auxiliary_loss_clip": 0.01361335, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.24589634, + "balance_loss_mlp": 1.0233531, + "epoch": 0.5674432586802945, + "flos": 27679024751760.0, + "grad_norm": 1.795919507217323, + "language_loss": 0.6080898, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63207775, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14105225, + "step": 9438, + "time_per_iteration": 2.7666819095611572 + }, + { + "auxiliary_loss_clip": 0.01353477, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.23672175, + "balance_loss_mlp": 1.01515627, + "epoch": 0.5675033819329626, + "flos": 26620857261360.0, + "grad_norm": 1.472769379708755, + "language_loss": 0.75110471, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77492195, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.13104248, + "step": 9439, + "time_per_iteration": 2.807042121887207 + }, + { + "auxiliary_loss_clip": 0.01351923, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.23980474, + "balance_loss_mlp": 1.01915598, + "epoch": 0.5675635051856306, + "flos": 19102996774440.0, + "grad_norm": 1.6626260578106324, + "language_loss": 0.8351202, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85898197, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.15112305, + "step": 9440, + "time_per_iteration": 2.729428291320801 + }, + { + "auxiliary_loss_clip": 0.01371047, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.24994087, + "balance_loss_mlp": 1.02266455, + "epoch": 0.5676236284382985, + "flos": 17570785003200.0, + "grad_norm": 2.551727403641636, + "language_loss": 0.75755095, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.78163123, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.14331055, + "step": 9441, + "time_per_iteration": 2.877932548522949 + }, + { + "auxiliary_loss_clip": 0.01355831, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.24224901, + "balance_loss_mlp": 1.01907563, + "epoch": 0.5676837516909665, + "flos": 15957431082720.0, + "grad_norm": 1.941114359526022, + "language_loss": 0.82350695, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.84738964, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13354492, + "step": 9442, + "time_per_iteration": 2.791189432144165 + }, + { + "auxiliary_loss_clip": 0.01335904, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.22810733, + "balance_loss_mlp": 1.02009022, + "epoch": 0.5677438749436344, + "flos": 18300461866920.0, + "grad_norm": 1.832962286016101, + "language_loss": 0.75014371, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.77383411, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1305542, + "step": 9443, + "time_per_iteration": 2.740074396133423 + }, + { + "auxiliary_loss_clip": 0.01355324, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.24128211, + "balance_loss_mlp": 1.01829147, + "epoch": 0.5678039981963025, + "flos": 17935846781040.0, + "grad_norm": 1.941218289139477, + "language_loss": 0.77669084, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.80056334, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13635254, + "step": 9444, + "time_per_iteration": 2.7521772384643555 + }, + { + "auxiliary_loss_clip": 0.01355986, + "auxiliary_loss_mlp": 0.01042166, + "balance_loss_clip": 1.23957384, + "balance_loss_mlp": 1.02815866, + "epoch": 0.5678641214489704, + "flos": 19320880563720.0, + "grad_norm": 1.5894773788428291, + "language_loss": 0.8104046, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83438611, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14013672, + "step": 9445, + "time_per_iteration": 2.7292416095733643 + }, + { + "auxiliary_loss_clip": 0.01353609, + "auxiliary_loss_mlp": 0.01024855, + "balance_loss_clip": 1.23886585, + "balance_loss_mlp": 1.01161051, + "epoch": 0.5679242447016384, + "flos": 27757567965960.0, + "grad_norm": 1.2792706945535426, + "language_loss": 0.70678413, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73056877, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13250732, + "step": 9446, + "time_per_iteration": 2.8395748138427734 + }, + { + "auxiliary_loss_clip": 0.01361966, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.24254906, + "balance_loss_mlp": 1.01952434, + "epoch": 0.5679843679543063, + "flos": 23774398724880.0, + "grad_norm": 1.8264084953186601, + "language_loss": 0.73578095, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.7597326, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13665771, + "step": 9447, + "time_per_iteration": 2.738410234451294 + }, + { + "auxiliary_loss_clip": 0.01359798, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.2416923, + "balance_loss_mlp": 1.01922917, + "epoch": 0.5680444912069743, + "flos": 25597230504120.0, + "grad_norm": 2.0350762578936554, + "language_loss": 0.75221843, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77614605, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.1373291, + "step": 9448, + "time_per_iteration": 2.7848684787750244 + }, + { + "auxiliary_loss_clip": 0.01365359, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.24752843, + "balance_loss_mlp": 1.02391994, + "epoch": 0.5681046144596422, + "flos": 23701053380760.0, + "grad_norm": 2.055537062385475, + "language_loss": 0.75953144, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78356302, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.13891602, + "step": 9449, + "time_per_iteration": 2.7342069149017334 + }, + { + "auxiliary_loss_clip": 0.01353676, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.23706245, + "balance_loss_mlp": 1.02460444, + "epoch": 0.5681647377123102, + "flos": 28006540777800.0, + "grad_norm": 1.5900410136800873, + "language_loss": 0.75137067, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.7752943, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14080811, + "step": 9450, + "time_per_iteration": 2.7929065227508545 + }, + { + "auxiliary_loss_clip": 0.01352183, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.23557293, + "balance_loss_mlp": 1.02345681, + "epoch": 0.5682248609649782, + "flos": 22752964819080.0, + "grad_norm": 1.965643782710182, + "language_loss": 0.67342472, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69731736, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13623047, + "step": 9451, + "time_per_iteration": 2.7364141941070557 + }, + { + "auxiliary_loss_clip": 0.01370348, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.24757457, + "balance_loss_mlp": 1.01740265, + "epoch": 0.5682849842176462, + "flos": 21293651700000.0, + "grad_norm": 1.808689957141762, + "language_loss": 0.7221911, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74622375, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.15509033, + "step": 9452, + "time_per_iteration": 2.785126209259033 + }, + { + "auxiliary_loss_clip": 0.01345132, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.23331046, + "balance_loss_mlp": 1.0186106, + "epoch": 0.5683451074703142, + "flos": 21146595536520.0, + "grad_norm": 1.4588011511153829, + "language_loss": 0.70596486, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72973365, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13153076, + "step": 9453, + "time_per_iteration": 2.7499868869781494 + }, + { + "auxiliary_loss_clip": 0.01355972, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.24109101, + "balance_loss_mlp": 1.02357125, + "epoch": 0.5684052307229821, + "flos": 22349382688800.0, + "grad_norm": 2.4665302394636646, + "language_loss": 0.70286012, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.72678697, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13153076, + "step": 9454, + "time_per_iteration": 2.7747106552124023 + }, + { + "auxiliary_loss_clip": 0.013447, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.23211849, + "balance_loss_mlp": 1.02022433, + "epoch": 0.5684653539756501, + "flos": 21803658006600.0, + "grad_norm": 1.9699333521034044, + "language_loss": 0.60851586, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.63228518, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.12646484, + "router_z_loss_mlp": 0.12011719, + "step": 9455, + "time_per_iteration": 2.7646963596343994 + }, + { + "auxiliary_loss_clip": 0.01366083, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.24676168, + "balance_loss_mlp": 1.02026391, + "epoch": 0.568525477228318, + "flos": 23004414740880.0, + "grad_norm": 1.9863555325783633, + "language_loss": 0.7399357, + "learning_rate": 1.6549199011198e-06, + "loss": 0.76394069, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.1416626, + "step": 9456, + "time_per_iteration": 2.731799602508545 + }, + { + "auxiliary_loss_clip": 0.01356045, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.24123001, + "balance_loss_mlp": 1.01967883, + "epoch": 0.568585600480986, + "flos": 21396827207520.0, + "grad_norm": 1.517935303564246, + "language_loss": 0.7734915, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79737294, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.12445068, + "step": 9457, + "time_per_iteration": 2.7868497371673584 + }, + { + "auxiliary_loss_clip": 0.01355731, + "auxiliary_loss_mlp": 0.01039641, + "balance_loss_clip": 1.2380321, + "balance_loss_mlp": 1.02468061, + "epoch": 0.568645723733654, + "flos": 30013365346920.0, + "grad_norm": 1.735976155716197, + "language_loss": 0.66466182, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68861556, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.1496582, + "step": 9458, + "time_per_iteration": 2.78631854057312 + }, + { + "auxiliary_loss_clip": 0.01360082, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.24253905, + "balance_loss_mlp": 1.01534891, + "epoch": 0.568705846986322, + "flos": 20417568406560.0, + "grad_norm": 2.0926498848883, + "language_loss": 0.68477017, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70866776, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14343262, + "step": 9459, + "time_per_iteration": 2.7558786869049072 + }, + { + "auxiliary_loss_clip": 0.01360943, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.24254024, + "balance_loss_mlp": 1.02072883, + "epoch": 0.5687659702389899, + "flos": 17461193374800.0, + "grad_norm": 2.0840305193902804, + "language_loss": 0.76968074, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79363394, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.13671875, + "step": 9460, + "time_per_iteration": 4.1404688358306885 + }, + { + "auxiliary_loss_clip": 0.01366565, + "auxiliary_loss_mlp": 0.01033177, + "balance_loss_clip": 1.24774075, + "balance_loss_mlp": 1.01910448, + "epoch": 0.5688260934916579, + "flos": 25410963646080.0, + "grad_norm": 1.9267483111635444, + "language_loss": 0.72125387, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74525136, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.14093018, + "step": 9461, + "time_per_iteration": 4.3386571407318115 + }, + { + "auxiliary_loss_clip": 0.01356107, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.24121499, + "balance_loss_mlp": 1.01800084, + "epoch": 0.5688862167443258, + "flos": 21611949628320.0, + "grad_norm": 1.771423312157656, + "language_loss": 0.73264754, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75652885, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14019775, + "step": 9462, + "time_per_iteration": 2.71740460395813 + }, + { + "auxiliary_loss_clip": 0.01342834, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.2323401, + "balance_loss_mlp": 1.0177772, + "epoch": 0.5689463399969938, + "flos": 22424108717160.0, + "grad_norm": 1.9864459444316156, + "language_loss": 0.72466028, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.74838382, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.11743164, + "step": 9463, + "time_per_iteration": 2.877622127532959 + }, + { + "auxiliary_loss_clip": 0.0135821, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.24207699, + "balance_loss_mlp": 1.02054274, + "epoch": 0.5690064632496618, + "flos": 18301477075920.0, + "grad_norm": 1.6407868171971025, + "language_loss": 0.74400353, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76792705, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13574219, + "step": 9464, + "time_per_iteration": 2.872636318206787 + }, + { + "auxiliary_loss_clip": 0.01356196, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.23964071, + "balance_loss_mlp": 1.0239327, + "epoch": 0.5690665865023298, + "flos": 21584190491280.0, + "grad_norm": 2.051564369163061, + "language_loss": 0.84570014, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86964488, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14343262, + "step": 9465, + "time_per_iteration": 2.756239175796509 + }, + { + "auxiliary_loss_clip": 0.01345946, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.23432481, + "balance_loss_mlp": 1.01784396, + "epoch": 0.5691267097549978, + "flos": 24426263324880.0, + "grad_norm": 1.6061304717970188, + "language_loss": 0.72373068, + "learning_rate": 1.651084350506125e-06, + "loss": 0.7474944, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12567139, + "step": 9466, + "time_per_iteration": 2.7750723361968994 + }, + { + "auxiliary_loss_clip": 0.01201684, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.15570927, + "balance_loss_mlp": 1.03092432, + "epoch": 0.5691868330076657, + "flos": 61675089020640.0, + "grad_norm": 0.732188743192971, + "language_loss": 0.55420446, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57655591, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02539062, + "step": 9467, + "time_per_iteration": 4.740269184112549 + }, + { + "auxiliary_loss_clip": 0.01363083, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.24682152, + "balance_loss_mlp": 1.01692104, + "epoch": 0.5692469562603337, + "flos": 21330263459520.0, + "grad_norm": 2.991573047046074, + "language_loss": 0.63714349, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66109407, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.15063477, + "step": 9468, + "time_per_iteration": 2.7736847400665283 + }, + { + "auxiliary_loss_clip": 0.01354061, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.24048829, + "balance_loss_mlp": 1.02026153, + "epoch": 0.5693070795130016, + "flos": 23373171879480.0, + "grad_norm": 2.0605245223571926, + "language_loss": 0.79248571, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81636298, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13391113, + "step": 9469, + "time_per_iteration": 2.74756121635437 + }, + { + "auxiliary_loss_clip": 0.01367771, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.24856794, + "balance_loss_mlp": 1.01961923, + "epoch": 0.5693672027656697, + "flos": 18701851145760.0, + "grad_norm": 1.8784518712952838, + "language_loss": 0.69531524, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71933472, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.14550781, + "step": 9470, + "time_per_iteration": 2.7373719215393066 + }, + { + "auxiliary_loss_clip": 0.01355872, + "auxiliary_loss_mlp": 0.01033815, + "balance_loss_clip": 1.24029517, + "balance_loss_mlp": 1.01923585, + "epoch": 0.5694273260183376, + "flos": 20454098949360.0, + "grad_norm": 1.5061148700435985, + "language_loss": 0.74982095, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77371782, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14550781, + "step": 9471, + "time_per_iteration": 2.759554862976074 + }, + { + "auxiliary_loss_clip": 0.01348077, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.2364136, + "balance_loss_mlp": 1.01888371, + "epoch": 0.5694874492710056, + "flos": 17607721629600.0, + "grad_norm": 2.120848993818007, + "language_loss": 0.57970607, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.60350323, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12756348, + "step": 9472, + "time_per_iteration": 4.251052618026733 + }, + { + "auxiliary_loss_clip": 0.01346261, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.23540592, + "balance_loss_mlp": 1.0217886, + "epoch": 0.5695475725236735, + "flos": 13374808017840.0, + "grad_norm": 1.8003596637424302, + "language_loss": 0.74030608, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76411736, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.1307373, + "step": 9473, + "time_per_iteration": 2.7093465328216553 + }, + { + "auxiliary_loss_clip": 0.01200659, + "auxiliary_loss_mlp": 0.01005201, + "balance_loss_clip": 1.15563226, + "balance_loss_mlp": 1.00250721, + "epoch": 0.5696076957763415, + "flos": 68190543941400.0, + "grad_norm": 0.7139596443338401, + "language_loss": 0.57631159, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59837019, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02697754, + "step": 9474, + "time_per_iteration": 3.296753168106079 + }, + { + "auxiliary_loss_clip": 0.01350921, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.23884273, + "balance_loss_mlp": 1.02058434, + "epoch": 0.5696678190290094, + "flos": 33844808463120.0, + "grad_norm": 1.6828513145675528, + "language_loss": 0.54095411, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.5648042, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13513184, + "step": 9475, + "time_per_iteration": 2.8303844928741455 + }, + { + "auxiliary_loss_clip": 0.0135875, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.24321878, + "balance_loss_mlp": 1.01721001, + "epoch": 0.5697279422816774, + "flos": 26361854184600.0, + "grad_norm": 1.531027945521136, + "language_loss": 0.79579806, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81969357, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13592529, + "step": 9476, + "time_per_iteration": 2.8497679233551025 + }, + { + "auxiliary_loss_clip": 0.01362885, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.24640012, + "balance_loss_mlp": 1.02677226, + "epoch": 0.5697880655343454, + "flos": 22935779966520.0, + "grad_norm": 11.808102353789522, + "language_loss": 0.67155445, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.69559002, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.13897705, + "step": 9477, + "time_per_iteration": 2.7758901119232178 + }, + { + "auxiliary_loss_clip": 0.01356699, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.24083364, + "balance_loss_mlp": 1.02368188, + "epoch": 0.5698481887870134, + "flos": 26767060649280.0, + "grad_norm": 1.6464732061912508, + "language_loss": 0.71201932, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73596632, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14318848, + "step": 9478, + "time_per_iteration": 2.79546856880188 + }, + { + "auxiliary_loss_clip": 0.01345779, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.23619103, + "balance_loss_mlp": 1.01696086, + "epoch": 0.5699083120396814, + "flos": 15746531931360.0, + "grad_norm": 1.6518958355625772, + "language_loss": 0.69384539, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71759915, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1262207, + "step": 9479, + "time_per_iteration": 2.692105770111084 + }, + { + "auxiliary_loss_clip": 0.0134566, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.23531771, + "balance_loss_mlp": 1.02058291, + "epoch": 0.5699684352923493, + "flos": 19542094238520.0, + "grad_norm": 1.3794430560029187, + "language_loss": 0.71327329, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.7370708, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13500977, + "step": 9480, + "time_per_iteration": 2.7566776275634766 + }, + { + "auxiliary_loss_clip": 0.01352306, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.23848403, + "balance_loss_mlp": 1.01854372, + "epoch": 0.5700285585450173, + "flos": 16257715880400.0, + "grad_norm": 2.070161300657689, + "language_loss": 0.72539139, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.7492429, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14306641, + "step": 9481, + "time_per_iteration": 2.697368621826172 + }, + { + "auxiliary_loss_clip": 0.013524, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.23950005, + "balance_loss_mlp": 1.02222848, + "epoch": 0.5700886817976852, + "flos": 19869772698000.0, + "grad_norm": 1.634595924650817, + "language_loss": 0.78460371, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.8084836, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13378906, + "step": 9482, + "time_per_iteration": 2.7306008338928223 + }, + { + "auxiliary_loss_clip": 0.01350719, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.23812628, + "balance_loss_mlp": 1.01909769, + "epoch": 0.5701488050503533, + "flos": 23846972510160.0, + "grad_norm": 1.6966907067005077, + "language_loss": 0.78183943, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80567086, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13317871, + "step": 9483, + "time_per_iteration": 2.789371967315674 + }, + { + "auxiliary_loss_clip": 0.01354086, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.24079728, + "balance_loss_mlp": 1.02328229, + "epoch": 0.5702089283030212, + "flos": 23664969529920.0, + "grad_norm": 1.669715027114449, + "language_loss": 0.81351632, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83742142, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13140869, + "step": 9484, + "time_per_iteration": 2.759446620941162 + }, + { + "auxiliary_loss_clip": 0.01353697, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.23944473, + "balance_loss_mlp": 1.01631296, + "epoch": 0.5702690515556892, + "flos": 27896908541040.0, + "grad_norm": 1.8860490507459886, + "language_loss": 0.60851878, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63235933, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.14031982, + "step": 9485, + "time_per_iteration": 2.7767374515533447 + }, + { + "auxiliary_loss_clip": 0.01356918, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.2421602, + "balance_loss_mlp": 1.0208497, + "epoch": 0.5703291748083571, + "flos": 24029219140560.0, + "grad_norm": 1.8471085149889774, + "language_loss": 0.65373927, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.6776554, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.1383667, + "step": 9486, + "time_per_iteration": 2.791239023208618 + }, + { + "auxiliary_loss_clip": 0.01191979, + "auxiliary_loss_mlp": 0.01014748, + "balance_loss_clip": 1.14722037, + "balance_loss_mlp": 1.01238811, + "epoch": 0.5703892980610251, + "flos": 57039852137760.0, + "grad_norm": 0.66947245500886, + "language_loss": 0.47999924, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50206655, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02355957, + "step": 9487, + "time_per_iteration": 3.3436546325683594 + }, + { + "auxiliary_loss_clip": 0.01352603, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.23908079, + "balance_loss_mlp": 1.01844406, + "epoch": 0.570449421313693, + "flos": 24356166649560.0, + "grad_norm": 1.567881926037877, + "language_loss": 0.86334968, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88719213, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13189697, + "step": 9488, + "time_per_iteration": 2.7773499488830566 + }, + { + "auxiliary_loss_clip": 0.01359406, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.24217594, + "balance_loss_mlp": 1.01805341, + "epoch": 0.570509544566361, + "flos": 24834190549680.0, + "grad_norm": 1.429143574616039, + "language_loss": 0.79342645, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81733692, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.13580322, + "step": 9489, + "time_per_iteration": 2.8095295429229736 + }, + { + "auxiliary_loss_clip": 0.0135247, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.23825979, + "balance_loss_mlp": 1.01898456, + "epoch": 0.570569667819029, + "flos": 21402268727760.0, + "grad_norm": 1.7071241643956385, + "language_loss": 0.70380199, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72764719, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13085938, + "step": 9490, + "time_per_iteration": 2.7743356227874756 + }, + { + "auxiliary_loss_clip": 0.01348934, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.23639727, + "balance_loss_mlp": 1.01537037, + "epoch": 0.570629791071697, + "flos": 23221323929520.0, + "grad_norm": 1.5690096030806506, + "language_loss": 0.76710522, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.79087543, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.12701416, + "step": 9491, + "time_per_iteration": 2.7389659881591797 + }, + { + "auxiliary_loss_clip": 0.01194844, + "auxiliary_loss_mlp": 0.01010943, + "balance_loss_clip": 1.14976585, + "balance_loss_mlp": 1.00859499, + "epoch": 0.570689914324365, + "flos": 65299473798480.0, + "grad_norm": 0.7993043410353418, + "language_loss": 0.57484519, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59690309, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.0234375, + "step": 9492, + "time_per_iteration": 3.2714200019836426 + }, + { + "auxiliary_loss_clip": 0.01352648, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.2398026, + "balance_loss_mlp": 1.01671386, + "epoch": 0.5707500375770329, + "flos": 21146839186680.0, + "grad_norm": 1.7112727945043669, + "language_loss": 0.72275871, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.74658507, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13287354, + "step": 9493, + "time_per_iteration": 2.7219674587249756 + }, + { + "auxiliary_loss_clip": 0.0136042, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.24467492, + "balance_loss_mlp": 1.01703382, + "epoch": 0.5708101608297009, + "flos": 20817536392800.0, + "grad_norm": 1.7188251337727496, + "language_loss": 0.78096259, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80487448, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.1373291, + "step": 9494, + "time_per_iteration": 2.762939929962158 + }, + { + "auxiliary_loss_clip": 0.01368805, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.24925625, + "balance_loss_mlp": 1.01958084, + "epoch": 0.5708702840823688, + "flos": 25817916270240.0, + "grad_norm": 2.1972697529300422, + "language_loss": 0.80044734, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82447886, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14758301, + "step": 9495, + "time_per_iteration": 2.783356189727783 + }, + { + "auxiliary_loss_clip": 0.01369017, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.24865842, + "balance_loss_mlp": 1.02418709, + "epoch": 0.5709304073350369, + "flos": 23656198124160.0, + "grad_norm": 2.425101539317727, + "language_loss": 0.66637683, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.69046825, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.15948486, + "step": 9496, + "time_per_iteration": 2.8089988231658936 + }, + { + "auxiliary_loss_clip": 0.01363852, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.24633312, + "balance_loss_mlp": 1.02482796, + "epoch": 0.5709905305877048, + "flos": 16111918576080.0, + "grad_norm": 7.470521214692688, + "language_loss": 0.69765544, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.72167921, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.13690186, + "step": 9497, + "time_per_iteration": 2.7323319911956787 + }, + { + "auxiliary_loss_clip": 0.01361495, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.24504828, + "balance_loss_mlp": 1.01361871, + "epoch": 0.5710506538403728, + "flos": 24755809768920.0, + "grad_norm": 1.7610222532694464, + "language_loss": 0.81616336, + "learning_rate": 1.638819551358182e-06, + "loss": 0.84005779, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14324951, + "step": 9498, + "time_per_iteration": 4.16823148727417 + }, + { + "auxiliary_loss_clip": 0.01360626, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.24444342, + "balance_loss_mlp": 1.01759887, + "epoch": 0.5711107770930407, + "flos": 21987447754680.0, + "grad_norm": 1.975299155819826, + "language_loss": 0.67063862, + "learning_rate": 1.638436499891469e-06, + "loss": 0.69456959, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14874268, + "step": 9499, + "time_per_iteration": 2.779231548309326 + }, + { + "auxiliary_loss_clip": 0.01364416, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.24941802, + "balance_loss_mlp": 1.01975131, + "epoch": 0.5711709003457087, + "flos": 19578868431480.0, + "grad_norm": 1.5130433364776732, + "language_loss": 0.7210837, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74506342, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13818359, + "step": 9500, + "time_per_iteration": 4.268391847610474 + }, + { + "auxiliary_loss_clip": 0.01365149, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.24744034, + "balance_loss_mlp": 1.01701713, + "epoch": 0.5712310235983766, + "flos": 24247955705400.0, + "grad_norm": 1.8868861147886513, + "language_loss": 0.76756853, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.79153216, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14196777, + "step": 9501, + "time_per_iteration": 2.795006513595581 + }, + { + "auxiliary_loss_clip": 0.01359748, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.24371171, + "balance_loss_mlp": 1.01939201, + "epoch": 0.5712911468510447, + "flos": 21001082490720.0, + "grad_norm": 1.7068631263274205, + "language_loss": 0.75114059, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77506733, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13513184, + "step": 9502, + "time_per_iteration": 2.777010440826416 + }, + { + "auxiliary_loss_clip": 0.01358815, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.24575901, + "balance_loss_mlp": 1.01662529, + "epoch": 0.5713512701037126, + "flos": 18921724744680.0, + "grad_norm": 1.512652295130917, + "language_loss": 0.82155144, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84544027, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13433838, + "step": 9503, + "time_per_iteration": 2.765104293823242 + }, + { + "auxiliary_loss_clip": 0.01355333, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.24239969, + "balance_loss_mlp": 1.02208519, + "epoch": 0.5714113933563806, + "flos": 17416987851960.0, + "grad_norm": 1.8443190328865637, + "language_loss": 0.85836715, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.8822751, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13378906, + "step": 9504, + "time_per_iteration": 2.7808806896209717 + }, + { + "auxiliary_loss_clip": 0.01352676, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.24162126, + "balance_loss_mlp": 1.01785159, + "epoch": 0.5714715166090486, + "flos": 20198182107960.0, + "grad_norm": 1.791598978275559, + "language_loss": 0.75870746, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.78254855, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13586426, + "step": 9505, + "time_per_iteration": 2.763540744781494 + }, + { + "auxiliary_loss_clip": 0.01352294, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.23965096, + "balance_loss_mlp": 1.02007401, + "epoch": 0.5715316398617165, + "flos": 18556297491600.0, + "grad_norm": 2.067442150637464, + "language_loss": 0.81921196, + "learning_rate": 1.635755524332509e-06, + "loss": 0.843063, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.12756348, + "step": 9506, + "time_per_iteration": 4.174624919891357 + }, + { + "auxiliary_loss_clip": 0.01356485, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.24305964, + "balance_loss_mlp": 1.0194391, + "epoch": 0.5715917631143845, + "flos": 18482749105680.0, + "grad_norm": 1.5641172757764075, + "language_loss": 0.77420259, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79808879, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.12695312, + "step": 9507, + "time_per_iteration": 2.7695345878601074 + }, + { + "auxiliary_loss_clip": 0.01361264, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.24521852, + "balance_loss_mlp": 1.02766299, + "epoch": 0.5716518863670524, + "flos": 24024792829320.0, + "grad_norm": 1.507584472568667, + "language_loss": 0.68996918, + "learning_rate": 1.63498965540751e-06, + "loss": 0.71400809, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14959717, + "step": 9508, + "time_per_iteration": 2.8109731674194336 + }, + { + "auxiliary_loss_clip": 0.01363024, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.24645722, + "balance_loss_mlp": 1.01702547, + "epoch": 0.5717120096197205, + "flos": 17823696825960.0, + "grad_norm": 1.9728294407868645, + "language_loss": 0.79943776, + "learning_rate": 1.634606741699593e-06, + "loss": 0.82337999, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14172363, + "step": 9509, + "time_per_iteration": 2.716449737548828 + }, + { + "auxiliary_loss_clip": 0.01350029, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.23901176, + "balance_loss_mlp": 1.01917195, + "epoch": 0.5717721328723884, + "flos": 21870343579680.0, + "grad_norm": 1.8737822559311341, + "language_loss": 0.72290057, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74672991, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.1373291, + "step": 9510, + "time_per_iteration": 2.7298741340637207 + }, + { + "auxiliary_loss_clip": 0.0135296, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.24059165, + "balance_loss_mlp": 1.02187169, + "epoch": 0.5718322561250564, + "flos": 28443323565360.0, + "grad_norm": 1.3956098915932535, + "language_loss": 0.69855356, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.72243023, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1282959, + "step": 9511, + "time_per_iteration": 4.352480173110962 + }, + { + "auxiliary_loss_clip": 0.01357159, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.24340892, + "balance_loss_mlp": 1.0215832, + "epoch": 0.5718923793777243, + "flos": 13555633355640.0, + "grad_norm": 1.7984089563697474, + "language_loss": 0.61923552, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.6431545, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.13720703, + "router_z_loss_mlp": 0.13165283, + "step": 9512, + "time_per_iteration": 2.7580509185791016 + }, + { + "auxiliary_loss_clip": 0.01351713, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.23826408, + "balance_loss_mlp": 1.02013516, + "epoch": 0.5719525026303923, + "flos": 17826580019520.0, + "grad_norm": 2.494500215275191, + "language_loss": 0.76176655, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78560889, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12408447, + "step": 9513, + "time_per_iteration": 2.805821657180786 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01008723, + "balance_loss_clip": 1.14627671, + "balance_loss_mlp": 1.00661302, + "epoch": 0.5720126258830602, + "flos": 61311066078960.0, + "grad_norm": 0.8920441180958528, + "language_loss": 0.66844743, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.69043487, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02111816, + "step": 9514, + "time_per_iteration": 3.243126630783081 + }, + { + "auxiliary_loss_clip": 0.01364468, + "auxiliary_loss_mlp": 0.01040196, + "balance_loss_clip": 1.24788857, + "balance_loss_mlp": 1.02626681, + "epoch": 0.5720727491357283, + "flos": 23993378939880.0, + "grad_norm": 1.9515714011905307, + "language_loss": 0.81712472, + "learning_rate": 1.63230955093099e-06, + "loss": 0.84117138, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13928223, + "step": 9515, + "time_per_iteration": 2.792851209640503 + }, + { + "auxiliary_loss_clip": 0.01350095, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.24027145, + "balance_loss_mlp": 1.01680732, + "epoch": 0.5721328723883962, + "flos": 23410839456360.0, + "grad_norm": 1.6360969524265654, + "language_loss": 0.86031938, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88412738, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13903809, + "step": 9516, + "time_per_iteration": 2.7589614391326904 + }, + { + "auxiliary_loss_clip": 0.01351013, + "auxiliary_loss_mlp": 0.01025524, + "balance_loss_clip": 1.23840666, + "balance_loss_mlp": 1.01157093, + "epoch": 0.5721929956410642, + "flos": 18809290531080.0, + "grad_norm": 1.7874327520449929, + "language_loss": 0.87804937, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.9018147, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13952637, + "step": 9517, + "time_per_iteration": 2.786933660507202 + }, + { + "auxiliary_loss_clip": 0.01355182, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.24211192, + "balance_loss_mlp": 1.01528883, + "epoch": 0.5722531188937322, + "flos": 27202137885720.0, + "grad_norm": 1.5661691122702954, + "language_loss": 0.8558861, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87973034, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13946533, + "step": 9518, + "time_per_iteration": 2.9102940559387207 + }, + { + "auxiliary_loss_clip": 0.01348584, + "auxiliary_loss_mlp": 0.01028566, + "balance_loss_clip": 1.23902416, + "balance_loss_mlp": 1.01560175, + "epoch": 0.5723132421464001, + "flos": 15199954473600.0, + "grad_norm": 1.9474638126468244, + "language_loss": 0.79152024, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81529176, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12969971, + "step": 9519, + "time_per_iteration": 2.7623977661132812 + }, + { + "auxiliary_loss_clip": 0.01356298, + "auxiliary_loss_mlp": 0.01027666, + "balance_loss_clip": 1.24363136, + "balance_loss_mlp": 1.01495838, + "epoch": 0.5723733653990681, + "flos": 27605151498960.0, + "grad_norm": 1.5604373652138699, + "language_loss": 0.82908314, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.8529228, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.1270752, + "step": 9520, + "time_per_iteration": 2.8858046531677246 + }, + { + "auxiliary_loss_clip": 0.01361868, + "auxiliary_loss_mlp": 0.01040891, + "balance_loss_clip": 1.24700963, + "balance_loss_mlp": 1.02703297, + "epoch": 0.572433488651736, + "flos": 18227400781320.0, + "grad_norm": 2.103743883196804, + "language_loss": 0.73057604, + "learning_rate": 1.630012862105243e-06, + "loss": 0.75460362, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13861084, + "step": 9521, + "time_per_iteration": 2.7183997631073 + }, + { + "auxiliary_loss_clip": 0.01352841, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.23955381, + "balance_loss_mlp": 1.02016544, + "epoch": 0.5724936119044041, + "flos": 31255444410480.0, + "grad_norm": 2.0019857489419643, + "language_loss": 0.78258806, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80644929, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13110352, + "step": 9522, + "time_per_iteration": 2.8569676876068115 + }, + { + "auxiliary_loss_clip": 0.0134673, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.23764908, + "balance_loss_mlp": 1.01723814, + "epoch": 0.572553735157072, + "flos": 19206091065240.0, + "grad_norm": 1.5057634218364921, + "language_loss": 0.7214427, + "learning_rate": 1.629247411248102e-06, + "loss": 0.74520385, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12164307, + "step": 9523, + "time_per_iteration": 2.7800064086914062 + }, + { + "auxiliary_loss_clip": 0.01350155, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.23992991, + "balance_loss_mlp": 1.01946533, + "epoch": 0.57261385840974, + "flos": 21219859663920.0, + "grad_norm": 1.7443221263120505, + "language_loss": 0.70484227, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72866654, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12805176, + "step": 9524, + "time_per_iteration": 2.764134168624878 + }, + { + "auxiliary_loss_clip": 0.01354647, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.24235392, + "balance_loss_mlp": 1.01668572, + "epoch": 0.5726739816624079, + "flos": 33991093067760.0, + "grad_norm": 1.8547925274689072, + "language_loss": 0.65747738, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.68132269, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13214111, + "step": 9525, + "time_per_iteration": 2.8585009574890137 + }, + { + "auxiliary_loss_clip": 0.01349177, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.23787689, + "balance_loss_mlp": 1.01798093, + "epoch": 0.5727341049150759, + "flos": 24280831495800.0, + "grad_norm": 1.6305737871160408, + "language_loss": 0.72637182, + "learning_rate": 1.628099340440984e-06, + "loss": 0.75017393, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13043213, + "step": 9526, + "time_per_iteration": 2.806457281112671 + }, + { + "auxiliary_loss_clip": 0.01348259, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.23864067, + "balance_loss_mlp": 1.01710761, + "epoch": 0.5727942281677438, + "flos": 28406183897160.0, + "grad_norm": 1.5432429055411891, + "language_loss": 0.80484051, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82861942, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12530518, + "step": 9527, + "time_per_iteration": 2.82030987739563 + }, + { + "auxiliary_loss_clip": 0.01349148, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.24006414, + "balance_loss_mlp": 1.02512872, + "epoch": 0.5728543514204119, + "flos": 19541525721480.0, + "grad_norm": 1.701157133125045, + "language_loss": 0.72856575, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.75244331, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.1348877, + "step": 9528, + "time_per_iteration": 2.7388389110565186 + }, + { + "auxiliary_loss_clip": 0.01353691, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.24106693, + "balance_loss_mlp": 1.01876092, + "epoch": 0.5729144746730798, + "flos": 21511738531080.0, + "grad_norm": 1.7399450626858446, + "language_loss": 0.86756349, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.89142203, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13397217, + "step": 9529, + "time_per_iteration": 2.7851529121398926 + }, + { + "auxiliary_loss_clip": 0.01189556, + "auxiliary_loss_mlp": 0.01014682, + "balance_loss_clip": 1.14532459, + "balance_loss_mlp": 1.01242948, + "epoch": 0.5729745979257478, + "flos": 58695689048760.0, + "grad_norm": 0.7658212125000742, + "language_loss": 0.56156325, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58360565, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.02258301, + "step": 9530, + "time_per_iteration": 3.1019110679626465 + }, + { + "auxiliary_loss_clip": 0.01364229, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.24899757, + "balance_loss_mlp": 1.01810622, + "epoch": 0.5730347211784158, + "flos": 18556784791920.0, + "grad_norm": 1.6908763016121893, + "language_loss": 0.67015767, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.69411355, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13269043, + "step": 9531, + "time_per_iteration": 2.7776076793670654 + }, + { + "auxiliary_loss_clip": 0.01358518, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.24457622, + "balance_loss_mlp": 1.01598907, + "epoch": 0.5730948444310837, + "flos": 38038186513440.0, + "grad_norm": 2.252581552773886, + "language_loss": 0.76164693, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78553396, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14196777, + "step": 9532, + "time_per_iteration": 2.8562569618225098 + }, + { + "auxiliary_loss_clip": 0.01357282, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.24328446, + "balance_loss_mlp": 1.01748276, + "epoch": 0.5731549676837517, + "flos": 25232371768080.0, + "grad_norm": 1.3071607917941124, + "language_loss": 0.78753555, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81141925, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13598633, + "step": 9533, + "time_per_iteration": 2.8129799365997314 + }, + { + "auxiliary_loss_clip": 0.01351013, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.24143732, + "balance_loss_mlp": 1.01580679, + "epoch": 0.5732150909364196, + "flos": 23373537354720.0, + "grad_norm": 1.5232266751174717, + "language_loss": 0.85880494, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.88260067, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.12762451, + "step": 9534, + "time_per_iteration": 2.768453598022461 + }, + { + "auxiliary_loss_clip": 0.01356174, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.24378872, + "balance_loss_mlp": 1.01715279, + "epoch": 0.5732752141890877, + "flos": 23085028981440.0, + "grad_norm": 1.8684396489327146, + "language_loss": 0.75110447, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77498418, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14642334, + "step": 9535, + "time_per_iteration": 2.839590311050415 + }, + { + "auxiliary_loss_clip": 0.01366269, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.24993539, + "balance_loss_mlp": 1.0200901, + "epoch": 0.5733353374417556, + "flos": 24357587942160.0, + "grad_norm": 1.7445644946200394, + "language_loss": 0.71011603, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73412013, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.140625, + "step": 9536, + "time_per_iteration": 2.786259174346924 + }, + { + "auxiliary_loss_clip": 0.01355992, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.24448264, + "balance_loss_mlp": 1.01605368, + "epoch": 0.5733954606944236, + "flos": 27204614995680.0, + "grad_norm": 1.9548083513943422, + "language_loss": 0.69965315, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.72350603, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13226318, + "step": 9537, + "time_per_iteration": 4.308429479598999 + }, + { + "auxiliary_loss_clip": 0.01356245, + "auxiliary_loss_mlp": 0.01035507, + "balance_loss_clip": 1.2429111, + "balance_loss_mlp": 1.0218215, + "epoch": 0.5734555839470915, + "flos": 28770758374680.0, + "grad_norm": 2.485838816839336, + "language_loss": 0.63114309, + "learning_rate": 1.623508330355902e-06, + "loss": 0.65506065, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13677979, + "step": 9538, + "time_per_iteration": 4.235944986343384 + }, + { + "auxiliary_loss_clip": 0.01359235, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.24703741, + "balance_loss_mlp": 1.02055728, + "epoch": 0.5735157071997595, + "flos": 22972066859160.0, + "grad_norm": 1.6432984355710656, + "language_loss": 0.83770436, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.86164242, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14013672, + "step": 9539, + "time_per_iteration": 2.8273983001708984 + }, + { + "auxiliary_loss_clip": 0.01363498, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.2482121, + "balance_loss_mlp": 1.01633763, + "epoch": 0.5735758304524274, + "flos": 18994136096520.0, + "grad_norm": 3.806612749874315, + "language_loss": 0.73018444, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75412619, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14361572, + "step": 9540, + "time_per_iteration": 2.7338528633117676 + }, + { + "auxiliary_loss_clip": 0.01356335, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.24365628, + "balance_loss_mlp": 1.01589501, + "epoch": 0.5736359537050955, + "flos": 28403219486880.0, + "grad_norm": 1.7407069199225649, + "language_loss": 0.80326188, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82711375, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12963867, + "step": 9541, + "time_per_iteration": 2.828282117843628 + }, + { + "auxiliary_loss_clip": 0.01371569, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.25498343, + "balance_loss_mlp": 1.01878285, + "epoch": 0.5736960769577634, + "flos": 15630727223880.0, + "grad_norm": 1.995007254018245, + "language_loss": 0.65083337, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67487621, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13909912, + "step": 9542, + "time_per_iteration": 2.756741523742676 + }, + { + "auxiliary_loss_clip": 0.0136163, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.24837613, + "balance_loss_mlp": 1.02080369, + "epoch": 0.5737562002104314, + "flos": 18008501783040.0, + "grad_norm": 2.1949170469672135, + "language_loss": 0.83642936, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.86038303, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12939453, + "step": 9543, + "time_per_iteration": 2.823530673980713 + }, + { + "auxiliary_loss_clip": 0.01366598, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.24993527, + "balance_loss_mlp": 1.0199151, + "epoch": 0.5738163234630994, + "flos": 20702300202360.0, + "grad_norm": 1.769145921407272, + "language_loss": 0.73739445, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.76140404, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14453125, + "step": 9544, + "time_per_iteration": 2.7661349773406982 + }, + { + "auxiliary_loss_clip": 0.01363035, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.2476902, + "balance_loss_mlp": 1.01388884, + "epoch": 0.5738764467157673, + "flos": 23154597748080.0, + "grad_norm": 1.7610908052876437, + "language_loss": 0.76307279, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78697968, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13775635, + "step": 9545, + "time_per_iteration": 4.350647926330566 + }, + { + "auxiliary_loss_clip": 0.01357131, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.24435091, + "balance_loss_mlp": 1.01859283, + "epoch": 0.5739365699684353, + "flos": 29498404820400.0, + "grad_norm": 2.3310135258718137, + "language_loss": 0.56373286, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58762836, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.1383667, + "step": 9546, + "time_per_iteration": 2.791762590408325 + }, + { + "auxiliary_loss_clip": 0.01366272, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.25010657, + "balance_loss_mlp": 1.02057683, + "epoch": 0.5739966932211032, + "flos": 14031261362520.0, + "grad_norm": 2.2902212100491455, + "language_loss": 0.77217114, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.79617345, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.1338501, + "step": 9547, + "time_per_iteration": 2.746405601501465 + }, + { + "auxiliary_loss_clip": 0.01357061, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.24331927, + "balance_loss_mlp": 1.017712, + "epoch": 0.5740568164737713, + "flos": 19066669273440.0, + "grad_norm": 1.9345790972647776, + "language_loss": 0.74340212, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76729435, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14465332, + "step": 9548, + "time_per_iteration": 2.7669684886932373 + }, + { + "auxiliary_loss_clip": 0.01360587, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.24561942, + "balance_loss_mlp": 1.01764023, + "epoch": 0.5741169397264392, + "flos": 22133001408840.0, + "grad_norm": 2.08973343009088, + "language_loss": 0.69875336, + "learning_rate": 1.619301709822355e-06, + "loss": 0.72267276, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13720703, + "step": 9549, + "time_per_iteration": 2.8662962913513184 + }, + { + "auxiliary_loss_clip": 0.01359375, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.24844038, + "balance_loss_mlp": 1.01890516, + "epoch": 0.5741770629791072, + "flos": 24942320277120.0, + "grad_norm": 1.5330095784557347, + "language_loss": 0.79509652, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81901145, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.10888672, + "router_z_loss_mlp": 0.13214111, + "step": 9550, + "time_per_iteration": 4.280382871627808 + }, + { + "auxiliary_loss_clip": 0.01361179, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.2475853, + "balance_loss_mlp": 1.01740575, + "epoch": 0.5742371862317751, + "flos": 18805392128520.0, + "grad_norm": 1.865881752635382, + "language_loss": 0.68460053, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70852566, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13928223, + "step": 9551, + "time_per_iteration": 2.77695631980896 + }, + { + "auxiliary_loss_clip": 0.01367409, + "auxiliary_loss_mlp": 0.01037883, + "balance_loss_clip": 1.25078881, + "balance_loss_mlp": 1.02429891, + "epoch": 0.5742973094844431, + "flos": 24465839494680.0, + "grad_norm": 1.6913602483997454, + "language_loss": 0.72257674, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74662966, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13568115, + "step": 9552, + "time_per_iteration": 2.8062469959259033 + }, + { + "auxiliary_loss_clip": 0.01356301, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.24454832, + "balance_loss_mlp": 1.01575112, + "epoch": 0.574357432737111, + "flos": 21657779485560.0, + "grad_norm": 1.7741505612970385, + "language_loss": 0.8012073, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82505429, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12640381, + "step": 9553, + "time_per_iteration": 2.7330355644226074 + }, + { + "auxiliary_loss_clip": 0.01365919, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.25020897, + "balance_loss_mlp": 1.01437068, + "epoch": 0.5744175559897791, + "flos": 16549066838880.0, + "grad_norm": 2.1407427274696844, + "language_loss": 0.83327377, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85720342, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.12677002, + "step": 9554, + "time_per_iteration": 2.74714994430542 + }, + { + "auxiliary_loss_clip": 0.01368075, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.25298476, + "balance_loss_mlp": 1.01970077, + "epoch": 0.574477679242447, + "flos": 24213049497000.0, + "grad_norm": 1.6543827579351875, + "language_loss": 0.71072555, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.73474211, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13885498, + "step": 9555, + "time_per_iteration": 2.7999374866485596 + }, + { + "auxiliary_loss_clip": 0.01357566, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.24473464, + "balance_loss_mlp": 1.01554322, + "epoch": 0.574537802495115, + "flos": 14907141614160.0, + "grad_norm": 2.409571366570797, + "language_loss": 0.73244387, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.75631225, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13739014, + "step": 9556, + "time_per_iteration": 2.743173599243164 + }, + { + "auxiliary_loss_clip": 0.01355449, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.24342287, + "balance_loss_mlp": 1.01673412, + "epoch": 0.5745979257477829, + "flos": 24940208642400.0, + "grad_norm": 1.5420130428697751, + "language_loss": 0.7444014, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76825726, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1340332, + "step": 9557, + "time_per_iteration": 2.837790012359619 + }, + { + "auxiliary_loss_clip": 0.01352021, + "auxiliary_loss_mlp": 0.0103685, + "balance_loss_clip": 1.23976934, + "balance_loss_mlp": 1.02383208, + "epoch": 0.5746580490004509, + "flos": 17239898483280.0, + "grad_norm": 1.4956841992710748, + "language_loss": 0.68283832, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70672697, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13000488, + "step": 9558, + "time_per_iteration": 2.7118349075317383 + }, + { + "auxiliary_loss_clip": 0.01373969, + "auxiliary_loss_mlp": 0.01044027, + "balance_loss_clip": 1.2548697, + "balance_loss_mlp": 1.02788615, + "epoch": 0.5747181722531189, + "flos": 13191749220240.0, + "grad_norm": 1.904712319641505, + "language_loss": 0.70877081, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73295081, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.16137695, + "step": 9559, + "time_per_iteration": 2.7426769733428955 + }, + { + "auxiliary_loss_clip": 0.01359116, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.24795842, + "balance_loss_mlp": 1.0160203, + "epoch": 0.5747782955057869, + "flos": 22967762373000.0, + "grad_norm": 1.6530328582508123, + "language_loss": 0.7892983, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81316817, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.11853027, + "step": 9560, + "time_per_iteration": 2.8267552852630615 + }, + { + "auxiliary_loss_clip": 0.01365152, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.24999547, + "balance_loss_mlp": 1.01787508, + "epoch": 0.5748384187584549, + "flos": 23408362346400.0, + "grad_norm": 2.028584361502932, + "language_loss": 0.64396226, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66793287, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14019775, + "step": 9561, + "time_per_iteration": 2.8487775325775146 + }, + { + "auxiliary_loss_clip": 0.0137581, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.25616217, + "balance_loss_mlp": 1.0195812, + "epoch": 0.5748985420111228, + "flos": 17790049476720.0, + "grad_norm": 1.5379845875324878, + "language_loss": 0.71245229, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73655218, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.14605713, + "step": 9562, + "time_per_iteration": 2.772620439529419 + }, + { + "auxiliary_loss_clip": 0.01357731, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.24537778, + "balance_loss_mlp": 1.02439165, + "epoch": 0.5749586652637908, + "flos": 19871193990600.0, + "grad_norm": 1.5154137834461272, + "language_loss": 0.84184337, + "learning_rate": 1.613950357999751e-06, + "loss": 0.8657937, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12902832, + "step": 9563, + "time_per_iteration": 2.760073184967041 + }, + { + "auxiliary_loss_clip": 0.01371151, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.25409412, + "balance_loss_mlp": 1.02688885, + "epoch": 0.5750187885164587, + "flos": 21292108582320.0, + "grad_norm": 1.7790565620872427, + "language_loss": 0.5787369, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.6028555, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.13818359, + "step": 9564, + "time_per_iteration": 2.736394166946411 + }, + { + "auxiliary_loss_clip": 0.01343845, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.23564887, + "balance_loss_mlp": 1.01942587, + "epoch": 0.5750789117691267, + "flos": 18809006272560.0, + "grad_norm": 1.6295696689159138, + "language_loss": 0.75919139, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78294766, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1237793, + "step": 9565, + "time_per_iteration": 2.725135087966919 + }, + { + "auxiliary_loss_clip": 0.01196138, + "auxiliary_loss_mlp": 0.01007947, + "balance_loss_clip": 1.15190578, + "balance_loss_mlp": 1.00512159, + "epoch": 0.5751390350217946, + "flos": 70680712105800.0, + "grad_norm": 0.7379145353033765, + "language_loss": 0.60816681, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.63020766, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02819824, + "step": 9566, + "time_per_iteration": 3.3283233642578125 + }, + { + "auxiliary_loss_clip": 0.01358304, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.24519622, + "balance_loss_mlp": 1.02181685, + "epoch": 0.5751991582744627, + "flos": 14250404010960.0, + "grad_norm": 2.0826317905330476, + "language_loss": 0.75819927, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.78213078, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13031006, + "step": 9567, + "time_per_iteration": 2.7053537368774414 + }, + { + "auxiliary_loss_clip": 0.01359707, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.24707258, + "balance_loss_mlp": 1.02264917, + "epoch": 0.5752592815271306, + "flos": 18332160014880.0, + "grad_norm": 1.7035387704174558, + "language_loss": 0.74874812, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77270114, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.1295166, + "step": 9568, + "time_per_iteration": 2.748555898666382 + }, + { + "auxiliary_loss_clip": 0.01363066, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.24931967, + "balance_loss_mlp": 1.01660943, + "epoch": 0.5753194047797986, + "flos": 20927615321520.0, + "grad_norm": 1.6862202644889406, + "language_loss": 0.71179831, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.7357291, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13409424, + "step": 9569, + "time_per_iteration": 2.7367000579833984 + }, + { + "auxiliary_loss_clip": 0.01361144, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.24822855, + "balance_loss_mlp": 1.02711368, + "epoch": 0.5753795280324665, + "flos": 19286583480720.0, + "grad_norm": 2.1535962006163683, + "language_loss": 0.55844247, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.58246404, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13885498, + "step": 9570, + "time_per_iteration": 2.768772840499878 + }, + { + "auxiliary_loss_clip": 0.01350579, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.24065113, + "balance_loss_mlp": 1.02159059, + "epoch": 0.5754396512851345, + "flos": 21657901310640.0, + "grad_norm": 1.4201355288470408, + "language_loss": 0.6472708, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.67111945, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12719727, + "step": 9571, + "time_per_iteration": 2.7551231384277344 + }, + { + "auxiliary_loss_clip": 0.01356418, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.24486697, + "balance_loss_mlp": 1.01726747, + "epoch": 0.5754997745378025, + "flos": 51030268116840.0, + "grad_norm": 1.485293779352867, + "language_loss": 0.67333531, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69720316, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13098145, + "step": 9572, + "time_per_iteration": 3.011691093444824 + }, + { + "auxiliary_loss_clip": 0.01352252, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.24091041, + "balance_loss_mlp": 1.02171302, + "epoch": 0.5755598977904705, + "flos": 22862069147160.0, + "grad_norm": 1.9497203076511818, + "language_loss": 0.72921318, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.75309163, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13885498, + "step": 9573, + "time_per_iteration": 2.7903683185577393 + }, + { + "auxiliary_loss_clip": 0.01344582, + "auxiliary_loss_mlp": 0.01026998, + "balance_loss_clip": 1.23828387, + "balance_loss_mlp": 1.01498723, + "epoch": 0.5756200210431385, + "flos": 38480045346000.0, + "grad_norm": 1.8884389746292871, + "language_loss": 0.76382959, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78754544, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12005615, + "step": 9574, + "time_per_iteration": 2.8923697471618652 + }, + { + "auxiliary_loss_clip": 0.01370263, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.25289917, + "balance_loss_mlp": 1.02028477, + "epoch": 0.5756801442958064, + "flos": 23914673292240.0, + "grad_norm": 2.927194463546883, + "language_loss": 0.67378366, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.69783384, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14459229, + "step": 9575, + "time_per_iteration": 2.8832242488861084 + }, + { + "auxiliary_loss_clip": 0.01343783, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.23578084, + "balance_loss_mlp": 1.01967144, + "epoch": 0.5757402675484744, + "flos": 21110186818800.0, + "grad_norm": 1.435657154079831, + "language_loss": 0.79911447, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82287824, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12908936, + "step": 9576, + "time_per_iteration": 4.232696294784546 + }, + { + "auxiliary_loss_clip": 0.01354918, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.24520326, + "balance_loss_mlp": 1.01489747, + "epoch": 0.5758003908011423, + "flos": 20564096661360.0, + "grad_norm": 1.767795141150298, + "language_loss": 0.69771981, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.72154433, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12640381, + "step": 9577, + "time_per_iteration": 2.7393107414245605 + }, + { + "auxiliary_loss_clip": 0.01360148, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.2444613, + "balance_loss_mlp": 1.01914752, + "epoch": 0.5758605140538103, + "flos": 16477508262600.0, + "grad_norm": 1.67485963000108, + "language_loss": 0.66673791, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.69066024, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12939453, + "step": 9578, + "time_per_iteration": 4.146129608154297 + }, + { + "auxiliary_loss_clip": 0.01347537, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.23755479, + "balance_loss_mlp": 1.0150578, + "epoch": 0.5759206373064782, + "flos": 21292474057560.0, + "grad_norm": 1.8821149310031149, + "language_loss": 0.72672915, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75047904, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12408447, + "step": 9579, + "time_per_iteration": 2.761507034301758 + }, + { + "auxiliary_loss_clip": 0.0136722, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.24993706, + "balance_loss_mlp": 1.01581836, + "epoch": 0.5759807605591463, + "flos": 26073508244760.0, + "grad_norm": 2.939140472077949, + "language_loss": 0.65073216, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.67470944, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.14685059, + "step": 9580, + "time_per_iteration": 2.7651054859161377 + }, + { + "auxiliary_loss_clip": 0.01357698, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.2443378, + "balance_loss_mlp": 1.01891994, + "epoch": 0.5760408838118142, + "flos": 18876828879720.0, + "grad_norm": 1.71641095171937, + "language_loss": 0.85496491, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87886554, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13439941, + "step": 9581, + "time_per_iteration": 2.7881290912628174 + }, + { + "auxiliary_loss_clip": 0.01367467, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.25013578, + "balance_loss_mlp": 1.01997399, + "epoch": 0.5761010070644822, + "flos": 15382444754160.0, + "grad_norm": 2.6756796269811733, + "language_loss": 0.67599547, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.70000732, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.1373291, + "step": 9582, + "time_per_iteration": 4.18100905418396 + }, + { + "auxiliary_loss_clip": 0.01202364, + "auxiliary_loss_mlp": 0.01004204, + "balance_loss_clip": 1.15749109, + "balance_loss_mlp": 1.00168824, + "epoch": 0.5761611303171501, + "flos": 71490312867960.0, + "grad_norm": 0.6465553991813274, + "language_loss": 0.57299531, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59506094, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02514648, + "step": 9583, + "time_per_iteration": 3.4575130939483643 + }, + { + "auxiliary_loss_clip": 0.01356235, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.24441314, + "balance_loss_mlp": 1.0159632, + "epoch": 0.5762212535698181, + "flos": 16249594208400.0, + "grad_norm": 1.8038766406751385, + "language_loss": 0.82945061, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.85330248, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13000488, + "step": 9584, + "time_per_iteration": 2.719637870788574 + }, + { + "auxiliary_loss_clip": 0.01200201, + "auxiliary_loss_mlp": 0.01002293, + "balance_loss_clip": 1.15521121, + "balance_loss_mlp": 0.99968225, + "epoch": 0.5762813768224861, + "flos": 70202485163880.0, + "grad_norm": 0.659298768623583, + "language_loss": 0.49647582, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51850075, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02612305, + "step": 9585, + "time_per_iteration": 3.2635107040405273 + }, + { + "auxiliary_loss_clip": 0.01342064, + "auxiliary_loss_mlp": 0.01027708, + "balance_loss_clip": 1.23173082, + "balance_loss_mlp": 1.01514316, + "epoch": 0.5763415000751541, + "flos": 20522408856840.0, + "grad_norm": 1.4373488683989701, + "language_loss": 0.85049999, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87419766, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12579346, + "step": 9586, + "time_per_iteration": 2.76348614692688 + }, + { + "auxiliary_loss_clip": 0.01353375, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.24127853, + "balance_loss_mlp": 1.01945686, + "epoch": 0.5764016233278221, + "flos": 15819836667120.0, + "grad_norm": 1.6763933607419914, + "language_loss": 0.80259752, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82646734, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14141846, + "step": 9587, + "time_per_iteration": 4.176113605499268 + }, + { + "auxiliary_loss_clip": 0.01353323, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.24217141, + "balance_loss_mlp": 1.02183223, + "epoch": 0.57646174658049, + "flos": 20776579538760.0, + "grad_norm": 1.5263419021049465, + "language_loss": 0.66326427, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68714941, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13348389, + "step": 9588, + "time_per_iteration": 2.7345733642578125 + }, + { + "auxiliary_loss_clip": 0.01354804, + "auxiliary_loss_mlp": 0.01030273, + "balance_loss_clip": 1.24205947, + "balance_loss_mlp": 1.01634383, + "epoch": 0.576521869833158, + "flos": 23555174859720.0, + "grad_norm": 1.819014119627254, + "language_loss": 0.78908759, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81293833, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13928223, + "step": 9589, + "time_per_iteration": 2.7883810997009277 + }, + { + "auxiliary_loss_clip": 0.01340978, + "auxiliary_loss_mlp": 0.01023061, + "balance_loss_clip": 1.23311114, + "balance_loss_mlp": 1.01072323, + "epoch": 0.5765819930858259, + "flos": 20271446235360.0, + "grad_norm": 1.9287505440906816, + "language_loss": 0.80052221, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.8241626, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12335205, + "step": 9590, + "time_per_iteration": 2.750042676925659 + }, + { + "auxiliary_loss_clip": 0.01352206, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.2414093, + "balance_loss_mlp": 1.0185225, + "epoch": 0.5766421163384939, + "flos": 23153988622680.0, + "grad_norm": 1.722435424029928, + "language_loss": 0.6305595, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65439016, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12347412, + "step": 9591, + "time_per_iteration": 2.801720380783081 + }, + { + "auxiliary_loss_clip": 0.01356369, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.24299979, + "balance_loss_mlp": 1.02640784, + "epoch": 0.5767022395911618, + "flos": 25854487421400.0, + "grad_norm": 1.9527556572129752, + "language_loss": 0.7806046, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80456853, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.13525391, + "router_z_loss_mlp": 0.1362915, + "step": 9592, + "time_per_iteration": 2.8162007331848145 + }, + { + "auxiliary_loss_clip": 0.01197319, + "auxiliary_loss_mlp": 0.01006724, + "balance_loss_clip": 1.1525588, + "balance_loss_mlp": 1.00439918, + "epoch": 0.5767623628438299, + "flos": 68310693743400.0, + "grad_norm": 0.7340502626557428, + "language_loss": 0.59637457, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.618415, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02319336, + "step": 9593, + "time_per_iteration": 3.3669798374176025 + }, + { + "auxiliary_loss_clip": 0.01361497, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.24594879, + "balance_loss_mlp": 1.0226903, + "epoch": 0.5768224860964978, + "flos": 30194312509800.0, + "grad_norm": 1.7678955740648539, + "language_loss": 0.71249843, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73647523, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13500977, + "step": 9594, + "time_per_iteration": 2.920069456100464 + }, + { + "auxiliary_loss_clip": 0.01346972, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.23578751, + "balance_loss_mlp": 1.0161252, + "epoch": 0.5768826093491658, + "flos": 17899600496760.0, + "grad_norm": 1.9518022992019286, + "language_loss": 0.71588802, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73964119, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12213135, + "step": 9595, + "time_per_iteration": 2.818167209625244 + }, + { + "auxiliary_loss_clip": 0.01353507, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.24246311, + "balance_loss_mlp": 1.01980436, + "epoch": 0.5769427326018337, + "flos": 17461680675120.0, + "grad_norm": 2.088963126122798, + "language_loss": 0.70260727, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72647077, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13037109, + "step": 9596, + "time_per_iteration": 2.825456142425537 + }, + { + "auxiliary_loss_clip": 0.0136815, + "auxiliary_loss_mlp": 0.01037355, + "balance_loss_clip": 1.25115609, + "balance_loss_mlp": 1.02276957, + "epoch": 0.5770028558545017, + "flos": 39429108508320.0, + "grad_norm": 2.152798097260962, + "language_loss": 0.67649376, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.70054877, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14581299, + "step": 9597, + "time_per_iteration": 2.9175164699554443 + }, + { + "auxiliary_loss_clip": 0.01354794, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.24324584, + "balance_loss_mlp": 1.02193403, + "epoch": 0.5770629791071697, + "flos": 21540472268760.0, + "grad_norm": 1.8138766079815778, + "language_loss": 0.81402743, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83792371, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12902832, + "step": 9598, + "time_per_iteration": 2.772503137588501 + }, + { + "auxiliary_loss_clip": 0.01354113, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.24141264, + "balance_loss_mlp": 1.02514958, + "epoch": 0.5771231023598377, + "flos": 20891734512480.0, + "grad_norm": 1.5732853825493125, + "language_loss": 0.73463881, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.75856102, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.1295166, + "step": 9599, + "time_per_iteration": 2.7732744216918945 + }, + { + "auxiliary_loss_clip": 0.01344109, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.23632455, + "balance_loss_mlp": 1.01829517, + "epoch": 0.5771832256125057, + "flos": 18081765910440.0, + "grad_norm": 1.6978699132547141, + "language_loss": 0.78247476, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80621958, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12078857, + "step": 9600, + "time_per_iteration": 2.7287116050720215 + }, + { + "auxiliary_loss_clip": 0.01358735, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.24477649, + "balance_loss_mlp": 1.02537274, + "epoch": 0.5772433488651736, + "flos": 26364656161440.0, + "grad_norm": 1.5823960966962172, + "language_loss": 0.72801644, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.75198865, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13116455, + "step": 9601, + "time_per_iteration": 2.83225154876709 + }, + { + "auxiliary_loss_clip": 0.01349778, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.23999214, + "balance_loss_mlp": 1.0213089, + "epoch": 0.5773034721178416, + "flos": 19685536257960.0, + "grad_norm": 1.6034294048421782, + "language_loss": 0.6869148, + "learning_rate": 1.599058274973348e-06, + "loss": 0.71075231, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12652588, + "step": 9602, + "time_per_iteration": 2.736306667327881 + }, + { + "auxiliary_loss_clip": 0.0134007, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.23414838, + "balance_loss_mlp": 1.01574326, + "epoch": 0.5773635953705095, + "flos": 25088401839960.0, + "grad_norm": 1.5085233745125424, + "language_loss": 0.73273438, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75640523, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11260986, + "step": 9603, + "time_per_iteration": 2.8258743286132812 + }, + { + "auxiliary_loss_clip": 0.0134978, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.24034667, + "balance_loss_mlp": 1.01545966, + "epoch": 0.5774237186231775, + "flos": 21037978508760.0, + "grad_norm": 2.3595836539631287, + "language_loss": 0.767308, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.79108298, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12249756, + "step": 9604, + "time_per_iteration": 2.73648738861084 + }, + { + "auxiliary_loss_clip": 0.01356762, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.24373567, + "balance_loss_mlp": 1.02199876, + "epoch": 0.5774838418758454, + "flos": 15235957107720.0, + "grad_norm": 1.6625765418113696, + "language_loss": 0.83797193, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.86189574, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13616943, + "step": 9605, + "time_per_iteration": 2.754310131072998 + }, + { + "auxiliary_loss_clip": 0.01377321, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.25855255, + "balance_loss_mlp": 1.01506996, + "epoch": 0.5775439651285135, + "flos": 23587360308000.0, + "grad_norm": 1.6831783206245174, + "language_loss": 0.78063416, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80471092, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.15270996, + "step": 9606, + "time_per_iteration": 2.7486040592193604 + }, + { + "auxiliary_loss_clip": 0.01354632, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.24302983, + "balance_loss_mlp": 1.01669776, + "epoch": 0.5776040883811814, + "flos": 18045032325840.0, + "grad_norm": 1.9918206795330866, + "language_loss": 0.74050045, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76434016, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12652588, + "step": 9607, + "time_per_iteration": 2.7352030277252197 + }, + { + "auxiliary_loss_clip": 0.01353466, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.2413137, + "balance_loss_mlp": 1.01639581, + "epoch": 0.5776642116338494, + "flos": 18629277360480.0, + "grad_norm": 1.6513262195472296, + "language_loss": 0.69254571, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71637619, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13183594, + "step": 9608, + "time_per_iteration": 2.7558205127716064 + }, + { + "auxiliary_loss_clip": 0.01355081, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.24301052, + "balance_loss_mlp": 1.01638746, + "epoch": 0.5777243348865173, + "flos": 28408417356960.0, + "grad_norm": 1.8755829667263715, + "language_loss": 0.77255476, + "learning_rate": 1.596387759940665e-06, + "loss": 0.79640192, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13256836, + "step": 9609, + "time_per_iteration": 2.8114655017852783 + }, + { + "auxiliary_loss_clip": 0.01350389, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.23685527, + "balance_loss_mlp": 1.01515126, + "epoch": 0.5777844581391853, + "flos": 24030071916120.0, + "grad_norm": 1.617897672747286, + "language_loss": 0.77747703, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.80126548, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13305664, + "step": 9610, + "time_per_iteration": 2.8052170276641846 + }, + { + "auxiliary_loss_clip": 0.0135503, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.24258494, + "balance_loss_mlp": 1.01351047, + "epoch": 0.5778445813918534, + "flos": 17780831379000.0, + "grad_norm": 2.231930329622829, + "language_loss": 0.6929391, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.71676433, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13977051, + "step": 9611, + "time_per_iteration": 2.7049901485443115 + }, + { + "auxiliary_loss_clip": 0.01348624, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.23813844, + "balance_loss_mlp": 1.01367188, + "epoch": 0.5779047046445213, + "flos": 22237841859120.0, + "grad_norm": 1.7511377652191091, + "language_loss": 0.8360287, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85978508, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13330078, + "step": 9612, + "time_per_iteration": 2.8493425846099854 + }, + { + "auxiliary_loss_clip": 0.01349705, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.23975253, + "balance_loss_mlp": 1.01901007, + "epoch": 0.5779648278971893, + "flos": 21439408395960.0, + "grad_norm": 2.495210581257992, + "language_loss": 0.79867691, + "learning_rate": 1.594862087742667e-06, + "loss": 0.82249069, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12664795, + "step": 9613, + "time_per_iteration": 2.7352097034454346 + }, + { + "auxiliary_loss_clip": 0.01346156, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.23562217, + "balance_loss_mlp": 1.02010727, + "epoch": 0.5780249511498572, + "flos": 19030829072760.0, + "grad_norm": 1.7098495660467756, + "language_loss": 0.77877373, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.80256546, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12915039, + "step": 9614, + "time_per_iteration": 4.209221601486206 + }, + { + "auxiliary_loss_clip": 0.01357423, + "auxiliary_loss_mlp": 0.01027227, + "balance_loss_clip": 1.24281359, + "balance_loss_mlp": 1.01411366, + "epoch": 0.5780850744025252, + "flos": 12127734126000.0, + "grad_norm": 2.022450139486207, + "language_loss": 0.81367826, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83752477, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13110352, + "step": 9615, + "time_per_iteration": 2.728101968765259 + }, + { + "auxiliary_loss_clip": 0.01358639, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.24522865, + "balance_loss_mlp": 1.01757646, + "epoch": 0.5781451976551931, + "flos": 25049678445720.0, + "grad_norm": 1.4496621165254442, + "language_loss": 0.67092091, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69481504, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13208008, + "step": 9616, + "time_per_iteration": 4.320052623748779 + }, + { + "auxiliary_loss_clip": 0.01349041, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.23947716, + "balance_loss_mlp": 1.01737916, + "epoch": 0.5782053209078611, + "flos": 19250418413160.0, + "grad_norm": 1.571804771301, + "language_loss": 0.77780426, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.80160201, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13354492, + "step": 9617, + "time_per_iteration": 2.725492000579834 + }, + { + "auxiliary_loss_clip": 0.01347688, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.23756421, + "balance_loss_mlp": 1.01858044, + "epoch": 0.578265444160529, + "flos": 25999310125080.0, + "grad_norm": 1.4412459507921604, + "language_loss": 0.75471151, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77850902, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.10107422, + "router_z_loss_mlp": 0.13494873, + "step": 9618, + "time_per_iteration": 2.8324966430664062 + }, + { + "auxiliary_loss_clip": 0.0134464, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.23547339, + "balance_loss_mlp": 1.01248884, + "epoch": 0.5783255674131971, + "flos": 21803617398240.0, + "grad_norm": 1.6321452463392383, + "language_loss": 0.81764501, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84134585, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12945557, + "step": 9619, + "time_per_iteration": 2.7671728134155273 + }, + { + "auxiliary_loss_clip": 0.01349365, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.23708189, + "balance_loss_mlp": 1.01554191, + "epoch": 0.578385690665865, + "flos": 24795061071840.0, + "grad_norm": 1.936419743047936, + "language_loss": 0.7275238, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.751306, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13305664, + "step": 9620, + "time_per_iteration": 2.88624906539917 + }, + { + "auxiliary_loss_clip": 0.01354825, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.24235725, + "balance_loss_mlp": 1.01421928, + "epoch": 0.578445813918533, + "flos": 21217544987400.0, + "grad_norm": 1.568083002827529, + "language_loss": 0.77630293, + "learning_rate": 1.591811481689916e-06, + "loss": 0.80012512, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13195801, + "step": 9621, + "time_per_iteration": 4.249150991439819 + }, + { + "auxiliary_loss_clip": 0.01355367, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.24191594, + "balance_loss_mlp": 1.01934516, + "epoch": 0.5785059371712009, + "flos": 25052358597480.0, + "grad_norm": 1.4159262500957774, + "language_loss": 0.70414221, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72802591, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13641357, + "step": 9622, + "time_per_iteration": 2.79308819770813 + }, + { + "auxiliary_loss_clip": 0.01196032, + "auxiliary_loss_mlp": 0.01006792, + "balance_loss_clip": 1.15073133, + "balance_loss_mlp": 1.00472975, + "epoch": 0.5785660604238689, + "flos": 70860359801160.0, + "grad_norm": 0.7744242406954986, + "language_loss": 0.56076276, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58279103, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02062988, + "step": 9623, + "time_per_iteration": 3.284377336502075 + }, + { + "auxiliary_loss_clip": 0.01360592, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.24549448, + "balance_loss_mlp": 1.01797915, + "epoch": 0.578626183676537, + "flos": 31656305780640.0, + "grad_norm": 2.0047120309017203, + "language_loss": 0.71618831, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.74011481, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.140625, + "step": 9624, + "time_per_iteration": 2.827465057373047 + }, + { + "auxiliary_loss_clip": 0.01347903, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.23698747, + "balance_loss_mlp": 1.01895702, + "epoch": 0.5786863069292049, + "flos": 21869368979040.0, + "grad_norm": 2.486994949113607, + "language_loss": 0.82266867, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84647524, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13787842, + "step": 9625, + "time_per_iteration": 2.748669147491455 + }, + { + "auxiliary_loss_clip": 0.01344164, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.23386383, + "balance_loss_mlp": 1.01509917, + "epoch": 0.5787464301818729, + "flos": 23369841993960.0, + "grad_norm": 1.454381965087166, + "language_loss": 0.70507503, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72880906, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14147949, + "step": 9626, + "time_per_iteration": 4.229094505310059 + }, + { + "auxiliary_loss_clip": 0.01345512, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.23520875, + "balance_loss_mlp": 1.02001762, + "epoch": 0.5788065534345408, + "flos": 30009588769440.0, + "grad_norm": 1.4337128574899176, + "language_loss": 0.72141087, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74519694, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1307373, + "step": 9627, + "time_per_iteration": 2.7982800006866455 + }, + { + "auxiliary_loss_clip": 0.01344605, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.23442423, + "balance_loss_mlp": 1.01842165, + "epoch": 0.5788666766872088, + "flos": 24532281417600.0, + "grad_norm": 2.31513111711863, + "language_loss": 0.8430897, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86684859, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12872314, + "step": 9628, + "time_per_iteration": 2.8398964405059814 + }, + { + "auxiliary_loss_clip": 0.01351365, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.23852825, + "balance_loss_mlp": 1.0177263, + "epoch": 0.5789267999398767, + "flos": 23738152440600.0, + "grad_norm": 1.5279679217006168, + "language_loss": 0.72078741, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74460739, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.12915039, + "step": 9629, + "time_per_iteration": 2.78745698928833 + }, + { + "auxiliary_loss_clip": 0.01350587, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.23978472, + "balance_loss_mlp": 1.01968873, + "epoch": 0.5789869231925447, + "flos": 21139082989920.0, + "grad_norm": 1.8767890753304308, + "language_loss": 0.75044644, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.77428246, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13330078, + "step": 9630, + "time_per_iteration": 2.722261428833008 + }, + { + "auxiliary_loss_clip": 0.01339131, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.23107421, + "balance_loss_mlp": 1.01664793, + "epoch": 0.5790470464452127, + "flos": 21213890235000.0, + "grad_norm": 1.5486044294454007, + "language_loss": 0.79099447, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81467605, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.1237793, + "step": 9631, + "time_per_iteration": 2.8494720458984375 + }, + { + "auxiliary_loss_clip": 0.01346202, + "auxiliary_loss_mlp": 0.01029166, + "balance_loss_clip": 1.23499095, + "balance_loss_mlp": 1.01639259, + "epoch": 0.5791071696978807, + "flos": 23409661813920.0, + "grad_norm": 1.6168631002708345, + "language_loss": 0.75684273, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.78059644, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12786865, + "step": 9632, + "time_per_iteration": 2.8739964962005615 + }, + { + "auxiliary_loss_clip": 0.01352053, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.24003899, + "balance_loss_mlp": 1.01502061, + "epoch": 0.5791672929505486, + "flos": 24211831246200.0, + "grad_norm": 2.249933780705409, + "language_loss": 0.79417133, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81798017, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13806152, + "step": 9633, + "time_per_iteration": 2.8002567291259766 + }, + { + "auxiliary_loss_clip": 0.01371968, + "auxiliary_loss_mlp": 0.01038957, + "balance_loss_clip": 1.25256383, + "balance_loss_mlp": 1.02430022, + "epoch": 0.5792274162032166, + "flos": 24353730147960.0, + "grad_norm": 2.414479992399952, + "language_loss": 0.77819484, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.80230403, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.14654541, + "step": 9634, + "time_per_iteration": 2.7727701663970947 + }, + { + "auxiliary_loss_clip": 0.0136097, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.24704814, + "balance_loss_mlp": 1.02277505, + "epoch": 0.5792875394558845, + "flos": 20454342599520.0, + "grad_norm": 2.3213008505048602, + "language_loss": 0.64453042, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.66849953, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13171387, + "step": 9635, + "time_per_iteration": 2.745900869369507 + }, + { + "auxiliary_loss_clip": 0.01347063, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.23784149, + "balance_loss_mlp": 1.02129722, + "epoch": 0.5793476627085525, + "flos": 24065587249920.0, + "grad_norm": 1.4071286863490051, + "language_loss": 0.77456421, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79837334, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12548828, + "step": 9636, + "time_per_iteration": 2.7719759941101074 + }, + { + "auxiliary_loss_clip": 0.01339373, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.23210216, + "balance_loss_mlp": 1.01754832, + "epoch": 0.5794077859612206, + "flos": 22059371806200.0, + "grad_norm": 1.5757193226076822, + "language_loss": 0.68355697, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70725381, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12762451, + "step": 9637, + "time_per_iteration": 2.7898154258728027 + }, + { + "auxiliary_loss_clip": 0.0135976, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.24509835, + "balance_loss_mlp": 1.02138042, + "epoch": 0.5794679092138885, + "flos": 11438283165840.0, + "grad_norm": 3.120512195627033, + "language_loss": 0.73128963, + "learning_rate": 1.585332242234043e-06, + "loss": 0.75523508, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.1340332, + "step": 9638, + "time_per_iteration": 2.73884654045105 + }, + { + "auxiliary_loss_clip": 0.0135376, + "auxiliary_loss_mlp": 0.01032679, + "balance_loss_clip": 1.24311066, + "balance_loss_mlp": 1.01998901, + "epoch": 0.5795280324665565, + "flos": 18884909943360.0, + "grad_norm": 1.6187129634577455, + "language_loss": 0.72403765, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74790204, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12683105, + "step": 9639, + "time_per_iteration": 2.7139639854431152 + }, + { + "auxiliary_loss_clip": 0.01356448, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.24511647, + "balance_loss_mlp": 1.02287769, + "epoch": 0.5795881557192244, + "flos": 13009868065080.0, + "grad_norm": 1.8516805268377332, + "language_loss": 0.70338666, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.72731197, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13195801, + "step": 9640, + "time_per_iteration": 2.8249638080596924 + }, + { + "auxiliary_loss_clip": 0.01374427, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.25469995, + "balance_loss_mlp": 1.03078997, + "epoch": 0.5796482789718924, + "flos": 19936458271080.0, + "grad_norm": 2.282500204016512, + "language_loss": 0.78132689, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8055191, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.13995361, + "step": 9641, + "time_per_iteration": 2.7157318592071533 + }, + { + "auxiliary_loss_clip": 0.01351624, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.23999476, + "balance_loss_mlp": 1.02493405, + "epoch": 0.5797084022245603, + "flos": 21655546025760.0, + "grad_norm": 1.7268689874852132, + "language_loss": 0.7421664, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.76606053, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12860107, + "step": 9642, + "time_per_iteration": 2.809234380722046 + }, + { + "auxiliary_loss_clip": 0.01351734, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.24062347, + "balance_loss_mlp": 1.01967955, + "epoch": 0.5797685254772283, + "flos": 26037221352120.0, + "grad_norm": 1.561617300847185, + "language_loss": 0.73806226, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.76191044, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1338501, + "step": 9643, + "time_per_iteration": 2.798153877258301 + }, + { + "auxiliary_loss_clip": 0.01362051, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.24778473, + "balance_loss_mlp": 1.01951182, + "epoch": 0.5798286487298963, + "flos": 22709571463440.0, + "grad_norm": 1.9336956529070288, + "language_loss": 0.67535472, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.6993053, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1348877, + "step": 9644, + "time_per_iteration": 2.7964487075805664 + }, + { + "auxiliary_loss_clip": 0.01363435, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.24833989, + "balance_loss_mlp": 1.02287149, + "epoch": 0.5798887719825643, + "flos": 23154232272840.0, + "grad_norm": 2.2983628545356636, + "language_loss": 0.85707295, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.88107467, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.1385498, + "step": 9645, + "time_per_iteration": 2.7772510051727295 + }, + { + "auxiliary_loss_clip": 0.01357225, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.24600613, + "balance_loss_mlp": 1.01909518, + "epoch": 0.5799488952352322, + "flos": 24431461194960.0, + "grad_norm": 2.1271943243120432, + "language_loss": 0.75509638, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77898633, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.12670898, + "step": 9646, + "time_per_iteration": 2.8182177543640137 + }, + { + "auxiliary_loss_clip": 0.01367698, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.25246263, + "balance_loss_mlp": 1.02389765, + "epoch": 0.5800090184879002, + "flos": 38402598557520.0, + "grad_norm": 1.798720047465539, + "language_loss": 0.5949719, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61902571, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13769531, + "step": 9647, + "time_per_iteration": 2.917003631591797 + }, + { + "auxiliary_loss_clip": 0.01363073, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.24844193, + "balance_loss_mlp": 1.02677298, + "epoch": 0.5800691417405681, + "flos": 19788671157120.0, + "grad_norm": 1.969393654657682, + "language_loss": 0.84162396, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86566252, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14007568, + "step": 9648, + "time_per_iteration": 2.737781286239624 + }, + { + "auxiliary_loss_clip": 0.01195759, + "auxiliary_loss_mlp": 0.01005742, + "balance_loss_clip": 1.15060997, + "balance_loss_mlp": 1.00327396, + "epoch": 0.5801292649932361, + "flos": 70329358972440.0, + "grad_norm": 0.8662827082866081, + "language_loss": 0.63109082, + "learning_rate": 1.581142210256242e-06, + "loss": 0.6531058, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.0246582, + "step": 9649, + "time_per_iteration": 3.3555150032043457 + }, + { + "auxiliary_loss_clip": 0.01348857, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.23910499, + "balance_loss_mlp": 1.01928425, + "epoch": 0.5801893882459042, + "flos": 18739559331000.0, + "grad_norm": 2.2670645861632908, + "language_loss": 0.82083154, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84463769, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12475586, + "step": 9650, + "time_per_iteration": 2.801048755645752 + }, + { + "auxiliary_loss_clip": 0.01359441, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.24310219, + "balance_loss_mlp": 1.01740694, + "epoch": 0.5802495114985721, + "flos": 15600775235400.0, + "grad_norm": 2.3377124379572782, + "language_loss": 0.77844751, + "learning_rate": 1.580380592177698e-06, + "loss": 0.80235279, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13671875, + "step": 9651, + "time_per_iteration": 2.907421827316284 + }, + { + "auxiliary_loss_clip": 0.01361689, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.24742174, + "balance_loss_mlp": 1.02231956, + "epoch": 0.5803096347512401, + "flos": 18259423796160.0, + "grad_norm": 1.9096292648827067, + "language_loss": 0.74695468, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.77093822, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.14355469, + "step": 9652, + "time_per_iteration": 2.864872932434082 + }, + { + "auxiliary_loss_clip": 0.01356698, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.24197721, + "balance_loss_mlp": 1.01673877, + "epoch": 0.580369758003908, + "flos": 22898315431440.0, + "grad_norm": 2.015889048411727, + "language_loss": 0.76191002, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78578484, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14031982, + "step": 9653, + "time_per_iteration": 4.401332378387451 + }, + { + "auxiliary_loss_clip": 0.01360118, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.24511611, + "balance_loss_mlp": 1.01900864, + "epoch": 0.580429881256576, + "flos": 18702338446080.0, + "grad_norm": 2.3274618757511862, + "language_loss": 0.74591124, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76984191, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.1394043, + "step": 9654, + "time_per_iteration": 2.862919330596924 + }, + { + "auxiliary_loss_clip": 0.01340916, + "auxiliary_loss_mlp": 0.01037675, + "balance_loss_clip": 1.23242736, + "balance_loss_mlp": 1.02486634, + "epoch": 0.5804900045092439, + "flos": 24687499861440.0, + "grad_norm": 10.213245229198456, + "language_loss": 0.71052945, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.73431545, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12811279, + "step": 9655, + "time_per_iteration": 2.899343252182007 + }, + { + "auxiliary_loss_clip": 0.01365171, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.24674773, + "balance_loss_mlp": 1.01847386, + "epoch": 0.580550127761912, + "flos": 23118067205280.0, + "grad_norm": 1.9977436719740134, + "language_loss": 0.70193398, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.72591388, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.14343262, + "step": 9656, + "time_per_iteration": 4.4112708568573 + }, + { + "auxiliary_loss_clip": 0.01343614, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.23518133, + "balance_loss_mlp": 1.01681435, + "epoch": 0.5806102510145799, + "flos": 18479906520480.0, + "grad_norm": 1.5936942814460358, + "language_loss": 0.71477389, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73850441, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12609863, + "step": 9657, + "time_per_iteration": 2.8974287509918213 + }, + { + "auxiliary_loss_clip": 0.01365083, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.24870586, + "balance_loss_mlp": 1.01951551, + "epoch": 0.5806703742672479, + "flos": 23921048804760.0, + "grad_norm": 2.006520896384617, + "language_loss": 0.71037847, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73437297, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14855957, + "step": 9658, + "time_per_iteration": 2.938750982284546 + }, + { + "auxiliary_loss_clip": 0.01190858, + "auxiliary_loss_mlp": 0.0100362, + "balance_loss_clip": 1.14565873, + "balance_loss_mlp": 1.00131941, + "epoch": 0.5807304975199158, + "flos": 66327689300400.0, + "grad_norm": 0.692211044262425, + "language_loss": 0.53612822, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55807304, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02294922, + "step": 9659, + "time_per_iteration": 4.831789016723633 + }, + { + "auxiliary_loss_clip": 0.01362548, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.24730361, + "balance_loss_mlp": 1.02651179, + "epoch": 0.5807906207725838, + "flos": 31728717132480.0, + "grad_norm": 2.0584870220750013, + "language_loss": 0.62493765, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64897001, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14178467, + "step": 9660, + "time_per_iteration": 2.8659937381744385 + }, + { + "auxiliary_loss_clip": 0.01358035, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.2421962, + "balance_loss_mlp": 1.01969123, + "epoch": 0.5808507440252517, + "flos": 23805893831040.0, + "grad_norm": 1.591520165810055, + "language_loss": 0.65633351, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.68024886, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.13781738, + "step": 9661, + "time_per_iteration": 2.880028486251831 + }, + { + "auxiliary_loss_clip": 0.01340129, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.23252404, + "balance_loss_mlp": 1.01541948, + "epoch": 0.5809108672779197, + "flos": 13702364652240.0, + "grad_norm": 1.4564550363832036, + "language_loss": 0.74329954, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76697624, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12115479, + "step": 9662, + "time_per_iteration": 2.949723243713379 + }, + { + "auxiliary_loss_clip": 0.01184886, + "auxiliary_loss_mlp": 0.01008117, + "balance_loss_clip": 1.14066577, + "balance_loss_mlp": 1.00551784, + "epoch": 0.5809709905305876, + "flos": 69152682995640.0, + "grad_norm": 0.8698665677657295, + "language_loss": 0.584405, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60633504, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02600098, + "step": 9663, + "time_per_iteration": 3.2905147075653076 + }, + { + "auxiliary_loss_clip": 0.01351572, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.2399981, + "balance_loss_mlp": 1.01715422, + "epoch": 0.5810311137832557, + "flos": 19832186337840.0, + "grad_norm": 2.5696869794331634, + "language_loss": 0.82191765, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84573573, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13092041, + "step": 9664, + "time_per_iteration": 4.328108787536621 + }, + { + "auxiliary_loss_clip": 0.01357349, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.2414546, + "balance_loss_mlp": 1.01477933, + "epoch": 0.5810912370359237, + "flos": 29244071705040.0, + "grad_norm": 1.8256645541134686, + "language_loss": 0.81885803, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.84271586, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13653564, + "step": 9665, + "time_per_iteration": 2.837172031402588 + }, + { + "auxiliary_loss_clip": 0.01369668, + "auxiliary_loss_mlp": 0.01039969, + "balance_loss_clip": 1.25187385, + "balance_loss_mlp": 1.02440023, + "epoch": 0.5811513602885916, + "flos": 22790957262840.0, + "grad_norm": 1.6172718058777376, + "language_loss": 0.81205213, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.8361485, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.15551758, + "step": 9666, + "time_per_iteration": 2.9163641929626465 + }, + { + "auxiliary_loss_clip": 0.013523, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.24285495, + "balance_loss_mlp": 1.01712179, + "epoch": 0.5812114835412596, + "flos": 18739031422320.0, + "grad_norm": 1.8047189435509619, + "language_loss": 0.80191171, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82574064, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13482666, + "step": 9667, + "time_per_iteration": 2.889054536819458 + }, + { + "auxiliary_loss_clip": 0.01372104, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.2520442, + "balance_loss_mlp": 1.02028298, + "epoch": 0.5812716067939275, + "flos": 26436783254760.0, + "grad_norm": 1.3954479407003373, + "language_loss": 0.79009998, + "learning_rate": 1.573909419957653e-06, + "loss": 0.81417179, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.14794922, + "step": 9668, + "time_per_iteration": 2.8147990703582764 + }, + { + "auxiliary_loss_clip": 0.01354909, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.24160123, + "balance_loss_mlp": 1.01911879, + "epoch": 0.5813317300465956, + "flos": 43405658586720.0, + "grad_norm": 1.9088792951109945, + "language_loss": 0.64239508, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66626823, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.1328125, + "step": 9669, + "time_per_iteration": 3.044222831726074 + }, + { + "auxiliary_loss_clip": 0.01359168, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.24619281, + "balance_loss_mlp": 1.02131116, + "epoch": 0.5813918532992635, + "flos": 24790228677000.0, + "grad_norm": 1.5048307845026376, + "language_loss": 0.73889536, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.76284957, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.1494751, + "step": 9670, + "time_per_iteration": 2.8099217414855957 + }, + { + "auxiliary_loss_clip": 0.01359946, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_clip": 1.24453771, + "balance_loss_mlp": 1.02851081, + "epoch": 0.5814519765519315, + "flos": 22862597055840.0, + "grad_norm": 2.16838403044818, + "language_loss": 0.79123044, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81525511, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14007568, + "step": 9671, + "time_per_iteration": 2.8476674556732178 + }, + { + "auxiliary_loss_clip": 0.01375902, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.25750732, + "balance_loss_mlp": 1.01674569, + "epoch": 0.5815120998045994, + "flos": 24066155766960.0, + "grad_norm": 4.710652507617608, + "language_loss": 0.60986376, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.63394308, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.15283203, + "step": 9672, + "time_per_iteration": 2.8843138217926025 + }, + { + "auxiliary_loss_clip": 0.01353227, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.24183059, + "balance_loss_mlp": 1.01964808, + "epoch": 0.5815722230572674, + "flos": 24284811115080.0, + "grad_norm": 1.6089910884471768, + "language_loss": 0.81618118, + "learning_rate": 1.572007019492342e-06, + "loss": 0.84003848, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12854004, + "step": 9673, + "time_per_iteration": 2.847299098968506 + }, + { + "auxiliary_loss_clip": 0.01364413, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.2478807, + "balance_loss_mlp": 1.02022159, + "epoch": 0.5816323463099353, + "flos": 22205493977400.0, + "grad_norm": 1.6890679832331343, + "language_loss": 0.88099843, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90498912, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14440918, + "step": 9674, + "time_per_iteration": 2.797135591506958 + }, + { + "auxiliary_loss_clip": 0.01358438, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.2448554, + "balance_loss_mlp": 1.01485062, + "epoch": 0.5816924695626033, + "flos": 24139866586320.0, + "grad_norm": 1.3983922731740202, + "language_loss": 0.78878087, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81264633, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13262939, + "step": 9675, + "time_per_iteration": 2.765608549118042 + }, + { + "auxiliary_loss_clip": 0.01358377, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.24525452, + "balance_loss_mlp": 1.01861668, + "epoch": 0.5817525928152713, + "flos": 21329167033800.0, + "grad_norm": 2.0761710787458836, + "language_loss": 0.70258784, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.7264992, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14147949, + "step": 9676, + "time_per_iteration": 2.792706251144409 + }, + { + "auxiliary_loss_clip": 0.01363571, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.24868929, + "balance_loss_mlp": 1.02184129, + "epoch": 0.5818127160679393, + "flos": 26939073972960.0, + "grad_norm": 2.1884085870388126, + "language_loss": 0.63881046, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.66280472, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14007568, + "step": 9677, + "time_per_iteration": 2.814152479171753 + }, + { + "auxiliary_loss_clip": 0.0118248, + "auxiliary_loss_mlp": 0.01004256, + "balance_loss_clip": 1.13743472, + "balance_loss_mlp": 1.00135958, + "epoch": 0.5818728393206073, + "flos": 63935921838240.0, + "grad_norm": 0.8083906259196725, + "language_loss": 0.54275143, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56461877, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02893066, + "step": 9678, + "time_per_iteration": 3.328047752380371 + }, + { + "auxiliary_loss_clip": 0.01179149, + "auxiliary_loss_mlp": 0.01011096, + "balance_loss_clip": 1.1344502, + "balance_loss_mlp": 1.00814009, + "epoch": 0.5819329625732752, + "flos": 64968587278560.0, + "grad_norm": 0.743681806907672, + "language_loss": 0.56212348, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58402592, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02954102, + "step": 9679, + "time_per_iteration": 3.1487979888916016 + }, + { + "auxiliary_loss_clip": 0.01363525, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.24920034, + "balance_loss_mlp": 1.01632857, + "epoch": 0.5819930858259432, + "flos": 21220265747520.0, + "grad_norm": 1.6598435105300218, + "language_loss": 0.65943956, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.68336797, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.12994385, + "step": 9680, + "time_per_iteration": 2.7403998374938965 + }, + { + "auxiliary_loss_clip": 0.01363629, + "auxiliary_loss_mlp": 0.01027347, + "balance_loss_clip": 1.24974942, + "balance_loss_mlp": 1.01387596, + "epoch": 0.5820532090786111, + "flos": 19462576423680.0, + "grad_norm": 1.8469425913921593, + "language_loss": 0.83342934, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85733914, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13476562, + "step": 9681, + "time_per_iteration": 2.776214361190796 + }, + { + "auxiliary_loss_clip": 0.01358869, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.24638593, + "balance_loss_mlp": 1.01505244, + "epoch": 0.5821133323312792, + "flos": 17717150824560.0, + "grad_norm": 1.6110982061199026, + "language_loss": 0.75863445, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.78251946, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14575195, + "step": 9682, + "time_per_iteration": 2.7246711254119873 + }, + { + "auxiliary_loss_clip": 0.01369503, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.25228238, + "balance_loss_mlp": 1.01876545, + "epoch": 0.5821734555839471, + "flos": 24577217890920.0, + "grad_norm": 2.0221303206960313, + "language_loss": 0.75446498, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77849424, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14660645, + "step": 9683, + "time_per_iteration": 2.967496156692505 + }, + { + "auxiliary_loss_clip": 0.01364961, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.24656749, + "balance_loss_mlp": 1.02021289, + "epoch": 0.5822335788366151, + "flos": 22387334524200.0, + "grad_norm": 1.6563428112737852, + "language_loss": 0.74692953, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.77092671, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.14544678, + "step": 9684, + "time_per_iteration": 2.749542474746704 + }, + { + "auxiliary_loss_clip": 0.01365634, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.2491504, + "balance_loss_mlp": 1.02327514, + "epoch": 0.582293702089283, + "flos": 26727931171440.0, + "grad_norm": 2.5158455413446257, + "language_loss": 0.78735322, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.81138539, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14294434, + "step": 9685, + "time_per_iteration": 2.7944252490997314 + }, + { + "auxiliary_loss_clip": 0.01363187, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.24823785, + "balance_loss_mlp": 1.02233803, + "epoch": 0.582353825341951, + "flos": 17353266689160.0, + "grad_norm": 1.8841237318273885, + "language_loss": 0.75563252, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77962852, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14074707, + "step": 9686, + "time_per_iteration": 2.761836528778076 + }, + { + "auxiliary_loss_clip": 0.01179276, + "auxiliary_loss_mlp": 0.00999011, + "balance_loss_clip": 1.13301706, + "balance_loss_mlp": 0.99619812, + "epoch": 0.5824139485946189, + "flos": 55487214361440.0, + "grad_norm": 0.8602088326031476, + "language_loss": 0.57502455, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59680742, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.02807617, + "step": 9687, + "time_per_iteration": 3.1292643547058105 + }, + { + "auxiliary_loss_clip": 0.01362065, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.24670374, + "balance_loss_mlp": 1.01335716, + "epoch": 0.582474071847287, + "flos": 20307895561440.0, + "grad_norm": 1.8261112290903319, + "language_loss": 0.70321202, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72711015, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14373779, + "step": 9688, + "time_per_iteration": 2.91207218170166 + }, + { + "auxiliary_loss_clip": 0.0136292, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.24870825, + "balance_loss_mlp": 1.01563692, + "epoch": 0.5825341950999549, + "flos": 23883259402800.0, + "grad_norm": 3.194692642938526, + "language_loss": 0.6558255, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67974472, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13342285, + "step": 9689, + "time_per_iteration": 2.910698175430298 + }, + { + "auxiliary_loss_clip": 0.01358254, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.24538946, + "balance_loss_mlp": 1.01688111, + "epoch": 0.5825943183526229, + "flos": 23118351463800.0, + "grad_norm": 1.4730167931384615, + "language_loss": 0.73067951, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75457287, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14202881, + "step": 9690, + "time_per_iteration": 2.9512224197387695 + }, + { + "auxiliary_loss_clip": 0.01368084, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.25015152, + "balance_loss_mlp": 1.02067733, + "epoch": 0.5826544416052909, + "flos": 22862678272560.0, + "grad_norm": 1.7946590796506852, + "language_loss": 0.76357102, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78761637, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.15777588, + "step": 9691, + "time_per_iteration": 2.925698757171631 + }, + { + "auxiliary_loss_clip": 0.01364362, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.2475822, + "balance_loss_mlp": 1.02022326, + "epoch": 0.5827145648579588, + "flos": 31507868932920.0, + "grad_norm": 2.0824841578453075, + "language_loss": 0.8059994, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82998151, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13641357, + "step": 9692, + "time_per_iteration": 2.9722442626953125 + }, + { + "auxiliary_loss_clip": 0.011779, + "auxiliary_loss_mlp": 0.01005354, + "balance_loss_clip": 1.1315577, + "balance_loss_mlp": 1.00250483, + "epoch": 0.5827746881106268, + "flos": 69827596162920.0, + "grad_norm": 0.8528332088023852, + "language_loss": 0.57006818, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.59190071, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02844238, + "step": 9693, + "time_per_iteration": 4.732581377029419 + }, + { + "auxiliary_loss_clip": 0.01360942, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.24641776, + "balance_loss_mlp": 1.01552153, + "epoch": 0.5828348113632947, + "flos": 23117620513320.0, + "grad_norm": 1.630415727284177, + "language_loss": 0.7894268, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81332088, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.12927246, + "step": 9694, + "time_per_iteration": 2.87624192237854 + }, + { + "auxiliary_loss_clip": 0.01349904, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.23991537, + "balance_loss_mlp": 1.02193475, + "epoch": 0.5828949346159628, + "flos": 21878302818240.0, + "grad_norm": 1.4508606324827142, + "language_loss": 0.76201367, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78585577, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12371826, + "step": 9695, + "time_per_iteration": 4.362673759460449 + }, + { + "auxiliary_loss_clip": 0.01177283, + "auxiliary_loss_mlp": 0.01004416, + "balance_loss_clip": 1.13098001, + "balance_loss_mlp": 1.00194836, + "epoch": 0.5829550578686307, + "flos": 65980210942440.0, + "grad_norm": 0.7733126474741434, + "language_loss": 0.55021667, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57203364, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.0246582, + "step": 9696, + "time_per_iteration": 3.370386838912964 + }, + { + "auxiliary_loss_clip": 0.01363234, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.24912357, + "balance_loss_mlp": 1.0211277, + "epoch": 0.5830151811212987, + "flos": 16294165206480.0, + "grad_norm": 2.9883856418433803, + "language_loss": 0.76589036, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78987181, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13787842, + "step": 9697, + "time_per_iteration": 2.819463014602661 + }, + { + "auxiliary_loss_clip": 0.0136828, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.25038552, + "balance_loss_mlp": 1.02252483, + "epoch": 0.5830753043739666, + "flos": 24174001235880.0, + "grad_norm": 1.7372322014892467, + "language_loss": 0.78051698, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.80458045, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.15539551, + "step": 9698, + "time_per_iteration": 4.399362087249756 + }, + { + "auxiliary_loss_clip": 0.01358608, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.2446475, + "balance_loss_mlp": 1.0282625, + "epoch": 0.5831354276266346, + "flos": 27065599287480.0, + "grad_norm": 1.6411548178696214, + "language_loss": 0.83545482, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.8594653, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14178467, + "step": 9699, + "time_per_iteration": 2.889220952987671 + }, + { + "auxiliary_loss_clip": 0.01361644, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.24521232, + "balance_loss_mlp": 1.02379298, + "epoch": 0.5831955508793025, + "flos": 23628804462360.0, + "grad_norm": 2.203417202123579, + "language_loss": 0.65942311, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68342185, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14440918, + "step": 9700, + "time_per_iteration": 2.815650463104248 + }, + { + "auxiliary_loss_clip": 0.01362938, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.24808645, + "balance_loss_mlp": 1.02554464, + "epoch": 0.5832556741319705, + "flos": 24978728994840.0, + "grad_norm": 1.672507437906973, + "language_loss": 0.71806264, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.74209106, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14361572, + "step": 9701, + "time_per_iteration": 2.917397975921631 + }, + { + "auxiliary_loss_clip": 0.01361028, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.24751043, + "balance_loss_mlp": 1.02503371, + "epoch": 0.5833157973846385, + "flos": 23226765449760.0, + "grad_norm": 2.2696491551789983, + "language_loss": 0.84998977, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87398791, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13769531, + "step": 9702, + "time_per_iteration": 2.8014590740203857 + }, + { + "auxiliary_loss_clip": 0.01351214, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.24021101, + "balance_loss_mlp": 1.02732015, + "epoch": 0.5833759206373065, + "flos": 21982574751480.0, + "grad_norm": 1.3406305206791171, + "language_loss": 0.78075278, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80467093, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13293457, + "step": 9703, + "time_per_iteration": 4.318129062652588 + }, + { + "auxiliary_loss_clip": 0.01362367, + "auxiliary_loss_mlp": 0.01041602, + "balance_loss_clip": 1.24599767, + "balance_loss_mlp": 1.02676058, + "epoch": 0.5834360438899745, + "flos": 21767452330680.0, + "grad_norm": 1.5882655472109461, + "language_loss": 0.71090925, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73494899, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14819336, + "step": 9704, + "time_per_iteration": 2.8087732791900635 + }, + { + "auxiliary_loss_clip": 0.01361098, + "auxiliary_loss_mlp": 0.01044362, + "balance_loss_clip": 1.24679279, + "balance_loss_mlp": 1.03023577, + "epoch": 0.5834961671426424, + "flos": 15996195085320.0, + "grad_norm": 1.8742726554542004, + "language_loss": 0.81866932, + "learning_rate": 1.559841341236335e-06, + "loss": 0.84272397, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14135742, + "step": 9705, + "time_per_iteration": 2.8277676105499268 + }, + { + "auxiliary_loss_clip": 0.01358061, + "auxiliary_loss_mlp": 0.01037376, + "balance_loss_clip": 1.24383736, + "balance_loss_mlp": 1.02402425, + "epoch": 0.5835562903953104, + "flos": 22823223927840.0, + "grad_norm": 1.6134705754042018, + "language_loss": 0.805996, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82995033, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13342285, + "step": 9706, + "time_per_iteration": 2.7647690773010254 + }, + { + "auxiliary_loss_clip": 0.01362632, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_clip": 1.25071216, + "balance_loss_mlp": 1.02850521, + "epoch": 0.5836164136479783, + "flos": 48476338181280.0, + "grad_norm": 1.699675662925746, + "language_loss": 0.74610811, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77016592, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14648438, + "step": 9707, + "time_per_iteration": 2.9915714263916016 + }, + { + "auxiliary_loss_clip": 0.01353792, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.24257636, + "balance_loss_mlp": 1.01990521, + "epoch": 0.5836765369006464, + "flos": 26911599094440.0, + "grad_norm": 1.7082497731057984, + "language_loss": 0.81622279, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.84010214, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14233398, + "step": 9708, + "time_per_iteration": 2.8466455936431885 + }, + { + "auxiliary_loss_clip": 0.01356501, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.24491477, + "balance_loss_mlp": 1.02403903, + "epoch": 0.5837366601533143, + "flos": 20088874738080.0, + "grad_norm": 1.38657965312343, + "language_loss": 0.78390408, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80784786, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1383667, + "step": 9709, + "time_per_iteration": 2.7460718154907227 + }, + { + "auxiliary_loss_clip": 0.01176169, + "auxiliary_loss_mlp": 0.01021777, + "balance_loss_clip": 1.12895215, + "balance_loss_mlp": 1.01860571, + "epoch": 0.5837967834059823, + "flos": 65379519522000.0, + "grad_norm": 0.7817843318327947, + "language_loss": 0.56581604, + "learning_rate": 1.557941985915844e-06, + "loss": 0.5877955, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.03173828, + "step": 9710, + "time_per_iteration": 3.2584950923919678 + }, + { + "auxiliary_loss_clip": 0.01360385, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.24865007, + "balance_loss_mlp": 1.02376592, + "epoch": 0.5838569066586502, + "flos": 25344237464640.0, + "grad_norm": 1.6009839084530346, + "language_loss": 0.65635097, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.6803205, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12811279, + "step": 9711, + "time_per_iteration": 2.790611743927002 + }, + { + "auxiliary_loss_clip": 0.01370735, + "auxiliary_loss_mlp": 0.01042196, + "balance_loss_clip": 1.25094032, + "balance_loss_mlp": 1.02685928, + "epoch": 0.5839170299113182, + "flos": 22233374939520.0, + "grad_norm": 1.7756573370807658, + "language_loss": 0.78951764, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.81364691, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.15338135, + "step": 9712, + "time_per_iteration": 2.8126537799835205 + }, + { + "auxiliary_loss_clip": 0.01359073, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.24556756, + "balance_loss_mlp": 1.02050948, + "epoch": 0.5839771531639861, + "flos": 22205087893800.0, + "grad_norm": 1.5130105328310803, + "language_loss": 0.73877579, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.76271427, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.1427002, + "step": 9713, + "time_per_iteration": 2.8706247806549072 + }, + { + "auxiliary_loss_clip": 0.01371921, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.25277007, + "balance_loss_mlp": 1.02442074, + "epoch": 0.5840372764166541, + "flos": 22424514800760.0, + "grad_norm": 4.18141688504086, + "language_loss": 0.69379354, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71791947, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.16247559, + "step": 9714, + "time_per_iteration": 2.8765695095062256 + }, + { + "auxiliary_loss_clip": 0.0136065, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.24526227, + "balance_loss_mlp": 1.02362466, + "epoch": 0.5840973996693221, + "flos": 19832998505040.0, + "grad_norm": 1.7126881444188795, + "language_loss": 0.80082476, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82481062, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14306641, + "step": 9715, + "time_per_iteration": 2.8468148708343506 + }, + { + "auxiliary_loss_clip": 0.0135693, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.24369586, + "balance_loss_mlp": 1.02003694, + "epoch": 0.5841575229219901, + "flos": 21148057437480.0, + "grad_norm": 1.9595837984773172, + "language_loss": 0.73210675, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.75601637, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13983154, + "step": 9716, + "time_per_iteration": 2.7325267791748047 + }, + { + "auxiliary_loss_clip": 0.01351478, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.24077773, + "balance_loss_mlp": 1.02108765, + "epoch": 0.5842176461746581, + "flos": 24645162323160.0, + "grad_norm": 1.5529068872121001, + "language_loss": 0.74910259, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77296388, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13549805, + "step": 9717, + "time_per_iteration": 2.798288583755493 + }, + { + "auxiliary_loss_clip": 0.01360407, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.24716294, + "balance_loss_mlp": 1.02492523, + "epoch": 0.584277769427326, + "flos": 19135304047800.0, + "grad_norm": 3.9472764567037046, + "language_loss": 0.80200368, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82599998, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14294434, + "step": 9718, + "time_per_iteration": 2.709791898727417 + }, + { + "auxiliary_loss_clip": 0.01356705, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.24421263, + "balance_loss_mlp": 1.02056837, + "epoch": 0.584337892679994, + "flos": 22680350425440.0, + "grad_norm": 1.595573008507261, + "language_loss": 0.67919612, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.70311594, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.12451172, + "router_z_loss_mlp": 0.14733887, + "step": 9719, + "time_per_iteration": 2.770812749862671 + }, + { + "auxiliary_loss_clip": 0.01359006, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.24359202, + "balance_loss_mlp": 1.01931632, + "epoch": 0.5843980159326619, + "flos": 31290228793800.0, + "grad_norm": 2.2033957587935005, + "language_loss": 0.75687015, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78079891, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14538574, + "step": 9720, + "time_per_iteration": 2.8099451065063477 + }, + { + "auxiliary_loss_clip": 0.01365419, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.24945903, + "balance_loss_mlp": 1.02481186, + "epoch": 0.58445813918533, + "flos": 22753370902680.0, + "grad_norm": 1.6056852302118496, + "language_loss": 0.83081746, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.8548618, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.1418457, + "step": 9721, + "time_per_iteration": 2.788156509399414 + }, + { + "auxiliary_loss_clip": 0.01173798, + "auxiliary_loss_mlp": 0.01003007, + "balance_loss_clip": 1.12534857, + "balance_loss_mlp": 0.99985981, + "epoch": 0.5845182624379979, + "flos": 60700645633320.0, + "grad_norm": 0.946664896512793, + "language_loss": 0.71322453, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73499262, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.03149414, + "step": 9722, + "time_per_iteration": 3.2615115642547607 + }, + { + "auxiliary_loss_clip": 0.01356925, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.24460912, + "balance_loss_mlp": 1.02140856, + "epoch": 0.5845783856906659, + "flos": 16366779600120.0, + "grad_norm": 2.029952108382291, + "language_loss": 0.89837778, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.92229527, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13433838, + "step": 9723, + "time_per_iteration": 2.7376270294189453 + }, + { + "auxiliary_loss_clip": 0.01354973, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.24201131, + "balance_loss_mlp": 1.02393484, + "epoch": 0.5846385089433338, + "flos": 20088143787600.0, + "grad_norm": 1.3861311330523693, + "language_loss": 0.68676794, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.71068907, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13201904, + "step": 9724, + "time_per_iteration": 2.8288021087646484 + }, + { + "auxiliary_loss_clip": 0.01359892, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.24519253, + "balance_loss_mlp": 1.01765776, + "epoch": 0.5846986321960018, + "flos": 17315517895560.0, + "grad_norm": 1.752512489645152, + "language_loss": 0.86490989, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88883078, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.1451416, + "step": 9725, + "time_per_iteration": 2.715750217437744 + }, + { + "auxiliary_loss_clip": 0.01368666, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.25120449, + "balance_loss_mlp": 1.02494955, + "epoch": 0.5847587554486697, + "flos": 17201865431160.0, + "grad_norm": 1.4829968309296897, + "language_loss": 0.82813811, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85221839, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14398193, + "step": 9726, + "time_per_iteration": 2.869291067123413 + }, + { + "auxiliary_loss_clip": 0.0136402, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.24955356, + "balance_loss_mlp": 1.02160037, + "epoch": 0.5848188787013378, + "flos": 24533702710200.0, + "grad_norm": 1.8447276801496117, + "language_loss": 0.67284328, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69683111, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13153076, + "step": 9727, + "time_per_iteration": 2.8662972450256348 + }, + { + "auxiliary_loss_clip": 0.01361187, + "auxiliary_loss_mlp": 0.01042114, + "balance_loss_clip": 1.2469418, + "balance_loss_mlp": 1.0274632, + "epoch": 0.5848790019540057, + "flos": 20632893869160.0, + "grad_norm": 1.7508974947881624, + "language_loss": 0.81646293, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84049594, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1463623, + "step": 9728, + "time_per_iteration": 2.784599542617798 + }, + { + "auxiliary_loss_clip": 0.01352114, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.24227023, + "balance_loss_mlp": 1.02071929, + "epoch": 0.5849391252066737, + "flos": 22423946283720.0, + "grad_norm": 1.7115056469108472, + "language_loss": 0.78023517, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80409235, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12890625, + "step": 9729, + "time_per_iteration": 2.8051061630249023 + }, + { + "auxiliary_loss_clip": 0.01363106, + "auxiliary_loss_mlp": 0.01036668, + "balance_loss_clip": 1.24789059, + "balance_loss_mlp": 1.02198744, + "epoch": 0.5849992484593417, + "flos": 25416445774680.0, + "grad_norm": 2.2757941521084026, + "language_loss": 0.70963508, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.7336328, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14672852, + "step": 9730, + "time_per_iteration": 2.810882806777954 + }, + { + "auxiliary_loss_clip": 0.01364625, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.24887753, + "balance_loss_mlp": 1.02218747, + "epoch": 0.5850593717120096, + "flos": 21069595440000.0, + "grad_norm": 2.543606331071323, + "language_loss": 0.78761047, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81162429, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14575195, + "step": 9731, + "time_per_iteration": 2.759592294692993 + }, + { + "auxiliary_loss_clip": 0.01358031, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.24432242, + "balance_loss_mlp": 1.02539253, + "epoch": 0.5851194949646776, + "flos": 25307057188080.0, + "grad_norm": 1.8849161947902457, + "language_loss": 0.70362425, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72760165, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14324951, + "step": 9732, + "time_per_iteration": 4.154609680175781 + }, + { + "auxiliary_loss_clip": 0.01367743, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.25039101, + "balance_loss_mlp": 1.02464104, + "epoch": 0.5851796182173455, + "flos": 23592598786440.0, + "grad_norm": 1.851772543378086, + "language_loss": 0.52798128, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.55206609, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.17236328, + "router_z_loss_mlp": 0.16101074, + "step": 9733, + "time_per_iteration": 4.2228124141693115 + }, + { + "auxiliary_loss_clip": 0.01362681, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.24819386, + "balance_loss_mlp": 1.02229762, + "epoch": 0.5852397414700136, + "flos": 24827571387000.0, + "grad_norm": 2.135542816674639, + "language_loss": 0.87993908, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.90392894, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.14019775, + "step": 9734, + "time_per_iteration": 2.8306164741516113 + }, + { + "auxiliary_loss_clip": 0.01350037, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.24113846, + "balance_loss_mlp": 1.02120972, + "epoch": 0.5852998647226815, + "flos": 19942549525080.0, + "grad_norm": 1.402225599086461, + "language_loss": 0.72152913, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74537379, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13232422, + "step": 9735, + "time_per_iteration": 2.745027542114258 + }, + { + "auxiliary_loss_clip": 0.01365113, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.24810338, + "balance_loss_mlp": 1.02440095, + "epoch": 0.5853599879753495, + "flos": 16723841531040.0, + "grad_norm": 2.405519568759907, + "language_loss": 0.74277163, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76682675, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.15991211, + "step": 9736, + "time_per_iteration": 2.676856756210327 + }, + { + "auxiliary_loss_clip": 0.01359039, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.24593222, + "balance_loss_mlp": 1.01704597, + "epoch": 0.5854201112280174, + "flos": 44464597635960.0, + "grad_norm": 1.5036283646951571, + "language_loss": 0.70612669, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.73002714, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.1394043, + "step": 9737, + "time_per_iteration": 4.464637756347656 + }, + { + "auxiliary_loss_clip": 0.01353515, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.24199522, + "balance_loss_mlp": 1.02420068, + "epoch": 0.5854802344806854, + "flos": 20344020020640.0, + "grad_norm": 1.631067177363168, + "language_loss": 0.8291617, + "learning_rate": 1.547313391573169e-06, + "loss": 0.85307181, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13287354, + "step": 9738, + "time_per_iteration": 2.7606236934661865 + }, + { + "auxiliary_loss_clip": 0.01368894, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.25213218, + "balance_loss_mlp": 1.02368784, + "epoch": 0.5855403577333533, + "flos": 20925706728600.0, + "grad_norm": 1.889954269685807, + "language_loss": 0.68554306, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70961827, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14941406, + "step": 9739, + "time_per_iteration": 2.74752140045166 + }, + { + "auxiliary_loss_clip": 0.0136794, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.25128961, + "balance_loss_mlp": 1.01820207, + "epoch": 0.5856004809860214, + "flos": 20453489823960.0, + "grad_norm": 2.350491674234863, + "language_loss": 0.58618605, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.6101886, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14099121, + "step": 9740, + "time_per_iteration": 2.75648832321167 + }, + { + "auxiliary_loss_clip": 0.01367725, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.25231957, + "balance_loss_mlp": 1.01920867, + "epoch": 0.5856606042386893, + "flos": 19644823054080.0, + "grad_norm": 2.6828907789760996, + "language_loss": 0.7497499, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77376312, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14398193, + "step": 9741, + "time_per_iteration": 2.752997875213623 + }, + { + "auxiliary_loss_clip": 0.01365044, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.24996984, + "balance_loss_mlp": 1.01791513, + "epoch": 0.5857207274913573, + "flos": 21690777101040.0, + "grad_norm": 2.2004786348945617, + "language_loss": 0.75950599, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78348082, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14538574, + "step": 9742, + "time_per_iteration": 4.298662185668945 + }, + { + "auxiliary_loss_clip": 0.01358032, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.24608219, + "balance_loss_mlp": 1.02318549, + "epoch": 0.5857808507440253, + "flos": 23187595363560.0, + "grad_norm": 1.6261679038626684, + "language_loss": 0.75142336, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77536982, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13415527, + "step": 9743, + "time_per_iteration": 2.781954526901245 + }, + { + "auxiliary_loss_clip": 0.01358038, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.2469058, + "balance_loss_mlp": 1.0172435, + "epoch": 0.5858409739966932, + "flos": 27241064321760.0, + "grad_norm": 1.7881843903941825, + "language_loss": 0.8176477, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.84153908, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13861084, + "step": 9744, + "time_per_iteration": 2.8151607513427734 + }, + { + "auxiliary_loss_clip": 0.01369819, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.25196552, + "balance_loss_mlp": 1.02144444, + "epoch": 0.5859010972493612, + "flos": 27861352598880.0, + "grad_norm": 1.5896978207968917, + "language_loss": 0.71949184, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.74354565, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14111328, + "step": 9745, + "time_per_iteration": 2.8371779918670654 + }, + { + "auxiliary_loss_clip": 0.01169976, + "auxiliary_loss_mlp": 0.01004477, + "balance_loss_clip": 1.12255621, + "balance_loss_mlp": 1.0018065, + "epoch": 0.5859612205020291, + "flos": 70024543019640.0, + "grad_norm": 0.7826529196639815, + "language_loss": 0.53370142, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55544591, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.0267334, + "step": 9746, + "time_per_iteration": 3.319457769393921 + }, + { + "auxiliary_loss_clip": 0.01364644, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.24886608, + "balance_loss_mlp": 1.02489328, + "epoch": 0.5860213437546972, + "flos": 24060714246720.0, + "grad_norm": 3.1222449850238028, + "language_loss": 0.73085266, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75490081, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.152771, + "step": 9747, + "time_per_iteration": 2.851350784301758 + }, + { + "auxiliary_loss_clip": 0.01368019, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.25304317, + "balance_loss_mlp": 1.01990271, + "epoch": 0.5860814670073651, + "flos": 18951554908080.0, + "grad_norm": 2.350538483985116, + "language_loss": 0.81453502, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83855283, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.1383667, + "step": 9748, + "time_per_iteration": 2.7758939266204834 + }, + { + "auxiliary_loss_clip": 0.01363914, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.24820232, + "balance_loss_mlp": 1.01889658, + "epoch": 0.5861415902600331, + "flos": 22566576135960.0, + "grad_norm": 1.8636691876084215, + "language_loss": 0.72138488, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74536085, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14782715, + "step": 9749, + "time_per_iteration": 2.8960773944854736 + }, + { + "auxiliary_loss_clip": 0.01360852, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.24908161, + "balance_loss_mlp": 1.01760125, + "epoch": 0.586201713512701, + "flos": 14396079490200.0, + "grad_norm": 2.3803447144432717, + "language_loss": 0.75505525, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77898371, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.144104, + "step": 9750, + "time_per_iteration": 2.8152685165405273 + }, + { + "auxiliary_loss_clip": 0.01361587, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.24812448, + "balance_loss_mlp": 1.02190506, + "epoch": 0.586261836765369, + "flos": 19503086585760.0, + "grad_norm": 1.731440996858836, + "language_loss": 0.71472347, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73871344, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.1550293, + "step": 9751, + "time_per_iteration": 2.749213218688965 + }, + { + "auxiliary_loss_clip": 0.01375814, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.25648069, + "balance_loss_mlp": 1.02295446, + "epoch": 0.5863219600180369, + "flos": 20706401646720.0, + "grad_norm": 1.8113666837206905, + "language_loss": 0.74660528, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.7707445, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.15148926, + "step": 9752, + "time_per_iteration": 2.756016492843628 + }, + { + "auxiliary_loss_clip": 0.01363849, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.24996758, + "balance_loss_mlp": 1.02070236, + "epoch": 0.586382083270705, + "flos": 19796914654200.0, + "grad_norm": 1.7970148246280555, + "language_loss": 0.78059673, + "learning_rate": 1.541625017642943e-06, + "loss": 0.80458039, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13818359, + "step": 9753, + "time_per_iteration": 2.7397568225860596 + }, + { + "auxiliary_loss_clip": 0.0135948, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.24857283, + "balance_loss_mlp": 1.01639605, + "epoch": 0.5864422065233729, + "flos": 16503886715400.0, + "grad_norm": 1.7182168815237915, + "language_loss": 0.70992887, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73382044, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13287354, + "step": 9754, + "time_per_iteration": 2.809643268585205 + }, + { + "auxiliary_loss_clip": 0.01362628, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.24764299, + "balance_loss_mlp": 1.01959729, + "epoch": 0.5865023297760409, + "flos": 20418258748680.0, + "grad_norm": 1.755605610243864, + "language_loss": 0.72684944, + "learning_rate": 1.540866862214043e-06, + "loss": 0.75081968, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14776611, + "step": 9755, + "time_per_iteration": 2.7779743671417236 + }, + { + "auxiliary_loss_clip": 0.01170351, + "auxiliary_loss_mlp": 0.01001495, + "balance_loss_clip": 1.1234268, + "balance_loss_mlp": 0.99911082, + "epoch": 0.5865624530287089, + "flos": 63365304231360.0, + "grad_norm": 0.7420327349350396, + "language_loss": 0.56886184, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59058022, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02380371, + "step": 9756, + "time_per_iteration": 3.207878351211548 + }, + { + "auxiliary_loss_clip": 0.01362668, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.24940062, + "balance_loss_mlp": 1.02499795, + "epoch": 0.5866225762813768, + "flos": 27022002890040.0, + "grad_norm": 1.702537468913496, + "language_loss": 0.76781493, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.79182136, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13000488, + "step": 9757, + "time_per_iteration": 2.8127031326293945 + }, + { + "auxiliary_loss_clip": 0.011677, + "auxiliary_loss_mlp": 0.01002429, + "balance_loss_clip": 1.12080324, + "balance_loss_mlp": 0.99994963, + "epoch": 0.5866826995340448, + "flos": 73003357493280.0, + "grad_norm": 1.1108115683819515, + "language_loss": 0.60527068, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62697202, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02478027, + "step": 9758, + "time_per_iteration": 3.2476630210876465 + }, + { + "auxiliary_loss_clip": 0.01373935, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.25478292, + "balance_loss_mlp": 1.0220927, + "epoch": 0.5867428227867127, + "flos": 21290403031200.0, + "grad_norm": 2.0656013151183, + "language_loss": 0.73015189, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.75426531, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.15319824, + "step": 9759, + "time_per_iteration": 2.7330756187438965 + }, + { + "auxiliary_loss_clip": 0.01363143, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.24919271, + "balance_loss_mlp": 1.02268159, + "epoch": 0.5868029460393808, + "flos": 33474711248640.0, + "grad_norm": 1.6456839106667402, + "language_loss": 0.7357868, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75978184, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13677979, + "step": 9760, + "time_per_iteration": 2.8316810131073 + }, + { + "auxiliary_loss_clip": 0.0136078, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.24769306, + "balance_loss_mlp": 1.02383661, + "epoch": 0.5868630692920487, + "flos": 17893306200960.0, + "grad_norm": 1.8818975690672604, + "language_loss": 0.72610867, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.75009114, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13635254, + "step": 9761, + "time_per_iteration": 2.7553904056549072 + }, + { + "auxiliary_loss_clip": 0.0137004, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.25153446, + "balance_loss_mlp": 1.02205658, + "epoch": 0.5869231925447167, + "flos": 21040293185280.0, + "grad_norm": 1.6406142796514165, + "language_loss": 0.74759912, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77167737, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.15722656, + "step": 9762, + "time_per_iteration": 2.7997753620147705 + }, + { + "auxiliary_loss_clip": 0.01358987, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.24684858, + "balance_loss_mlp": 1.02482581, + "epoch": 0.5869833157973846, + "flos": 74749984362000.0, + "grad_norm": 1.2459390800336059, + "language_loss": 0.72723496, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.75121856, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14550781, + "step": 9763, + "time_per_iteration": 3.1486592292785645 + }, + { + "auxiliary_loss_clip": 0.01357022, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.24412131, + "balance_loss_mlp": 1.02089262, + "epoch": 0.5870434390500526, + "flos": 17643440005200.0, + "grad_norm": 1.6205349283420212, + "language_loss": 0.80380887, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82772064, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13256836, + "step": 9764, + "time_per_iteration": 2.741079092025757 + }, + { + "auxiliary_loss_clip": 0.01364048, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.24944806, + "balance_loss_mlp": 1.02563274, + "epoch": 0.5871035623027205, + "flos": 21511048188960.0, + "grad_norm": 1.616661151294094, + "language_loss": 0.79480046, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81883854, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14129639, + "step": 9765, + "time_per_iteration": 2.8104188442230225 + }, + { + "auxiliary_loss_clip": 0.01355815, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.24375844, + "balance_loss_mlp": 1.02035236, + "epoch": 0.5871636855553886, + "flos": 13555673964000.0, + "grad_norm": 1.6536244290787052, + "language_loss": 0.8362397, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.86013681, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13525391, + "step": 9766, + "time_per_iteration": 2.7595744132995605 + }, + { + "auxiliary_loss_clip": 0.01367701, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.24993372, + "balance_loss_mlp": 1.02548933, + "epoch": 0.5872238088080565, + "flos": 26218452773520.0, + "grad_norm": 1.6068093043869034, + "language_loss": 0.69901824, + "learning_rate": 1.536319396136257e-06, + "loss": 0.7230826, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.13262939, + "step": 9767, + "time_per_iteration": 2.7966432571411133 + }, + { + "auxiliary_loss_clip": 0.01363621, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.24874735, + "balance_loss_mlp": 1.0267204, + "epoch": 0.5872839320607245, + "flos": 30671443026000.0, + "grad_norm": 1.7437862132562973, + "language_loss": 0.63692153, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.66096854, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14367676, + "step": 9768, + "time_per_iteration": 2.844559907913208 + }, + { + "auxiliary_loss_clip": 0.01171224, + "auxiliary_loss_mlp": 0.01003976, + "balance_loss_clip": 1.12455034, + "balance_loss_mlp": 1.0013653, + "epoch": 0.5873440553133924, + "flos": 60320007226440.0, + "grad_norm": 0.7174230818782313, + "language_loss": 0.53965098, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.56140298, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.02612305, + "step": 9769, + "time_per_iteration": 4.641117334365845 + }, + { + "auxiliary_loss_clip": 0.01359191, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.24628711, + "balance_loss_mlp": 1.02579331, + "epoch": 0.5874041785660604, + "flos": 21543761545920.0, + "grad_norm": 1.447819244418744, + "language_loss": 0.70982707, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73381329, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13647461, + "step": 9770, + "time_per_iteration": 2.7761666774749756 + }, + { + "auxiliary_loss_clip": 0.01354375, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.24180686, + "balance_loss_mlp": 1.01636648, + "epoch": 0.5874643018187284, + "flos": 24394280918400.0, + "grad_norm": 1.7158647409760635, + "language_loss": 0.68348074, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.70732319, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13494873, + "step": 9771, + "time_per_iteration": 3.0367658138275146 + }, + { + "auxiliary_loss_clip": 0.01370594, + "auxiliary_loss_mlp": 0.01038266, + "balance_loss_clip": 1.25409329, + "balance_loss_mlp": 1.02346015, + "epoch": 0.5875244250713964, + "flos": 28153637549640.0, + "grad_norm": 1.5228228164159334, + "language_loss": 0.66315389, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68724251, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14807129, + "step": 9772, + "time_per_iteration": 4.309790849685669 + }, + { + "auxiliary_loss_clip": 0.01367043, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.25044358, + "balance_loss_mlp": 1.02969313, + "epoch": 0.5875845483240644, + "flos": 25817997486960.0, + "grad_norm": 1.5153944895799858, + "language_loss": 0.75089902, + "learning_rate": 1.534046611017519e-06, + "loss": 0.77501881, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.15246582, + "step": 9773, + "time_per_iteration": 2.8593688011169434 + }, + { + "auxiliary_loss_clip": 0.01365361, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.25033903, + "balance_loss_mlp": 1.02302313, + "epoch": 0.5876446715767323, + "flos": 26912248828200.0, + "grad_norm": 1.9747168136236026, + "language_loss": 0.53669018, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.56071728, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14318848, + "step": 9774, + "time_per_iteration": 4.250225305557251 + }, + { + "auxiliary_loss_clip": 0.01363662, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.25038183, + "balance_loss_mlp": 1.02301908, + "epoch": 0.5877047948294003, + "flos": 36691794908280.0, + "grad_norm": 2.157728504082741, + "language_loss": 0.65455633, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67857051, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.1472168, + "step": 9775, + "time_per_iteration": 2.896616220474243 + }, + { + "auxiliary_loss_clip": 0.01357334, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.2435863, + "balance_loss_mlp": 1.01726317, + "epoch": 0.5877649180820682, + "flos": 26730448889760.0, + "grad_norm": 1.595075099292729, + "language_loss": 0.73945069, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.76333916, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14257812, + "step": 9776, + "time_per_iteration": 2.7764077186584473 + }, + { + "auxiliary_loss_clip": 0.01367351, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.25255418, + "balance_loss_mlp": 1.02641463, + "epoch": 0.5878250413347362, + "flos": 21037085124840.0, + "grad_norm": 3.469452736074659, + "language_loss": 0.74645144, + "learning_rate": 1.532531774126821e-06, + "loss": 0.77052772, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.1385498, + "step": 9777, + "time_per_iteration": 2.742391347885132 + }, + { + "auxiliary_loss_clip": 0.01350999, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.24232864, + "balance_loss_mlp": 1.02166486, + "epoch": 0.5878851645874041, + "flos": 25489831727160.0, + "grad_norm": 1.4485626376359244, + "language_loss": 0.74467486, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76853925, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13781738, + "step": 9778, + "time_per_iteration": 2.7619473934173584 + }, + { + "auxiliary_loss_clip": 0.0135867, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.24653411, + "balance_loss_mlp": 1.01521695, + "epoch": 0.5879452878400722, + "flos": 23774439333240.0, + "grad_norm": 1.6807718271027625, + "language_loss": 0.70220256, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72607642, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.13513184, + "step": 9779, + "time_per_iteration": 2.776543140411377 + }, + { + "auxiliary_loss_clip": 0.01368552, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.25239623, + "balance_loss_mlp": 1.01964998, + "epoch": 0.5880054110927401, + "flos": 17829381996360.0, + "grad_norm": 1.8749909919625927, + "language_loss": 0.67031008, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69433522, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14318848, + "step": 9780, + "time_per_iteration": 4.362837314605713 + }, + { + "auxiliary_loss_clip": 0.01367145, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.25311196, + "balance_loss_mlp": 1.02396452, + "epoch": 0.5880655343454081, + "flos": 19468099160640.0, + "grad_norm": 1.8110001649114005, + "language_loss": 0.72942507, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.75347877, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1427002, + "step": 9781, + "time_per_iteration": 2.757042407989502 + }, + { + "auxiliary_loss_clip": 0.01356922, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.24498892, + "balance_loss_mlp": 1.01994908, + "epoch": 0.588125657598076, + "flos": 21402593594640.0, + "grad_norm": 1.416739913775121, + "language_loss": 0.70098335, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72488993, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13769531, + "step": 9782, + "time_per_iteration": 2.754257917404175 + }, + { + "auxiliary_loss_clip": 0.01362753, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.24751019, + "balance_loss_mlp": 1.01879549, + "epoch": 0.588185780850744, + "flos": 16039019923920.0, + "grad_norm": 2.7618625725212747, + "language_loss": 0.7088936, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.73285013, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14123535, + "step": 9783, + "time_per_iteration": 2.7813994884490967 + }, + { + "auxiliary_loss_clip": 0.01367569, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.25169158, + "balance_loss_mlp": 1.01797318, + "epoch": 0.588245904103412, + "flos": 23732954570520.0, + "grad_norm": 1.860960096808682, + "language_loss": 0.69275057, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71675396, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14801025, + "step": 9784, + "time_per_iteration": 2.889906644821167 + }, + { + "auxiliary_loss_clip": 0.01364362, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.24806142, + "balance_loss_mlp": 1.02177095, + "epoch": 0.58830602735608, + "flos": 33809130695880.0, + "grad_norm": 1.6944065218541124, + "language_loss": 0.69935316, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.72335482, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14031982, + "step": 9785, + "time_per_iteration": 2.9875569343566895 + }, + { + "auxiliary_loss_clip": 0.01352613, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.24028826, + "balance_loss_mlp": 1.01526427, + "epoch": 0.588366150608748, + "flos": 17094953954520.0, + "grad_norm": 1.9460394765999252, + "language_loss": 0.77434409, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79814994, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12719727, + "step": 9786, + "time_per_iteration": 2.7154769897460938 + }, + { + "auxiliary_loss_clip": 0.01361826, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.24755311, + "balance_loss_mlp": 1.01967919, + "epoch": 0.5884262738614159, + "flos": 22132270458360.0, + "grad_norm": 2.488387365469339, + "language_loss": 0.80134296, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.82529694, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13903809, + "step": 9787, + "time_per_iteration": 2.8073034286499023 + }, + { + "auxiliary_loss_clip": 0.01362517, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.24869621, + "balance_loss_mlp": 1.0178169, + "epoch": 0.5884863971140839, + "flos": 21036679041240.0, + "grad_norm": 1.4046513064055475, + "language_loss": 0.66456461, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68850064, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13269043, + "step": 9788, + "time_per_iteration": 2.7362961769104004 + }, + { + "auxiliary_loss_clip": 0.01358675, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.24811935, + "balance_loss_mlp": 1.01894832, + "epoch": 0.5885465203667518, + "flos": 23810360750640.0, + "grad_norm": 2.152828086348643, + "language_loss": 0.7992515, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.8231557, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12805176, + "step": 9789, + "time_per_iteration": 2.7899153232574463 + }, + { + "auxiliary_loss_clip": 0.01349705, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.238976, + "balance_loss_mlp": 1.01723003, + "epoch": 0.5886066436194198, + "flos": 18885316026960.0, + "grad_norm": 1.398841780370144, + "language_loss": 0.70503354, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72883749, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13464355, + "step": 9790, + "time_per_iteration": 2.744896411895752 + }, + { + "auxiliary_loss_clip": 0.01358716, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.24715614, + "balance_loss_mlp": 1.0140512, + "epoch": 0.5886667668720877, + "flos": 24795507763800.0, + "grad_norm": 1.6842708850970518, + "language_loss": 0.83339447, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85726786, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14569092, + "step": 9791, + "time_per_iteration": 2.8161463737487793 + }, + { + "auxiliary_loss_clip": 0.01365482, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.25136352, + "balance_loss_mlp": 1.02319503, + "epoch": 0.5887268901247558, + "flos": 21619137308040.0, + "grad_norm": 1.5071397217597449, + "language_loss": 0.7649557, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.7889868, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14440918, + "step": 9792, + "time_per_iteration": 2.8359766006469727 + }, + { + "auxiliary_loss_clip": 0.01362957, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.24647999, + "balance_loss_mlp": 1.01601005, + "epoch": 0.5887870133774237, + "flos": 20486121964200.0, + "grad_norm": 1.8299821978672133, + "language_loss": 0.69138384, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71531653, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14312744, + "step": 9793, + "time_per_iteration": 2.769756555557251 + }, + { + "auxiliary_loss_clip": 0.01356404, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.24526882, + "balance_loss_mlp": 1.02147233, + "epoch": 0.5888471366300917, + "flos": 19210720418280.0, + "grad_norm": 1.7978928051507574, + "language_loss": 0.60381615, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62773788, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14300537, + "step": 9794, + "time_per_iteration": 2.7521045207977295 + }, + { + "auxiliary_loss_clip": 0.01369509, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.25462854, + "balance_loss_mlp": 1.02286243, + "epoch": 0.5889072598827596, + "flos": 19977496341840.0, + "grad_norm": 1.4363768819288296, + "language_loss": 0.65322179, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67728728, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14172363, + "step": 9795, + "time_per_iteration": 2.7729718685150146 + }, + { + "auxiliary_loss_clip": 0.01353163, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.24303055, + "balance_loss_mlp": 1.02113962, + "epoch": 0.5889673831354276, + "flos": 20746221466680.0, + "grad_norm": 1.9242843996412422, + "language_loss": 0.7442469, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.7681185, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.128479, + "step": 9796, + "time_per_iteration": 2.772195339202881 + }, + { + "auxiliary_loss_clip": 0.01357929, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.24623096, + "balance_loss_mlp": 1.01366758, + "epoch": 0.5890275063880956, + "flos": 25306366845960.0, + "grad_norm": 1.357368526730742, + "language_loss": 0.83121699, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85506034, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1272583, + "step": 9797, + "time_per_iteration": 2.8061470985412598 + }, + { + "auxiliary_loss_clip": 0.01352984, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.24254644, + "balance_loss_mlp": 1.01865149, + "epoch": 0.5890876296407636, + "flos": 11769129077400.0, + "grad_norm": 2.20287231999315, + "language_loss": 0.79080689, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.8146559, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13256836, + "step": 9798, + "time_per_iteration": 2.7446157932281494 + }, + { + "auxiliary_loss_clip": 0.01348721, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.23921311, + "balance_loss_mlp": 1.01935363, + "epoch": 0.5891477528934316, + "flos": 13593422757600.0, + "grad_norm": 2.010141274780463, + "language_loss": 0.74848574, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.77229393, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12756348, + "step": 9799, + "time_per_iteration": 2.7151997089385986 + }, + { + "auxiliary_loss_clip": 0.01363403, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.24897838, + "balance_loss_mlp": 1.01560092, + "epoch": 0.5892078761460995, + "flos": 15053385610440.0, + "grad_norm": 1.9612048408298635, + "language_loss": 0.76535934, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78930026, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.15081787, + "step": 9800, + "time_per_iteration": 2.740023612976074 + }, + { + "auxiliary_loss_clip": 0.01360457, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.24675465, + "balance_loss_mlp": 1.02151704, + "epoch": 0.5892679993987675, + "flos": 15782412740400.0, + "grad_norm": 2.389883000584978, + "language_loss": 0.79165757, + "learning_rate": 1.523448741022722e-06, + "loss": 0.81561315, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13586426, + "step": 9801, + "time_per_iteration": 2.7363288402557373 + }, + { + "auxiliary_loss_clip": 0.0136561, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.25134683, + "balance_loss_mlp": 1.02015471, + "epoch": 0.5893281226514354, + "flos": 25270729687080.0, + "grad_norm": 1.7012340215377548, + "language_loss": 0.66661346, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.69060707, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13586426, + "step": 9802, + "time_per_iteration": 2.7512667179107666 + }, + { + "auxiliary_loss_clip": 0.01349405, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.23885798, + "balance_loss_mlp": 1.01500845, + "epoch": 0.5893882459041034, + "flos": 19462251556800.0, + "grad_norm": 1.5660895064312903, + "language_loss": 0.78578353, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80956542, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13787842, + "step": 9803, + "time_per_iteration": 2.7347190380096436 + }, + { + "auxiliary_loss_clip": 0.01359901, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.24584889, + "balance_loss_mlp": 1.02557003, + "epoch": 0.5894483691567713, + "flos": 20639309990040.0, + "grad_norm": 1.5335801388995907, + "language_loss": 0.73078334, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75476825, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13024902, + "step": 9804, + "time_per_iteration": 2.7642714977264404 + }, + { + "auxiliary_loss_clip": 0.01353005, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.24178159, + "balance_loss_mlp": 1.01606965, + "epoch": 0.5895084924094394, + "flos": 17781805979640.0, + "grad_norm": 1.8847677708773107, + "language_loss": 0.7510128, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.77483672, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13305664, + "step": 9805, + "time_per_iteration": 2.778334856033325 + }, + { + "auxiliary_loss_clip": 0.01372541, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.25390506, + "balance_loss_mlp": 1.02112186, + "epoch": 0.5895686156621073, + "flos": 20125973797920.0, + "grad_norm": 1.7481939960022468, + "language_loss": 0.78455532, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80863893, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.14703369, + "step": 9806, + "time_per_iteration": 2.8540618419647217 + }, + { + "auxiliary_loss_clip": 0.0135394, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.24252319, + "balance_loss_mlp": 1.01991415, + "epoch": 0.5896287389147753, + "flos": 20854960319520.0, + "grad_norm": 2.030597935612896, + "language_loss": 0.77246815, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79634368, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.137146, + "step": 9807, + "time_per_iteration": 2.7758371829986572 + }, + { + "auxiliary_loss_clip": 0.0136883, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.25376081, + "balance_loss_mlp": 1.01579547, + "epoch": 0.5896888621674432, + "flos": 14541958011240.0, + "grad_norm": 1.6393745005400986, + "language_loss": 0.74857163, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.77255529, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13757324, + "step": 9808, + "time_per_iteration": 2.6953835487365723 + }, + { + "auxiliary_loss_clip": 0.01363424, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.24906349, + "balance_loss_mlp": 1.01339078, + "epoch": 0.5897489854201112, + "flos": 20891693904120.0, + "grad_norm": 2.0561753902415427, + "language_loss": 0.72612447, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.75003779, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14520264, + "step": 9809, + "time_per_iteration": 4.239224433898926 + }, + { + "auxiliary_loss_clip": 0.01361854, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.24614489, + "balance_loss_mlp": 1.02113271, + "epoch": 0.5898091086727792, + "flos": 20015976085920.0, + "grad_norm": 1.8535938494048336, + "language_loss": 0.82588041, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84985375, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.14355469, + "step": 9810, + "time_per_iteration": 2.8120882511138916 + }, + { + "auxiliary_loss_clip": 0.01353153, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.24377656, + "balance_loss_mlp": 1.01593494, + "epoch": 0.5898692319254472, + "flos": 16257594055320.0, + "grad_norm": 1.588669002873118, + "language_loss": 0.8144272, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83824903, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13104248, + "step": 9811, + "time_per_iteration": 4.155629396438599 + }, + { + "auxiliary_loss_clip": 0.0136559, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.25016427, + "balance_loss_mlp": 1.01493597, + "epoch": 0.5899293551781152, + "flos": 20453367998880.0, + "grad_norm": 1.6742321416730823, + "language_loss": 0.77086306, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79481268, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14416504, + "step": 9812, + "time_per_iteration": 2.7638635635375977 + }, + { + "auxiliary_loss_clip": 0.013625, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.25132084, + "balance_loss_mlp": 1.01762807, + "epoch": 0.5899894784307831, + "flos": 13885748316720.0, + "grad_norm": 1.6568756065404342, + "language_loss": 0.70800877, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.73192859, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.11853027, + "step": 9813, + "time_per_iteration": 4.222770690917969 + }, + { + "auxiliary_loss_clip": 0.01355631, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.24313176, + "balance_loss_mlp": 1.02412415, + "epoch": 0.5900496016834511, + "flos": 20088590479560.0, + "grad_norm": 1.4912567458928978, + "language_loss": 0.72198212, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74591249, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13299561, + "step": 9814, + "time_per_iteration": 2.7645630836486816 + }, + { + "auxiliary_loss_clip": 0.01359671, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.24864626, + "balance_loss_mlp": 1.01913428, + "epoch": 0.590109724936119, + "flos": 20263040304840.0, + "grad_norm": 5.053458486456494, + "language_loss": 0.78669703, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.81062007, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1350708, + "step": 9815, + "time_per_iteration": 2.7735049724578857 + }, + { + "auxiliary_loss_clip": 0.01371466, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.25404942, + "balance_loss_mlp": 1.02469254, + "epoch": 0.590169848188787, + "flos": 24239346733080.0, + "grad_norm": 13.700475208063736, + "language_loss": 0.76490915, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.78901374, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14282227, + "step": 9816, + "time_per_iteration": 2.8452816009521484 + }, + { + "auxiliary_loss_clip": 0.01354559, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.2432704, + "balance_loss_mlp": 1.02259707, + "epoch": 0.590229971441455, + "flos": 17789156092800.0, + "grad_norm": 1.7590047637361563, + "language_loss": 0.81284547, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83675051, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13360596, + "step": 9817, + "time_per_iteration": 2.6879498958587646 + }, + { + "auxiliary_loss_clip": 0.01365118, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.25158119, + "balance_loss_mlp": 1.02324855, + "epoch": 0.590290094694123, + "flos": 22242024520200.0, + "grad_norm": 1.6982979446594333, + "language_loss": 0.76595163, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78996861, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13348389, + "step": 9818, + "time_per_iteration": 2.7460155487060547 + }, + { + "auxiliary_loss_clip": 0.01357499, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.24754977, + "balance_loss_mlp": 1.01831293, + "epoch": 0.5903502179467909, + "flos": 19103281032960.0, + "grad_norm": 1.8712313861625396, + "language_loss": 0.66907918, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.69296753, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13012695, + "step": 9819, + "time_per_iteration": 4.323061227798462 + }, + { + "auxiliary_loss_clip": 0.01357932, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.24667442, + "balance_loss_mlp": 1.01701331, + "epoch": 0.5904103411994589, + "flos": 24239834033400.0, + "grad_norm": 1.6159948492842557, + "language_loss": 0.78197014, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80585355, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1340332, + "step": 9820, + "time_per_iteration": 2.7640767097473145 + }, + { + "auxiliary_loss_clip": 0.01176231, + "auxiliary_loss_mlp": 0.01009101, + "balance_loss_clip": 1.12880945, + "balance_loss_mlp": 1.00597787, + "epoch": 0.5904704644521268, + "flos": 64890490756320.0, + "grad_norm": 0.9376883530062378, + "language_loss": 0.65114844, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67300177, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.03112793, + "step": 9821, + "time_per_iteration": 3.273561477661133 + }, + { + "auxiliary_loss_clip": 0.01354099, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.24400902, + "balance_loss_mlp": 1.01999915, + "epoch": 0.5905305877047948, + "flos": 19614789848880.0, + "grad_norm": 1.9566417252541153, + "language_loss": 0.61751962, + "learning_rate": 1.515509618752521e-06, + "loss": 0.64139092, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13024902, + "step": 9822, + "time_per_iteration": 2.746978521347046 + }, + { + "auxiliary_loss_clip": 0.01362845, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.25013423, + "balance_loss_mlp": 1.02231312, + "epoch": 0.5905907109574628, + "flos": 18994339138320.0, + "grad_norm": 1.7197005848325453, + "language_loss": 0.82678497, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.85077602, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13934326, + "step": 9823, + "time_per_iteration": 2.796250581741333 + }, + { + "auxiliary_loss_clip": 0.01353483, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.243451, + "balance_loss_mlp": 1.0150373, + "epoch": 0.5906508342101308, + "flos": 22205534585760.0, + "grad_norm": 1.9744570202389804, + "language_loss": 0.73283994, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75666058, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13562012, + "step": 9824, + "time_per_iteration": 2.7968366146087646 + }, + { + "auxiliary_loss_clip": 0.0137908, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.25774598, + "balance_loss_mlp": 1.02201366, + "epoch": 0.5907109574627988, + "flos": 20891612687400.0, + "grad_norm": 2.088899916282801, + "language_loss": 0.83415532, + "learning_rate": 1.514376116721693e-06, + "loss": 0.85832357, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.1572876, + "step": 9825, + "time_per_iteration": 2.735687017440796 + }, + { + "auxiliary_loss_clip": 0.01343808, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.23588109, + "balance_loss_mlp": 1.01900291, + "epoch": 0.5907710807154667, + "flos": 21511616706000.0, + "grad_norm": 1.7439660214240915, + "language_loss": 0.77018833, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79393333, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.11700439, + "step": 9826, + "time_per_iteration": 2.828899621963501 + }, + { + "auxiliary_loss_clip": 0.01348427, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.23830342, + "balance_loss_mlp": 1.01871002, + "epoch": 0.5908312039681347, + "flos": 22023531605520.0, + "grad_norm": 2.097546552395404, + "language_loss": 0.72017634, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74397552, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.12792969, + "step": 9827, + "time_per_iteration": 2.751483917236328 + }, + { + "auxiliary_loss_clip": 0.01354715, + "auxiliary_loss_mlp": 0.01026679, + "balance_loss_clip": 1.2426374, + "balance_loss_mlp": 1.01418555, + "epoch": 0.5908913272208026, + "flos": 18484251615000.0, + "grad_norm": 1.5837481827776627, + "language_loss": 0.79746765, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.82128155, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12506104, + "step": 9828, + "time_per_iteration": 2.7686707973480225 + }, + { + "auxiliary_loss_clip": 0.01359615, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.24619937, + "balance_loss_mlp": 1.01736975, + "epoch": 0.5909514504734706, + "flos": 12315950185320.0, + "grad_norm": 2.7017304458111746, + "language_loss": 0.88575119, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90966827, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.1472168, + "step": 9829, + "time_per_iteration": 2.716803550720215 + }, + { + "auxiliary_loss_clip": 0.01174184, + "auxiliary_loss_mlp": 0.01006823, + "balance_loss_clip": 1.12732613, + "balance_loss_mlp": 1.00395036, + "epoch": 0.5910115737261386, + "flos": 70229107266840.0, + "grad_norm": 0.9324578538800516, + "language_loss": 0.57925057, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.60106063, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02868652, + "step": 9830, + "time_per_iteration": 3.1845390796661377 + }, + { + "auxiliary_loss_clip": 0.01365697, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.24734366, + "balance_loss_mlp": 1.0166862, + "epoch": 0.5910716969788066, + "flos": 22022841263400.0, + "grad_norm": 2.193144036146183, + "language_loss": 0.76233447, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78630853, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.15008545, + "step": 9831, + "time_per_iteration": 2.9167490005493164 + }, + { + "auxiliary_loss_clip": 0.01342547, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.23535752, + "balance_loss_mlp": 1.01561522, + "epoch": 0.5911318202314745, + "flos": 21256633856880.0, + "grad_norm": 1.6733409881735135, + "language_loss": 0.77639717, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80011082, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13201904, + "step": 9832, + "time_per_iteration": 2.7898664474487305 + }, + { + "auxiliary_loss_clip": 0.01351321, + "auxiliary_loss_mlp": 0.01026101, + "balance_loss_clip": 1.24087703, + "balance_loss_mlp": 1.01301146, + "epoch": 0.5911919434841425, + "flos": 17826092719200.0, + "grad_norm": 1.8220575934178254, + "language_loss": 0.84190369, + "learning_rate": 1.511354255945847e-06, + "loss": 0.86567789, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13098145, + "step": 9833, + "time_per_iteration": 2.8951334953308105 + }, + { + "auxiliary_loss_clip": 0.0135888, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.24748182, + "balance_loss_mlp": 1.01853514, + "epoch": 0.5912520667368104, + "flos": 20379494746080.0, + "grad_norm": 1.4701143807317447, + "language_loss": 0.74324358, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76715326, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13555908, + "step": 9834, + "time_per_iteration": 2.7771201133728027 + }, + { + "auxiliary_loss_clip": 0.01353696, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.24173355, + "balance_loss_mlp": 1.01966143, + "epoch": 0.5913121899894784, + "flos": 17934994005480.0, + "grad_norm": 3.6502445008427222, + "language_loss": 0.7852158, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80908322, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1340332, + "step": 9835, + "time_per_iteration": 2.7282421588897705 + }, + { + "auxiliary_loss_clip": 0.01360698, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.24667084, + "balance_loss_mlp": 1.0208919, + "epoch": 0.5913723132421465, + "flos": 22131986199840.0, + "grad_norm": 1.8956135978578335, + "language_loss": 0.73795193, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76190126, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13354492, + "step": 9836, + "time_per_iteration": 2.720146656036377 + }, + { + "auxiliary_loss_clip": 0.01361557, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.24873078, + "balance_loss_mlp": 1.01623046, + "epoch": 0.5914324364948144, + "flos": 15701148766080.0, + "grad_norm": 2.8135180898127126, + "language_loss": 0.82686442, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.85078025, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13781738, + "step": 9837, + "time_per_iteration": 2.8865885734558105 + }, + { + "auxiliary_loss_clip": 0.01361801, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.24830174, + "balance_loss_mlp": 1.01634359, + "epoch": 0.5914925597474824, + "flos": 22752477518760.0, + "grad_norm": 1.7400639882139632, + "language_loss": 0.79593664, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81985497, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13696289, + "step": 9838, + "time_per_iteration": 2.817615509033203 + }, + { + "auxiliary_loss_clip": 0.01357944, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.24547315, + "balance_loss_mlp": 1.02147126, + "epoch": 0.5915526830001503, + "flos": 18296928939600.0, + "grad_norm": 1.8564944535922105, + "language_loss": 0.69989276, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.72382104, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13415527, + "step": 9839, + "time_per_iteration": 2.7082583904266357 + }, + { + "auxiliary_loss_clip": 0.01361155, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.24627018, + "balance_loss_mlp": 1.024472, + "epoch": 0.5916128062528183, + "flos": 17023476594960.0, + "grad_norm": 1.8906841782373796, + "language_loss": 0.66066682, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68465483, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.1315918, + "step": 9840, + "time_per_iteration": 2.7348053455352783 + }, + { + "auxiliary_loss_clip": 0.01361018, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.24706006, + "balance_loss_mlp": 1.01768851, + "epoch": 0.5916729295054862, + "flos": 24759667563120.0, + "grad_norm": 1.6755527560318235, + "language_loss": 0.81950486, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.84343094, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13891602, + "step": 9841, + "time_per_iteration": 2.7739460468292236 + }, + { + "auxiliary_loss_clip": 0.01349859, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.23986554, + "balance_loss_mlp": 1.01687384, + "epoch": 0.5917330527581542, + "flos": 15962101044120.0, + "grad_norm": 1.6715448941069586, + "language_loss": 0.69225425, + "learning_rate": 1.507956080444291e-06, + "loss": 0.7160486, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12695312, + "step": 9842, + "time_per_iteration": 2.7882001399993896 + }, + { + "auxiliary_loss_clip": 0.01355235, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.24206424, + "balance_loss_mlp": 1.02280402, + "epoch": 0.5917931760108222, + "flos": 23805406530720.0, + "grad_norm": 1.959492151644549, + "language_loss": 0.83022094, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85413539, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13397217, + "step": 9843, + "time_per_iteration": 2.835554838180542 + }, + { + "auxiliary_loss_clip": 0.01357056, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.24323249, + "balance_loss_mlp": 1.01493049, + "epoch": 0.5918532992634902, + "flos": 23253712419600.0, + "grad_norm": 2.1488696979780437, + "language_loss": 0.82304597, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84691143, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14569092, + "step": 9844, + "time_per_iteration": 2.8299777507781982 + }, + { + "auxiliary_loss_clip": 0.01361438, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.24832201, + "balance_loss_mlp": 1.02015245, + "epoch": 0.5919134225161581, + "flos": 19504548486720.0, + "grad_norm": 1.740216374038052, + "language_loss": 0.74419588, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76814711, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13525391, + "step": 9845, + "time_per_iteration": 2.832747220993042 + }, + { + "auxiliary_loss_clip": 0.01357118, + "auxiliary_loss_mlp": 0.01027715, + "balance_loss_clip": 1.24228215, + "balance_loss_mlp": 1.01358867, + "epoch": 0.5919735457688261, + "flos": 38808170497440.0, + "grad_norm": 1.8337456171762938, + "language_loss": 0.64111793, + "learning_rate": 1.506446264718213e-06, + "loss": 0.66496623, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14135742, + "step": 9846, + "time_per_iteration": 4.4425108432769775 + }, + { + "auxiliary_loss_clip": 0.01335705, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.23087287, + "balance_loss_mlp": 1.0135318, + "epoch": 0.592033669021494, + "flos": 22169125868040.0, + "grad_norm": 1.6240724392556583, + "language_loss": 0.76351947, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78712487, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.11297607, + "step": 9847, + "time_per_iteration": 2.854799509048462 + }, + { + "auxiliary_loss_clip": 0.01355569, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.24245453, + "balance_loss_mlp": 1.01547503, + "epoch": 0.592093792274162, + "flos": 22716150017760.0, + "grad_norm": 1.6981481716334441, + "language_loss": 0.62892997, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.65278065, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14019775, + "step": 9848, + "time_per_iteration": 2.8450779914855957 + }, + { + "auxiliary_loss_clip": 0.01355334, + "auxiliary_loss_mlp": 0.0103116, + "balance_loss_clip": 1.24230492, + "balance_loss_mlp": 1.01824939, + "epoch": 0.59215391552683, + "flos": 22534147037520.0, + "grad_norm": 1.6030261445572727, + "language_loss": 0.76213413, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78599906, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.12921143, + "step": 9849, + "time_per_iteration": 4.271022319793701 + }, + { + "auxiliary_loss_clip": 0.01350971, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.23802924, + "balance_loss_mlp": 1.02023578, + "epoch": 0.592214038779498, + "flos": 24504441063840.0, + "grad_norm": 1.799933582535807, + "language_loss": 0.75606537, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77991545, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13793945, + "step": 9850, + "time_per_iteration": 2.857149362564087 + }, + { + "auxiliary_loss_clip": 0.01354334, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.24305844, + "balance_loss_mlp": 1.02205276, + "epoch": 0.592274162032166, + "flos": 21835884063240.0, + "grad_norm": 1.739227575714166, + "language_loss": 0.76126337, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.78515774, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13049316, + "step": 9851, + "time_per_iteration": 4.322427034378052 + }, + { + "auxiliary_loss_clip": 0.013568, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.24315143, + "balance_loss_mlp": 1.01746726, + "epoch": 0.5923342852848339, + "flos": 24613545391920.0, + "grad_norm": 1.7511778076823719, + "language_loss": 0.70521796, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72909373, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13323975, + "step": 9852, + "time_per_iteration": 2.7635602951049805 + }, + { + "auxiliary_loss_clip": 0.01361608, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.24519861, + "balance_loss_mlp": 1.02387261, + "epoch": 0.5923944085375019, + "flos": 19942955608680.0, + "grad_norm": 1.5757909653567919, + "language_loss": 0.80267435, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82667148, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14227295, + "step": 9853, + "time_per_iteration": 2.763698101043701 + }, + { + "auxiliary_loss_clip": 0.01345682, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.23654366, + "balance_loss_mlp": 1.0188477, + "epoch": 0.5924545317901698, + "flos": 28664496631800.0, + "grad_norm": 1.736261326729041, + "language_loss": 0.67595041, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69971675, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12109375, + "step": 9854, + "time_per_iteration": 2.7857112884521484 + }, + { + "auxiliary_loss_clip": 0.01352075, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.24062324, + "balance_loss_mlp": 1.01147807, + "epoch": 0.5925146550428378, + "flos": 19869650872920.0, + "grad_norm": 1.8859997783471079, + "language_loss": 0.88982129, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.91359705, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14038086, + "step": 9855, + "time_per_iteration": 2.7137794494628906 + }, + { + "auxiliary_loss_clip": 0.01345008, + "auxiliary_loss_mlp": 0.01027051, + "balance_loss_clip": 1.23573661, + "balance_loss_mlp": 1.01406872, + "epoch": 0.5925747782955058, + "flos": 15127299471600.0, + "grad_norm": 3.8004099251418957, + "language_loss": 0.87028724, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.8940078, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12982178, + "step": 9856, + "time_per_iteration": 2.7107203006744385 + }, + { + "auxiliary_loss_clip": 0.01358032, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.24446356, + "balance_loss_mlp": 1.02005196, + "epoch": 0.5926349015481738, + "flos": 18410012886960.0, + "grad_norm": 1.7592282657193046, + "language_loss": 0.77372622, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79764247, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13543701, + "step": 9857, + "time_per_iteration": 4.3518898487091064 + }, + { + "auxiliary_loss_clip": 0.0135678, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.24578381, + "balance_loss_mlp": 1.01715803, + "epoch": 0.5926950248008417, + "flos": 23116199220720.0, + "grad_norm": 2.0478099972950665, + "language_loss": 0.65000117, + "learning_rate": 1.501918617901419e-06, + "loss": 0.67387462, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13409424, + "step": 9858, + "time_per_iteration": 2.740748882293701 + }, + { + "auxiliary_loss_clip": 0.01349293, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.23963261, + "balance_loss_mlp": 1.01716554, + "epoch": 0.5927551480535097, + "flos": 28039254134760.0, + "grad_norm": 1.6789210508656884, + "language_loss": 0.77300787, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79680836, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13592529, + "step": 9859, + "time_per_iteration": 2.8587093353271484 + }, + { + "auxiliary_loss_clip": 0.0135853, + "auxiliary_loss_mlp": 0.01038591, + "balance_loss_clip": 1.2444979, + "balance_loss_mlp": 1.02403545, + "epoch": 0.5928152713061776, + "flos": 21803698614960.0, + "grad_norm": 2.0104989163227107, + "language_loss": 0.75527447, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77924573, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14550781, + "step": 9860, + "time_per_iteration": 2.9212749004364014 + }, + { + "auxiliary_loss_clip": 0.01350278, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.23979497, + "balance_loss_mlp": 1.0201323, + "epoch": 0.5928753945588456, + "flos": 24322275650160.0, + "grad_norm": 1.8977699137748596, + "language_loss": 0.76405454, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78788257, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12390137, + "step": 9861, + "time_per_iteration": 2.865231513977051 + }, + { + "auxiliary_loss_clip": 0.01345099, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.23518586, + "balance_loss_mlp": 1.01713943, + "epoch": 0.5929355178115137, + "flos": 26469699653520.0, + "grad_norm": 1.513638615516511, + "language_loss": 0.71202934, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.73578191, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13037109, + "step": 9862, + "time_per_iteration": 2.815002679824829 + }, + { + "auxiliary_loss_clip": 0.01352765, + "auxiliary_loss_mlp": 0.0102637, + "balance_loss_clip": 1.23998463, + "balance_loss_mlp": 1.01307821, + "epoch": 0.5929956410641816, + "flos": 24970526106120.0, + "grad_norm": 3.3565516956506127, + "language_loss": 0.78233576, + "learning_rate": 1.500032899685832e-06, + "loss": 0.80612707, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.1328125, + "step": 9863, + "time_per_iteration": 2.9036083221435547 + }, + { + "auxiliary_loss_clip": 0.01354528, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.24269986, + "balance_loss_mlp": 1.02057981, + "epoch": 0.5930557643168496, + "flos": 26213254903440.0, + "grad_norm": 1.7839446642272194, + "language_loss": 0.70229304, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72617972, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13549805, + "step": 9864, + "time_per_iteration": 2.84302020072937 + }, + { + "auxiliary_loss_clip": 0.01354245, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.24178672, + "balance_loss_mlp": 1.0186193, + "epoch": 0.5931158875695175, + "flos": 27860702865120.0, + "grad_norm": 1.5580062475610212, + "language_loss": 0.67531598, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69918436, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13977051, + "step": 9865, + "time_per_iteration": 2.8538460731506348 + }, + { + "auxiliary_loss_clip": 0.0135576, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.24196553, + "balance_loss_mlp": 1.01881647, + "epoch": 0.5931760108221855, + "flos": 15417838262880.0, + "grad_norm": 2.002559963935045, + "language_loss": 0.78666675, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.81055117, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13873291, + "step": 9866, + "time_per_iteration": 2.8098411560058594 + }, + { + "auxiliary_loss_clip": 0.01341421, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.23318684, + "balance_loss_mlp": 1.01876807, + "epoch": 0.5932361340748534, + "flos": 30194393726520.0, + "grad_norm": 2.0641114843180475, + "language_loss": 0.72312212, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74685729, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13317871, + "step": 9867, + "time_per_iteration": 2.912449598312378 + }, + { + "auxiliary_loss_clip": 0.01349155, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.23889494, + "balance_loss_mlp": 1.01327586, + "epoch": 0.5932962573275214, + "flos": 20162463732360.0, + "grad_norm": 1.4324106926076048, + "language_loss": 0.66992491, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.6937021, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.10302734, + "router_z_loss_mlp": 0.15307617, + "step": 9868, + "time_per_iteration": 2.8985164165496826 + }, + { + "auxiliary_loss_clip": 0.01354256, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.24052596, + "balance_loss_mlp": 1.01984501, + "epoch": 0.5933563805801894, + "flos": 25451230158000.0, + "grad_norm": 1.5180331744883224, + "language_loss": 0.75656688, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.78044969, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14178467, + "step": 9869, + "time_per_iteration": 2.887989044189453 + }, + { + "auxiliary_loss_clip": 0.01356858, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.24323273, + "balance_loss_mlp": 1.01996505, + "epoch": 0.5934165038328574, + "flos": 60005289479760.0, + "grad_norm": 1.5271336604587356, + "language_loss": 0.74185419, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76576638, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1439209, + "step": 9870, + "time_per_iteration": 3.2970874309539795 + }, + { + "auxiliary_loss_clip": 0.01356495, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.24217701, + "balance_loss_mlp": 1.0147016, + "epoch": 0.5934766270855253, + "flos": 24425248115880.0, + "grad_norm": 2.108615803984566, + "language_loss": 0.72185785, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74570692, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.137146, + "step": 9871, + "time_per_iteration": 2.9188904762268066 + }, + { + "auxiliary_loss_clip": 0.01356591, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.24197638, + "balance_loss_mlp": 1.01749086, + "epoch": 0.5935367503381933, + "flos": 23518360058400.0, + "grad_norm": 1.8159333507960154, + "language_loss": 0.74809861, + "learning_rate": 1.496639802503271e-06, + "loss": 0.7719869, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14764404, + "step": 9872, + "time_per_iteration": 2.9761922359466553 + }, + { + "auxiliary_loss_clip": 0.01360844, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.24547601, + "balance_loss_mlp": 1.01767612, + "epoch": 0.5935968735908612, + "flos": 18952407683640.0, + "grad_norm": 2.067688979526954, + "language_loss": 0.79441869, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81835306, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14923096, + "step": 9873, + "time_per_iteration": 2.8074889183044434 + }, + { + "auxiliary_loss_clip": 0.01355388, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.24188221, + "balance_loss_mlp": 1.0153625, + "epoch": 0.5936569968435292, + "flos": 25488897734880.0, + "grad_norm": 1.5082485909128887, + "language_loss": 0.85090256, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87475884, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14880371, + "step": 9874, + "time_per_iteration": 2.8294014930725098 + }, + { + "auxiliary_loss_clip": 0.01174875, + "auxiliary_loss_mlp": 0.01004916, + "balance_loss_clip": 1.12832665, + "balance_loss_mlp": 1.00204265, + "epoch": 0.5937171200961973, + "flos": 66393197231040.0, + "grad_norm": 0.714756319761272, + "language_loss": 0.60052264, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62232059, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02868652, + "step": 9875, + "time_per_iteration": 3.382884979248047 + }, + { + "auxiliary_loss_clip": 0.01362279, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.24460936, + "balance_loss_mlp": 1.0154376, + "epoch": 0.5937772433488652, + "flos": 14907669522840.0, + "grad_norm": 1.8624593931736737, + "language_loss": 0.77790278, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80182981, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.14978027, + "step": 9876, + "time_per_iteration": 2.814619302749634 + }, + { + "auxiliary_loss_clip": 0.01346487, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.23825192, + "balance_loss_mlp": 1.01681113, + "epoch": 0.5938373666015332, + "flos": 22566170052360.0, + "grad_norm": 1.5354621106914321, + "language_loss": 0.76007903, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78384358, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13153076, + "step": 9877, + "time_per_iteration": 2.8087682723999023 + }, + { + "auxiliary_loss_clip": 0.01353588, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.24001777, + "balance_loss_mlp": 1.01665783, + "epoch": 0.5938974898542011, + "flos": 18445325178960.0, + "grad_norm": 2.1966179655842555, + "language_loss": 0.81474066, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83858722, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14416504, + "step": 9878, + "time_per_iteration": 2.849851131439209 + }, + { + "auxiliary_loss_clip": 0.01359887, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.24552011, + "balance_loss_mlp": 1.02039492, + "epoch": 0.5939576131068691, + "flos": 45594770394600.0, + "grad_norm": 1.6201552486061384, + "language_loss": 0.71364331, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73759097, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14477539, + "step": 9879, + "time_per_iteration": 3.029824733734131 + }, + { + "auxiliary_loss_clip": 0.01354079, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.24300909, + "balance_loss_mlp": 1.01801121, + "epoch": 0.594017736359537, + "flos": 23593126695120.0, + "grad_norm": 1.4804652439977122, + "language_loss": 0.58056957, + "learning_rate": 1.493625013742401e-06, + "loss": 0.60442811, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13757324, + "step": 9880, + "time_per_iteration": 2.87577223777771 + }, + { + "auxiliary_loss_clip": 0.01352042, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.2399646, + "balance_loss_mlp": 1.02251172, + "epoch": 0.594077859612205, + "flos": 29462808269880.0, + "grad_norm": 1.9863936017365635, + "language_loss": 0.77457094, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79845405, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13781738, + "step": 9881, + "time_per_iteration": 3.0349864959716797 + }, + { + "auxiliary_loss_clip": 0.01356313, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.24253106, + "balance_loss_mlp": 1.01709926, + "epoch": 0.594137982864873, + "flos": 16804455771600.0, + "grad_norm": 2.0488812543679953, + "language_loss": 0.82871425, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.85259265, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14440918, + "step": 9882, + "time_per_iteration": 2.9666032791137695 + }, + { + "auxiliary_loss_clip": 0.01355085, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.24303913, + "balance_loss_mlp": 1.0250119, + "epoch": 0.594198106117541, + "flos": 12754113657120.0, + "grad_norm": 2.292328243110231, + "language_loss": 0.79350084, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81743264, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13085938, + "step": 9883, + "time_per_iteration": 2.8507237434387207 + }, + { + "auxiliary_loss_clip": 0.01361337, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.2458787, + "balance_loss_mlp": 1.02123451, + "epoch": 0.5942582293702089, + "flos": 21001529182680.0, + "grad_norm": 1.9631318818306853, + "language_loss": 0.74851298, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7724908, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.15197754, + "step": 9884, + "time_per_iteration": 2.8428354263305664 + }, + { + "auxiliary_loss_clip": 0.01357784, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.2456243, + "balance_loss_mlp": 1.02024961, + "epoch": 0.5943183526228769, + "flos": 28297079569080.0, + "grad_norm": 1.970666650301992, + "language_loss": 0.66570592, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.6896196, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13354492, + "step": 9885, + "time_per_iteration": 4.391482353210449 + }, + { + "auxiliary_loss_clip": 0.0135686, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.2454102, + "balance_loss_mlp": 1.02068174, + "epoch": 0.5943784758755448, + "flos": 26620248135960.0, + "grad_norm": 2.469996018039179, + "language_loss": 0.77624583, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.80016124, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14013672, + "step": 9886, + "time_per_iteration": 2.8524320125579834 + }, + { + "auxiliary_loss_clip": 0.0117696, + "auxiliary_loss_mlp": 0.0101323, + "balance_loss_clip": 1.12975132, + "balance_loss_mlp": 1.01053607, + "epoch": 0.5944385991282128, + "flos": 64205709757560.0, + "grad_norm": 0.8256517105381717, + "language_loss": 0.64560324, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66750509, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02697754, + "step": 9887, + "time_per_iteration": 3.1795201301574707 + }, + { + "auxiliary_loss_clip": 0.01357912, + "auxiliary_loss_mlp": 0.01038227, + "balance_loss_clip": 1.24595535, + "balance_loss_mlp": 1.0249052, + "epoch": 0.5944987223808808, + "flos": 19576594363320.0, + "grad_norm": 1.7561456964413924, + "language_loss": 0.69519466, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71915609, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13317871, + "step": 9888, + "time_per_iteration": 4.258397579193115 + }, + { + "auxiliary_loss_clip": 0.01356648, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.2447958, + "balance_loss_mlp": 1.02027345, + "epoch": 0.5945588456335488, + "flos": 26183262306600.0, + "grad_norm": 1.6353156514639384, + "language_loss": 0.79430389, + "learning_rate": 1.490234845687366e-06, + "loss": 0.818205, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13208008, + "step": 9889, + "time_per_iteration": 2.8818721771240234 + }, + { + "auxiliary_loss_clip": 0.0135432, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.24365747, + "balance_loss_mlp": 1.01904535, + "epoch": 0.5946189688862168, + "flos": 20451053322360.0, + "grad_norm": 1.6101142804684971, + "language_loss": 0.70920521, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.73306382, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12493896, + "step": 9890, + "time_per_iteration": 4.38239860534668 + }, + { + "auxiliary_loss_clip": 0.01361683, + "auxiliary_loss_mlp": 0.01040324, + "balance_loss_clip": 1.24825835, + "balance_loss_mlp": 1.0260663, + "epoch": 0.5946790921388847, + "flos": 13440681423720.0, + "grad_norm": 1.8331943780759101, + "language_loss": 0.69538736, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71940744, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.14257812, + "step": 9891, + "time_per_iteration": 2.8128480911254883 + }, + { + "auxiliary_loss_clip": 0.01357473, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.24668145, + "balance_loss_mlp": 1.0233779, + "epoch": 0.5947392153915527, + "flos": 20417121714600.0, + "grad_norm": 1.846317982820989, + "language_loss": 0.53673601, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.56067538, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13079834, + "step": 9892, + "time_per_iteration": 2.878460168838501 + }, + { + "auxiliary_loss_clip": 0.01177675, + "auxiliary_loss_mlp": 0.0101783, + "balance_loss_clip": 1.13100767, + "balance_loss_mlp": 1.0153861, + "epoch": 0.5947993386442206, + "flos": 65634786629640.0, + "grad_norm": 0.6985093554224664, + "language_loss": 0.54549026, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56744528, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.02441406, + "step": 9893, + "time_per_iteration": 3.3733584880828857 + }, + { + "auxiliary_loss_clip": 0.01354036, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_clip": 1.24485648, + "balance_loss_mlp": 1.02865911, + "epoch": 0.5948594618968887, + "flos": 23188042055520.0, + "grad_norm": 1.7225943734457496, + "language_loss": 0.74374497, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76770222, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13024902, + "step": 9894, + "time_per_iteration": 2.8428359031677246 + }, + { + "auxiliary_loss_clip": 0.01357987, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.24614215, + "balance_loss_mlp": 1.02331424, + "epoch": 0.5949195851495566, + "flos": 13630846684320.0, + "grad_norm": 1.6746462439841658, + "language_loss": 0.78121281, + "learning_rate": 1.487975602873434e-06, + "loss": 0.80515492, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12890625, + "step": 9895, + "time_per_iteration": 4.379376411437988 + }, + { + "auxiliary_loss_clip": 0.01367041, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.25265503, + "balance_loss_mlp": 1.02301455, + "epoch": 0.5949797084022246, + "flos": 19755186241320.0, + "grad_norm": 1.7406947923079854, + "language_loss": 0.79702801, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.82106793, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13934326, + "step": 9896, + "time_per_iteration": 2.833313226699829 + }, + { + "auxiliary_loss_clip": 0.01362091, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.24869132, + "balance_loss_mlp": 1.02006853, + "epoch": 0.5950398316548925, + "flos": 25779192876000.0, + "grad_norm": 1.5094546601334733, + "language_loss": 0.83493465, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8588953, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13916016, + "step": 9897, + "time_per_iteration": 2.9584333896636963 + }, + { + "auxiliary_loss_clip": 0.01361935, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.24828565, + "balance_loss_mlp": 1.02370405, + "epoch": 0.5950999549075605, + "flos": 23044193952480.0, + "grad_norm": 1.6973765993625225, + "language_loss": 0.71087337, + "learning_rate": 1.486846243389939e-06, + "loss": 0.73485506, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.12536621, + "step": 9898, + "time_per_iteration": 2.8256587982177734 + }, + { + "auxiliary_loss_clip": 0.01371681, + "auxiliary_loss_mlp": 0.01041017, + "balance_loss_clip": 1.25381446, + "balance_loss_mlp": 1.02573395, + "epoch": 0.5951600781602284, + "flos": 32451977875320.0, + "grad_norm": 2.130460455580024, + "language_loss": 0.64312136, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66724831, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.15264893, + "step": 9899, + "time_per_iteration": 2.9239447116851807 + }, + { + "auxiliary_loss_clip": 0.01359202, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.24811506, + "balance_loss_mlp": 1.02185202, + "epoch": 0.5952202014128964, + "flos": 23805568964160.0, + "grad_norm": 1.7761999162174749, + "language_loss": 0.72279972, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74673641, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12609863, + "step": 9900, + "time_per_iteration": 2.86686372756958 + }, + { + "auxiliary_loss_clip": 0.01350483, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.24093676, + "balance_loss_mlp": 1.01462519, + "epoch": 0.5952803246655644, + "flos": 22497332236200.0, + "grad_norm": 1.8938865233589635, + "language_loss": 0.84915203, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.87294269, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13964844, + "step": 9901, + "time_per_iteration": 2.8290271759033203 + }, + { + "auxiliary_loss_clip": 0.01179815, + "auxiliary_loss_mlp": 0.0100287, + "balance_loss_clip": 1.13210547, + "balance_loss_mlp": 0.9997471, + "epoch": 0.5953404479182324, + "flos": 51247275503400.0, + "grad_norm": 0.8214272997913843, + "language_loss": 0.58227444, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.6041013, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.03125, + "step": 9902, + "time_per_iteration": 3.156104326248169 + }, + { + "auxiliary_loss_clip": 0.01362202, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.2491765, + "balance_loss_mlp": 1.02194929, + "epoch": 0.5954005711709004, + "flos": 23117945380200.0, + "grad_norm": 1.65682498755191, + "language_loss": 0.77362603, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79760396, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13653564, + "step": 9903, + "time_per_iteration": 3.0412580966949463 + }, + { + "auxiliary_loss_clip": 0.01361466, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.24902892, + "balance_loss_mlp": 1.01934385, + "epoch": 0.5954606944235683, + "flos": 35961549527520.0, + "grad_norm": 2.146554814191254, + "language_loss": 0.7847017, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80863917, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12945557, + "step": 9904, + "time_per_iteration": 3.080148935317993 + }, + { + "auxiliary_loss_clip": 0.01367458, + "auxiliary_loss_mlp": 0.01037466, + "balance_loss_clip": 1.25145102, + "balance_loss_mlp": 1.02385783, + "epoch": 0.5955208176762363, + "flos": 30449539009080.0, + "grad_norm": 1.3520642186338037, + "language_loss": 0.72929525, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75334454, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.13604736, + "step": 9905, + "time_per_iteration": 2.9151382446289062 + }, + { + "auxiliary_loss_clip": 0.01355321, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.24227667, + "balance_loss_mlp": 1.01858759, + "epoch": 0.5955809409289042, + "flos": 17644698864360.0, + "grad_norm": 1.7859303849082404, + "language_loss": 0.70501924, + "learning_rate": 1.483835475336295e-06, + "loss": 0.72889608, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13769531, + "step": 9906, + "time_per_iteration": 2.8029303550720215 + }, + { + "auxiliary_loss_clip": 0.01358397, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.2449429, + "balance_loss_mlp": 1.02210414, + "epoch": 0.5956410641815723, + "flos": 24285257807040.0, + "grad_norm": 1.6724386243739817, + "language_loss": 0.75005406, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77399963, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14038086, + "step": 9907, + "time_per_iteration": 2.967186450958252 + }, + { + "auxiliary_loss_clip": 0.01359842, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.24731135, + "balance_loss_mlp": 1.0237968, + "epoch": 0.5957011874342402, + "flos": 35740863761400.0, + "grad_norm": 2.7512535553738124, + "language_loss": 0.6743778, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69834483, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13067627, + "step": 9908, + "time_per_iteration": 2.97047758102417 + }, + { + "auxiliary_loss_clip": 0.0135693, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.24514008, + "balance_loss_mlp": 1.01829839, + "epoch": 0.5957613106869082, + "flos": 21249121310280.0, + "grad_norm": 3.252656582368551, + "language_loss": 0.7616955, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78557992, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13232422, + "step": 9909, + "time_per_iteration": 2.872568130493164 + }, + { + "auxiliary_loss_clip": 0.01178681, + "auxiliary_loss_mlp": 0.01001727, + "balance_loss_clip": 1.13076758, + "balance_loss_mlp": 0.99872333, + "epoch": 0.5958214339395761, + "flos": 65955805318080.0, + "grad_norm": 0.9265230498319552, + "language_loss": 0.73485672, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75666082, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.0300293, + "step": 9910, + "time_per_iteration": 3.352889060974121 + }, + { + "auxiliary_loss_clip": 0.01360467, + "auxiliary_loss_mlp": 0.01029209, + "balance_loss_clip": 1.24687159, + "balance_loss_mlp": 1.01537466, + "epoch": 0.5958815571922441, + "flos": 23223516780960.0, + "grad_norm": 1.7182781973493733, + "language_loss": 0.69928312, + "learning_rate": 1.481954380961799e-06, + "loss": 0.72317994, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.1383667, + "step": 9911, + "time_per_iteration": 2.85166335105896 + }, + { + "auxiliary_loss_clip": 0.0137643, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.25873435, + "balance_loss_mlp": 1.02109683, + "epoch": 0.595941680444912, + "flos": 16542163417680.0, + "grad_norm": 2.0431271128695703, + "language_loss": 0.6577664, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.68188095, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.13934326, + "step": 9912, + "time_per_iteration": 2.8515102863311768 + }, + { + "auxiliary_loss_clip": 0.01360118, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.24773955, + "balance_loss_mlp": 1.01928663, + "epoch": 0.59600180369758, + "flos": 27824700231000.0, + "grad_norm": 2.279394002920519, + "language_loss": 0.72980022, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75373065, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13641357, + "step": 9913, + "time_per_iteration": 2.883687973022461 + }, + { + "auxiliary_loss_clip": 0.01367835, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.25031352, + "balance_loss_mlp": 1.0189935, + "epoch": 0.596061926950248, + "flos": 29496861702720.0, + "grad_norm": 1.9942222636439921, + "language_loss": 0.79671508, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82072341, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.14019775, + "step": 9914, + "time_per_iteration": 2.9064462184906006 + }, + { + "auxiliary_loss_clip": 0.01353251, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.24193168, + "balance_loss_mlp": 1.02152276, + "epoch": 0.596122050202916, + "flos": 16841189356200.0, + "grad_norm": 1.7695235856137708, + "language_loss": 0.67863727, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70251596, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13116455, + "step": 9915, + "time_per_iteration": 2.839869260787964 + }, + { + "auxiliary_loss_clip": 0.01356164, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.24278736, + "balance_loss_mlp": 1.02529192, + "epoch": 0.596182173455584, + "flos": 21001651007760.0, + "grad_norm": 1.755494804441325, + "language_loss": 0.78557026, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.80951166, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12701416, + "step": 9916, + "time_per_iteration": 2.89208722114563 + }, + { + "auxiliary_loss_clip": 0.01361725, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.24711442, + "balance_loss_mlp": 1.01995766, + "epoch": 0.5962422967082519, + "flos": 16069337387640.0, + "grad_norm": 1.7683965516134117, + "language_loss": 0.83072686, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.85468054, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13659668, + "step": 9917, + "time_per_iteration": 2.865149974822998 + }, + { + "auxiliary_loss_clip": 0.01360357, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.24879336, + "balance_loss_mlp": 1.0220468, + "epoch": 0.5963024199609199, + "flos": 12170802614760.0, + "grad_norm": 1.7733630422824, + "language_loss": 0.77356434, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79751301, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12475586, + "step": 9918, + "time_per_iteration": 2.87119197845459 + }, + { + "auxiliary_loss_clip": 0.01354744, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.24281847, + "balance_loss_mlp": 1.02917004, + "epoch": 0.5963625432135878, + "flos": 28080129772080.0, + "grad_norm": 1.517396097296116, + "language_loss": 0.78675783, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81072968, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13262939, + "step": 9919, + "time_per_iteration": 2.8960776329040527 + }, + { + "auxiliary_loss_clip": 0.01357098, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.24384701, + "balance_loss_mlp": 1.01836324, + "epoch": 0.5964226664662559, + "flos": 19864493611200.0, + "grad_norm": 1.789395924418081, + "language_loss": 0.78182042, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.80571908, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.144104, + "step": 9920, + "time_per_iteration": 2.8124983310699463 + }, + { + "auxiliary_loss_clip": 0.01363298, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.24978435, + "balance_loss_mlp": 1.01986015, + "epoch": 0.5964827897189238, + "flos": 12936482112600.0, + "grad_norm": 2.198586532168537, + "language_loss": 0.82556033, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84953016, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13818359, + "step": 9921, + "time_per_iteration": 2.841519355773926 + }, + { + "auxiliary_loss_clip": 0.013507, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.23949838, + "balance_loss_mlp": 1.02250981, + "epoch": 0.5965429129715918, + "flos": 18155436121440.0, + "grad_norm": 6.156423410403676, + "language_loss": 0.80960608, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.83347148, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13342285, + "step": 9922, + "time_per_iteration": 2.827897310256958 + }, + { + "auxiliary_loss_clip": 0.01350208, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.2391274, + "balance_loss_mlp": 1.01787162, + "epoch": 0.5966030362242597, + "flos": 21768386322960.0, + "grad_norm": 2.218265460570061, + "language_loss": 0.76817256, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1307373, + "step": 9923, + "time_per_iteration": 2.7829792499542236 + }, + { + "auxiliary_loss_clip": 0.01366101, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.24961424, + "balance_loss_mlp": 1.02377009, + "epoch": 0.5966631594769277, + "flos": 18811889466120.0, + "grad_norm": 2.1569708001890477, + "language_loss": 0.76044095, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78449005, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.15045166, + "step": 9924, + "time_per_iteration": 3.0539965629577637 + }, + { + "auxiliary_loss_clip": 0.01350452, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.24184287, + "balance_loss_mlp": 1.01905584, + "epoch": 0.5967232827295956, + "flos": 14068238597280.0, + "grad_norm": 1.8747161891535624, + "language_loss": 0.66585338, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68968976, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.14117432, + "step": 9925, + "time_per_iteration": 4.320198059082031 + }, + { + "auxiliary_loss_clip": 0.01350884, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.24190044, + "balance_loss_mlp": 1.01861191, + "epoch": 0.5967834059822636, + "flos": 17242700460120.0, + "grad_norm": 1.7812292604178166, + "language_loss": 0.71712565, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74095374, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13293457, + "step": 9926, + "time_per_iteration": 4.230662107467651 + }, + { + "auxiliary_loss_clip": 0.01356694, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.24310553, + "balance_loss_mlp": 1.01954055, + "epoch": 0.5968435292349316, + "flos": 42527666700360.0, + "grad_norm": 2.1851352369115586, + "language_loss": 0.70380294, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7277081, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14282227, + "step": 9927, + "time_per_iteration": 2.9626402854919434 + }, + { + "auxiliary_loss_clip": 0.01363911, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.24598968, + "balance_loss_mlp": 1.02206445, + "epoch": 0.5969036524875996, + "flos": 37637650010160.0, + "grad_norm": 1.5634920285518172, + "language_loss": 0.63887084, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66288036, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.1496582, + "step": 9928, + "time_per_iteration": 2.9226155281066895 + }, + { + "auxiliary_loss_clip": 0.01351732, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.23993826, + "balance_loss_mlp": 1.02212703, + "epoch": 0.5969637757402676, + "flos": 23153541930720.0, + "grad_norm": 1.4419892379643713, + "language_loss": 0.69675732, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.72062987, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13397217, + "step": 9929, + "time_per_iteration": 4.295178651809692 + }, + { + "auxiliary_loss_clip": 0.013444, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.23610795, + "balance_loss_mlp": 1.02631056, + "epoch": 0.5970238989929355, + "flos": 24025808038320.0, + "grad_norm": 2.1582296205373757, + "language_loss": 0.77056116, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.79439759, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12945557, + "step": 9930, + "time_per_iteration": 2.7583975791931152 + }, + { + "auxiliary_loss_clip": 0.01365056, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.24827945, + "balance_loss_mlp": 1.02096915, + "epoch": 0.5970840222456035, + "flos": 19431568617840.0, + "grad_norm": 1.6152613431377434, + "language_loss": 0.69418955, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.7182039, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.1539917, + "step": 9931, + "time_per_iteration": 2.760708808898926 + }, + { + "auxiliary_loss_clip": 0.01178549, + "auxiliary_loss_mlp": 0.01008481, + "balance_loss_clip": 1.13120282, + "balance_loss_mlp": 1.00624013, + "epoch": 0.5971441454982714, + "flos": 62989887321720.0, + "grad_norm": 0.8563278362533061, + "language_loss": 0.64281714, + "learning_rate": 1.474059168257065e-06, + "loss": 0.6646874, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.02246094, + "step": 9932, + "time_per_iteration": 3.173102378845215 + }, + { + "auxiliary_loss_clip": 0.01354876, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.24286556, + "balance_loss_mlp": 1.02071548, + "epoch": 0.5972042687509395, + "flos": 20271040151760.0, + "grad_norm": 1.7558556284898919, + "language_loss": 0.74237084, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76626843, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14160156, + "step": 9933, + "time_per_iteration": 2.7878715991973877 + }, + { + "auxiliary_loss_clip": 0.01179927, + "auxiliary_loss_mlp": 0.01000782, + "balance_loss_clip": 1.1324228, + "balance_loss_mlp": 0.99808782, + "epoch": 0.5972643920036074, + "flos": 71670529080360.0, + "grad_norm": 0.6768358166266475, + "language_loss": 0.52078325, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54259038, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.02697754, + "step": 9934, + "time_per_iteration": 4.742334604263306 + }, + { + "auxiliary_loss_clip": 0.01179379, + "auxiliary_loss_mlp": 0.01009708, + "balance_loss_clip": 1.13218403, + "balance_loss_mlp": 1.0070852, + "epoch": 0.5973245152562754, + "flos": 56906382793680.0, + "grad_norm": 0.8283298210407584, + "language_loss": 0.54168886, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56357974, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.02624512, + "step": 9935, + "time_per_iteration": 3.130713701248169 + }, + { + "auxiliary_loss_clip": 0.01359722, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.24525082, + "balance_loss_mlp": 1.0217905, + "epoch": 0.5973846385089433, + "flos": 24168925190880.0, + "grad_norm": 1.6442049976945652, + "language_loss": 0.65691495, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68087661, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.14642334, + "step": 9936, + "time_per_iteration": 2.822629451751709 + }, + { + "auxiliary_loss_clip": 0.01364318, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.25027633, + "balance_loss_mlp": 1.0208149, + "epoch": 0.5974447617616113, + "flos": 17674407202680.0, + "grad_norm": 1.9554651231425406, + "language_loss": 0.67570388, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69969523, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13983154, + "step": 9937, + "time_per_iteration": 2.795421838760376 + }, + { + "auxiliary_loss_clip": 0.01361602, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.24543047, + "balance_loss_mlp": 1.02201986, + "epoch": 0.5975048850142792, + "flos": 22897584480960.0, + "grad_norm": 1.9115359888002215, + "language_loss": 0.77872437, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.80270588, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14538574, + "step": 9938, + "time_per_iteration": 2.7638988494873047 + }, + { + "auxiliary_loss_clip": 0.01354188, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.24194813, + "balance_loss_mlp": 1.01867449, + "epoch": 0.5975650082669473, + "flos": 24358521934440.0, + "grad_norm": 1.5367804019168618, + "language_loss": 0.76071382, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78457803, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13555908, + "step": 9939, + "time_per_iteration": 2.8046717643737793 + }, + { + "auxiliary_loss_clip": 0.01362389, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.24358988, + "balance_loss_mlp": 1.01767349, + "epoch": 0.5976251315196152, + "flos": 20928305663640.0, + "grad_norm": 2.116675633115188, + "language_loss": 0.68416846, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70812571, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.15661621, + "step": 9940, + "time_per_iteration": 2.749890089035034 + }, + { + "auxiliary_loss_clip": 0.01348414, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.23863506, + "balance_loss_mlp": 1.01941931, + "epoch": 0.5976852547722832, + "flos": 35851308165360.0, + "grad_norm": 1.2763782438101055, + "language_loss": 0.70237589, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72618228, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.12786865, + "step": 9941, + "time_per_iteration": 2.876840591430664 + }, + { + "auxiliary_loss_clip": 0.01352224, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.24052262, + "balance_loss_mlp": 1.02068055, + "epoch": 0.5977453780249512, + "flos": 12859319582640.0, + "grad_norm": 1.786500403689621, + "language_loss": 0.77887225, + "learning_rate": 1.470302626336386e-06, + "loss": 0.80274189, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14056396, + "step": 9942, + "time_per_iteration": 2.7257277965545654 + }, + { + "auxiliary_loss_clip": 0.01362615, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.24599886, + "balance_loss_mlp": 1.02496266, + "epoch": 0.5978055012776191, + "flos": 20964024039240.0, + "grad_norm": 2.1526242456066043, + "language_loss": 0.75844169, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78246087, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14343262, + "step": 9943, + "time_per_iteration": 2.7887074947357178 + }, + { + "auxiliary_loss_clip": 0.01346069, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.23497224, + "balance_loss_mlp": 1.01537538, + "epoch": 0.5978656245302871, + "flos": 34064641453680.0, + "grad_norm": 1.768623725564068, + "language_loss": 0.61660063, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64034224, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12713623, + "step": 9944, + "time_per_iteration": 2.8480918407440186 + }, + { + "auxiliary_loss_clip": 0.01352487, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.23974407, + "balance_loss_mlp": 1.02094722, + "epoch": 0.597925747782955, + "flos": 37379702750760.0, + "grad_norm": 1.6326482609896382, + "language_loss": 0.72775006, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.75162435, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14001465, + "step": 9945, + "time_per_iteration": 2.8927743434906006 + }, + { + "auxiliary_loss_clip": 0.01349601, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.23754323, + "balance_loss_mlp": 1.02106524, + "epoch": 0.5979858710356231, + "flos": 25380361923840.0, + "grad_norm": 1.97404357139064, + "language_loss": 0.67852497, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.70237243, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14080811, + "step": 9946, + "time_per_iteration": 2.775183916091919 + }, + { + "auxiliary_loss_clip": 0.01364728, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.24783611, + "balance_loss_mlp": 1.02220321, + "epoch": 0.598045994288291, + "flos": 13702324043880.0, + "grad_norm": 2.2112688947954626, + "language_loss": 0.89226115, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91627818, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.14782715, + "step": 9947, + "time_per_iteration": 2.766494035720825 + }, + { + "auxiliary_loss_clip": 0.01343498, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.23545837, + "balance_loss_mlp": 1.01843166, + "epoch": 0.598106117540959, + "flos": 21986432545680.0, + "grad_norm": 1.7339732514450785, + "language_loss": 0.71827996, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74202949, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13018799, + "step": 9948, + "time_per_iteration": 2.8315391540527344 + }, + { + "auxiliary_loss_clip": 0.01355663, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.2410574, + "balance_loss_mlp": 1.02151585, + "epoch": 0.5981662407936269, + "flos": 20564380919880.0, + "grad_norm": 1.7469097929837747, + "language_loss": 0.89606905, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91998839, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14752197, + "step": 9949, + "time_per_iteration": 2.748568534851074 + }, + { + "auxiliary_loss_clip": 0.01355765, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.24385595, + "balance_loss_mlp": 1.02116156, + "epoch": 0.5982263640462949, + "flos": 14067873122040.0, + "grad_norm": 1.7444866763495506, + "language_loss": 0.70801461, + "learning_rate": 1.467298838320673e-06, + "loss": 0.73191679, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13299561, + "step": 9950, + "time_per_iteration": 2.7726571559906006 + }, + { + "auxiliary_loss_clip": 0.01357576, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.24398255, + "balance_loss_mlp": 1.01981926, + "epoch": 0.5982864872989628, + "flos": 17711059570560.0, + "grad_norm": 1.538690108361813, + "language_loss": 0.78301424, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8069244, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13616943, + "step": 9951, + "time_per_iteration": 2.7272377014160156 + }, + { + "auxiliary_loss_clip": 0.01354239, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_clip": 1.24000716, + "balance_loss_mlp": 1.02914071, + "epoch": 0.5983466105516309, + "flos": 16769305913040.0, + "grad_norm": 1.2901680208263382, + "language_loss": 0.7372824, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76127148, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.15533447, + "step": 9952, + "time_per_iteration": 2.793287754058838 + }, + { + "auxiliary_loss_clip": 0.01354345, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.24000156, + "balance_loss_mlp": 1.01595271, + "epoch": 0.5984067338042988, + "flos": 20046902675040.0, + "grad_norm": 2.2695637384773555, + "language_loss": 0.79189092, + "learning_rate": 1.466172750724613e-06, + "loss": 0.81574535, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.15148926, + "step": 9953, + "time_per_iteration": 2.7315943241119385 + }, + { + "auxiliary_loss_clip": 0.01348769, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.23743498, + "balance_loss_mlp": 1.02018142, + "epoch": 0.5984668570569668, + "flos": 26325120600000.0, + "grad_norm": 1.422403243991515, + "language_loss": 0.70010936, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.72393501, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.1362915, + "step": 9954, + "time_per_iteration": 2.791168451309204 + }, + { + "auxiliary_loss_clip": 0.01355059, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.2415272, + "balance_loss_mlp": 1.01544428, + "epoch": 0.5985269803096348, + "flos": 20598474961080.0, + "grad_norm": 2.5838465917685665, + "language_loss": 0.7334612, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75730371, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.1373291, + "step": 9955, + "time_per_iteration": 2.7355153560638428 + }, + { + "auxiliary_loss_clip": 0.01354292, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.24187338, + "balance_loss_mlp": 1.01896632, + "epoch": 0.5985871035623027, + "flos": 26870439198600.0, + "grad_norm": 1.4470969603750772, + "language_loss": 0.68581045, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70968479, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1418457, + "step": 9956, + "time_per_iteration": 2.8055567741394043 + }, + { + "auxiliary_loss_clip": 0.01354663, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.24137497, + "balance_loss_mlp": 1.01773274, + "epoch": 0.5986472268149707, + "flos": 19614464982000.0, + "grad_norm": 2.166963699874265, + "language_loss": 0.73931944, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76318818, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14477539, + "step": 9957, + "time_per_iteration": 2.755347490310669 + }, + { + "auxiliary_loss_clip": 0.01337814, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.23005152, + "balance_loss_mlp": 1.01745939, + "epoch": 0.5987073500676386, + "flos": 21799109870280.0, + "grad_norm": 1.9962745436329201, + "language_loss": 0.85161811, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.87530351, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13275146, + "step": 9958, + "time_per_iteration": 2.7471976280212402 + }, + { + "auxiliary_loss_clip": 0.01358097, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.24356651, + "balance_loss_mlp": 1.01814222, + "epoch": 0.5987674733203067, + "flos": 24319473673320.0, + "grad_norm": 1.7310241620261049, + "language_loss": 0.6680311, + "learning_rate": 1.463921122471864e-06, + "loss": 0.69193733, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14385986, + "step": 9959, + "time_per_iteration": 2.8808960914611816 + }, + { + "auxiliary_loss_clip": 0.01351518, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.23982406, + "balance_loss_mlp": 1.01613307, + "epoch": 0.5988275965729746, + "flos": 21324618897480.0, + "grad_norm": 1.6017961842872637, + "language_loss": 0.83763254, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.86143941, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13024902, + "step": 9960, + "time_per_iteration": 2.7363572120666504 + }, + { + "auxiliary_loss_clip": 0.01348617, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.2376647, + "balance_loss_mlp": 1.01585579, + "epoch": 0.5988877198256426, + "flos": 25123389265080.0, + "grad_norm": 1.5043049960126897, + "language_loss": 0.79680353, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.82058489, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13684082, + "step": 9961, + "time_per_iteration": 2.839287519454956 + }, + { + "auxiliary_loss_clip": 0.01350297, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.23880482, + "balance_loss_mlp": 1.01663184, + "epoch": 0.5989478430783105, + "flos": 26434306144800.0, + "grad_norm": 1.709117931048147, + "language_loss": 0.67154884, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69535154, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13354492, + "step": 9962, + "time_per_iteration": 2.7740771770477295 + }, + { + "auxiliary_loss_clip": 0.013496, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.23786426, + "balance_loss_mlp": 1.01652181, + "epoch": 0.5990079663309785, + "flos": 25784350137720.0, + "grad_norm": 1.324516093760274, + "language_loss": 0.74052119, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76432478, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14245605, + "step": 9963, + "time_per_iteration": 2.8251781463623047 + }, + { + "auxiliary_loss_clip": 0.01346309, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.23677754, + "balance_loss_mlp": 1.0136832, + "epoch": 0.5990680895836464, + "flos": 36838891680120.0, + "grad_norm": 1.5349784336320285, + "language_loss": 0.68196702, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70570225, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13531494, + "step": 9964, + "time_per_iteration": 4.382431268692017 + }, + { + "auxiliary_loss_clip": 0.01340025, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.23302531, + "balance_loss_mlp": 1.01607001, + "epoch": 0.5991282128363145, + "flos": 24138851377320.0, + "grad_norm": 2.2471149350928474, + "language_loss": 0.77076507, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79446286, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13684082, + "step": 9965, + "time_per_iteration": 2.811126470565796 + }, + { + "auxiliary_loss_clip": 0.01348748, + "auxiliary_loss_mlp": 0.0102737, + "balance_loss_clip": 1.23619378, + "balance_loss_mlp": 1.01438856, + "epoch": 0.5991883360889824, + "flos": 10307379456720.0, + "grad_norm": 1.8008322391648464, + "language_loss": 0.77260029, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79636145, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12982178, + "step": 9966, + "time_per_iteration": 4.088757753372192 + }, + { + "auxiliary_loss_clip": 0.01345492, + "auxiliary_loss_mlp": 0.01026298, + "balance_loss_clip": 1.23562789, + "balance_loss_mlp": 1.0127852, + "epoch": 0.5992484593416504, + "flos": 23956482921840.0, + "grad_norm": 1.4660316580432695, + "language_loss": 0.73709524, + "learning_rate": 1.460920090376422e-06, + "loss": 0.76081312, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1350708, + "step": 9967, + "time_per_iteration": 4.189153671264648 + }, + { + "auxiliary_loss_clip": 0.01363347, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.24604535, + "balance_loss_mlp": 1.02149856, + "epoch": 0.5993085825943184, + "flos": 11947396088520.0, + "grad_norm": 2.0836367514190908, + "language_loss": 0.6853931, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70939046, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.14886475, + "step": 9968, + "time_per_iteration": 2.711808204650879 + }, + { + "auxiliary_loss_clip": 0.01355823, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.24261928, + "balance_loss_mlp": 1.02008331, + "epoch": 0.5993687058469863, + "flos": 19031641239960.0, + "grad_norm": 1.9499475914020847, + "language_loss": 0.7992425, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.82314116, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13964844, + "step": 9969, + "time_per_iteration": 2.7756378650665283 + }, + { + "auxiliary_loss_clip": 0.01347291, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.23485065, + "balance_loss_mlp": 1.01488543, + "epoch": 0.5994288290996543, + "flos": 14287056378840.0, + "grad_norm": 1.536635211227129, + "language_loss": 0.81200004, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83576202, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14044189, + "step": 9970, + "time_per_iteration": 2.7802138328552246 + }, + { + "auxiliary_loss_clip": 0.01351349, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.23732495, + "balance_loss_mlp": 1.02070677, + "epoch": 0.5994889523523222, + "flos": 19211126501880.0, + "grad_norm": 2.5347472387370473, + "language_loss": 0.62360501, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64748478, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.15930176, + "step": 9971, + "time_per_iteration": 2.7199230194091797 + }, + { + "auxiliary_loss_clip": 0.01339412, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.23169851, + "balance_loss_mlp": 1.01535177, + "epoch": 0.5995490756049903, + "flos": 28042787062080.0, + "grad_norm": 1.6510271872495546, + "language_loss": 0.79519534, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81887376, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1307373, + "step": 9972, + "time_per_iteration": 4.284715175628662 + }, + { + "auxiliary_loss_clip": 0.01363643, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.24535608, + "balance_loss_mlp": 1.02577376, + "epoch": 0.5996091988576582, + "flos": 29058089105520.0, + "grad_norm": 1.989396489658073, + "language_loss": 0.76434952, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78838861, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14489746, + "step": 9973, + "time_per_iteration": 2.79805064201355 + }, + { + "auxiliary_loss_clip": 0.01349813, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.23768604, + "balance_loss_mlp": 1.01707602, + "epoch": 0.5996693221103262, + "flos": 20818917077040.0, + "grad_norm": 6.553128948645843, + "language_loss": 0.65983552, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.68365031, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14587402, + "step": 9974, + "time_per_iteration": 2.7306368350982666 + }, + { + "auxiliary_loss_clip": 0.01348527, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.23700702, + "balance_loss_mlp": 1.0170939, + "epoch": 0.5997294453629941, + "flos": 23774195683080.0, + "grad_norm": 1.3990601249575967, + "language_loss": 0.74496204, + "learning_rate": 1.457920366566428e-06, + "loss": 0.768749, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.1307373, + "step": 9975, + "time_per_iteration": 2.8537774085998535 + }, + { + "auxiliary_loss_clip": 0.01347579, + "auxiliary_loss_mlp": 0.01027285, + "balance_loss_clip": 1.23607123, + "balance_loss_mlp": 1.01278973, + "epoch": 0.5997895686156621, + "flos": 20964998639880.0, + "grad_norm": 2.310728357384463, + "language_loss": 0.77604455, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79979318, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14501953, + "step": 9976, + "time_per_iteration": 2.765552043914795 + }, + { + "auxiliary_loss_clip": 0.01346729, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.23562384, + "balance_loss_mlp": 1.01636839, + "epoch": 0.59984969186833, + "flos": 28371237080400.0, + "grad_norm": 2.371199831115175, + "language_loss": 0.74883771, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.7726106, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14172363, + "step": 9977, + "time_per_iteration": 2.827488660812378 + }, + { + "auxiliary_loss_clip": 0.01350285, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.23782349, + "balance_loss_mlp": 1.01739883, + "epoch": 0.5999098151209981, + "flos": 22571164880640.0, + "grad_norm": 1.643057503650222, + "language_loss": 0.69084907, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.7146616, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13555908, + "step": 9978, + "time_per_iteration": 2.7847824096679688 + }, + { + "auxiliary_loss_clip": 0.01358652, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.24497676, + "balance_loss_mlp": 1.0189724, + "epoch": 0.599969938373666, + "flos": 18773125463520.0, + "grad_norm": 2.2735218392179726, + "language_loss": 0.8195982, + "learning_rate": 1.456420997543594e-06, + "loss": 0.84352767, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.15307617, + "step": 9979, + "time_per_iteration": 2.799598455429077 + }, + { + "auxiliary_loss_clip": 0.013356, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.22845244, + "balance_loss_mlp": 1.02069855, + "epoch": 0.600030061626334, + "flos": 11330884388880.0, + "grad_norm": 1.7258565096110543, + "language_loss": 0.69763529, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72133279, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13452148, + "step": 9980, + "time_per_iteration": 2.7586817741394043 + }, + { + "auxiliary_loss_clip": 0.01356732, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.2415657, + "balance_loss_mlp": 1.01740539, + "epoch": 0.600090184879002, + "flos": 16582673579760.0, + "grad_norm": 2.4036414241239608, + "language_loss": 0.69248343, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.71637499, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.15020752, + "step": 9981, + "time_per_iteration": 2.7398622035980225 + }, + { + "auxiliary_loss_clip": 0.01348003, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.23756492, + "balance_loss_mlp": 1.0196178, + "epoch": 0.6001503081316699, + "flos": 23623322333760.0, + "grad_norm": 1.9108334230426725, + "language_loss": 0.7867952, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.81059879, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12744141, + "step": 9982, + "time_per_iteration": 2.798017740249634 + }, + { + "auxiliary_loss_clip": 0.01342688, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.23293638, + "balance_loss_mlp": 1.01949739, + "epoch": 0.6002104313843379, + "flos": 20672551255680.0, + "grad_norm": 1.89305708145999, + "language_loss": 0.73169875, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75546074, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.14007568, + "step": 9983, + "time_per_iteration": 2.788996934890747 + }, + { + "auxiliary_loss_clip": 0.01350625, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.23870909, + "balance_loss_mlp": 1.02100468, + "epoch": 0.6002705546370058, + "flos": 22460314393080.0, + "grad_norm": 2.5700070898489815, + "language_loss": 0.78950214, + "learning_rate": 1.454547250154447e-06, + "loss": 0.81335276, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13421631, + "step": 9984, + "time_per_iteration": 2.795560359954834 + }, + { + "auxiliary_loss_clip": 0.01348394, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.23673749, + "balance_loss_mlp": 1.01756179, + "epoch": 0.6003306778896739, + "flos": 25197465559680.0, + "grad_norm": 1.5596279408529123, + "language_loss": 0.82703823, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85083091, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13311768, + "step": 9985, + "time_per_iteration": 3.0085952281951904 + }, + { + "auxiliary_loss_clip": 0.01346449, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.23528898, + "balance_loss_mlp": 1.02051473, + "epoch": 0.6003908011423418, + "flos": 26693390438280.0, + "grad_norm": 1.5792578076649033, + "language_loss": 0.7161184, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73992091, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.13287354, + "step": 9986, + "time_per_iteration": 2.955381155014038 + }, + { + "auxiliary_loss_clip": 0.01348041, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.23591995, + "balance_loss_mlp": 1.01531994, + "epoch": 0.6004509243950098, + "flos": 22570230888360.0, + "grad_norm": 1.460373632371263, + "language_loss": 0.7196331, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.74341106, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14422607, + "step": 9987, + "time_per_iteration": 2.835319757461548 + }, + { + "auxiliary_loss_clip": 0.01344037, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.23352027, + "balance_loss_mlp": 1.02353013, + "epoch": 0.6005110476476777, + "flos": 19724097218760.0, + "grad_norm": 1.616077136993336, + "language_loss": 0.85197043, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87578011, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13372803, + "step": 9988, + "time_per_iteration": 2.888162851333618 + }, + { + "auxiliary_loss_clip": 0.01347593, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.236166, + "balance_loss_mlp": 1.02328038, + "epoch": 0.6005711709003457, + "flos": 17717069607840.0, + "grad_norm": 2.596693307558426, + "language_loss": 0.65926713, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.68311167, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 1.11572266, + "router_z_loss_mlp": 0.13586426, + "step": 9989, + "time_per_iteration": 2.8106420040130615 + }, + { + "auxiliary_loss_clip": 0.01348155, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.23705745, + "balance_loss_mlp": 1.02071166, + "epoch": 0.6006312941530136, + "flos": 18518995389960.0, + "grad_norm": 1.4456078243518344, + "language_loss": 0.80633694, + "learning_rate": 1.452299436003257e-06, + "loss": 0.83015841, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13287354, + "step": 9990, + "time_per_iteration": 2.7852466106414795 + }, + { + "auxiliary_loss_clip": 0.0135498, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.24062622, + "balance_loss_mlp": 1.02738011, + "epoch": 0.6006914174056817, + "flos": 21394959222960.0, + "grad_norm": 2.031093388696831, + "language_loss": 0.82495481, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84891987, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.14154053, + "step": 9991, + "time_per_iteration": 2.82442307472229 + }, + { + "auxiliary_loss_clip": 0.01342988, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.23400664, + "balance_loss_mlp": 1.02112126, + "epoch": 0.6007515406583496, + "flos": 12754397915640.0, + "grad_norm": 1.7012704050971807, + "language_loss": 0.82969606, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.85347265, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13549805, + "step": 9992, + "time_per_iteration": 2.798535108566284 + }, + { + "auxiliary_loss_clip": 0.01348415, + "auxiliary_loss_mlp": 0.01040277, + "balance_loss_clip": 1.23796952, + "balance_loss_mlp": 1.02535176, + "epoch": 0.6008116639110176, + "flos": 19211410760400.0, + "grad_norm": 1.9901193985895926, + "language_loss": 0.66547906, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.68936598, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14923096, + "step": 9993, + "time_per_iteration": 2.914584159851074 + }, + { + "auxiliary_loss_clip": 0.01348542, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.23639989, + "balance_loss_mlp": 1.02277255, + "epoch": 0.6008717871636855, + "flos": 17059722879240.0, + "grad_norm": 2.486377792049452, + "language_loss": 0.81195998, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83581311, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13995361, + "step": 9994, + "time_per_iteration": 2.846543788909912 + }, + { + "auxiliary_loss_clip": 0.01335835, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.22944474, + "balance_loss_mlp": 1.01567197, + "epoch": 0.6009319104163535, + "flos": 20302981949880.0, + "grad_norm": 1.8013542888334693, + "language_loss": 0.72647041, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.75010478, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11932373, + "step": 9995, + "time_per_iteration": 2.837574005126953 + }, + { + "auxiliary_loss_clip": 0.0135006, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.23766601, + "balance_loss_mlp": 1.01973891, + "epoch": 0.6009920336690215, + "flos": 21842015925600.0, + "grad_norm": 2.291954381074853, + "language_loss": 0.81129515, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83512527, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13214111, + "step": 9996, + "time_per_iteration": 2.8581960201263428 + }, + { + "auxiliary_loss_clip": 0.01347697, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.23673081, + "balance_loss_mlp": 1.02365255, + "epoch": 0.6010521569216895, + "flos": 22600548352080.0, + "grad_norm": 2.0822052037364407, + "language_loss": 0.78600693, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80985963, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13909912, + "step": 9997, + "time_per_iteration": 2.867875337600708 + }, + { + "auxiliary_loss_clip": 0.01354114, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.23968554, + "balance_loss_mlp": 1.02198863, + "epoch": 0.6011122801743575, + "flos": 19176382726920.0, + "grad_norm": 1.6652210393838731, + "language_loss": 0.73146117, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75536048, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13842773, + "step": 9998, + "time_per_iteration": 3.015138626098633 + }, + { + "auxiliary_loss_clip": 0.01345577, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.23545623, + "balance_loss_mlp": 1.02189374, + "epoch": 0.6011724034270254, + "flos": 25015543796160.0, + "grad_norm": 1.488369866974953, + "language_loss": 0.72430599, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74811715, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13659668, + "step": 9999, + "time_per_iteration": 2.8911521434783936 + }, + { + "auxiliary_loss_clip": 0.01354569, + "auxiliary_loss_mlp": 0.01038165, + "balance_loss_clip": 1.2390759, + "balance_loss_mlp": 1.02371669, + "epoch": 0.6012325266796934, + "flos": 21802561580880.0, + "grad_norm": 1.565736892470484, + "language_loss": 0.78294933, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.8068766, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14465332, + "step": 10000, + "time_per_iteration": 2.9031245708465576 + }, + { + "auxiliary_loss_clip": 0.01359196, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.24343288, + "balance_loss_mlp": 1.02132678, + "epoch": 0.6012926499323613, + "flos": 19577731397400.0, + "grad_norm": 2.0644835149013887, + "language_loss": 0.77977979, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.80372995, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14501953, + "step": 10001, + "time_per_iteration": 2.87479567527771 + }, + { + "auxiliary_loss_clip": 0.01357722, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.24293423, + "balance_loss_mlp": 1.01968503, + "epoch": 0.6013527731850293, + "flos": 34867541836440.0, + "grad_norm": 1.7477089422490513, + "language_loss": 0.58976817, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61368418, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14202881, + "step": 10002, + "time_per_iteration": 5.190438985824585 + }, + { + "auxiliary_loss_clip": 0.01359808, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.24532783, + "balance_loss_mlp": 1.01896191, + "epoch": 0.6014128964376972, + "flos": 23296821516720.0, + "grad_norm": 1.9237711057504017, + "language_loss": 0.78188133, + "learning_rate": 1.447431741055314e-06, + "loss": 0.80581272, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.14379883, + "step": 10003, + "time_per_iteration": 4.307927846908569 + }, + { + "auxiliary_loss_clip": 0.01359893, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.24617028, + "balance_loss_mlp": 1.02454031, + "epoch": 0.6014730196903653, + "flos": 24825540969000.0, + "grad_norm": 2.3819241291142044, + "language_loss": 0.78206682, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.80605638, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14526367, + "step": 10004, + "time_per_iteration": 2.8355698585510254 + }, + { + "auxiliary_loss_clip": 0.01355074, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.24172258, + "balance_loss_mlp": 1.02330172, + "epoch": 0.6015331429430332, + "flos": 23117579904960.0, + "grad_norm": 1.6298852138095334, + "language_loss": 0.7245912, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74851048, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13574219, + "step": 10005, + "time_per_iteration": 2.907972812652588 + }, + { + "auxiliary_loss_clip": 0.01346095, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.23819602, + "balance_loss_mlp": 1.02400327, + "epoch": 0.6015932661957012, + "flos": 19205157072960.0, + "grad_norm": 2.1257122369195747, + "language_loss": 0.74953002, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77336335, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13232422, + "step": 10006, + "time_per_iteration": 4.234566926956177 + }, + { + "auxiliary_loss_clip": 0.01357841, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.24432063, + "balance_loss_mlp": 1.02298808, + "epoch": 0.6016533894483691, + "flos": 18118174628160.0, + "grad_norm": 1.8415705725140723, + "language_loss": 0.74105954, + "learning_rate": 1.445934699732685e-06, + "loss": 0.76500648, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13879395, + "step": 10007, + "time_per_iteration": 2.804908037185669 + }, + { + "auxiliary_loss_clip": 0.01350284, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.23950005, + "balance_loss_mlp": 1.02023292, + "epoch": 0.6017135127010371, + "flos": 16221266554320.0, + "grad_norm": 2.1110057428977345, + "language_loss": 0.70546842, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.7293005, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12689209, + "step": 10008, + "time_per_iteration": 2.9713237285614014 + }, + { + "auxiliary_loss_clip": 0.01358039, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.24673939, + "balance_loss_mlp": 1.02139139, + "epoch": 0.6017736359537051, + "flos": 23449968934200.0, + "grad_norm": 1.6457008739145185, + "language_loss": 0.76360869, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78753424, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13146973, + "step": 10009, + "time_per_iteration": 3.030144691467285 + }, + { + "auxiliary_loss_clip": 0.01352826, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.24143243, + "balance_loss_mlp": 1.02358401, + "epoch": 0.601833759206373, + "flos": 23519375267400.0, + "grad_norm": 2.10026279239764, + "language_loss": 0.74520719, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76910841, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13720703, + "step": 10010, + "time_per_iteration": 2.8656723499298096 + }, + { + "auxiliary_loss_clip": 0.01179069, + "auxiliary_loss_mlp": 0.01001457, + "balance_loss_clip": 1.13092053, + "balance_loss_mlp": 0.99842864, + "epoch": 0.6018938824590411, + "flos": 64008982923840.0, + "grad_norm": 0.8050738730271253, + "language_loss": 0.55074388, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57254916, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.03027344, + "step": 10011, + "time_per_iteration": 4.8751795291900635 + }, + { + "auxiliary_loss_clip": 0.0135469, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.24224484, + "balance_loss_mlp": 1.02484179, + "epoch": 0.601954005711709, + "flos": 34646490595080.0, + "grad_norm": 1.355412388766423, + "language_loss": 0.62001491, + "learning_rate": 1.44406387091556e-06, + "loss": 0.6439361, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1260376, + "step": 10012, + "time_per_iteration": 2.936920404434204 + }, + { + "auxiliary_loss_clip": 0.01352667, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.24241054, + "balance_loss_mlp": 1.01818502, + "epoch": 0.602014128964377, + "flos": 19431771659640.0, + "grad_norm": 1.5749540770669765, + "language_loss": 0.75053471, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77436835, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12518311, + "step": 10013, + "time_per_iteration": 2.8524270057678223 + }, + { + "auxiliary_loss_clip": 0.01348243, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.24077928, + "balance_loss_mlp": 1.0264169, + "epoch": 0.6020742522170449, + "flos": 28335234446280.0, + "grad_norm": 2.59200029902443, + "language_loss": 0.81504035, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83891118, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12414551, + "step": 10014, + "time_per_iteration": 2.890270948410034 + }, + { + "auxiliary_loss_clip": 0.0134351, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.23692751, + "balance_loss_mlp": 1.02087831, + "epoch": 0.6021343754697129, + "flos": 22752152651880.0, + "grad_norm": 1.3790659587059557, + "language_loss": 0.72328919, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74706137, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12841797, + "step": 10015, + "time_per_iteration": 2.858846664428711 + }, + { + "auxiliary_loss_clip": 0.01178627, + "auxiliary_loss_mlp": 0.01005093, + "balance_loss_clip": 1.13154626, + "balance_loss_mlp": 1.00204134, + "epoch": 0.6021944987223808, + "flos": 65765964924360.0, + "grad_norm": 0.8450983379977922, + "language_loss": 0.54835695, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.57019418, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.03051758, + "step": 10016, + "time_per_iteration": 3.2642929553985596 + }, + { + "auxiliary_loss_clip": 0.01351263, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.24115479, + "balance_loss_mlp": 1.01517653, + "epoch": 0.6022546219750489, + "flos": 16109238424320.0, + "grad_norm": 1.4986402373551218, + "language_loss": 0.82975066, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85354853, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13354492, + "step": 10017, + "time_per_iteration": 2.8004374504089355 + }, + { + "auxiliary_loss_clip": 0.01346932, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.23954439, + "balance_loss_mlp": 1.0202564, + "epoch": 0.6023147452277168, + "flos": 25517062955520.0, + "grad_norm": 1.8438066481746855, + "language_loss": 0.83982587, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.86362565, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12805176, + "step": 10018, + "time_per_iteration": 2.9110758304595947 + }, + { + "auxiliary_loss_clip": 0.0136498, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.2490077, + "balance_loss_mlp": 1.02222562, + "epoch": 0.6023748684803848, + "flos": 22640977297440.0, + "grad_norm": 1.4825932044894616, + "language_loss": 0.78123051, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80524755, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.1449585, + "step": 10019, + "time_per_iteration": 3.021836996078491 + }, + { + "auxiliary_loss_clip": 0.01352368, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.24063993, + "balance_loss_mlp": 1.01848972, + "epoch": 0.6024349917330527, + "flos": 26215650796680.0, + "grad_norm": 1.402436972704574, + "language_loss": 0.7414391, + "learning_rate": 1.441071641765681e-06, + "loss": 0.76527953, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13189697, + "step": 10020, + "time_per_iteration": 2.9269442558288574 + }, + { + "auxiliary_loss_clip": 0.01354833, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.24338579, + "balance_loss_mlp": 1.01963782, + "epoch": 0.6024951149857207, + "flos": 21256755681960.0, + "grad_norm": 3.8803028171956373, + "language_loss": 0.640522, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66440207, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13537598, + "step": 10021, + "time_per_iteration": 2.8639938831329346 + }, + { + "auxiliary_loss_clip": 0.01358346, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.24694121, + "balance_loss_mlp": 1.01676822, + "epoch": 0.6025552382383887, + "flos": 26949185454600.0, + "grad_norm": 1.4075564103667484, + "language_loss": 0.80658287, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.83046633, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13250732, + "step": 10022, + "time_per_iteration": 2.865009307861328 + }, + { + "auxiliary_loss_clip": 0.01365899, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.25030565, + "balance_loss_mlp": 1.01941407, + "epoch": 0.6026153614910567, + "flos": 31690440430200.0, + "grad_norm": 1.472424893699132, + "language_loss": 0.66828859, + "learning_rate": 1.439949905155693e-06, + "loss": 0.69227159, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.12982178, + "step": 10023, + "time_per_iteration": 2.941295862197876 + }, + { + "auxiliary_loss_clip": 0.01357926, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.24442339, + "balance_loss_mlp": 1.01865125, + "epoch": 0.6026754847437247, + "flos": 29318472866520.0, + "grad_norm": 2.185396064951229, + "language_loss": 0.74322122, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76712334, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13635254, + "step": 10024, + "time_per_iteration": 2.844611167907715 + }, + { + "auxiliary_loss_clip": 0.01352163, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.24122286, + "balance_loss_mlp": 1.02284634, + "epoch": 0.6027356079963926, + "flos": 23592070877760.0, + "grad_norm": 1.6195801728944992, + "language_loss": 0.7296139, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75349164, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12780762, + "step": 10025, + "time_per_iteration": 2.8868603706359863 + }, + { + "auxiliary_loss_clip": 0.01367947, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.25043142, + "balance_loss_mlp": 1.02398252, + "epoch": 0.6027957312490606, + "flos": 20818673426880.0, + "grad_norm": 2.286435889868336, + "language_loss": 0.67539769, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69946134, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.14447021, + "step": 10026, + "time_per_iteration": 2.9615182876586914 + }, + { + "auxiliary_loss_clip": 0.01344884, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.23684478, + "balance_loss_mlp": 1.01960707, + "epoch": 0.6028558545017285, + "flos": 19940275456920.0, + "grad_norm": 2.5788704886281044, + "language_loss": 0.80246025, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82623255, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12738037, + "step": 10027, + "time_per_iteration": 2.827172040939331 + }, + { + "auxiliary_loss_clip": 0.0136425, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.24952865, + "balance_loss_mlp": 1.02368021, + "epoch": 0.6029159777543965, + "flos": 22826066513040.0, + "grad_norm": 2.00480731736119, + "language_loss": 0.71383095, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73785174, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14141846, + "step": 10028, + "time_per_iteration": 2.8750691413879395 + }, + { + "auxiliary_loss_clip": 0.01359104, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.2453444, + "balance_loss_mlp": 1.02774727, + "epoch": 0.6029761010070644, + "flos": 23592598786440.0, + "grad_norm": 1.590128616679744, + "language_loss": 0.84446985, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86848378, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14538574, + "step": 10029, + "time_per_iteration": 2.8905324935913086 + }, + { + "auxiliary_loss_clip": 0.01348981, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.23892641, + "balance_loss_mlp": 1.0248425, + "epoch": 0.6030362242597325, + "flos": 13666402626480.0, + "grad_norm": 1.6945022509472305, + "language_loss": 0.79826552, + "learning_rate": 1.437333263694373e-06, + "loss": 0.82213414, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13049316, + "step": 10030, + "time_per_iteration": 2.9247944355010986 + }, + { + "auxiliary_loss_clip": 0.01359472, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.24591255, + "balance_loss_mlp": 1.01941013, + "epoch": 0.6030963475124004, + "flos": 24427887659280.0, + "grad_norm": 1.6237569792404303, + "language_loss": 0.7174772, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.74140418, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13800049, + "step": 10031, + "time_per_iteration": 3.0792253017425537 + }, + { + "auxiliary_loss_clip": 0.01367315, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.25099277, + "balance_loss_mlp": 1.01970637, + "epoch": 0.6031564707650684, + "flos": 29650212162000.0, + "grad_norm": 1.5197608679923302, + "language_loss": 0.73637521, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.76038742, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14208984, + "step": 10032, + "time_per_iteration": 2.949895143508911 + }, + { + "auxiliary_loss_clip": 0.01361688, + "auxiliary_loss_mlp": 0.01032664, + "balance_loss_clip": 1.24665141, + "balance_loss_mlp": 1.01761389, + "epoch": 0.6032165940177363, + "flos": 16623264958560.0, + "grad_norm": 1.703968535989996, + "language_loss": 0.68689382, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.71083736, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.1505127, + "step": 10033, + "time_per_iteration": 2.892164945602417 + }, + { + "auxiliary_loss_clip": 0.01353309, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.2424109, + "balance_loss_mlp": 1.01869464, + "epoch": 0.6032767172704043, + "flos": 17491957530480.0, + "grad_norm": 1.8521032256372894, + "language_loss": 0.76237881, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78624094, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14215088, + "step": 10034, + "time_per_iteration": 2.9288289546966553 + }, + { + "auxiliary_loss_clip": 0.01358778, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.24465728, + "balance_loss_mlp": 1.01824403, + "epoch": 0.6033368405230723, + "flos": 26838538008840.0, + "grad_norm": 1.9853341151963286, + "language_loss": 0.74567461, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76958859, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14361572, + "step": 10035, + "time_per_iteration": 2.982273578643799 + }, + { + "auxiliary_loss_clip": 0.01349735, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.24027824, + "balance_loss_mlp": 1.01677084, + "epoch": 0.6033969637757403, + "flos": 16914331658520.0, + "grad_norm": 1.6275005376987255, + "language_loss": 0.86917078, + "learning_rate": 1.435091260090536e-06, + "loss": 0.8929652, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.1293335, + "step": 10036, + "time_per_iteration": 2.826845645904541 + }, + { + "auxiliary_loss_clip": 0.01360224, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.24541569, + "balance_loss_mlp": 1.02260852, + "epoch": 0.6034570870284083, + "flos": 22935292666200.0, + "grad_norm": 1.9005506311827438, + "language_loss": 0.70244116, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72641152, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14215088, + "step": 10037, + "time_per_iteration": 2.8912158012390137 + }, + { + "auxiliary_loss_clip": 0.01356495, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.24601746, + "balance_loss_mlp": 1.01291883, + "epoch": 0.6035172102810762, + "flos": 23371385111640.0, + "grad_norm": 1.7916943404592744, + "language_loss": 0.85128343, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.8751145, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13690186, + "step": 10038, + "time_per_iteration": 2.9214491844177246 + }, + { + "auxiliary_loss_clip": 0.0135832, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.24267673, + "balance_loss_mlp": 1.01991487, + "epoch": 0.6035773335337442, + "flos": 20892059379360.0, + "grad_norm": 2.3643967429956434, + "language_loss": 0.76900148, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.79292774, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14398193, + "step": 10039, + "time_per_iteration": 2.857719898223877 + }, + { + "auxiliary_loss_clip": 0.013498, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.23921466, + "balance_loss_mlp": 1.01428556, + "epoch": 0.6036374567864121, + "flos": 24941711151720.0, + "grad_norm": 1.6392188801030143, + "language_loss": 0.71147013, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73524129, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13031006, + "step": 10040, + "time_per_iteration": 2.9589109420776367 + }, + { + "auxiliary_loss_clip": 0.01362764, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.24733746, + "balance_loss_mlp": 1.01641011, + "epoch": 0.6036975800390801, + "flos": 23153420105640.0, + "grad_norm": 4.73920282787662, + "language_loss": 0.78421068, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80815572, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.15332031, + "step": 10041, + "time_per_iteration": 4.415592670440674 + }, + { + "auxiliary_loss_clip": 0.01353232, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.24144542, + "balance_loss_mlp": 1.0140779, + "epoch": 0.603757703291748, + "flos": 18665320602960.0, + "grad_norm": 1.6917440521329812, + "language_loss": 0.76061296, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.78442723, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14135742, + "step": 10042, + "time_per_iteration": 2.8717949390411377 + }, + { + "auxiliary_loss_clip": 0.01348799, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.23744345, + "balance_loss_mlp": 1.01648355, + "epoch": 0.6038178265444161, + "flos": 19687363634160.0, + "grad_norm": 1.7951829366676537, + "language_loss": 0.85058612, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.87437063, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.1317749, + "step": 10043, + "time_per_iteration": 4.313758134841919 + }, + { + "auxiliary_loss_clip": 0.01366253, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.24977994, + "balance_loss_mlp": 1.02052045, + "epoch": 0.603877949797084, + "flos": 22643413799040.0, + "grad_norm": 1.7632182409747097, + "language_loss": 0.69520319, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71922016, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14916992, + "step": 10044, + "time_per_iteration": 2.9921860694885254 + }, + { + "auxiliary_loss_clip": 0.01360228, + "auxiliary_loss_mlp": 0.01030018, + "balance_loss_clip": 1.24630404, + "balance_loss_mlp": 1.01470006, + "epoch": 0.603938073049752, + "flos": 25453463617800.0, + "grad_norm": 1.6251768623324054, + "language_loss": 0.78244638, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80634886, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.15332031, + "step": 10045, + "time_per_iteration": 2.8745100498199463 + }, + { + "auxiliary_loss_clip": 0.01350881, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.23985696, + "balance_loss_mlp": 1.01850271, + "epoch": 0.6039981963024199, + "flos": 22344469077240.0, + "grad_norm": 1.6374935534828776, + "language_loss": 0.77033818, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79417145, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13964844, + "step": 10046, + "time_per_iteration": 2.9290390014648438 + }, + { + "auxiliary_loss_clip": 0.01354699, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.24122906, + "balance_loss_mlp": 1.01445878, + "epoch": 0.6040583195550879, + "flos": 20707904156040.0, + "grad_norm": 1.596701667575185, + "language_loss": 0.87595093, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89977348, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13104248, + "step": 10047, + "time_per_iteration": 2.8909382820129395 + }, + { + "auxiliary_loss_clip": 0.01346488, + "auxiliary_loss_mlp": 0.01027275, + "balance_loss_clip": 1.23756039, + "balance_loss_mlp": 1.0141325, + "epoch": 0.604118442807756, + "flos": 27168977836800.0, + "grad_norm": 1.461745226430548, + "language_loss": 0.75445282, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77819049, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.13128662, + "step": 10048, + "time_per_iteration": 2.963470458984375 + }, + { + "auxiliary_loss_clip": 0.01372321, + "auxiliary_loss_mlp": 0.0104139, + "balance_loss_clip": 1.25289702, + "balance_loss_mlp": 1.02458191, + "epoch": 0.6041785660604239, + "flos": 30887458830720.0, + "grad_norm": 1.8201263702757282, + "language_loss": 0.66044617, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68458325, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.16796875, + "step": 10049, + "time_per_iteration": 2.990816116333008 + }, + { + "auxiliary_loss_clip": 0.01352801, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.24154294, + "balance_loss_mlp": 1.01654983, + "epoch": 0.6042386893130919, + "flos": 19943036825400.0, + "grad_norm": 1.4568067400928943, + "language_loss": 0.67042851, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69426048, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13861084, + "step": 10050, + "time_per_iteration": 4.399128437042236 + }, + { + "auxiliary_loss_clip": 0.01355829, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.24254346, + "balance_loss_mlp": 1.02248716, + "epoch": 0.6042988125657598, + "flos": 32422310145360.0, + "grad_norm": 1.6953246544299767, + "language_loss": 0.70342135, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.72734833, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14379883, + "step": 10051, + "time_per_iteration": 3.0380287170410156 + }, + { + "auxiliary_loss_clip": 0.0134808, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.23689425, + "balance_loss_mlp": 1.0159508, + "epoch": 0.6043589358184278, + "flos": 17425312565760.0, + "grad_norm": 1.8694811596466547, + "language_loss": 0.64821613, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.67198533, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12890625, + "step": 10052, + "time_per_iteration": 2.8878610134124756 + }, + { + "auxiliary_loss_clip": 0.01356504, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.24383712, + "balance_loss_mlp": 1.01900351, + "epoch": 0.6044190590710957, + "flos": 27678618668160.0, + "grad_norm": 1.6159850317967541, + "language_loss": 0.69167161, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71556985, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.14318848, + "step": 10053, + "time_per_iteration": 2.991589307785034 + }, + { + "auxiliary_loss_clip": 0.0118012, + "auxiliary_loss_mlp": 0.01004091, + "balance_loss_clip": 1.1284368, + "balance_loss_mlp": 0.99996597, + "epoch": 0.6044791823237637, + "flos": 65330440995960.0, + "grad_norm": 0.7506045346447694, + "language_loss": 0.60493886, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62678093, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.04125977, + "step": 10054, + "time_per_iteration": 3.4487826824188232 + }, + { + "auxiliary_loss_clip": 0.01344408, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.23441529, + "balance_loss_mlp": 1.01809335, + "epoch": 0.6045393055764317, + "flos": 24496928517240.0, + "grad_norm": 1.5709236539228901, + "language_loss": 0.85754323, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.88130891, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14074707, + "step": 10055, + "time_per_iteration": 2.9957733154296875 + }, + { + "auxiliary_loss_clip": 0.01356454, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.24408889, + "balance_loss_mlp": 1.02270985, + "epoch": 0.6045994288290997, + "flos": 19057979084400.0, + "grad_norm": 2.3331541005592435, + "language_loss": 0.73420978, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75814414, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14257812, + "step": 10056, + "time_per_iteration": 2.816537857055664 + }, + { + "auxiliary_loss_clip": 0.0134616, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.23771667, + "balance_loss_mlp": 1.01984835, + "epoch": 0.6046595520817676, + "flos": 26582133867120.0, + "grad_norm": 1.6124772604289432, + "language_loss": 0.80311525, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82690376, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12841797, + "step": 10057, + "time_per_iteration": 2.9217593669891357 + }, + { + "auxiliary_loss_clip": 0.01349027, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.23973179, + "balance_loss_mlp": 1.01810443, + "epoch": 0.6047196753344356, + "flos": 13585301085600.0, + "grad_norm": 7.848454851880992, + "language_loss": 0.76468086, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.78849822, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14593506, + "step": 10058, + "time_per_iteration": 2.7131311893463135 + }, + { + "auxiliary_loss_clip": 0.01347257, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.23744917, + "balance_loss_mlp": 1.01614535, + "epoch": 0.6047797985871035, + "flos": 25525996794720.0, + "grad_norm": 1.7091230465109586, + "language_loss": 0.71294051, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73670995, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13555908, + "step": 10059, + "time_per_iteration": 2.876521587371826 + }, + { + "auxiliary_loss_clip": 0.01355082, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.2415601, + "balance_loss_mlp": 1.02086735, + "epoch": 0.6048399218397715, + "flos": 20524804750080.0, + "grad_norm": 1.466550963489376, + "language_loss": 0.76576531, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78966808, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14324951, + "step": 10060, + "time_per_iteration": 2.8285439014434814 + }, + { + "auxiliary_loss_clip": 0.01353472, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.24319506, + "balance_loss_mlp": 1.02015471, + "epoch": 0.6049000450924396, + "flos": 20413182703680.0, + "grad_norm": 2.108963346266568, + "language_loss": 0.73888296, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.7627548, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13574219, + "step": 10061, + "time_per_iteration": 2.8166615962982178 + }, + { + "auxiliary_loss_clip": 0.01355058, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.24173462, + "balance_loss_mlp": 1.01925874, + "epoch": 0.6049601683451075, + "flos": 20746343291760.0, + "grad_norm": 1.8556146849114885, + "language_loss": 0.67621136, + "learning_rate": 1.425384861715639e-06, + "loss": 0.70008016, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.12561035, + "step": 10062, + "time_per_iteration": 2.7916486263275146 + }, + { + "auxiliary_loss_clip": 0.01349011, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.23743534, + "balance_loss_mlp": 1.02524316, + "epoch": 0.6050202915977755, + "flos": 20087818920720.0, + "grad_norm": 2.1676549810048193, + "language_loss": 0.72262824, + "learning_rate": 1.425011831266978e-06, + "loss": 0.74650401, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13317871, + "step": 10063, + "time_per_iteration": 2.769583225250244 + }, + { + "auxiliary_loss_clip": 0.01347988, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.23882437, + "balance_loss_mlp": 1.01967931, + "epoch": 0.6050804148504434, + "flos": 15965065454400.0, + "grad_norm": 1.5860305122683949, + "language_loss": 0.84846878, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87227571, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13037109, + "step": 10064, + "time_per_iteration": 2.7659597396850586 + }, + { + "auxiliary_loss_clip": 0.01350225, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.24017191, + "balance_loss_mlp": 1.02262712, + "epoch": 0.6051405381031114, + "flos": 17461071549720.0, + "grad_norm": 2.4600556021975866, + "language_loss": 0.79982436, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.82368672, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13372803, + "step": 10065, + "time_per_iteration": 2.8342971801757812 + }, + { + "auxiliary_loss_clip": 0.01366083, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.25095975, + "balance_loss_mlp": 1.02199161, + "epoch": 0.6052006613557793, + "flos": 11403783041040.0, + "grad_norm": 2.461501489756801, + "language_loss": 0.78630483, + "learning_rate": 1.423892870799226e-06, + "loss": 0.81033432, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14868164, + "step": 10066, + "time_per_iteration": 2.7670023441314697 + }, + { + "auxiliary_loss_clip": 0.01355806, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.24476194, + "balance_loss_mlp": 1.0217452, + "epoch": 0.6052607846084473, + "flos": 24756012810720.0, + "grad_norm": 1.7353398468892007, + "language_loss": 0.73363733, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75755239, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13952637, + "step": 10067, + "time_per_iteration": 2.827845811843872 + }, + { + "auxiliary_loss_clip": 0.0135173, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.24111223, + "balance_loss_mlp": 1.02708304, + "epoch": 0.6053209078611153, + "flos": 20746099641600.0, + "grad_norm": 1.3255271116923093, + "language_loss": 0.6930474, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.71696854, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.13311768, + "step": 10068, + "time_per_iteration": 2.88771915435791 + }, + { + "auxiliary_loss_clip": 0.0135128, + "auxiliary_loss_mlp": 0.01034618, + "balance_loss_clip": 1.23804021, + "balance_loss_mlp": 1.02169538, + "epoch": 0.6053810311137833, + "flos": 18957849203880.0, + "grad_norm": 1.9208254551898543, + "language_loss": 0.87325013, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89710903, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12921143, + "step": 10069, + "time_per_iteration": 2.880570888519287 + }, + { + "auxiliary_loss_clip": 0.01349738, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.24013805, + "balance_loss_mlp": 1.02148509, + "epoch": 0.6054411543664512, + "flos": 23956401705120.0, + "grad_norm": 1.4335529765205008, + "language_loss": 0.83372426, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85756457, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12811279, + "step": 10070, + "time_per_iteration": 2.8946852684020996 + }, + { + "auxiliary_loss_clip": 0.01353276, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.2407825, + "balance_loss_mlp": 1.02674389, + "epoch": 0.6055012776191192, + "flos": 20598759219600.0, + "grad_norm": 1.581433645847547, + "language_loss": 0.86734521, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.89128029, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13494873, + "step": 10071, + "time_per_iteration": 2.882476568222046 + }, + { + "auxiliary_loss_clip": 0.01362305, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.24752223, + "balance_loss_mlp": 1.02187157, + "epoch": 0.6055614008717871, + "flos": 30304107180000.0, + "grad_norm": 1.5003093129974765, + "language_loss": 0.7745831, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79856485, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14001465, + "step": 10072, + "time_per_iteration": 2.8456268310546875 + }, + { + "auxiliary_loss_clip": 0.01357821, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.24449205, + "balance_loss_mlp": 1.02108955, + "epoch": 0.6056215241244551, + "flos": 27130660526160.0, + "grad_norm": 1.5456416392125127, + "language_loss": 0.74446464, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76839513, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14141846, + "step": 10073, + "time_per_iteration": 2.853868007659912 + }, + { + "auxiliary_loss_clip": 0.01195727, + "auxiliary_loss_mlp": 0.01020644, + "balance_loss_clip": 1.14381385, + "balance_loss_mlp": 1.01656711, + "epoch": 0.6056816473771232, + "flos": 56019619501560.0, + "grad_norm": 0.8733144788836944, + "language_loss": 0.55229962, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57446337, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.04077148, + "step": 10074, + "time_per_iteration": 3.3136837482452393 + }, + { + "auxiliary_loss_clip": 0.01349982, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.24013877, + "balance_loss_mlp": 1.0207994, + "epoch": 0.6057417706297911, + "flos": 23554687559400.0, + "grad_norm": 1.7842678488820751, + "language_loss": 0.82131231, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.8451466, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12658691, + "step": 10075, + "time_per_iteration": 2.898383617401123 + }, + { + "auxiliary_loss_clip": 0.01352226, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.24017513, + "balance_loss_mlp": 1.01324677, + "epoch": 0.6058018938824591, + "flos": 27749892985920.0, + "grad_norm": 1.9884826918746834, + "language_loss": 0.78098464, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80477899, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13946533, + "step": 10076, + "time_per_iteration": 2.8836870193481445 + }, + { + "auxiliary_loss_clip": 0.01352865, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.23861158, + "balance_loss_mlp": 1.0192287, + "epoch": 0.605862017135127, + "flos": 22788480152880.0, + "grad_norm": 1.729225596600637, + "language_loss": 0.72557974, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74943388, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13311768, + "step": 10077, + "time_per_iteration": 2.9411659240722656 + }, + { + "auxiliary_loss_clip": 0.01357445, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.24472451, + "balance_loss_mlp": 1.02025723, + "epoch": 0.605922140387795, + "flos": 21219981489000.0, + "grad_norm": 1.534643989481989, + "language_loss": 0.56275612, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.58666921, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.1361084, + "step": 10078, + "time_per_iteration": 2.792522430419922 + }, + { + "auxiliary_loss_clip": 0.01360509, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.2451731, + "balance_loss_mlp": 1.02353907, + "epoch": 0.6059822636404629, + "flos": 27273696462000.0, + "grad_norm": 2.2375973702838694, + "language_loss": 0.70531076, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72928911, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.13769531, + "step": 10079, + "time_per_iteration": 4.501824378967285 + }, + { + "auxiliary_loss_clip": 0.01350482, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.23969042, + "balance_loss_mlp": 1.02408278, + "epoch": 0.606042386893131, + "flos": 20636223754680.0, + "grad_norm": 1.8009654404138702, + "language_loss": 0.62302196, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64690775, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14001465, + "step": 10080, + "time_per_iteration": 2.786390542984009 + }, + { + "auxiliary_loss_clip": 0.01357538, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.24460006, + "balance_loss_mlp": 1.01616657, + "epoch": 0.6061025101457989, + "flos": 23007419759520.0, + "grad_norm": 1.739194575082357, + "language_loss": 0.71224415, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73612237, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14123535, + "step": 10081, + "time_per_iteration": 4.211940050125122 + }, + { + "auxiliary_loss_clip": 0.0135734, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.2454257, + "balance_loss_mlp": 1.01837254, + "epoch": 0.6061626333984669, + "flos": 29905073186040.0, + "grad_norm": 1.5664311126986135, + "language_loss": 0.69107002, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71496427, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13726807, + "step": 10082, + "time_per_iteration": 2.837339162826538 + }, + { + "auxiliary_loss_clip": 0.01353885, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.24214339, + "balance_loss_mlp": 1.02130604, + "epoch": 0.6062227566511348, + "flos": 25014609803880.0, + "grad_norm": 1.483948439579018, + "language_loss": 0.66327548, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68716156, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13415527, + "step": 10083, + "time_per_iteration": 4.236896753311157 + }, + { + "auxiliary_loss_clip": 0.01355889, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.24392498, + "balance_loss_mlp": 1.0202558, + "epoch": 0.6062828799038028, + "flos": 19468708286040.0, + "grad_norm": 1.8291845409399896, + "language_loss": 0.74435949, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76824969, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12878418, + "step": 10084, + "time_per_iteration": 2.774230718612671 + }, + { + "auxiliary_loss_clip": 0.01349316, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.2377032, + "balance_loss_mlp": 1.01837075, + "epoch": 0.6063430031564707, + "flos": 13593544582680.0, + "grad_norm": 3.0594522061287948, + "language_loss": 0.73241037, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.7562263, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13909912, + "step": 10085, + "time_per_iteration": 2.750645160675049 + }, + { + "auxiliary_loss_clip": 0.013476, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.23783612, + "balance_loss_mlp": 1.01557946, + "epoch": 0.6064031264091387, + "flos": 23260372190640.0, + "grad_norm": 3.1847923332022208, + "language_loss": 0.76687527, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.79063666, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12945557, + "step": 10086, + "time_per_iteration": 2.7553067207336426 + }, + { + "auxiliary_loss_clip": 0.01347007, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.23785985, + "balance_loss_mlp": 1.01889086, + "epoch": 0.6064632496618068, + "flos": 22463969145480.0, + "grad_norm": 1.3247412506169565, + "language_loss": 0.72882712, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.7526226, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13635254, + "step": 10087, + "time_per_iteration": 2.8309874534606934 + }, + { + "auxiliary_loss_clip": 0.01346946, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.23820186, + "balance_loss_mlp": 1.01551569, + "epoch": 0.6065233729144747, + "flos": 25124079607200.0, + "grad_norm": 1.5824055714475218, + "language_loss": 0.83744276, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86119312, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12579346, + "step": 10088, + "time_per_iteration": 4.517136335372925 + }, + { + "auxiliary_loss_clip": 0.01349476, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.23905027, + "balance_loss_mlp": 1.01098418, + "epoch": 0.6065834961671427, + "flos": 23482641682800.0, + "grad_norm": 1.855645546336947, + "language_loss": 0.71557665, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.7393235, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.14227295, + "step": 10089, + "time_per_iteration": 2.848039388656616 + }, + { + "auxiliary_loss_clip": 0.01351429, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.24087429, + "balance_loss_mlp": 1.01775801, + "epoch": 0.6066436194198106, + "flos": 17023842070200.0, + "grad_norm": 1.96666991192297, + "language_loss": 0.82690275, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.85072368, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12902832, + "step": 10090, + "time_per_iteration": 2.9794344902038574 + }, + { + "auxiliary_loss_clip": 0.01369984, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.25093329, + "balance_loss_mlp": 1.02567625, + "epoch": 0.6067037426724786, + "flos": 18519320256840.0, + "grad_norm": 2.3374678224181764, + "language_loss": 0.75524783, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77935386, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.14929199, + "step": 10091, + "time_per_iteration": 2.860506057739258 + }, + { + "auxiliary_loss_clip": 0.01347085, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.23800898, + "balance_loss_mlp": 1.01753759, + "epoch": 0.6067638659251465, + "flos": 22351088239920.0, + "grad_norm": 1.5594355250436636, + "language_loss": 0.79727668, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.82106149, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13848877, + "step": 10092, + "time_per_iteration": 2.891754150390625 + }, + { + "auxiliary_loss_clip": 0.01351508, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.2386688, + "balance_loss_mlp": 1.01814008, + "epoch": 0.6068239891778145, + "flos": 12453869467800.0, + "grad_norm": 1.7674125158877083, + "language_loss": 0.76294547, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78678399, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14208984, + "step": 10093, + "time_per_iteration": 2.785320520401001 + }, + { + "auxiliary_loss_clip": 0.01342511, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.23481965, + "balance_loss_mlp": 1.01550531, + "epoch": 0.6068841124304825, + "flos": 23191168899240.0, + "grad_norm": 1.88700284308075, + "language_loss": 0.87657434, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.90028256, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12805176, + "step": 10094, + "time_per_iteration": 2.8666462898254395 + }, + { + "auxiliary_loss_clip": 0.01349555, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.23932409, + "balance_loss_mlp": 1.0168488, + "epoch": 0.6069442356831505, + "flos": 18592503167520.0, + "grad_norm": 1.6514894855152342, + "language_loss": 0.72566324, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74946189, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13470459, + "step": 10095, + "time_per_iteration": 2.9125537872314453 + }, + { + "auxiliary_loss_clip": 0.01353033, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.24182558, + "balance_loss_mlp": 1.01438773, + "epoch": 0.6070043589358184, + "flos": 20965364115120.0, + "grad_norm": 1.6169499259397184, + "language_loss": 0.76790017, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.79170054, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12615967, + "step": 10096, + "time_per_iteration": 2.9138216972351074 + }, + { + "auxiliary_loss_clip": 0.01354261, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.24181521, + "balance_loss_mlp": 1.0211699, + "epoch": 0.6070644821884864, + "flos": 11696189816880.0, + "grad_norm": 1.9882021558697551, + "language_loss": 0.80091715, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82480592, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13433838, + "step": 10097, + "time_per_iteration": 2.742826223373413 + }, + { + "auxiliary_loss_clip": 0.01348831, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.24027419, + "balance_loss_mlp": 1.01512957, + "epoch": 0.6071246054411543, + "flos": 19314342617760.0, + "grad_norm": 1.465112149730139, + "language_loss": 0.67750496, + "learning_rate": 1.411969602780478e-06, + "loss": 0.70126778, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12335205, + "step": 10098, + "time_per_iteration": 2.7029078006744385 + }, + { + "auxiliary_loss_clip": 0.01349195, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.2395469, + "balance_loss_mlp": 1.0131247, + "epoch": 0.6071847286938223, + "flos": 17753559542280.0, + "grad_norm": 1.7595582242564805, + "language_loss": 0.80812734, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.83188343, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13299561, + "step": 10099, + "time_per_iteration": 2.751370906829834 + }, + { + "auxiliary_loss_clip": 0.01358258, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.24170732, + "balance_loss_mlp": 1.01623213, + "epoch": 0.6072448519464904, + "flos": 22642601631840.0, + "grad_norm": 1.8498230670034805, + "language_loss": 0.71225965, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.73615849, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.15386963, + "step": 10100, + "time_per_iteration": 2.90303635597229 + }, + { + "auxiliary_loss_clip": 0.01359667, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.24778378, + "balance_loss_mlp": 1.01955771, + "epoch": 0.6073049751991583, + "flos": 19542134846880.0, + "grad_norm": 1.7845182250669633, + "language_loss": 0.71297348, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.73690414, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13842773, + "step": 10101, + "time_per_iteration": 2.8544411659240723 + }, + { + "auxiliary_loss_clip": 0.01346514, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.2374177, + "balance_loss_mlp": 1.01560748, + "epoch": 0.6073650984518263, + "flos": 28300247021160.0, + "grad_norm": 2.200000876931124, + "language_loss": 0.69384533, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71759969, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13305664, + "step": 10102, + "time_per_iteration": 2.963007688522339 + }, + { + "auxiliary_loss_clip": 0.01356205, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.24429476, + "balance_loss_mlp": 1.01690483, + "epoch": 0.6074252217044942, + "flos": 25669723072680.0, + "grad_norm": 1.9673313145477889, + "language_loss": 0.73688805, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.76075292, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13378906, + "step": 10103, + "time_per_iteration": 2.8386552333831787 + }, + { + "auxiliary_loss_clip": 0.01367889, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.25112653, + "balance_loss_mlp": 1.02134418, + "epoch": 0.6074853449571622, + "flos": 22862312797320.0, + "grad_norm": 2.360957920008533, + "language_loss": 0.77205819, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.79609609, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14569092, + "step": 10104, + "time_per_iteration": 2.7916295528411865 + }, + { + "auxiliary_loss_clip": 0.0118639, + "auxiliary_loss_mlp": 0.01006727, + "balance_loss_clip": 1.13549423, + "balance_loss_mlp": 1.00329423, + "epoch": 0.6075454682098301, + "flos": 67126772497320.0, + "grad_norm": 0.7779182438018823, + "language_loss": 0.55998921, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58192039, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03442383, + "step": 10105, + "time_per_iteration": 3.2381532192230225 + }, + { + "auxiliary_loss_clip": 0.01186919, + "auxiliary_loss_mlp": 0.0100707, + "balance_loss_clip": 1.13476825, + "balance_loss_mlp": 1.00335026, + "epoch": 0.6076055914624982, + "flos": 70726362548400.0, + "grad_norm": 0.7565799413107644, + "language_loss": 0.56900477, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.59094465, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.03710938, + "step": 10106, + "time_per_iteration": 3.172487735748291 + }, + { + "auxiliary_loss_clip": 0.01345075, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.23561084, + "balance_loss_mlp": 1.01756287, + "epoch": 0.6076657147151661, + "flos": 28370221871400.0, + "grad_norm": 1.5258609321587866, + "language_loss": 0.69046634, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.714221, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12841797, + "step": 10107, + "time_per_iteration": 2.8852319717407227 + }, + { + "auxiliary_loss_clip": 0.01361948, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.24734616, + "balance_loss_mlp": 1.01698709, + "epoch": 0.6077258379678341, + "flos": 15054035344200.0, + "grad_norm": 2.8976381707834067, + "language_loss": 0.81410867, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83803117, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13305664, + "step": 10108, + "time_per_iteration": 2.7614927291870117 + }, + { + "auxiliary_loss_clip": 0.01362495, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.2473141, + "balance_loss_mlp": 1.0229677, + "epoch": 0.607785961220502, + "flos": 36173260846080.0, + "grad_norm": 1.8710725400600694, + "language_loss": 0.71439505, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73839039, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14086914, + "step": 10109, + "time_per_iteration": 2.907360315322876 + }, + { + "auxiliary_loss_clip": 0.01343221, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.23610163, + "balance_loss_mlp": 1.01905489, + "epoch": 0.60784608447317, + "flos": 22528827342360.0, + "grad_norm": 1.5739700945913728, + "language_loss": 0.80465114, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82839906, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12512207, + "step": 10110, + "time_per_iteration": 2.7869627475738525 + }, + { + "auxiliary_loss_clip": 0.01354197, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.24264348, + "balance_loss_mlp": 1.01994014, + "epoch": 0.6079062077258379, + "flos": 23846363384760.0, + "grad_norm": 2.336988073531836, + "language_loss": 0.71308774, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.73696542, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13623047, + "step": 10111, + "time_per_iteration": 2.8897948265075684 + }, + { + "auxiliary_loss_clip": 0.0135945, + "auxiliary_loss_mlp": 0.01042411, + "balance_loss_clip": 1.245803, + "balance_loss_mlp": 1.02831483, + "epoch": 0.6079663309785059, + "flos": 23372197278840.0, + "grad_norm": 1.9277063866542445, + "language_loss": 0.65410149, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67812014, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 1.13818359, + "router_z_loss_mlp": 0.14099121, + "step": 10112, + "time_per_iteration": 2.9416697025299072 + }, + { + "auxiliary_loss_clip": 0.01182801, + "auxiliary_loss_mlp": 0.01004106, + "balance_loss_clip": 1.13113356, + "balance_loss_mlp": 1.00007725, + "epoch": 0.6080264542311739, + "flos": 71399140453800.0, + "grad_norm": 0.639821461446926, + "language_loss": 0.49605405, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51792312, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.0402832, + "step": 10113, + "time_per_iteration": 3.3354156017303467 + }, + { + "auxiliary_loss_clip": 0.01182666, + "auxiliary_loss_mlp": 0.01002909, + "balance_loss_clip": 1.13101053, + "balance_loss_mlp": 0.9992134, + "epoch": 0.6080865774838419, + "flos": 66545410656240.0, + "grad_norm": 0.8533319399668577, + "language_loss": 0.57032824, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59218395, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.03686523, + "step": 10114, + "time_per_iteration": 3.2001945972442627 + }, + { + "auxiliary_loss_clip": 0.01356893, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.24410534, + "balance_loss_mlp": 1.02123189, + "epoch": 0.6081467007365099, + "flos": 19212588402840.0, + "grad_norm": 1.7801720974352873, + "language_loss": 0.70173258, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72564864, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13476562, + "step": 10115, + "time_per_iteration": 2.818593740463257 + }, + { + "auxiliary_loss_clip": 0.01351861, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.24091125, + "balance_loss_mlp": 1.02036309, + "epoch": 0.6082068239891778, + "flos": 24172742376720.0, + "grad_norm": 1.7070441473133657, + "language_loss": 0.72673064, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.75058377, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13098145, + "step": 10116, + "time_per_iteration": 2.8314971923828125 + }, + { + "auxiliary_loss_clip": 0.01360471, + "auxiliary_loss_mlp": 0.01036272, + "balance_loss_clip": 1.24750161, + "balance_loss_mlp": 1.02183628, + "epoch": 0.6082669472418458, + "flos": 37420212912840.0, + "grad_norm": 9.875413926722171, + "language_loss": 0.54303169, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.56699914, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14422607, + "step": 10117, + "time_per_iteration": 2.9612984657287598 + }, + { + "auxiliary_loss_clip": 0.0136084, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.24826598, + "balance_loss_mlp": 1.02076495, + "epoch": 0.6083270704945137, + "flos": 15089713111440.0, + "grad_norm": 2.2391156212816585, + "language_loss": 0.70765936, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.73160499, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12976074, + "step": 10118, + "time_per_iteration": 4.242502689361572 + }, + { + "auxiliary_loss_clip": 0.01358401, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.24646986, + "balance_loss_mlp": 1.02418733, + "epoch": 0.6083871937471818, + "flos": 20673444639600.0, + "grad_norm": 1.5535848247770427, + "language_loss": 0.74628806, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.77024823, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13427734, + "step": 10119, + "time_per_iteration": 4.135087251663208 + }, + { + "auxiliary_loss_clip": 0.01355994, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.24518991, + "balance_loss_mlp": 1.0232482, + "epoch": 0.6084473169998497, + "flos": 21511900964520.0, + "grad_norm": 1.921698645996598, + "language_loss": 0.67308438, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69700587, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.12915039, + "step": 10120, + "time_per_iteration": 2.7608795166015625 + }, + { + "auxiliary_loss_clip": 0.01364682, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.24984157, + "balance_loss_mlp": 1.02496302, + "epoch": 0.6085074402525177, + "flos": 26875718285400.0, + "grad_norm": 1.7864004997827032, + "language_loss": 0.74631125, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.77034283, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 1.14697266, + "router_z_loss_mlp": 0.1350708, + "step": 10121, + "time_per_iteration": 4.287592887878418 + }, + { + "auxiliary_loss_clip": 0.01348009, + "auxiliary_loss_mlp": 0.01030669, + "balance_loss_clip": 1.23797035, + "balance_loss_mlp": 1.01834285, + "epoch": 0.6085675635051856, + "flos": 10893492475920.0, + "grad_norm": 1.8448847002029523, + "language_loss": 0.80507147, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82885826, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12316895, + "step": 10122, + "time_per_iteration": 2.835744857788086 + }, + { + "auxiliary_loss_clip": 0.01355642, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.24401653, + "balance_loss_mlp": 1.0224781, + "epoch": 0.6086276867578536, + "flos": 34869125562480.0, + "grad_norm": 1.4521215314253393, + "language_loss": 0.56098473, + "learning_rate": 1.402670413578284e-06, + "loss": 0.58490717, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14123535, + "step": 10123, + "time_per_iteration": 2.878952980041504 + }, + { + "auxiliary_loss_clip": 0.01346929, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.23802137, + "balance_loss_mlp": 1.02710199, + "epoch": 0.6086878100105215, + "flos": 20052547237080.0, + "grad_norm": 1.7888075239554249, + "language_loss": 0.74584866, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76971978, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13085938, + "step": 10124, + "time_per_iteration": 2.7546045780181885 + }, + { + "auxiliary_loss_clip": 0.01354324, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.24062181, + "balance_loss_mlp": 1.02739024, + "epoch": 0.6087479332631895, + "flos": 18337033018080.0, + "grad_norm": 2.0919113756367937, + "language_loss": 0.65608215, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.68003583, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13671875, + "step": 10125, + "time_per_iteration": 2.8377411365509033 + }, + { + "auxiliary_loss_clip": 0.013458, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.23641634, + "balance_loss_mlp": 1.01981831, + "epoch": 0.6088080565158575, + "flos": 24498106159680.0, + "grad_norm": 7.1814554499528125, + "language_loss": 0.76859128, + "learning_rate": 1.40155545786479e-06, + "loss": 0.79238051, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13317871, + "step": 10126, + "time_per_iteration": 2.818281888961792 + }, + { + "auxiliary_loss_clip": 0.01355809, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.24126279, + "balance_loss_mlp": 1.01411819, + "epoch": 0.6088681797685255, + "flos": 10272513856680.0, + "grad_norm": 2.435049888002765, + "language_loss": 0.71766412, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.74150407, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14074707, + "step": 10127, + "time_per_iteration": 4.316412925720215 + }, + { + "auxiliary_loss_clip": 0.01363656, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.24897039, + "balance_loss_mlp": 1.02079022, + "epoch": 0.6089283030211935, + "flos": 21977782965000.0, + "grad_norm": 2.3946541165365156, + "language_loss": 0.73442924, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75841486, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14099121, + "step": 10128, + "time_per_iteration": 2.7893593311309814 + }, + { + "auxiliary_loss_clip": 0.01350789, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.24038458, + "balance_loss_mlp": 1.02314389, + "epoch": 0.6089884262738614, + "flos": 17789684001480.0, + "grad_norm": 2.2873851042242586, + "language_loss": 0.73651588, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.76038337, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12817383, + "step": 10129, + "time_per_iteration": 2.743147850036621 + }, + { + "auxiliary_loss_clip": 0.01352608, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.24150276, + "balance_loss_mlp": 1.01835108, + "epoch": 0.6090485495265294, + "flos": 36918287669880.0, + "grad_norm": 1.4130422728829768, + "language_loss": 0.66045475, + "learning_rate": 1.400069168015626e-06, + "loss": 0.68429601, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13146973, + "step": 10130, + "time_per_iteration": 3.013915777206421 + }, + { + "auxiliary_loss_clip": 0.01348038, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.23977995, + "balance_loss_mlp": 1.01749229, + "epoch": 0.6091086727791973, + "flos": 19903501263960.0, + "grad_norm": 1.7058668964630985, + "language_loss": 0.77191883, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.7956959, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12182617, + "step": 10131, + "time_per_iteration": 2.815077543258667 + }, + { + "auxiliary_loss_clip": 0.01344945, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.23508501, + "balance_loss_mlp": 1.01919758, + "epoch": 0.6091687960318654, + "flos": 22168719784440.0, + "grad_norm": 1.6969935378446277, + "language_loss": 0.77444768, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79821444, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12536621, + "step": 10132, + "time_per_iteration": 2.756962537765503 + }, + { + "auxiliary_loss_clip": 0.01343647, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.23662889, + "balance_loss_mlp": 1.02097511, + "epoch": 0.6092289192845333, + "flos": 21469604034600.0, + "grad_norm": 1.7143807219824496, + "language_loss": 0.75779128, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.78156304, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12579346, + "step": 10133, + "time_per_iteration": 2.8685309886932373 + }, + { + "auxiliary_loss_clip": 0.01353464, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.24223018, + "balance_loss_mlp": 1.01767683, + "epoch": 0.6092890425372013, + "flos": 28700905349520.0, + "grad_norm": 1.7418297910119382, + "language_loss": 0.64166307, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66551328, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13867188, + "step": 10134, + "time_per_iteration": 2.990967273712158 + }, + { + "auxiliary_loss_clip": 0.01342648, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.23323321, + "balance_loss_mlp": 1.01694393, + "epoch": 0.6093491657898692, + "flos": 20818023693120.0, + "grad_norm": 1.7259108433306176, + "language_loss": 0.79063028, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.81435847, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13232422, + "step": 10135, + "time_per_iteration": 2.78595232963562 + }, + { + "auxiliary_loss_clip": 0.01348051, + "auxiliary_loss_mlp": 0.01029915, + "balance_loss_clip": 1.23525441, + "balance_loss_mlp": 1.0169276, + "epoch": 0.6094092890425372, + "flos": 25452448408800.0, + "grad_norm": 1.9624216525188223, + "language_loss": 0.72471666, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74849629, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 1.12841797, + "router_z_loss_mlp": 0.12988281, + "step": 10136, + "time_per_iteration": 2.7537600994110107 + }, + { + "auxiliary_loss_clip": 0.01347877, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.23704576, + "balance_loss_mlp": 1.01650631, + "epoch": 0.6094694122952051, + "flos": 35627373730440.0, + "grad_norm": 1.6673884980246891, + "language_loss": 0.74878979, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.7725597, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12615967, + "step": 10137, + "time_per_iteration": 2.875546932220459 + }, + { + "auxiliary_loss_clip": 0.01352117, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.23908484, + "balance_loss_mlp": 1.01774716, + "epoch": 0.6095295355478731, + "flos": 24461656833600.0, + "grad_norm": 1.6099638605830826, + "language_loss": 0.8057121, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82954872, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13787842, + "step": 10138, + "time_per_iteration": 2.8602168560028076 + }, + { + "auxiliary_loss_clip": 0.01342545, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.23526561, + "balance_loss_mlp": 1.01895356, + "epoch": 0.6095896588005411, + "flos": 15637508820000.0, + "grad_norm": 1.5919302120314194, + "language_loss": 0.81500798, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83875209, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12902832, + "step": 10139, + "time_per_iteration": 2.755492925643921 + }, + { + "auxiliary_loss_clip": 0.01355495, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.24293017, + "balance_loss_mlp": 1.01937377, + "epoch": 0.6096497820532091, + "flos": 15553605302280.0, + "grad_norm": 1.9193091359774486, + "language_loss": 0.83846414, + "learning_rate": 1.396355037825315e-06, + "loss": 0.86234987, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13720703, + "step": 10140, + "time_per_iteration": 2.880300760269165 + }, + { + "auxiliary_loss_clip": 0.01347307, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.23522937, + "balance_loss_mlp": 1.0160954, + "epoch": 0.6097099053058771, + "flos": 24209516569680.0, + "grad_norm": 1.690710998152391, + "language_loss": 0.75769424, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.7814703, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14208984, + "step": 10141, + "time_per_iteration": 2.809326648712158 + }, + { + "auxiliary_loss_clip": 0.01342982, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.23329651, + "balance_loss_mlp": 1.01561821, + "epoch": 0.609770028558545, + "flos": 19574604553680.0, + "grad_norm": 1.9587828837386942, + "language_loss": 0.76998317, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.79370165, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13262939, + "step": 10142, + "time_per_iteration": 2.755605459213257 + }, + { + "auxiliary_loss_clip": 0.01347482, + "auxiliary_loss_mlp": 0.01026361, + "balance_loss_clip": 1.23639262, + "balance_loss_mlp": 1.01225233, + "epoch": 0.609830151811213, + "flos": 23954208853680.0, + "grad_norm": 1.6752802897061299, + "language_loss": 0.76840329, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.79214174, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14099121, + "step": 10143, + "time_per_iteration": 2.7816414833068848 + }, + { + "auxiliary_loss_clip": 0.01346695, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.23647058, + "balance_loss_mlp": 1.01799822, + "epoch": 0.6098902750638809, + "flos": 16184005061040.0, + "grad_norm": 1.646779107337069, + "language_loss": 0.75539428, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77917832, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13708496, + "step": 10144, + "time_per_iteration": 2.7604708671569824 + }, + { + "auxiliary_loss_clip": 0.01355023, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.24255824, + "balance_loss_mlp": 1.01433325, + "epoch": 0.609950398316549, + "flos": 44534003969160.0, + "grad_norm": 1.6621690253635584, + "language_loss": 0.73259956, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75642258, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12957764, + "step": 10145, + "time_per_iteration": 3.0054678916931152 + }, + { + "auxiliary_loss_clip": 0.01348605, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.23797059, + "balance_loss_mlp": 1.01653421, + "epoch": 0.6100105215692169, + "flos": 23226968491560.0, + "grad_norm": 1.5448296677031965, + "language_loss": 0.69122499, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.715011, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 1.10595703, + "router_z_loss_mlp": 0.13458252, + "step": 10146, + "time_per_iteration": 2.964125633239746 + }, + { + "auxiliary_loss_clip": 0.01343613, + "auxiliary_loss_mlp": 0.01026162, + "balance_loss_clip": 1.2363894, + "balance_loss_mlp": 1.01364493, + "epoch": 0.6100706448218849, + "flos": 15016733242560.0, + "grad_norm": 1.6017112921148542, + "language_loss": 0.76767218, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79136986, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12518311, + "step": 10147, + "time_per_iteration": 2.7896413803100586 + }, + { + "auxiliary_loss_clip": 0.013456, + "auxiliary_loss_mlp": 0.01028191, + "balance_loss_clip": 1.23517919, + "balance_loss_mlp": 1.01496482, + "epoch": 0.6101307680745528, + "flos": 19643929670160.0, + "grad_norm": 1.875392260050073, + "language_loss": 0.79139441, + "learning_rate": 1.393385381096786e-06, + "loss": 0.81513238, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13226318, + "step": 10148, + "time_per_iteration": 2.7724313735961914 + }, + { + "auxiliary_loss_clip": 0.01355388, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.23983526, + "balance_loss_mlp": 1.01731479, + "epoch": 0.6101908913272208, + "flos": 29941725553920.0, + "grad_norm": 2.3100079071852853, + "language_loss": 0.54103374, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56491274, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.15185547, + "step": 10149, + "time_per_iteration": 2.871032953262329 + }, + { + "auxiliary_loss_clip": 0.01338864, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.23276699, + "balance_loss_mlp": 1.01658881, + "epoch": 0.6102510145798887, + "flos": 21803658006600.0, + "grad_norm": 2.0155126212781447, + "language_loss": 0.80757833, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.83126652, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13372803, + "step": 10150, + "time_per_iteration": 2.7990798950195312 + }, + { + "auxiliary_loss_clip": 0.0135217, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.23893356, + "balance_loss_mlp": 1.02255857, + "epoch": 0.6103111378325567, + "flos": 20711274649920.0, + "grad_norm": 1.494209715497746, + "language_loss": 0.69338328, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71727848, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14807129, + "step": 10151, + "time_per_iteration": 2.851511001586914 + }, + { + "auxiliary_loss_clip": 0.01343956, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.23396397, + "balance_loss_mlp": 1.01538801, + "epoch": 0.6103712610852247, + "flos": 29386863990720.0, + "grad_norm": 1.6054070541897958, + "language_loss": 0.71049684, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73421979, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12957764, + "step": 10152, + "time_per_iteration": 2.8774330615997314 + }, + { + "auxiliary_loss_clip": 0.01350138, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.23811245, + "balance_loss_mlp": 1.02075648, + "epoch": 0.6104313843378927, + "flos": 20817861259680.0, + "grad_norm": 1.6560146600806223, + "language_loss": 0.78231573, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80616552, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14093018, + "step": 10153, + "time_per_iteration": 2.819387912750244 + }, + { + "auxiliary_loss_clip": 0.01349222, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.23834896, + "balance_loss_mlp": 1.01636899, + "epoch": 0.6104915075905607, + "flos": 26584123676760.0, + "grad_norm": 1.5908965822711572, + "language_loss": 0.79790664, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.82169867, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13616943, + "step": 10154, + "time_per_iteration": 2.8438539505004883 + }, + { + "auxiliary_loss_clip": 0.01344434, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.23541999, + "balance_loss_mlp": 1.0152905, + "epoch": 0.6105516308432286, + "flos": 23921211238200.0, + "grad_norm": 1.493227696609083, + "language_loss": 0.70746362, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.73118663, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12554932, + "step": 10155, + "time_per_iteration": 2.7539401054382324 + }, + { + "auxiliary_loss_clip": 0.01346267, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.23543406, + "balance_loss_mlp": 1.01756537, + "epoch": 0.6106117540958966, + "flos": 31583731995360.0, + "grad_norm": 1.5087661901491816, + "language_loss": 0.71847773, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.74225068, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13476562, + "step": 10156, + "time_per_iteration": 4.255848407745361 + }, + { + "auxiliary_loss_clip": 0.01333721, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.22724938, + "balance_loss_mlp": 1.02030301, + "epoch": 0.6106718773485645, + "flos": 19612718822520.0, + "grad_norm": 1.5540301024160978, + "language_loss": 0.67368674, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69736183, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13476562, + "step": 10157, + "time_per_iteration": 4.288233995437622 + }, + { + "auxiliary_loss_clip": 0.01347504, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.23557568, + "balance_loss_mlp": 1.01570749, + "epoch": 0.6107320006012326, + "flos": 17127707919840.0, + "grad_norm": 4.005775692994193, + "language_loss": 0.73141164, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.75517714, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13342285, + "step": 10158, + "time_per_iteration": 2.843979597091675 + }, + { + "auxiliary_loss_clip": 0.01350974, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.23937511, + "balance_loss_mlp": 1.01803565, + "epoch": 0.6107921238539005, + "flos": 30153599305920.0, + "grad_norm": 1.6175424494517057, + "language_loss": 0.69556057, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71938109, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13043213, + "step": 10159, + "time_per_iteration": 2.8722400665283203 + }, + { + "auxiliary_loss_clip": 0.01345894, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.23525882, + "balance_loss_mlp": 1.02110279, + "epoch": 0.6108522471065685, + "flos": 18444634836840.0, + "grad_norm": 2.0165569336250178, + "language_loss": 0.78911227, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.8129161, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13409424, + "step": 10160, + "time_per_iteration": 4.198394298553467 + }, + { + "auxiliary_loss_clip": 0.01190771, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 1.14160776, + "balance_loss_mlp": 0.99765188, + "epoch": 0.6109123703592364, + "flos": 64152506160000.0, + "grad_norm": 0.827476322327156, + "language_loss": 0.61510152, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63701838, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.03271484, + "step": 10161, + "time_per_iteration": 3.4575083255767822 + }, + { + "auxiliary_loss_clip": 0.01353661, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.24141073, + "balance_loss_mlp": 1.02255261, + "epoch": 0.6109724936119044, + "flos": 20672754297480.0, + "grad_norm": 1.7090200480064885, + "language_loss": 0.76622462, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.7901268, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13995361, + "step": 10162, + "time_per_iteration": 2.763784885406494 + }, + { + "auxiliary_loss_clip": 0.01347534, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.23722243, + "balance_loss_mlp": 1.02152753, + "epoch": 0.6110326168645723, + "flos": 31357361058840.0, + "grad_norm": 1.8885084867873307, + "language_loss": 0.72308582, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.74691439, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13812256, + "step": 10163, + "time_per_iteration": 2.878390312194824 + }, + { + "auxiliary_loss_clip": 0.01337465, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.22967243, + "balance_loss_mlp": 1.01627994, + "epoch": 0.6110927401172404, + "flos": 25008031249560.0, + "grad_norm": 1.7433636864772537, + "language_loss": 0.59621978, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61987758, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12030029, + "step": 10164, + "time_per_iteration": 2.8172709941864014 + }, + { + "auxiliary_loss_clip": 0.01339448, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.23113906, + "balance_loss_mlp": 1.01939321, + "epoch": 0.6111528633699083, + "flos": 26253440198640.0, + "grad_norm": 1.807100902309758, + "language_loss": 0.75811374, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.78182948, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12738037, + "step": 10165, + "time_per_iteration": 4.345120906829834 + }, + { + "auxiliary_loss_clip": 0.01341914, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.23503959, + "balance_loss_mlp": 1.01536798, + "epoch": 0.6112129866225763, + "flos": 22387578174360.0, + "grad_norm": 1.6557897418691383, + "language_loss": 0.7962575, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81995904, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12866211, + "step": 10166, + "time_per_iteration": 2.8393471240997314 + }, + { + "auxiliary_loss_clip": 0.01348094, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.23762894, + "balance_loss_mlp": 1.01884353, + "epoch": 0.6112731098752443, + "flos": 25233305760360.0, + "grad_norm": 1.7033636551275497, + "language_loss": 0.67573303, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69953513, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.1328125, + "step": 10167, + "time_per_iteration": 2.821692943572998 + }, + { + "auxiliary_loss_clip": 0.01341552, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.23379993, + "balance_loss_mlp": 1.02005148, + "epoch": 0.6113332331279122, + "flos": 22898315431440.0, + "grad_norm": 1.4907618962224454, + "language_loss": 0.79121858, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81495601, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12133789, + "step": 10168, + "time_per_iteration": 2.7928099632263184 + }, + { + "auxiliary_loss_clip": 0.01360923, + "auxiliary_loss_mlp": 0.01040766, + "balance_loss_clip": 1.24284673, + "balance_loss_mlp": 1.02600837, + "epoch": 0.6113933563805802, + "flos": 18624404357280.0, + "grad_norm": 2.6407180561283137, + "language_loss": 0.86376119, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.88777804, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.14758301, + "step": 10169, + "time_per_iteration": 2.868746042251587 + }, + { + "auxiliary_loss_clip": 0.01343692, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.23510861, + "balance_loss_mlp": 1.0184902, + "epoch": 0.6114534796332481, + "flos": 41873974724160.0, + "grad_norm": 1.8399624969990631, + "language_loss": 0.78960073, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81334156, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.11901855, + "step": 10170, + "time_per_iteration": 3.0107622146606445 + }, + { + "auxiliary_loss_clip": 0.01357912, + "auxiliary_loss_mlp": 0.0103831, + "balance_loss_clip": 1.24161863, + "balance_loss_mlp": 1.02334881, + "epoch": 0.6115136028859162, + "flos": 21913533893520.0, + "grad_norm": 1.8004465659798157, + "language_loss": 0.69014609, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71410829, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.1496582, + "step": 10171, + "time_per_iteration": 2.868015766143799 + }, + { + "auxiliary_loss_clip": 0.01347412, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.23514366, + "balance_loss_mlp": 1.02526784, + "epoch": 0.6115737261385841, + "flos": 28810903061520.0, + "grad_norm": 1.8232001876277832, + "language_loss": 0.79785192, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.82171929, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14056396, + "step": 10172, + "time_per_iteration": 2.947918176651001 + }, + { + "auxiliary_loss_clip": 0.01354034, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.2383827, + "balance_loss_mlp": 1.02043045, + "epoch": 0.6116338493912521, + "flos": 21256430815080.0, + "grad_norm": 1.9176546695220853, + "language_loss": 0.6723454, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69623077, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.14068604, + "step": 10173, + "time_per_iteration": 2.8373947143554688 + }, + { + "auxiliary_loss_clip": 0.01351161, + "auxiliary_loss_mlp": 0.0103835, + "balance_loss_clip": 1.23929214, + "balance_loss_mlp": 1.02458096, + "epoch": 0.61169397264392, + "flos": 17534701152360.0, + "grad_norm": 1.7515454460966153, + "language_loss": 0.56221694, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.58611202, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13763428, + "step": 10174, + "time_per_iteration": 2.7669031620025635 + }, + { + "auxiliary_loss_clip": 0.01344713, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.23278046, + "balance_loss_mlp": 1.02136827, + "epoch": 0.611754095896588, + "flos": 23956889005440.0, + "grad_norm": 1.8787847202169146, + "language_loss": 0.67440021, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.69820148, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14044189, + "step": 10175, + "time_per_iteration": 2.809565305709839 + }, + { + "auxiliary_loss_clip": 0.01343504, + "auxiliary_loss_mlp": 0.0102442, + "balance_loss_clip": 1.23229623, + "balance_loss_mlp": 1.01248074, + "epoch": 0.6118142191492559, + "flos": 26000487767520.0, + "grad_norm": 2.0322395097847803, + "language_loss": 0.82524425, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84892356, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.11950684, + "step": 10176, + "time_per_iteration": 2.7948362827301025 + }, + { + "auxiliary_loss_clip": 0.01344432, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.23366916, + "balance_loss_mlp": 1.02103543, + "epoch": 0.611874342401924, + "flos": 24607941438240.0, + "grad_norm": 1.991162904087573, + "language_loss": 0.77402753, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79782569, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14349365, + "step": 10177, + "time_per_iteration": 2.7660841941833496 + }, + { + "auxiliary_loss_clip": 0.01345531, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.23502207, + "balance_loss_mlp": 1.01865935, + "epoch": 0.6119344656545919, + "flos": 15891273418320.0, + "grad_norm": 1.9065491911771468, + "language_loss": 0.75994766, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.78371525, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.12585449, + "step": 10178, + "time_per_iteration": 2.8957252502441406 + }, + { + "auxiliary_loss_clip": 0.01350865, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.23856533, + "balance_loss_mlp": 1.02146745, + "epoch": 0.6119945889072599, + "flos": 21657901310640.0, + "grad_norm": 1.5917150380526883, + "language_loss": 0.67238879, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.6962508, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13861084, + "step": 10179, + "time_per_iteration": 2.771221876144409 + }, + { + "auxiliary_loss_clip": 0.01348055, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.23622155, + "balance_loss_mlp": 1.02159977, + "epoch": 0.6120547121599279, + "flos": 13776603380280.0, + "grad_norm": 1.7020041458690767, + "language_loss": 0.84266424, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86648846, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12768555, + "step": 10180, + "time_per_iteration": 2.741797685623169 + }, + { + "auxiliary_loss_clip": 0.01346361, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.2362361, + "balance_loss_mlp": 1.01463103, + "epoch": 0.6121148354125958, + "flos": 20083352001120.0, + "grad_norm": 1.5571745124179173, + "language_loss": 0.77787292, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.80162179, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13897705, + "step": 10181, + "time_per_iteration": 2.865734577178955 + }, + { + "auxiliary_loss_clip": 0.01350628, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.23798585, + "balance_loss_mlp": 1.02055573, + "epoch": 0.6121749586652638, + "flos": 13474125731160.0, + "grad_norm": 1.9231860217137204, + "language_loss": 0.80828166, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.83212614, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.1328125, + "step": 10182, + "time_per_iteration": 2.821789026260376 + }, + { + "auxiliary_loss_clip": 0.01331303, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.22407007, + "balance_loss_mlp": 1.01364076, + "epoch": 0.6122350819179317, + "flos": 20125364672520.0, + "grad_norm": 1.4971014876679587, + "language_loss": 0.83094686, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85450774, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1114502, + "step": 10183, + "time_per_iteration": 2.770456075668335 + }, + { + "auxiliary_loss_clip": 0.01193614, + "auxiliary_loss_mlp": 0.01000935, + "balance_loss_clip": 1.14292216, + "balance_loss_mlp": 0.99709624, + "epoch": 0.6122952051705998, + "flos": 65443890418560.0, + "grad_norm": 0.7090245185002984, + "language_loss": 0.63013983, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.6520853, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03833008, + "step": 10184, + "time_per_iteration": 3.496032953262329 + }, + { + "auxiliary_loss_clip": 0.01344522, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.23382068, + "balance_loss_mlp": 1.01960111, + "epoch": 0.6123553284232677, + "flos": 20381159688840.0, + "grad_norm": 1.7604819675001468, + "language_loss": 0.82306653, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84683073, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1229248, + "step": 10185, + "time_per_iteration": 2.9233179092407227 + }, + { + "auxiliary_loss_clip": 0.0135382, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.23865151, + "balance_loss_mlp": 1.02023792, + "epoch": 0.6124154516759357, + "flos": 23992932247920.0, + "grad_norm": 1.788993028101979, + "language_loss": 0.74880159, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.77268374, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14160156, + "step": 10186, + "time_per_iteration": 2.8890364170074463 + }, + { + "auxiliary_loss_clip": 0.01337046, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.22828889, + "balance_loss_mlp": 1.0190407, + "epoch": 0.6124755749286036, + "flos": 21473583653880.0, + "grad_norm": 1.7363392838663017, + "language_loss": 0.78856897, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.81225204, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12237549, + "step": 10187, + "time_per_iteration": 2.8708138465881348 + }, + { + "auxiliary_loss_clip": 0.01344532, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.23390162, + "balance_loss_mlp": 1.01679015, + "epoch": 0.6125356981812716, + "flos": 23884599478680.0, + "grad_norm": 1.7606361464388978, + "language_loss": 0.83228987, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85602915, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1262207, + "step": 10188, + "time_per_iteration": 2.8025400638580322 + }, + { + "auxiliary_loss_clip": 0.01344287, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.23257554, + "balance_loss_mlp": 1.01931596, + "epoch": 0.6125958214339395, + "flos": 14429280147480.0, + "grad_norm": 1.712961942454645, + "language_loss": 0.75772226, + "learning_rate": 1.378189152155896e-06, + "loss": 0.78149033, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13220215, + "step": 10189, + "time_per_iteration": 2.845616102218628 + }, + { + "auxiliary_loss_clip": 0.01345571, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.23472548, + "balance_loss_mlp": 1.01983178, + "epoch": 0.6126559446866076, + "flos": 23264433026640.0, + "grad_norm": 1.4845866649532027, + "language_loss": 0.74268705, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76647639, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13531494, + "step": 10190, + "time_per_iteration": 2.807708501815796 + }, + { + "auxiliary_loss_clip": 0.01346849, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.23572946, + "balance_loss_mlp": 1.02012253, + "epoch": 0.6127160679392755, + "flos": 26869789464840.0, + "grad_norm": 13.68016384520195, + "language_loss": 0.69003487, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.71384323, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13861084, + "step": 10191, + "time_per_iteration": 2.8967125415802 + }, + { + "auxiliary_loss_clip": 0.01345594, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.23299158, + "balance_loss_mlp": 1.02069831, + "epoch": 0.6127761911919435, + "flos": 26402120696520.0, + "grad_norm": 1.8876308865821314, + "language_loss": 0.7392298, + "learning_rate": 1.377078777445467e-06, + "loss": 0.76302558, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 1.12744141, + "router_z_loss_mlp": 0.1328125, + "step": 10192, + "time_per_iteration": 2.8918020725250244 + }, + { + "auxiliary_loss_clip": 0.0133803, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.22979772, + "balance_loss_mlp": 1.02111161, + "epoch": 0.6128363144446115, + "flos": 22639474788120.0, + "grad_norm": 7.166277324476955, + "language_loss": 0.83347017, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85718465, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12304688, + "step": 10193, + "time_per_iteration": 3.0195767879486084 + }, + { + "auxiliary_loss_clip": 0.01344791, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.23321426, + "balance_loss_mlp": 1.018592, + "epoch": 0.6128964376972794, + "flos": 26764502322600.0, + "grad_norm": 2.0577810623316997, + "language_loss": 0.70677114, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.73053306, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12811279, + "step": 10194, + "time_per_iteration": 2.810671329498291 + }, + { + "auxiliary_loss_clip": 0.01195073, + "auxiliary_loss_mlp": 0.01003715, + "balance_loss_clip": 1.14436769, + "balance_loss_mlp": 1.00004292, + "epoch": 0.6129565609499474, + "flos": 65581647267600.0, + "grad_norm": 0.8270175371269645, + "language_loss": 0.58703446, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60902238, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03662109, + "step": 10195, + "time_per_iteration": 5.924023389816284 + }, + { + "auxiliary_loss_clip": 0.01342036, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.23218596, + "balance_loss_mlp": 1.01975453, + "epoch": 0.6130166842026153, + "flos": 16366373516520.0, + "grad_norm": 1.7838540061757735, + "language_loss": 0.69594741, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.7197001, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13476562, + "step": 10196, + "time_per_iteration": 2.743096351623535 + }, + { + "auxiliary_loss_clip": 0.01339848, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.23073876, + "balance_loss_mlp": 1.01983726, + "epoch": 0.6130768074552834, + "flos": 23656807249560.0, + "grad_norm": 1.7945076724444164, + "language_loss": 0.7128067, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73653269, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12908936, + "step": 10197, + "time_per_iteration": 2.8615529537200928 + }, + { + "auxiliary_loss_clip": 0.01346559, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.23344433, + "balance_loss_mlp": 1.02287984, + "epoch": 0.6131369307079513, + "flos": 20052262978560.0, + "grad_norm": 1.9689654135202372, + "language_loss": 0.78970212, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81352895, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13238525, + "step": 10198, + "time_per_iteration": 4.338047027587891 + }, + { + "auxiliary_loss_clip": 0.01353933, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.24006248, + "balance_loss_mlp": 1.0190928, + "epoch": 0.6131970539606193, + "flos": 22677020539920.0, + "grad_norm": 1.4216497861559139, + "language_loss": 0.74715018, + "learning_rate": 1.374488730519181e-06, + "loss": 0.77102268, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14221191, + "step": 10199, + "time_per_iteration": 2.832237958908081 + }, + { + "auxiliary_loss_clip": 0.01352766, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.23858237, + "balance_loss_mlp": 1.02320004, + "epoch": 0.6132571772132872, + "flos": 26876692886040.0, + "grad_norm": 1.8890378704731245, + "language_loss": 0.62504679, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64894676, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14013672, + "step": 10200, + "time_per_iteration": 2.850964307785034 + }, + { + "auxiliary_loss_clip": 0.01343698, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.23233998, + "balance_loss_mlp": 1.02166641, + "epoch": 0.6133173004659552, + "flos": 22897746914400.0, + "grad_norm": 1.7870711268559818, + "language_loss": 0.6903367, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.7141223, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13208008, + "step": 10201, + "time_per_iteration": 2.855919122695923 + }, + { + "auxiliary_loss_clip": 0.01340116, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.23066795, + "balance_loss_mlp": 1.01639473, + "epoch": 0.6133774237186231, + "flos": 20489939150040.0, + "grad_norm": 1.8746323351002536, + "language_loss": 0.84555328, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.86925286, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13427734, + "step": 10202, + "time_per_iteration": 2.8573553562164307 + }, + { + "auxiliary_loss_clip": 0.01196473, + "auxiliary_loss_mlp": 0.01006332, + "balance_loss_clip": 1.14475036, + "balance_loss_mlp": 1.00261235, + "epoch": 0.6134375469712912, + "flos": 69429130686000.0, + "grad_norm": 0.891179369527832, + "language_loss": 0.67068136, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69270933, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.03710938, + "step": 10203, + "time_per_iteration": 3.383643627166748 + }, + { + "auxiliary_loss_clip": 0.01346835, + "auxiliary_loss_mlp": 0.01027882, + "balance_loss_clip": 1.23544693, + "balance_loss_mlp": 1.01540089, + "epoch": 0.6134976702239591, + "flos": 41289039347400.0, + "grad_norm": 1.7574015482177807, + "language_loss": 0.612885, + "learning_rate": 1.37263940830327e-06, + "loss": 0.6366322, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12475586, + "step": 10204, + "time_per_iteration": 4.491133213043213 + }, + { + "auxiliary_loss_clip": 0.01337722, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.22895026, + "balance_loss_mlp": 1.01849961, + "epoch": 0.6135577934766271, + "flos": 22351778582040.0, + "grad_norm": 1.9086174728747338, + "language_loss": 0.72834146, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.75203168, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12799072, + "step": 10205, + "time_per_iteration": 2.933649778366089 + }, + { + "auxiliary_loss_clip": 0.01340973, + "auxiliary_loss_mlp": 0.01025139, + "balance_loss_clip": 1.23295546, + "balance_loss_mlp": 1.01174605, + "epoch": 0.6136179167292951, + "flos": 23732873353800.0, + "grad_norm": 1.6391997398722382, + "language_loss": 0.76208937, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78575057, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13397217, + "step": 10206, + "time_per_iteration": 2.851482629776001 + }, + { + "auxiliary_loss_clip": 0.01348158, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.23758435, + "balance_loss_mlp": 1.01763225, + "epoch": 0.613678039981963, + "flos": 26029424547000.0, + "grad_norm": 1.9871104437486664, + "language_loss": 0.75723743, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.78103054, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13531494, + "step": 10207, + "time_per_iteration": 2.873492956161499 + }, + { + "auxiliary_loss_clip": 0.01344868, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.23471975, + "balance_loss_mlp": 1.01873147, + "epoch": 0.613738163234631, + "flos": 9862393780440.0, + "grad_norm": 2.222131049383388, + "language_loss": 0.82446355, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84822679, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 1.10107422, + "router_z_loss_mlp": 0.12731934, + "step": 10208, + "time_per_iteration": 2.9096903800964355 + }, + { + "auxiliary_loss_clip": 0.01350044, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.23702204, + "balance_loss_mlp": 1.01690793, + "epoch": 0.613798286487299, + "flos": 33188558160240.0, + "grad_norm": 1.7511055182509148, + "language_loss": 0.73216045, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.75596786, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13787842, + "step": 10209, + "time_per_iteration": 2.9845566749572754 + }, + { + "auxiliary_loss_clip": 0.01346105, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.23667359, + "balance_loss_mlp": 1.02175963, + "epoch": 0.613858409739967, + "flos": 25633192529880.0, + "grad_norm": 1.6112349488419564, + "language_loss": 0.74141365, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76522231, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.12994385, + "step": 10210, + "time_per_iteration": 2.9789178371429443 + }, + { + "auxiliary_loss_clip": 0.01192875, + "auxiliary_loss_mlp": 0.01008959, + "balance_loss_clip": 1.14118052, + "balance_loss_mlp": 1.00464404, + "epoch": 0.6139185329926349, + "flos": 67208726813760.0, + "grad_norm": 0.8682816666349172, + "language_loss": 0.6509614, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67297971, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.04321289, + "step": 10211, + "time_per_iteration": 3.401550054550171 + }, + { + "auxiliary_loss_clip": 0.0134514, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.23518169, + "balance_loss_mlp": 1.0184114, + "epoch": 0.6139786562453029, + "flos": 21548675157480.0, + "grad_norm": 1.6457723749661908, + "language_loss": 0.76112199, + "learning_rate": 1.369681730544801e-06, + "loss": 0.78489351, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1361084, + "step": 10212, + "time_per_iteration": 2.8500266075134277 + }, + { + "auxiliary_loss_clip": 0.01347222, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.23685408, + "balance_loss_mlp": 1.01794553, + "epoch": 0.6140387794979708, + "flos": 26074198586880.0, + "grad_norm": 1.5173749746539942, + "language_loss": 0.74193335, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76571488, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12982178, + "step": 10213, + "time_per_iteration": 2.8004157543182373 + }, + { + "auxiliary_loss_clip": 0.013569, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.24200499, + "balance_loss_mlp": 1.02122569, + "epoch": 0.6140989027506388, + "flos": 23699713304880.0, + "grad_norm": 1.61919226917107, + "language_loss": 0.73234659, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75626957, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14172363, + "step": 10214, + "time_per_iteration": 2.844085931777954 + }, + { + "auxiliary_loss_clip": 0.0135077, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.23823023, + "balance_loss_mlp": 1.01507807, + "epoch": 0.6141590260033067, + "flos": 22236176916360.0, + "grad_norm": 1.6672487263132096, + "language_loss": 0.74667931, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.77047348, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13580322, + "step": 10215, + "time_per_iteration": 2.8619797229766846 + }, + { + "auxiliary_loss_clip": 0.01342471, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.23303938, + "balance_loss_mlp": 1.01748991, + "epoch": 0.6142191492559748, + "flos": 23876071723080.0, + "grad_norm": 1.6600043274726315, + "language_loss": 0.78763211, + "learning_rate": 1.368203464858542e-06, + "loss": 0.81136882, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.137146, + "step": 10216, + "time_per_iteration": 2.7816786766052246 + }, + { + "auxiliary_loss_clip": 0.01349054, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.23849344, + "balance_loss_mlp": 1.01775622, + "epoch": 0.6142792725086427, + "flos": 15045832455480.0, + "grad_norm": 2.122848266725334, + "language_loss": 0.80175424, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82555926, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13708496, + "step": 10217, + "time_per_iteration": 2.7112739086151123 + }, + { + "auxiliary_loss_clip": 0.01350626, + "auxiliary_loss_mlp": 0.01025037, + "balance_loss_clip": 1.23830366, + "balance_loss_mlp": 1.01123834, + "epoch": 0.6143393957613107, + "flos": 23336154036360.0, + "grad_norm": 2.4052078156109955, + "language_loss": 0.78990877, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.81366539, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13806152, + "step": 10218, + "time_per_iteration": 2.744481325149536 + }, + { + "auxiliary_loss_clip": 0.0134125, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.23162413, + "balance_loss_mlp": 1.01896644, + "epoch": 0.6143995190139786, + "flos": 20121466269960.0, + "grad_norm": 1.588886148257552, + "language_loss": 0.82209611, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.137146, + "step": 10219, + "time_per_iteration": 2.7485013008117676 + }, + { + "auxiliary_loss_clip": 0.01354147, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.24103308, + "balance_loss_mlp": 1.01470637, + "epoch": 0.6144596422666466, + "flos": 42311244812040.0, + "grad_norm": 1.986227875151766, + "language_loss": 0.66975373, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.69358563, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14349365, + "step": 10220, + "time_per_iteration": 2.9628705978393555 + }, + { + "auxiliary_loss_clip": 0.01346976, + "auxiliary_loss_mlp": 0.01025058, + "balance_loss_clip": 1.23720431, + "balance_loss_mlp": 1.01195097, + "epoch": 0.6145197655193146, + "flos": 21577814978760.0, + "grad_norm": 1.924975374778231, + "language_loss": 0.71518815, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73890847, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13098145, + "step": 10221, + "time_per_iteration": 2.8388099670410156 + }, + { + "auxiliary_loss_clip": 0.0134375, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.23441029, + "balance_loss_mlp": 1.01300049, + "epoch": 0.6145798887719826, + "flos": 21476182588920.0, + "grad_norm": 1.6190983526664418, + "language_loss": 0.79422987, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81792611, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12890625, + "step": 10222, + "time_per_iteration": 2.7360498905181885 + }, + { + "auxiliary_loss_clip": 0.01353547, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.24074459, + "balance_loss_mlp": 1.01804245, + "epoch": 0.6146400120246506, + "flos": 20781899233920.0, + "grad_norm": 1.8100162179630763, + "language_loss": 0.76176566, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78561658, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13500977, + "step": 10223, + "time_per_iteration": 2.869455099105835 + }, + { + "auxiliary_loss_clip": 0.01340509, + "auxiliary_loss_mlp": 0.01028979, + "balance_loss_clip": 1.2337445, + "balance_loss_mlp": 1.01528215, + "epoch": 0.6147001352773185, + "flos": 13885301624760.0, + "grad_norm": 1.7787363799082399, + "language_loss": 0.78763235, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.81132728, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13702393, + "step": 10224, + "time_per_iteration": 2.719374895095825 + }, + { + "auxiliary_loss_clip": 0.01342363, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.23361313, + "balance_loss_mlp": 1.01404619, + "epoch": 0.6147602585299865, + "flos": 56651261138280.0, + "grad_norm": 1.1788954927183641, + "language_loss": 0.66684687, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.69053769, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12664795, + "step": 10225, + "time_per_iteration": 3.1171798706054688 + }, + { + "auxiliary_loss_clip": 0.01352009, + "auxiliary_loss_mlp": 0.01028985, + "balance_loss_clip": 1.23912179, + "balance_loss_mlp": 1.01513851, + "epoch": 0.6148203817826544, + "flos": 32823658815840.0, + "grad_norm": 3.9800976050707235, + "language_loss": 0.63183546, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65564537, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13842773, + "step": 10226, + "time_per_iteration": 2.865316390991211 + }, + { + "auxiliary_loss_clip": 0.01354914, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.2432375, + "balance_loss_mlp": 1.01775861, + "epoch": 0.6148805050353224, + "flos": 18336545717760.0, + "grad_norm": 1.861884465687413, + "language_loss": 0.75689769, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.78075975, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13549805, + "step": 10227, + "time_per_iteration": 3.139799118041992 + }, + { + "auxiliary_loss_clip": 0.01353616, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.2395221, + "balance_loss_mlp": 1.01679015, + "epoch": 0.6149406282879903, + "flos": 14068035555480.0, + "grad_norm": 2.065335534735629, + "language_loss": 0.6254558, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64932072, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.16064453, + "step": 10228, + "time_per_iteration": 2.7714476585388184 + }, + { + "auxiliary_loss_clip": 0.01347789, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.2385658, + "balance_loss_mlp": 1.01508951, + "epoch": 0.6150007515406584, + "flos": 25195435141680.0, + "grad_norm": 1.9624473601028747, + "language_loss": 0.74617231, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76993525, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13415527, + "step": 10229, + "time_per_iteration": 2.853893995285034 + }, + { + "auxiliary_loss_clip": 0.01348846, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.23949957, + "balance_loss_mlp": 1.01852882, + "epoch": 0.6150608747933263, + "flos": 21950389303200.0, + "grad_norm": 1.8793766571775132, + "language_loss": 0.78348124, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80729401, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13891602, + "step": 10230, + "time_per_iteration": 2.8194456100463867 + }, + { + "auxiliary_loss_clip": 0.01354908, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.24242258, + "balance_loss_mlp": 1.01693451, + "epoch": 0.6151209980459943, + "flos": 30123566100720.0, + "grad_norm": 1.4700430321455835, + "language_loss": 0.73160172, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75545448, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13439941, + "step": 10231, + "time_per_iteration": 2.834561347961426 + }, + { + "auxiliary_loss_clip": 0.01349221, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.2376411, + "balance_loss_mlp": 1.01871634, + "epoch": 0.6151811212986622, + "flos": 30014502381000.0, + "grad_norm": 1.6220684248498234, + "language_loss": 0.70264566, + "learning_rate": 1.362294244324858e-06, + "loss": 0.72645283, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12786865, + "step": 10232, + "time_per_iteration": 2.8619213104248047 + }, + { + "auxiliary_loss_clip": 0.01340028, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.232723, + "balance_loss_mlp": 1.01584983, + "epoch": 0.6152412445513302, + "flos": 18876625837920.0, + "grad_norm": 1.865809850866683, + "language_loss": 0.92009526, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.94378167, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12768555, + "step": 10233, + "time_per_iteration": 2.79036808013916 + }, + { + "auxiliary_loss_clip": 0.01345669, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.23677027, + "balance_loss_mlp": 1.01803064, + "epoch": 0.6153013678039982, + "flos": 25709258634120.0, + "grad_norm": 1.7186371983828586, + "language_loss": 0.71819544, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.74195576, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12341309, + "step": 10234, + "time_per_iteration": 5.5418994426727295 + }, + { + "auxiliary_loss_clip": 0.01354337, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.24138021, + "balance_loss_mlp": 1.01899016, + "epoch": 0.6153614910566662, + "flos": 28515937959000.0, + "grad_norm": 1.8737292781132502, + "language_loss": 0.66988188, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69375145, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13641357, + "step": 10235, + "time_per_iteration": 2.801945686340332 + }, + { + "auxiliary_loss_clip": 0.0136301, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.24849749, + "balance_loss_mlp": 1.01686525, + "epoch": 0.6154216143093342, + "flos": 23555012426280.0, + "grad_norm": 1.7793180349914695, + "language_loss": 0.81314337, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83707309, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13098145, + "step": 10236, + "time_per_iteration": 2.789764642715454 + }, + { + "auxiliary_loss_clip": 0.01358017, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.24454141, + "balance_loss_mlp": 1.01547527, + "epoch": 0.6154817375620021, + "flos": 22753330294320.0, + "grad_norm": 1.6226313815501923, + "language_loss": 0.80883056, + "learning_rate": 1.360448879760721e-06, + "loss": 0.83269954, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13415527, + "step": 10237, + "time_per_iteration": 4.287235736846924 + }, + { + "auxiliary_loss_clip": 0.01345761, + "auxiliary_loss_mlp": 0.01031972, + "balance_loss_clip": 1.23521733, + "balance_loss_mlp": 1.0184474, + "epoch": 0.6155418608146701, + "flos": 27168815403360.0, + "grad_norm": 1.590764625159999, + "language_loss": 0.76422131, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78799862, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13525391, + "step": 10238, + "time_per_iteration": 2.8525383472442627 + }, + { + "auxiliary_loss_clip": 0.01187361, + "auxiliary_loss_mlp": 0.00999464, + "balance_loss_clip": 1.13701367, + "balance_loss_mlp": 0.99615049, + "epoch": 0.615601984067338, + "flos": 68824784513160.0, + "grad_norm": 0.7660014272402387, + "language_loss": 0.57687747, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.5987457, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.03320312, + "step": 10239, + "time_per_iteration": 3.2728028297424316 + }, + { + "auxiliary_loss_clip": 0.01353073, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.2397933, + "balance_loss_mlp": 1.01966047, + "epoch": 0.615662107320006, + "flos": 15520567078440.0, + "grad_norm": 2.05744531112445, + "language_loss": 0.77985078, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.80371416, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.1361084, + "step": 10240, + "time_per_iteration": 2.7223944664001465 + }, + { + "auxiliary_loss_clip": 0.01356622, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.24465656, + "balance_loss_mlp": 1.02056003, + "epoch": 0.615722230572674, + "flos": 21067930497240.0, + "grad_norm": 30.760724912749122, + "language_loss": 0.72640026, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75031257, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14050293, + "step": 10241, + "time_per_iteration": 2.781280279159546 + }, + { + "auxiliary_loss_clip": 0.01346524, + "auxiliary_loss_mlp": 0.01027497, + "balance_loss_clip": 1.23756242, + "balance_loss_mlp": 1.01474786, + "epoch": 0.615782353825342, + "flos": 23261834091600.0, + "grad_norm": 1.56197112639615, + "language_loss": 0.72551894, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74925911, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12750244, + "step": 10242, + "time_per_iteration": 2.880378484725952 + }, + { + "auxiliary_loss_clip": 0.01353685, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.24255991, + "balance_loss_mlp": 1.01719975, + "epoch": 0.6158424770780099, + "flos": 21108643701120.0, + "grad_norm": 1.7503185822015734, + "language_loss": 0.72475415, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74859238, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12939453, + "step": 10243, + "time_per_iteration": 4.391860485076904 + }, + { + "auxiliary_loss_clip": 0.01184934, + "auxiliary_loss_mlp": 0.01001879, + "balance_loss_clip": 1.13491356, + "balance_loss_mlp": 0.99861234, + "epoch": 0.6159026003306779, + "flos": 70350515928000.0, + "grad_norm": 0.7691108466685466, + "language_loss": 0.56920391, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.59107202, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03271484, + "step": 10244, + "time_per_iteration": 3.357607126235962 + }, + { + "auxiliary_loss_clip": 0.01350055, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.23950553, + "balance_loss_mlp": 1.01800609, + "epoch": 0.6159627235833458, + "flos": 33880811097240.0, + "grad_norm": 1.5841366131924444, + "language_loss": 0.63541734, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65923989, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14172363, + "step": 10245, + "time_per_iteration": 2.9494705200195312 + }, + { + "auxiliary_loss_clip": 0.01343337, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.23324788, + "balance_loss_mlp": 1.01904905, + "epoch": 0.6160228468360138, + "flos": 26580712574520.0, + "grad_norm": 1.6381181907913855, + "language_loss": 0.79529583, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81904298, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12335205, + "step": 10246, + "time_per_iteration": 2.84696626663208 + }, + { + "auxiliary_loss_clip": 0.0135846, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.24552453, + "balance_loss_mlp": 1.02679729, + "epoch": 0.6160829700886818, + "flos": 17196626952720.0, + "grad_norm": 2.1823949849293744, + "language_loss": 0.87960613, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.90360153, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14276123, + "step": 10247, + "time_per_iteration": 2.7501776218414307 + }, + { + "auxiliary_loss_clip": 0.01354603, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.24274921, + "balance_loss_mlp": 1.02115536, + "epoch": 0.6161430933413498, + "flos": 23628885679080.0, + "grad_norm": 1.851953022677519, + "language_loss": 0.79924667, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82313609, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13183594, + "step": 10248, + "time_per_iteration": 2.8137359619140625 + }, + { + "auxiliary_loss_clip": 0.01346394, + "auxiliary_loss_mlp": 0.01028085, + "balance_loss_clip": 1.23717439, + "balance_loss_mlp": 1.01508522, + "epoch": 0.6162032165940178, + "flos": 23007582192960.0, + "grad_norm": 1.8572265150434792, + "language_loss": 0.8681134, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89185816, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13000488, + "step": 10249, + "time_per_iteration": 2.924569845199585 + }, + { + "auxiliary_loss_clip": 0.01354696, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.24317312, + "balance_loss_mlp": 1.01484108, + "epoch": 0.6162633398466857, + "flos": 39429392766840.0, + "grad_norm": 2.6158114505010204, + "language_loss": 0.69007009, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71390599, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14050293, + "step": 10250, + "time_per_iteration": 2.878814458847046 + }, + { + "auxiliary_loss_clip": 0.01339127, + "auxiliary_loss_mlp": 0.01023573, + "balance_loss_clip": 1.23297524, + "balance_loss_mlp": 1.01197958, + "epoch": 0.6163234630993537, + "flos": 19249362595800.0, + "grad_norm": 5.721903408221452, + "language_loss": 0.73985672, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.7634837, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11590576, + "step": 10251, + "time_per_iteration": 2.778773069381714 + }, + { + "auxiliary_loss_clip": 0.01346137, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.2358489, + "balance_loss_mlp": 1.0134716, + "epoch": 0.6163835863520216, + "flos": 15966161880120.0, + "grad_norm": 1.950727406842139, + "language_loss": 0.68440866, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.7081359, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13122559, + "step": 10252, + "time_per_iteration": 2.7527737617492676 + }, + { + "auxiliary_loss_clip": 0.01177894, + "auxiliary_loss_mlp": 0.01003496, + "balance_loss_clip": 1.12784863, + "balance_loss_mlp": 1.00011051, + "epoch": 0.6164437096046896, + "flos": 68120284832640.0, + "grad_norm": 0.8911634453147257, + "language_loss": 0.57912052, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.60093439, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03393555, + "step": 10253, + "time_per_iteration": 3.334653615951538 + }, + { + "auxiliary_loss_clip": 0.01352322, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.23975587, + "balance_loss_mlp": 1.01646948, + "epoch": 0.6165038328573575, + "flos": 21366225485280.0, + "grad_norm": 3.485571510493238, + "language_loss": 0.80010986, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.82393301, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13519287, + "step": 10254, + "time_per_iteration": 2.8484690189361572 + }, + { + "auxiliary_loss_clip": 0.0135394, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.23955107, + "balance_loss_mlp": 1.01545787, + "epoch": 0.6165639561100256, + "flos": 21106450849680.0, + "grad_norm": 1.6500888670072045, + "language_loss": 0.80591434, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82974064, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13226318, + "step": 10255, + "time_per_iteration": 2.8421270847320557 + }, + { + "auxiliary_loss_clip": 0.01351983, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.23845387, + "balance_loss_mlp": 1.01834369, + "epoch": 0.6166240793626935, + "flos": 25344115639560.0, + "grad_norm": 2.055654417217163, + "language_loss": 0.6583609, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.68220305, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 1.13330078, + "router_z_loss_mlp": 0.13909912, + "step": 10256, + "time_per_iteration": 2.9168524742126465 + }, + { + "auxiliary_loss_clip": 0.01343034, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.2348969, + "balance_loss_mlp": 1.01604795, + "epoch": 0.6166842026153615, + "flos": 19687404242520.0, + "grad_norm": 1.5246035189345224, + "language_loss": 0.72403967, + "learning_rate": 1.353073501949825e-06, + "loss": 0.7477569, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12652588, + "step": 10257, + "time_per_iteration": 2.7513790130615234 + }, + { + "auxiliary_loss_clip": 0.01349273, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.23829734, + "balance_loss_mlp": 1.01563168, + "epoch": 0.6167443258680294, + "flos": 19323154631880.0, + "grad_norm": 1.7209545309665681, + "language_loss": 0.72675574, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.75054085, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.1361084, + "step": 10258, + "time_per_iteration": 2.7480030059814453 + }, + { + "auxiliary_loss_clip": 0.01346926, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.23443067, + "balance_loss_mlp": 1.01793194, + "epoch": 0.6168044491206974, + "flos": 25270770295440.0, + "grad_norm": 2.361458263756384, + "language_loss": 0.64587998, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66966623, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13769531, + "step": 10259, + "time_per_iteration": 2.816617012023926 + }, + { + "auxiliary_loss_clip": 0.01340506, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.23224413, + "balance_loss_mlp": 1.01780236, + "epoch": 0.6168645723733654, + "flos": 13223203718040.0, + "grad_norm": 1.9290285578585933, + "language_loss": 0.71457851, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73829126, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12963867, + "step": 10260, + "time_per_iteration": 2.750159502029419 + }, + { + "auxiliary_loss_clip": 0.01359631, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.24465787, + "balance_loss_mlp": 1.01913285, + "epoch": 0.6169246956260334, + "flos": 26657793887760.0, + "grad_norm": 1.7259136993584232, + "language_loss": 0.68932903, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.71325707, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14025879, + "step": 10261, + "time_per_iteration": 3.0198910236358643 + }, + { + "auxiliary_loss_clip": 0.01347341, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.23653185, + "balance_loss_mlp": 1.02002192, + "epoch": 0.6169848188787014, + "flos": 23153582539080.0, + "grad_norm": 1.6595203775946321, + "language_loss": 0.71444023, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73823619, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12249756, + "step": 10262, + "time_per_iteration": 2.7759876251220703 + }, + { + "auxiliary_loss_clip": 0.01347201, + "auxiliary_loss_mlp": 0.01027708, + "balance_loss_clip": 1.23683786, + "balance_loss_mlp": 1.01455331, + "epoch": 0.6170449421313693, + "flos": 23337250462080.0, + "grad_norm": 1.7281430821213892, + "language_loss": 0.69782948, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72157848, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13146973, + "step": 10263, + "time_per_iteration": 2.9201605319976807 + }, + { + "auxiliary_loss_clip": 0.01355884, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.24110723, + "balance_loss_mlp": 1.02106535, + "epoch": 0.6171050653840373, + "flos": 15855879909600.0, + "grad_norm": 2.364433382303508, + "language_loss": 0.76342958, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78732979, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13061523, + "step": 10264, + "time_per_iteration": 2.7413718700408936 + }, + { + "auxiliary_loss_clip": 0.01342291, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.23234963, + "balance_loss_mlp": 1.01625991, + "epoch": 0.6171651886367052, + "flos": 20050070127120.0, + "grad_norm": 2.667656171971219, + "language_loss": 0.85231602, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87603581, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13433838, + "step": 10265, + "time_per_iteration": 2.7856271266937256 + }, + { + "auxiliary_loss_clip": 0.01340298, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.23141527, + "balance_loss_mlp": 1.01939106, + "epoch": 0.6172253118893732, + "flos": 26438366980800.0, + "grad_norm": 1.8369844111102445, + "language_loss": 0.652031, + "learning_rate": 1.349757776608153e-06, + "loss": 0.67575294, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12506104, + "step": 10266, + "time_per_iteration": 2.875765085220337 + }, + { + "auxiliary_loss_clip": 0.0134602, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.23432398, + "balance_loss_mlp": 1.01948476, + "epoch": 0.6172854351420412, + "flos": 22637363153400.0, + "grad_norm": 2.579134237884224, + "language_loss": 0.76173937, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.78551459, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12030029, + "step": 10267, + "time_per_iteration": 2.8587098121643066 + }, + { + "auxiliary_loss_clip": 0.013531, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.23897815, + "balance_loss_mlp": 1.01258087, + "epoch": 0.6173455583947092, + "flos": 21217626204120.0, + "grad_norm": 1.7425086481107404, + "language_loss": 0.75550079, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77929431, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13647461, + "step": 10268, + "time_per_iteration": 2.9648420810699463 + }, + { + "auxiliary_loss_clip": 0.01355029, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.23957849, + "balance_loss_mlp": 1.01804948, + "epoch": 0.6174056816473771, + "flos": 19505035787040.0, + "grad_norm": 1.599073173247932, + "language_loss": 0.75914949, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.78301251, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13226318, + "step": 10269, + "time_per_iteration": 2.8010902404785156 + }, + { + "auxiliary_loss_clip": 0.01346406, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.23486722, + "balance_loss_mlp": 1.0154047, + "epoch": 0.6174658049000451, + "flos": 16001595997200.0, + "grad_norm": 1.832673970329259, + "language_loss": 0.76402563, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78777301, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12939453, + "step": 10270, + "time_per_iteration": 2.8065457344055176 + }, + { + "auxiliary_loss_clip": 0.01345915, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.23383665, + "balance_loss_mlp": 1.01998711, + "epoch": 0.617525928152713, + "flos": 21908376631800.0, + "grad_norm": 1.6395086443171074, + "language_loss": 0.82710117, + "learning_rate": 1.347916569325736e-06, + "loss": 0.85089421, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.1338501, + "step": 10271, + "time_per_iteration": 4.267320394515991 + }, + { + "auxiliary_loss_clip": 0.01350588, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.23743081, + "balance_loss_mlp": 1.01736355, + "epoch": 0.617586051405381, + "flos": 21110958377640.0, + "grad_norm": 1.573858129515018, + "language_loss": 0.77064764, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79445791, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13098145, + "step": 10272, + "time_per_iteration": 2.7406253814697266 + }, + { + "auxiliary_loss_clip": 0.01177222, + "auxiliary_loss_mlp": 0.01000759, + "balance_loss_clip": 1.12812293, + "balance_loss_mlp": 0.99786186, + "epoch": 0.617646174658049, + "flos": 58625105073120.0, + "grad_norm": 0.8787772536180432, + "language_loss": 0.59173042, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61351019, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.02893066, + "step": 10273, + "time_per_iteration": 4.768226861953735 + }, + { + "auxiliary_loss_clip": 0.01338776, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.22873867, + "balance_loss_mlp": 1.01738679, + "epoch": 0.617706297910717, + "flos": 13882337214480.0, + "grad_norm": 2.2693724967283875, + "language_loss": 0.73571134, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75941062, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13751221, + "step": 10274, + "time_per_iteration": 4.25204873085022 + }, + { + "auxiliary_loss_clip": 0.01343099, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.23262048, + "balance_loss_mlp": 1.01567149, + "epoch": 0.617766421163385, + "flos": 19213156919880.0, + "grad_norm": 2.362256013333018, + "language_loss": 0.77924335, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.80295396, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12286377, + "step": 10275, + "time_per_iteration": 2.764035701751709 + }, + { + "auxiliary_loss_clip": 0.01340536, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.22992659, + "balance_loss_mlp": 1.01917672, + "epoch": 0.6178265444160529, + "flos": 22571246097360.0, + "grad_norm": 1.6353024315643543, + "language_loss": 0.79784596, + "learning_rate": 1.346075980219998e-06, + "loss": 0.82157713, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13391113, + "step": 10276, + "time_per_iteration": 2.7812438011169434 + }, + { + "auxiliary_loss_clip": 0.0135176, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.23900235, + "balance_loss_mlp": 1.01921678, + "epoch": 0.6178866676687209, + "flos": 11988068684040.0, + "grad_norm": 2.184862096537278, + "language_loss": 0.81200612, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83585155, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 1.12744141, + "router_z_loss_mlp": 0.13543701, + "step": 10277, + "time_per_iteration": 2.8177146911621094 + }, + { + "auxiliary_loss_clip": 0.01348039, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.23516345, + "balance_loss_mlp": 1.01503837, + "epoch": 0.6179467909213888, + "flos": 20995844012280.0, + "grad_norm": 1.800488601352376, + "language_loss": 0.82071769, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.84448373, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13537598, + "step": 10278, + "time_per_iteration": 2.7892613410949707 + }, + { + "auxiliary_loss_clip": 0.01341695, + "auxiliary_loss_mlp": 0.0102759, + "balance_loss_clip": 1.23078632, + "balance_loss_mlp": 1.01507318, + "epoch": 0.6180069141740568, + "flos": 25343628339240.0, + "grad_norm": 1.4616596559784618, + "language_loss": 0.74204147, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76573426, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12524414, + "step": 10279, + "time_per_iteration": 2.8960113525390625 + }, + { + "auxiliary_loss_clip": 0.0134287, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.23196077, + "balance_loss_mlp": 1.0199306, + "epoch": 0.6180670374267248, + "flos": 19650427007760.0, + "grad_norm": 1.5326505884441368, + "language_loss": 0.71096647, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73472357, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12902832, + "step": 10280, + "time_per_iteration": 2.751013994216919 + }, + { + "auxiliary_loss_clip": 0.01346195, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.23418367, + "balance_loss_mlp": 1.02321613, + "epoch": 0.6181271606793928, + "flos": 19469764103400.0, + "grad_norm": 1.3784043726252233, + "language_loss": 0.73273253, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75656116, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13458252, + "step": 10281, + "time_per_iteration": 2.7986345291137695 + }, + { + "auxiliary_loss_clip": 0.01336668, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.22870731, + "balance_loss_mlp": 1.01902664, + "epoch": 0.6181872839320607, + "flos": 25599951264240.0, + "grad_norm": 2.0067356650942827, + "language_loss": 0.7706331, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.79431027, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12036133, + "step": 10282, + "time_per_iteration": 4.414547681808472 + }, + { + "auxiliary_loss_clip": 0.01347648, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.23401356, + "balance_loss_mlp": 1.01403832, + "epoch": 0.6182474071847287, + "flos": 25556679733680.0, + "grad_norm": 1.8089018712535554, + "language_loss": 0.6902892, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71405828, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.15228271, + "step": 10283, + "time_per_iteration": 3.008922576904297 + }, + { + "auxiliary_loss_clip": 0.01358488, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.24031353, + "balance_loss_mlp": 1.01501918, + "epoch": 0.6183075304373966, + "flos": 22128372055800.0, + "grad_norm": 1.5965643225334265, + "language_loss": 0.75367546, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77754796, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.13751221, + "step": 10284, + "time_per_iteration": 2.843027353286743 + }, + { + "auxiliary_loss_clip": 0.01328609, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.22380877, + "balance_loss_mlp": 1.02283669, + "epoch": 0.6183676536900646, + "flos": 22460720476680.0, + "grad_norm": 1.4272126909190925, + "language_loss": 0.75707811, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.78071409, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12139893, + "step": 10285, + "time_per_iteration": 2.8978521823883057 + }, + { + "auxiliary_loss_clip": 0.01339934, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.22868133, + "balance_loss_mlp": 1.02182162, + "epoch": 0.6184277769427327, + "flos": 23368908001680.0, + "grad_norm": 1.6217082344589808, + "language_loss": 0.72767186, + "learning_rate": 1.342396663517503e-06, + "loss": 0.75142324, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1338501, + "step": 10286, + "time_per_iteration": 2.784075975418091 + }, + { + "auxiliary_loss_clip": 0.01340837, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.2312814, + "balance_loss_mlp": 1.0181818, + "epoch": 0.6184879001954006, + "flos": 22716231234480.0, + "grad_norm": 1.5267654965747774, + "language_loss": 0.75851822, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78223717, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12896729, + "step": 10287, + "time_per_iteration": 2.7942967414855957 + }, + { + "auxiliary_loss_clip": 0.01341505, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.23135889, + "balance_loss_mlp": 1.01846504, + "epoch": 0.6185480234480686, + "flos": 23847216160320.0, + "grad_norm": 1.5997397024252986, + "language_loss": 0.72837961, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.7521081, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.12890625, + "step": 10288, + "time_per_iteration": 2.7889416217803955 + }, + { + "auxiliary_loss_clip": 0.01329553, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.22311938, + "balance_loss_mlp": 1.01784229, + "epoch": 0.6186081467007365, + "flos": 45485666066520.0, + "grad_norm": 1.4289945160037534, + "language_loss": 0.73056221, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.75415421, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11804199, + "step": 10289, + "time_per_iteration": 2.992384195327759 + }, + { + "auxiliary_loss_clip": 0.01346596, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.23271775, + "balance_loss_mlp": 1.01708674, + "epoch": 0.6186682699534045, + "flos": 23556230677080.0, + "grad_norm": 1.4596351832458572, + "language_loss": 0.79840773, + "learning_rate": 1.340925634274056e-06, + "loss": 0.82218266, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13806152, + "step": 10290, + "time_per_iteration": 2.7795016765594482 + }, + { + "auxiliary_loss_clip": 0.01347064, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.23369944, + "balance_loss_mlp": 1.01906896, + "epoch": 0.6187283932060724, + "flos": 25779558351240.0, + "grad_norm": 1.5834138995304827, + "language_loss": 0.8165558, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.84035063, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13342285, + "step": 10291, + "time_per_iteration": 2.8455755710601807 + }, + { + "auxiliary_loss_clip": 0.01338952, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.22799528, + "balance_loss_mlp": 1.01835823, + "epoch": 0.6187885164587404, + "flos": 25270729687080.0, + "grad_norm": 1.9359407108341433, + "language_loss": 0.77653408, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.80023086, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12371826, + "step": 10292, + "time_per_iteration": 2.89259672164917 + }, + { + "auxiliary_loss_clip": 0.01354837, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.23822093, + "balance_loss_mlp": 1.02211952, + "epoch": 0.6188486397114084, + "flos": 26256485825640.0, + "grad_norm": 2.280369811174453, + "language_loss": 0.73611003, + "learning_rate": 1.339822624710401e-06, + "loss": 0.76003259, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.152771, + "step": 10293, + "time_per_iteration": 2.797203779220581 + }, + { + "auxiliary_loss_clip": 0.01345573, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.23515618, + "balance_loss_mlp": 1.02147675, + "epoch": 0.6189087629640764, + "flos": 20928346272000.0, + "grad_norm": 1.5619336306017404, + "language_loss": 0.83317113, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85697716, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13549805, + "step": 10294, + "time_per_iteration": 2.8454384803771973 + }, + { + "auxiliary_loss_clip": 0.01341987, + "auxiliary_loss_mlp": 0.01025998, + "balance_loss_clip": 1.23079872, + "balance_loss_mlp": 1.01296234, + "epoch": 0.6189688862167443, + "flos": 14833918095120.0, + "grad_norm": 1.9756748431968691, + "language_loss": 0.70698881, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73066866, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13031006, + "step": 10295, + "time_per_iteration": 2.709268093109131 + }, + { + "auxiliary_loss_clip": 0.01337678, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.22712076, + "balance_loss_mlp": 1.02038801, + "epoch": 0.6190290094694123, + "flos": 24291105410880.0, + "grad_norm": 1.4883729460739872, + "language_loss": 0.70247149, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72618794, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13568115, + "step": 10296, + "time_per_iteration": 2.945875406265259 + }, + { + "auxiliary_loss_clip": 0.01349257, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.23797572, + "balance_loss_mlp": 1.01540828, + "epoch": 0.6190891327220802, + "flos": 22534715554560.0, + "grad_norm": 2.3440891976400158, + "language_loss": 0.72255325, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.74634206, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14227295, + "step": 10297, + "time_per_iteration": 2.792837619781494 + }, + { + "auxiliary_loss_clip": 0.01170969, + "auxiliary_loss_mlp": 0.0100932, + "balance_loss_clip": 1.12265944, + "balance_loss_mlp": 1.00632787, + "epoch": 0.6191492559747482, + "flos": 67744502447760.0, + "grad_norm": 0.8820623638625252, + "language_loss": 0.64300227, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66480517, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02990723, + "step": 10298, + "time_per_iteration": 3.209702968597412 + }, + { + "auxiliary_loss_clip": 0.01343569, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.23201454, + "balance_loss_mlp": 1.01962054, + "epoch": 0.6192093792274163, + "flos": 22351859798760.0, + "grad_norm": 1.6202611433630734, + "language_loss": 0.74118245, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76494157, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12750244, + "step": 10299, + "time_per_iteration": 2.867492914199829 + }, + { + "auxiliary_loss_clip": 0.01355532, + "auxiliary_loss_mlp": 0.01025228, + "balance_loss_clip": 1.24000502, + "balance_loss_mlp": 1.01191807, + "epoch": 0.6192695024800842, + "flos": 13558029248880.0, + "grad_norm": 1.7636303778730873, + "language_loss": 0.68596488, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70977253, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13323975, + "step": 10300, + "time_per_iteration": 2.7769289016723633 + }, + { + "auxiliary_loss_clip": 0.01347547, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.23596668, + "balance_loss_mlp": 1.02063799, + "epoch": 0.6193296257327522, + "flos": 17419099486680.0, + "grad_norm": 1.8623400392859952, + "language_loss": 0.67188203, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69569713, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13305664, + "step": 10301, + "time_per_iteration": 2.799694299697876 + }, + { + "auxiliary_loss_clip": 0.01339911, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.22750044, + "balance_loss_mlp": 1.01426864, + "epoch": 0.6193897489854201, + "flos": 31107169996200.0, + "grad_norm": 1.6805862776058682, + "language_loss": 0.73750782, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.76118672, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13720703, + "step": 10302, + "time_per_iteration": 2.857891798019409 + }, + { + "auxiliary_loss_clip": 0.01342681, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.23107445, + "balance_loss_mlp": 1.01419973, + "epoch": 0.6194498722380881, + "flos": 19138552716600.0, + "grad_norm": 1.63681261196878, + "language_loss": 0.80760479, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.83131278, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13922119, + "step": 10303, + "time_per_iteration": 2.8676345348358154 + }, + { + "auxiliary_loss_clip": 0.01352303, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.23728526, + "balance_loss_mlp": 1.01530933, + "epoch": 0.619509995490756, + "flos": 21840107332680.0, + "grad_norm": 1.5324680859832793, + "language_loss": 0.7702589, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79408169, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14654541, + "step": 10304, + "time_per_iteration": 2.9031789302825928 + }, + { + "auxiliary_loss_clip": 0.01351505, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.2361331, + "balance_loss_mlp": 1.01786256, + "epoch": 0.619570118743424, + "flos": 23811985085040.0, + "grad_norm": 1.8406067730562583, + "language_loss": 0.76999247, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79382491, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13867188, + "step": 10305, + "time_per_iteration": 2.831766128540039 + }, + { + "auxiliary_loss_clip": 0.0135096, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.23571062, + "balance_loss_mlp": 1.01684022, + "epoch": 0.619630241996092, + "flos": 21106044766080.0, + "grad_norm": 1.6761713594617207, + "language_loss": 0.79435486, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81818247, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.1496582, + "step": 10306, + "time_per_iteration": 2.809481620788574 + }, + { + "auxiliary_loss_clip": 0.0133675, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.2277534, + "balance_loss_mlp": 1.01425219, + "epoch": 0.61969036524876, + "flos": 27314044190640.0, + "grad_norm": 1.676228801353245, + "language_loss": 0.80523485, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82886386, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.11901855, + "step": 10307, + "time_per_iteration": 2.7753946781158447 + }, + { + "auxiliary_loss_clip": 0.01169346, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.12177348, + "balance_loss_mlp": 0.99899334, + "epoch": 0.6197504885014279, + "flos": 51663283791840.0, + "grad_norm": 0.8715536340841574, + "language_loss": 0.59499252, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61670554, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.02966309, + "step": 10308, + "time_per_iteration": 3.342238664627075 + }, + { + "auxiliary_loss_clip": 0.01335938, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.22740531, + "balance_loss_mlp": 1.01676583, + "epoch": 0.6198106117540959, + "flos": 30563516340360.0, + "grad_norm": 1.7137786180453742, + "language_loss": 0.68198931, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70563519, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.11889648, + "step": 10309, + "time_per_iteration": 2.9136717319488525 + }, + { + "auxiliary_loss_clip": 0.01333635, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.22454894, + "balance_loss_mlp": 1.01706624, + "epoch": 0.6198707350067638, + "flos": 18913521855960.0, + "grad_norm": 1.4280979731411256, + "language_loss": 0.7220794, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74571693, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13067627, + "step": 10310, + "time_per_iteration": 4.144188165664673 + }, + { + "auxiliary_loss_clip": 0.01350818, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.23846686, + "balance_loss_mlp": 1.01741266, + "epoch": 0.6199308582594318, + "flos": 21438839878920.0, + "grad_norm": 1.8916087705202225, + "language_loss": 0.79403174, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81785905, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14508057, + "step": 10311, + "time_per_iteration": 2.8607287406921387 + }, + { + "auxiliary_loss_clip": 0.01346349, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.23287809, + "balance_loss_mlp": 1.01762772, + "epoch": 0.6199909815120999, + "flos": 18412205738400.0, + "grad_norm": 1.7341613807039786, + "language_loss": 0.72607821, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74985594, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13806152, + "step": 10312, + "time_per_iteration": 4.202470779418945 + }, + { + "auxiliary_loss_clip": 0.01348803, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.23493958, + "balance_loss_mlp": 1.01902556, + "epoch": 0.6200511047647678, + "flos": 21471390802440.0, + "grad_norm": 1.8022009866927189, + "language_loss": 0.72508824, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.74890184, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13531494, + "step": 10313, + "time_per_iteration": 4.3386290073394775 + }, + { + "auxiliary_loss_clip": 0.0135233, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.23711514, + "balance_loss_mlp": 1.01643348, + "epoch": 0.6201112280174358, + "flos": 18219522759480.0, + "grad_norm": 1.9323858710682378, + "language_loss": 0.78700888, + "learning_rate": 1.332107887401416e-06, + "loss": 0.81083691, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14019775, + "step": 10314, + "time_per_iteration": 2.861985683441162 + }, + { + "auxiliary_loss_clip": 0.01346048, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.23263383, + "balance_loss_mlp": 1.01877546, + "epoch": 0.6201713512701037, + "flos": 20016016694280.0, + "grad_norm": 2.1258754765466032, + "language_loss": 0.78034955, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80413038, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13262939, + "step": 10315, + "time_per_iteration": 2.7497024536132812 + }, + { + "auxiliary_loss_clip": 0.01359859, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.24422479, + "balance_loss_mlp": 1.02182007, + "epoch": 0.6202314745227717, + "flos": 22492256191200.0, + "grad_norm": 2.39399339299998, + "language_loss": 0.76078629, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.7847352, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.13226318, + "step": 10316, + "time_per_iteration": 2.738940715789795 + }, + { + "auxiliary_loss_clip": 0.01346338, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.23216641, + "balance_loss_mlp": 1.02070212, + "epoch": 0.6202915977754396, + "flos": 26833705614000.0, + "grad_norm": 2.2310166466109065, + "language_loss": 0.77780855, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.80161953, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.140625, + "step": 10317, + "time_per_iteration": 2.7883057594299316 + }, + { + "auxiliary_loss_clip": 0.01169919, + "auxiliary_loss_mlp": 0.01011686, + "balance_loss_clip": 1.12322938, + "balance_loss_mlp": 1.0088253, + "epoch": 0.6203517210281076, + "flos": 62758741579920.0, + "grad_norm": 0.6886579890997031, + "language_loss": 0.590626, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61244214, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.02856445, + "step": 10318, + "time_per_iteration": 3.3862106800079346 + }, + { + "auxiliary_loss_clip": 0.01348041, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.23630285, + "balance_loss_mlp": 1.02136886, + "epoch": 0.6204118442807756, + "flos": 23409539988840.0, + "grad_norm": 1.6428184769946392, + "language_loss": 0.78240871, + "learning_rate": 1.330272686582143e-06, + "loss": 0.80623734, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13452148, + "step": 10319, + "time_per_iteration": 2.79237699508667 + }, + { + "auxiliary_loss_clip": 0.01340233, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.23073339, + "balance_loss_mlp": 1.01814473, + "epoch": 0.6204719675334436, + "flos": 20198506974840.0, + "grad_norm": 1.798559929779499, + "language_loss": 0.66414863, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68786114, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12884521, + "step": 10320, + "time_per_iteration": 4.274611949920654 + }, + { + "auxiliary_loss_clip": 0.01337305, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.22879887, + "balance_loss_mlp": 1.01504326, + "epoch": 0.6205320907861115, + "flos": 13191952262040.0, + "grad_norm": 1.6180543218402748, + "language_loss": 0.76276708, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78641546, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12487793, + "step": 10321, + "time_per_iteration": 2.709904909133911 + }, + { + "auxiliary_loss_clip": 0.013388, + "auxiliary_loss_mlp": 0.01025589, + "balance_loss_clip": 1.2291224, + "balance_loss_mlp": 1.01289916, + "epoch": 0.6205922140387795, + "flos": 20673363422880.0, + "grad_norm": 1.6414148238430057, + "language_loss": 0.73490119, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75854504, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12689209, + "step": 10322, + "time_per_iteration": 2.7603845596313477 + }, + { + "auxiliary_loss_clip": 0.01341473, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.23053479, + "balance_loss_mlp": 1.01408219, + "epoch": 0.6206523372914474, + "flos": 23883056361000.0, + "grad_norm": 1.742506866845638, + "language_loss": 0.73477501, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75845325, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1227417, + "step": 10323, + "time_per_iteration": 2.774914026260376 + }, + { + "auxiliary_loss_clip": 0.01355219, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.23669291, + "balance_loss_mlp": 1.01912761, + "epoch": 0.6207124605441154, + "flos": 13410485785080.0, + "grad_norm": 2.4145863642254146, + "language_loss": 0.58957303, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61344957, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.13311768, + "step": 10324, + "time_per_iteration": 2.809353828430176 + }, + { + "auxiliary_loss_clip": 0.01352827, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.23891902, + "balance_loss_mlp": 1.01886463, + "epoch": 0.6207725837967835, + "flos": 18920993794200.0, + "grad_norm": 1.7324534450821585, + "language_loss": 0.76759183, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.79144895, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14025879, + "step": 10325, + "time_per_iteration": 2.808152675628662 + }, + { + "auxiliary_loss_clip": 0.01352034, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.23700619, + "balance_loss_mlp": 1.0157007, + "epoch": 0.6208327070494514, + "flos": 23984526317400.0, + "grad_norm": 2.997960295220004, + "language_loss": 0.72772312, + "learning_rate": 1.327704472462003e-06, + "loss": 0.75153852, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13812256, + "step": 10326, + "time_per_iteration": 2.780654191970825 + }, + { + "auxiliary_loss_clip": 0.01353875, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.23855186, + "balance_loss_mlp": 1.02137303, + "epoch": 0.6208928303021194, + "flos": 22825579212720.0, + "grad_norm": 2.635992267386886, + "language_loss": 0.7464273, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.77032471, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14489746, + "step": 10327, + "time_per_iteration": 2.831747531890869 + }, + { + "auxiliary_loss_clip": 0.01350357, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.23516703, + "balance_loss_mlp": 1.01732016, + "epoch": 0.6209529535547873, + "flos": 17568754585200.0, + "grad_norm": 2.0394309748484085, + "language_loss": 0.80235106, + "learning_rate": 1.326970926232066e-06, + "loss": 0.82617438, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14660645, + "step": 10328, + "time_per_iteration": 2.699054002761841 + }, + { + "auxiliary_loss_clip": 0.01350845, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.23708868, + "balance_loss_mlp": 1.02534795, + "epoch": 0.6210130768074553, + "flos": 22016222100720.0, + "grad_norm": 1.7120552972422245, + "language_loss": 0.78343022, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80733216, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13989258, + "step": 10329, + "time_per_iteration": 2.8771886825561523 + }, + { + "auxiliary_loss_clip": 0.0117282, + "auxiliary_loss_mlp": 0.01002132, + "balance_loss_clip": 1.12613809, + "balance_loss_mlp": 0.99909192, + "epoch": 0.6210732000601232, + "flos": 63691155314640.0, + "grad_norm": 0.8328332864170922, + "language_loss": 0.62250388, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64425337, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.03039551, + "step": 10330, + "time_per_iteration": 3.2933385372161865 + }, + { + "auxiliary_loss_clip": 0.01358109, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.24155188, + "balance_loss_mlp": 1.01545036, + "epoch": 0.6211333233127913, + "flos": 24248727264240.0, + "grad_norm": 1.8943910926068046, + "language_loss": 0.77871877, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.80259889, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14459229, + "step": 10331, + "time_per_iteration": 2.766484498977661 + }, + { + "auxiliary_loss_clip": 0.01354473, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.23865783, + "balance_loss_mlp": 1.01885355, + "epoch": 0.6211934465654592, + "flos": 16947897791040.0, + "grad_norm": 1.9609558662499789, + "language_loss": 0.67856514, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70243895, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14068604, + "step": 10332, + "time_per_iteration": 2.7404072284698486 + }, + { + "auxiliary_loss_clip": 0.01349049, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.23467278, + "balance_loss_mlp": 1.01921928, + "epoch": 0.6212535698181272, + "flos": 15271513049880.0, + "grad_norm": 1.52315238146167, + "language_loss": 0.76519048, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78900826, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13500977, + "step": 10333, + "time_per_iteration": 2.7047860622406006 + }, + { + "auxiliary_loss_clip": 0.01342313, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.23259163, + "balance_loss_mlp": 1.01816225, + "epoch": 0.6213136930707951, + "flos": 13447950320160.0, + "grad_norm": 2.283272180151584, + "language_loss": 0.69659078, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.72032499, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12945557, + "step": 10334, + "time_per_iteration": 2.79390549659729 + }, + { + "auxiliary_loss_clip": 0.01341368, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.23092067, + "balance_loss_mlp": 1.01656675, + "epoch": 0.6213738163234631, + "flos": 18115453868040.0, + "grad_norm": 1.7125876770589112, + "language_loss": 0.70265603, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72636449, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12908936, + "step": 10335, + "time_per_iteration": 2.8085410594940186 + }, + { + "auxiliary_loss_clip": 0.01344286, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.2346468, + "balance_loss_mlp": 1.01803637, + "epoch": 0.621433939576131, + "flos": 25342775563680.0, + "grad_norm": 1.598194593642662, + "language_loss": 0.80623984, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82999229, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12927246, + "step": 10336, + "time_per_iteration": 2.944645404815674 + }, + { + "auxiliary_loss_clip": 0.01343625, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.23407602, + "balance_loss_mlp": 1.01556504, + "epoch": 0.621494062828799, + "flos": 22570840013760.0, + "grad_norm": 1.750985541608721, + "language_loss": 0.73467976, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75840443, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13269043, + "step": 10337, + "time_per_iteration": 2.7912495136260986 + }, + { + "auxiliary_loss_clip": 0.01354571, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.23865652, + "balance_loss_mlp": 1.01716959, + "epoch": 0.621554186081467, + "flos": 27423798252480.0, + "grad_norm": 1.939804065310674, + "language_loss": 0.63076651, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65462887, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.1449585, + "step": 10338, + "time_per_iteration": 2.919660806655884 + }, + { + "auxiliary_loss_clip": 0.01351114, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.23910153, + "balance_loss_mlp": 1.01534009, + "epoch": 0.621614309334135, + "flos": 22352022232200.0, + "grad_norm": 1.526593678215274, + "language_loss": 0.71685064, + "learning_rate": 1.322938249724991e-06, + "loss": 0.74064887, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13342285, + "step": 10339, + "time_per_iteration": 2.828395366668701 + }, + { + "auxiliary_loss_clip": 0.0133904, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.23076141, + "balance_loss_mlp": 1.01777256, + "epoch": 0.621674432586803, + "flos": 19285933746960.0, + "grad_norm": 1.8872555673872486, + "language_loss": 0.69542086, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71912378, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13476562, + "step": 10340, + "time_per_iteration": 2.77634596824646 + }, + { + "auxiliary_loss_clip": 0.01337455, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.22775221, + "balance_loss_mlp": 1.01526666, + "epoch": 0.6217345558394709, + "flos": 21613655179440.0, + "grad_norm": 2.0485295918704045, + "language_loss": 0.69491494, + "learning_rate": 1.322205369037788e-06, + "loss": 0.7185722, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13018799, + "step": 10341, + "time_per_iteration": 2.8467586040496826 + }, + { + "auxiliary_loss_clip": 0.01353967, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.23991084, + "balance_loss_mlp": 1.01404476, + "epoch": 0.6217946790921389, + "flos": 18008623608120.0, + "grad_norm": 1.821309774365017, + "language_loss": 0.81069148, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83451927, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14764404, + "step": 10342, + "time_per_iteration": 2.827212333679199 + }, + { + "auxiliary_loss_clip": 0.01173318, + "auxiliary_loss_mlp": 0.01020863, + "balance_loss_clip": 1.12653375, + "balance_loss_mlp": 1.01750135, + "epoch": 0.6218548023448068, + "flos": 61989204287880.0, + "grad_norm": 0.7814080968347521, + "language_loss": 0.5737797, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59572148, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03369141, + "step": 10343, + "time_per_iteration": 3.2689921855926514 + }, + { + "auxiliary_loss_clip": 0.0134379, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.23485959, + "balance_loss_mlp": 1.0164057, + "epoch": 0.6219149255974749, + "flos": 25744692751200.0, + "grad_norm": 1.831956095664534, + "language_loss": 0.7294668, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75319242, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12365723, + "step": 10344, + "time_per_iteration": 2.9146199226379395 + }, + { + "auxiliary_loss_clip": 0.01347374, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.2356261, + "balance_loss_mlp": 1.02108347, + "epoch": 0.6219750488501428, + "flos": 25416608208120.0, + "grad_norm": 1.753618883453442, + "language_loss": 0.60447872, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62829554, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13214111, + "step": 10345, + "time_per_iteration": 2.8720545768737793 + }, + { + "auxiliary_loss_clip": 0.01349577, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.23659742, + "balance_loss_mlp": 1.02045178, + "epoch": 0.6220351721028108, + "flos": 20052181761840.0, + "grad_norm": 2.369135597527914, + "language_loss": 0.78249264, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80633366, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14074707, + "step": 10346, + "time_per_iteration": 2.7715795040130615 + }, + { + "auxiliary_loss_clip": 0.01355539, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.23885775, + "balance_loss_mlp": 1.02517033, + "epoch": 0.6220952953554787, + "flos": 27493894927800.0, + "grad_norm": 1.8358271607277996, + "language_loss": 0.71641397, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.74036855, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14770508, + "step": 10347, + "time_per_iteration": 2.789685010910034 + }, + { + "auxiliary_loss_clip": 0.01345677, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.23393154, + "balance_loss_mlp": 1.01511478, + "epoch": 0.6221554186081467, + "flos": 19212222927600.0, + "grad_norm": 1.6846980651867294, + "language_loss": 0.72140783, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74514735, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1315918, + "step": 10348, + "time_per_iteration": 4.137695789337158 + }, + { + "auxiliary_loss_clip": 0.01175387, + "auxiliary_loss_mlp": 0.01013521, + "balance_loss_clip": 1.12869549, + "balance_loss_mlp": 1.01006401, + "epoch": 0.6222155418608146, + "flos": 62965620503640.0, + "grad_norm": 0.8114408079674911, + "language_loss": 0.54176044, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56364954, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.03466797, + "step": 10349, + "time_per_iteration": 3.2440173625946045 + }, + { + "auxiliary_loss_clip": 0.01361295, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.24850082, + "balance_loss_mlp": 1.01853216, + "epoch": 0.6222756651134826, + "flos": 22606192914120.0, + "grad_norm": 1.927879132075995, + "language_loss": 0.70050895, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.7244488, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.14172363, + "step": 10350, + "time_per_iteration": 2.8226265907287598 + }, + { + "auxiliary_loss_clip": 0.01354219, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.24115932, + "balance_loss_mlp": 1.01753306, + "epoch": 0.6223357883661506, + "flos": 21147204661920.0, + "grad_norm": 2.037360366896374, + "language_loss": 0.56781387, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59167588, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14453125, + "step": 10351, + "time_per_iteration": 4.29375147819519 + }, + { + "auxiliary_loss_clip": 0.01167593, + "auxiliary_loss_mlp": 0.01005996, + "balance_loss_clip": 1.12104189, + "balance_loss_mlp": 1.0028491, + "epoch": 0.6223959116188186, + "flos": 63781353128160.0, + "grad_norm": 0.8070304298590718, + "language_loss": 0.61275381, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63448972, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.03149414, + "step": 10352, + "time_per_iteration": 3.1772475242614746 + }, + { + "auxiliary_loss_clip": 0.01346026, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.23663831, + "balance_loss_mlp": 1.02091575, + "epoch": 0.6224560348714866, + "flos": 22570961838840.0, + "grad_norm": 1.9464146081818878, + "language_loss": 0.81835598, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84216225, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13684082, + "step": 10353, + "time_per_iteration": 4.325258731842041 + }, + { + "auxiliary_loss_clip": 0.01340386, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.23244226, + "balance_loss_mlp": 1.02256155, + "epoch": 0.6225161581241545, + "flos": 24103133001720.0, + "grad_norm": 1.4282258378470343, + "language_loss": 0.75791276, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.7816751, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13293457, + "step": 10354, + "time_per_iteration": 2.833116292953491 + }, + { + "auxiliary_loss_clip": 0.01348726, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.23779559, + "balance_loss_mlp": 1.02068174, + "epoch": 0.6225762813768225, + "flos": 20447682828480.0, + "grad_norm": 1.5046752130832994, + "language_loss": 0.78842717, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.81225771, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13653564, + "step": 10355, + "time_per_iteration": 2.764091968536377 + }, + { + "auxiliary_loss_clip": 0.01353846, + "auxiliary_loss_mlp": 0.01038574, + "balance_loss_clip": 1.24045885, + "balance_loss_mlp": 1.02544308, + "epoch": 0.6226364046294904, + "flos": 27203802828480.0, + "grad_norm": 1.5443484275976476, + "language_loss": 0.78673047, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.8106547, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13122559, + "step": 10356, + "time_per_iteration": 2.814069986343384 + }, + { + "auxiliary_loss_clip": 0.01365588, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.24812341, + "balance_loss_mlp": 1.0195992, + "epoch": 0.6226965278821585, + "flos": 20450241155160.0, + "grad_norm": 1.7681175051665006, + "language_loss": 0.67833644, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.7023285, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.14031982, + "step": 10357, + "time_per_iteration": 2.7784016132354736 + }, + { + "auxiliary_loss_clip": 0.01362559, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.24544978, + "balance_loss_mlp": 1.01435256, + "epoch": 0.6227566511348264, + "flos": 22167704575440.0, + "grad_norm": 2.7330552219705013, + "language_loss": 0.77273566, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.79665667, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.15197754, + "step": 10358, + "time_per_iteration": 2.907505512237549 + }, + { + "auxiliary_loss_clip": 0.01356889, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.24220932, + "balance_loss_mlp": 1.0173614, + "epoch": 0.6228167743874944, + "flos": 18045438409440.0, + "grad_norm": 2.0990200689401126, + "language_loss": 0.82414007, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13287354, + "step": 10359, + "time_per_iteration": 4.295156240463257 + }, + { + "auxiliary_loss_clip": 0.01345282, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.2362566, + "balance_loss_mlp": 1.02744889, + "epoch": 0.6228768976401623, + "flos": 17746574904360.0, + "grad_norm": 2.088392611873594, + "language_loss": 0.73285472, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75671607, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13397217, + "step": 10360, + "time_per_iteration": 2.843803644180298 + }, + { + "auxiliary_loss_clip": 0.01352363, + "auxiliary_loss_mlp": 0.01040923, + "balance_loss_clip": 1.23906279, + "balance_loss_mlp": 1.02729177, + "epoch": 0.6229370208928303, + "flos": 17899478671680.0, + "grad_norm": 1.9814011994865017, + "language_loss": 0.77669299, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8006258, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.1362915, + "step": 10361, + "time_per_iteration": 2.777686834335327 + }, + { + "auxiliary_loss_clip": 0.01347641, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.23669982, + "balance_loss_mlp": 1.01996112, + "epoch": 0.6229971441454982, + "flos": 17352251480160.0, + "grad_norm": 1.5384743640684249, + "language_loss": 0.67761672, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.70141733, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12451172, + "step": 10362, + "time_per_iteration": 2.8543496131896973 + }, + { + "auxiliary_loss_clip": 0.01353517, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.24153686, + "balance_loss_mlp": 1.01862955, + "epoch": 0.6230572673981662, + "flos": 29247320373840.0, + "grad_norm": 2.0075861444617167, + "language_loss": 0.68128747, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70515227, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14355469, + "step": 10363, + "time_per_iteration": 2.8237316608428955 + }, + { + "auxiliary_loss_clip": 0.01362394, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.24561346, + "balance_loss_mlp": 1.01689279, + "epoch": 0.6231173906508342, + "flos": 16330289665680.0, + "grad_norm": 1.9639317278528539, + "language_loss": 0.86964881, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.89358473, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14306641, + "step": 10364, + "time_per_iteration": 2.718636989593506 + }, + { + "auxiliary_loss_clip": 0.01166113, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_clip": 1.11999011, + "balance_loss_mlp": 1.01059151, + "epoch": 0.6231775139035022, + "flos": 68715680185080.0, + "grad_norm": 0.8857211984646765, + "language_loss": 0.60867536, + "learning_rate": 1.313418851605015e-06, + "loss": 0.6304701, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02770996, + "step": 10365, + "time_per_iteration": 3.3728129863739014 + }, + { + "auxiliary_loss_clip": 0.01363717, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.24631929, + "balance_loss_mlp": 1.02409291, + "epoch": 0.6232376371561702, + "flos": 19824836224680.0, + "grad_norm": 3.6837991280206963, + "language_loss": 0.7572037, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.78122783, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.14611816, + "step": 10366, + "time_per_iteration": 2.7676076889038086 + }, + { + "auxiliary_loss_clip": 0.01359321, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.24409759, + "balance_loss_mlp": 1.02323437, + "epoch": 0.6232977604088381, + "flos": 23263499034360.0, + "grad_norm": 1.8903255841460749, + "language_loss": 0.76485276, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78882241, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14385986, + "step": 10367, + "time_per_iteration": 2.792757511138916 + }, + { + "auxiliary_loss_clip": 0.01343982, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.23491812, + "balance_loss_mlp": 1.0211308, + "epoch": 0.6233578836615061, + "flos": 21111486286320.0, + "grad_norm": 1.4226037239925968, + "language_loss": 0.78662443, + "learning_rate": 1.312321587418457e-06, + "loss": 0.81040823, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13262939, + "step": 10368, + "time_per_iteration": 2.747325897216797 + }, + { + "auxiliary_loss_clip": 0.01354388, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.23987639, + "balance_loss_mlp": 1.0185343, + "epoch": 0.623418006914174, + "flos": 23774764200120.0, + "grad_norm": 2.837796451807112, + "language_loss": 0.68340844, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70727539, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13781738, + "step": 10369, + "time_per_iteration": 2.7817249298095703 + }, + { + "auxiliary_loss_clip": 0.01356681, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.24453068, + "balance_loss_mlp": 1.02160668, + "epoch": 0.6234781301668421, + "flos": 17894443235040.0, + "grad_norm": 1.9899070979035545, + "language_loss": 0.88043904, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90436888, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14685059, + "step": 10370, + "time_per_iteration": 2.746295213699341 + }, + { + "auxiliary_loss_clip": 0.01351328, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.24047172, + "balance_loss_mlp": 1.01619697, + "epoch": 0.62353825341951, + "flos": 26180907021720.0, + "grad_norm": 1.445263258767965, + "language_loss": 0.66407996, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68788755, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13238525, + "step": 10371, + "time_per_iteration": 2.7796761989593506 + }, + { + "auxiliary_loss_clip": 0.01336677, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.23083544, + "balance_loss_mlp": 1.01546264, + "epoch": 0.623598376672178, + "flos": 31146258865680.0, + "grad_norm": 1.3173391874836307, + "language_loss": 0.77678537, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.80042505, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11810303, + "step": 10372, + "time_per_iteration": 2.9061477184295654 + }, + { + "auxiliary_loss_clip": 0.01352416, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.23826277, + "balance_loss_mlp": 1.01630521, + "epoch": 0.6236584999248459, + "flos": 23735188030320.0, + "grad_norm": 1.5528084650119665, + "language_loss": 0.77749872, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.80132437, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13848877, + "step": 10373, + "time_per_iteration": 2.9043424129486084 + }, + { + "auxiliary_loss_clip": 0.0133953, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.23057568, + "balance_loss_mlp": 1.01351845, + "epoch": 0.6237186231775139, + "flos": 21767858414280.0, + "grad_norm": 1.5803510622127324, + "language_loss": 0.69608563, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71974695, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1307373, + "step": 10374, + "time_per_iteration": 2.843475341796875 + }, + { + "auxiliary_loss_clip": 0.01351882, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.23864603, + "balance_loss_mlp": 1.01657176, + "epoch": 0.6237787464301818, + "flos": 14943631548600.0, + "grad_norm": 1.7441036352733523, + "language_loss": 0.76966083, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79347706, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.1315918, + "step": 10375, + "time_per_iteration": 2.8257486820220947 + }, + { + "auxiliary_loss_clip": 0.01345503, + "auxiliary_loss_mlp": 0.01028338, + "balance_loss_clip": 1.23635912, + "balance_loss_mlp": 1.01543951, + "epoch": 0.6238388696828499, + "flos": 35597137483440.0, + "grad_norm": 1.51457110281383, + "language_loss": 0.70044976, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72418821, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12902832, + "step": 10376, + "time_per_iteration": 2.9002513885498047 + }, + { + "auxiliary_loss_clip": 0.01359582, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.24483228, + "balance_loss_mlp": 1.01706982, + "epoch": 0.6238989929355178, + "flos": 23628845070720.0, + "grad_norm": 1.6003636017289429, + "language_loss": 0.76771778, + "learning_rate": 1.309031204505301e-06, + "loss": 0.79162955, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14526367, + "step": 10377, + "time_per_iteration": 2.9865236282348633 + }, + { + "auxiliary_loss_clip": 0.01351258, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.23877454, + "balance_loss_mlp": 1.01432109, + "epoch": 0.6239591161881858, + "flos": 22092125771520.0, + "grad_norm": 1.5501332688797482, + "language_loss": 0.68646705, + "learning_rate": 1.308665737227052e-06, + "loss": 0.71024919, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12646484, + "step": 10378, + "time_per_iteration": 2.819409132003784 + }, + { + "auxiliary_loss_clip": 0.01347725, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.2364428, + "balance_loss_mlp": 1.01850295, + "epoch": 0.6240192394408538, + "flos": 24541418298600.0, + "grad_norm": 1.8933360701926827, + "language_loss": 0.76404941, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78784716, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13549805, + "step": 10379, + "time_per_iteration": 2.853355884552002 + }, + { + "auxiliary_loss_clip": 0.01353684, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.24084163, + "balance_loss_mlp": 1.01367581, + "epoch": 0.6240793626935217, + "flos": 27938230870320.0, + "grad_norm": 1.2767539673521544, + "language_loss": 0.79383564, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81764168, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13238525, + "step": 10380, + "time_per_iteration": 2.8703293800354004 + }, + { + "auxiliary_loss_clip": 0.01340712, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.23183143, + "balance_loss_mlp": 1.01928258, + "epoch": 0.6241394859461897, + "flos": 22897584480960.0, + "grad_norm": 1.5723984027457676, + "language_loss": 0.80335903, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82708216, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12310791, + "step": 10381, + "time_per_iteration": 2.9295570850372314 + }, + { + "auxiliary_loss_clip": 0.01348979, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.23635232, + "balance_loss_mlp": 1.016891, + "epoch": 0.6241996091988576, + "flos": 12754682174160.0, + "grad_norm": 1.9486710647375536, + "language_loss": 0.74806947, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.7718581, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.12988281, + "step": 10382, + "time_per_iteration": 2.7646985054016113 + }, + { + "auxiliary_loss_clip": 0.01346265, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.237059, + "balance_loss_mlp": 1.01467538, + "epoch": 0.6242597324515257, + "flos": 25857208181520.0, + "grad_norm": 1.489074232302383, + "language_loss": 0.78512478, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80886281, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12860107, + "step": 10383, + "time_per_iteration": 2.974246025085449 + }, + { + "auxiliary_loss_clip": 0.0134685, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.23510361, + "balance_loss_mlp": 1.0149554, + "epoch": 0.6243198557041936, + "flos": 19942468308360.0, + "grad_norm": 1.8359692414495332, + "language_loss": 0.75564075, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77938187, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12310791, + "step": 10384, + "time_per_iteration": 2.841308355331421 + }, + { + "auxiliary_loss_clip": 0.01352914, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.23958206, + "balance_loss_mlp": 1.01501369, + "epoch": 0.6243799789568616, + "flos": 18410865662520.0, + "grad_norm": 2.8957975194399164, + "language_loss": 0.66501325, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68882912, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13659668, + "step": 10385, + "time_per_iteration": 2.8442728519439697 + }, + { + "auxiliary_loss_clip": 0.01163402, + "auxiliary_loss_mlp": 0.01004285, + "balance_loss_clip": 1.11601305, + "balance_loss_mlp": 1.00136423, + "epoch": 0.6244401022095295, + "flos": 66044216363760.0, + "grad_norm": 0.762221467528392, + "language_loss": 0.61999297, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64166987, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.0291748, + "step": 10386, + "time_per_iteration": 3.3604214191436768 + }, + { + "auxiliary_loss_clip": 0.01347134, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.23532248, + "balance_loss_mlp": 1.01807618, + "epoch": 0.6245002254621975, + "flos": 24576608765520.0, + "grad_norm": 2.5065058742871202, + "language_loss": 0.72213256, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74592543, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14068604, + "step": 10387, + "time_per_iteration": 4.145484924316406 + }, + { + "auxiliary_loss_clip": 0.01360685, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.24282861, + "balance_loss_mlp": 1.02133298, + "epoch": 0.6245603487148654, + "flos": 29174827805280.0, + "grad_norm": 2.3749871630423454, + "language_loss": 0.6553055, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67927343, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14770508, + "step": 10388, + "time_per_iteration": 4.47648024559021 + }, + { + "auxiliary_loss_clip": 0.01350039, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.23782325, + "balance_loss_mlp": 1.02138424, + "epoch": 0.6246204719675335, + "flos": 14793773408280.0, + "grad_norm": 2.1086897428996223, + "language_loss": 0.79360163, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81744075, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.12481689, + "step": 10389, + "time_per_iteration": 2.6947999000549316 + }, + { + "auxiliary_loss_clip": 0.01350371, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.2388494, + "balance_loss_mlp": 1.0161525, + "epoch": 0.6246805952202014, + "flos": 12497181606720.0, + "grad_norm": 1.7381615830363553, + "language_loss": 0.60394222, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62774402, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13671875, + "step": 10390, + "time_per_iteration": 2.7234535217285156 + }, + { + "auxiliary_loss_clip": 0.01357014, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.24191403, + "balance_loss_mlp": 1.02033007, + "epoch": 0.6247407184728694, + "flos": 12790522374840.0, + "grad_norm": 2.32143362071782, + "language_loss": 0.77937591, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.80328685, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13739014, + "step": 10391, + "time_per_iteration": 2.6974024772644043 + }, + { + "auxiliary_loss_clip": 0.01355272, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.24194658, + "balance_loss_mlp": 1.02310705, + "epoch": 0.6248008417255374, + "flos": 40637174747400.0, + "grad_norm": 1.4093014723853399, + "language_loss": 0.64659476, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.67051435, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13568115, + "step": 10392, + "time_per_iteration": 4.38269305229187 + }, + { + "auxiliary_loss_clip": 0.01356967, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.24280357, + "balance_loss_mlp": 1.01987195, + "epoch": 0.6248609649782053, + "flos": 19906871757840.0, + "grad_norm": 1.6764372715593236, + "language_loss": 0.76584613, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78975689, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14251709, + "step": 10393, + "time_per_iteration": 2.785413980484009 + }, + { + "auxiliary_loss_clip": 0.01353938, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.23948479, + "balance_loss_mlp": 1.02201676, + "epoch": 0.6249210882308733, + "flos": 19687526067600.0, + "grad_norm": 1.8576395244549355, + "language_loss": 0.82699722, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.85091156, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.15484619, + "step": 10394, + "time_per_iteration": 2.7637782096862793 + }, + { + "auxiliary_loss_clip": 0.01354479, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.23896503, + "balance_loss_mlp": 1.02224874, + "epoch": 0.6249812114835412, + "flos": 13994730819720.0, + "grad_norm": 1.6493785702983577, + "language_loss": 0.755445, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77935719, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.14501953, + "step": 10395, + "time_per_iteration": 2.819423198699951 + }, + { + "auxiliary_loss_clip": 0.0136163, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.24552464, + "balance_loss_mlp": 1.02233469, + "epoch": 0.6250413347362093, + "flos": 14533064780400.0, + "grad_norm": 2.131955352255797, + "language_loss": 0.7240541, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74803138, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13775635, + "step": 10396, + "time_per_iteration": 2.75266170501709 + }, + { + "auxiliary_loss_clip": 0.01351559, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.23989081, + "balance_loss_mlp": 1.02466881, + "epoch": 0.6251014579888772, + "flos": 22967762373000.0, + "grad_norm": 2.1029958027652755, + "language_loss": 0.76142228, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.78531629, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13183594, + "step": 10397, + "time_per_iteration": 2.845381736755371 + }, + { + "auxiliary_loss_clip": 0.01349497, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.23646724, + "balance_loss_mlp": 1.02113855, + "epoch": 0.6251615812415452, + "flos": 28117147615200.0, + "grad_norm": 3.4622878827118493, + "language_loss": 0.75577956, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77962613, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14031982, + "step": 10398, + "time_per_iteration": 4.329890727996826 + }, + { + "auxiliary_loss_clip": 0.0135614, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.23977196, + "balance_loss_mlp": 1.02019882, + "epoch": 0.6252217044942131, + "flos": 26730164631240.0, + "grad_norm": 1.7710633625494372, + "language_loss": 0.74891776, + "learning_rate": 1.300997001489483e-06, + "loss": 0.77282804, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14697266, + "step": 10399, + "time_per_iteration": 2.924799680709839 + }, + { + "auxiliary_loss_clip": 0.0135153, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.2383852, + "balance_loss_mlp": 1.02378035, + "epoch": 0.6252818277468811, + "flos": 20011021866000.0, + "grad_norm": 1.5403076462191394, + "language_loss": 0.74420154, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76809388, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13928223, + "step": 10400, + "time_per_iteration": 2.780618667602539 + }, + { + "auxiliary_loss_clip": 0.01160894, + "auxiliary_loss_mlp": 0.01004036, + "balance_loss_clip": 1.11406052, + "balance_loss_mlp": 1.00110364, + "epoch": 0.625341950999549, + "flos": 59294106401040.0, + "grad_norm": 0.8582858607511084, + "language_loss": 0.56572139, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58737063, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.02929688, + "step": 10401, + "time_per_iteration": 3.3359615802764893 + }, + { + "auxiliary_loss_clip": 0.01353941, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.24004722, + "balance_loss_mlp": 1.01886094, + "epoch": 0.625402074252217, + "flos": 20161935823680.0, + "grad_norm": 2.6567535378179987, + "language_loss": 0.83178037, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85564625, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13800049, + "step": 10402, + "time_per_iteration": 2.868415117263794 + }, + { + "auxiliary_loss_clip": 0.0133763, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.22573948, + "balance_loss_mlp": 1.01828456, + "epoch": 0.625462197504885, + "flos": 29138540912640.0, + "grad_norm": 1.8017654883874228, + "language_loss": 0.69638515, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.72007191, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12774658, + "step": 10403, + "time_per_iteration": 2.831132650375366 + }, + { + "auxiliary_loss_clip": 0.01352399, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.23803747, + "balance_loss_mlp": 1.02052402, + "epoch": 0.625522320757553, + "flos": 26110079395920.0, + "grad_norm": 1.5235961286633897, + "language_loss": 0.72010922, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74399227, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.15374756, + "step": 10404, + "time_per_iteration": 2.774254083633423 + }, + { + "auxiliary_loss_clip": 0.01352786, + "auxiliary_loss_mlp": 0.01038266, + "balance_loss_clip": 1.23980522, + "balance_loss_mlp": 1.02425933, + "epoch": 0.625582444010221, + "flos": 20635817671080.0, + "grad_norm": 1.7737485983235752, + "language_loss": 0.69768524, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.72159588, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14019775, + "step": 10405, + "time_per_iteration": 2.7880167961120605 + }, + { + "auxiliary_loss_clip": 0.01346712, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.2356441, + "balance_loss_mlp": 1.02118826, + "epoch": 0.6256425672628889, + "flos": 20526347867760.0, + "grad_norm": 1.5212316741505099, + "language_loss": 0.79080302, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81461984, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13775635, + "step": 10406, + "time_per_iteration": 2.7631640434265137 + }, + { + "auxiliary_loss_clip": 0.01347411, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.23418796, + "balance_loss_mlp": 1.02089119, + "epoch": 0.6257026905155569, + "flos": 29534651104680.0, + "grad_norm": 1.655006069816943, + "language_loss": 0.68747175, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71128589, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13110352, + "step": 10407, + "time_per_iteration": 2.80629301071167 + }, + { + "auxiliary_loss_clip": 0.01339276, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.23159432, + "balance_loss_mlp": 1.01985765, + "epoch": 0.6257628137682248, + "flos": 24030274957920.0, + "grad_norm": 6.101228342625464, + "language_loss": 0.85665166, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.88036549, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12261963, + "step": 10408, + "time_per_iteration": 2.8538475036621094 + }, + { + "auxiliary_loss_clip": 0.01341568, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.23105609, + "balance_loss_mlp": 1.02683258, + "epoch": 0.6258229370208929, + "flos": 20855975528520.0, + "grad_norm": 1.6208174335031609, + "language_loss": 0.79836857, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.8221792, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12670898, + "step": 10409, + "time_per_iteration": 2.8752784729003906 + }, + { + "auxiliary_loss_clip": 0.01343015, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.23318863, + "balance_loss_mlp": 1.01772261, + "epoch": 0.6258830602735608, + "flos": 22235973874560.0, + "grad_norm": 2.2131700909978353, + "language_loss": 0.69825423, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.72199333, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1317749, + "step": 10410, + "time_per_iteration": 2.806992530822754 + }, + { + "auxiliary_loss_clip": 0.01342971, + "auxiliary_loss_mlp": 0.01027127, + "balance_loss_clip": 1.23552871, + "balance_loss_mlp": 1.01464558, + "epoch": 0.6259431835262288, + "flos": 25081620243840.0, + "grad_norm": 1.5586265786062836, + "language_loss": 0.67949855, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.70319951, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12481689, + "step": 10411, + "time_per_iteration": 2.9920566082000732 + }, + { + "auxiliary_loss_clip": 0.01347848, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.23522568, + "balance_loss_mlp": 1.02154744, + "epoch": 0.6260033067788967, + "flos": 28257584616000.0, + "grad_norm": 1.6960477440905686, + "language_loss": 0.6984055, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.72223276, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13336182, + "step": 10412, + "time_per_iteration": 2.929664134979248 + }, + { + "auxiliary_loss_clip": 0.01341359, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.23248267, + "balance_loss_mlp": 1.01797664, + "epoch": 0.6260634300315647, + "flos": 23372643970800.0, + "grad_norm": 1.3740597292764425, + "language_loss": 0.69580317, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71952391, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12731934, + "step": 10413, + "time_per_iteration": 2.8693013191223145 + }, + { + "auxiliary_loss_clip": 0.01356146, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.23854899, + "balance_loss_mlp": 1.0185262, + "epoch": 0.6261235532842326, + "flos": 18039144113640.0, + "grad_norm": 2.4814788347644807, + "language_loss": 0.80796206, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83185661, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.14782715, + "step": 10414, + "time_per_iteration": 2.900513172149658 + }, + { + "auxiliary_loss_clip": 0.01344691, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.23336256, + "balance_loss_mlp": 1.01870775, + "epoch": 0.6261836765369007, + "flos": 22015491150240.0, + "grad_norm": 1.625050453847863, + "language_loss": 0.7485404, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.77230394, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12945557, + "step": 10415, + "time_per_iteration": 2.889946222305298 + }, + { + "auxiliary_loss_clip": 0.01345867, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.23590994, + "balance_loss_mlp": 1.01418161, + "epoch": 0.6262437997895686, + "flos": 24941386284840.0, + "grad_norm": 2.19912138084765, + "language_loss": 0.74720085, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.77093762, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1362915, + "step": 10416, + "time_per_iteration": 3.0463449954986572 + }, + { + "auxiliary_loss_clip": 0.01338774, + "auxiliary_loss_mlp": 0.0102534, + "balance_loss_clip": 1.23112595, + "balance_loss_mlp": 1.01242995, + "epoch": 0.6263039230422366, + "flos": 31614090067440.0, + "grad_norm": 1.5903253758214586, + "language_loss": 0.84345376, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86709499, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12927246, + "step": 10417, + "time_per_iteration": 2.9701850414276123 + }, + { + "auxiliary_loss_clip": 0.01344866, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.23329306, + "balance_loss_mlp": 1.01520252, + "epoch": 0.6263640462949046, + "flos": 17643967913880.0, + "grad_norm": 3.044882886692875, + "language_loss": 0.5761131, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59985161, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13793945, + "step": 10418, + "time_per_iteration": 2.818850517272949 + }, + { + "auxiliary_loss_clip": 0.0135502, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.23901057, + "balance_loss_mlp": 1.0192368, + "epoch": 0.6264241695475725, + "flos": 19979811018360.0, + "grad_norm": 1.7938550576221284, + "language_loss": 0.84749454, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8713786, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.14147949, + "step": 10419, + "time_per_iteration": 2.954530954360962 + }, + { + "auxiliary_loss_clip": 0.01347687, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.23543298, + "balance_loss_mlp": 1.02254891, + "epoch": 0.6264842928002405, + "flos": 27350046824760.0, + "grad_norm": 1.484156984167766, + "language_loss": 0.64878452, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67261773, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13104248, + "step": 10420, + "time_per_iteration": 3.042818069458008 + }, + { + "auxiliary_loss_clip": 0.01348915, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.23525894, + "balance_loss_mlp": 1.01909709, + "epoch": 0.6265444160529084, + "flos": 23001734589120.0, + "grad_norm": 1.838063613365811, + "language_loss": 0.86380458, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88763082, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 1.13623047, + "router_z_loss_mlp": 0.1461792, + "step": 10421, + "time_per_iteration": 2.8639237880706787 + }, + { + "auxiliary_loss_clip": 0.01343393, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.23235393, + "balance_loss_mlp": 1.01969147, + "epoch": 0.6266045393055765, + "flos": 19943158650480.0, + "grad_norm": 2.3182350661149784, + "language_loss": 0.79321337, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81697375, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1295166, + "step": 10422, + "time_per_iteration": 2.8539607524871826 + }, + { + "auxiliary_loss_clip": 0.01341133, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.22956955, + "balance_loss_mlp": 1.01316476, + "epoch": 0.6266646625582444, + "flos": 24394443351840.0, + "grad_norm": 1.6679774805912866, + "language_loss": 0.75194281, + "learning_rate": 1.292247052906389e-06, + "loss": 0.77562547, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 1.11572266, + "router_z_loss_mlp": 0.13970947, + "step": 10423, + "time_per_iteration": 3.041656017303467 + }, + { + "auxiliary_loss_clip": 0.01343114, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.23261321, + "balance_loss_mlp": 1.0123421, + "epoch": 0.6267247858109124, + "flos": 14687877140640.0, + "grad_norm": 1.821095483020176, + "language_loss": 0.78119385, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.8048799, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.1315918, + "step": 10424, + "time_per_iteration": 2.8724782466888428 + }, + { + "auxiliary_loss_clip": 0.01341599, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.23168755, + "balance_loss_mlp": 1.01371121, + "epoch": 0.6267849090635803, + "flos": 24934564080360.0, + "grad_norm": 1.803825080410893, + "language_loss": 0.69610107, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71980047, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.14624023, + "step": 10425, + "time_per_iteration": 4.28348445892334 + }, + { + "auxiliary_loss_clip": 0.01330626, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.22525835, + "balance_loss_mlp": 1.01560044, + "epoch": 0.6268450323162483, + "flos": 25343344080720.0, + "grad_norm": 1.4497379921988258, + "language_loss": 0.74404252, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76762664, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.1217041, + "step": 10426, + "time_per_iteration": 2.8088693618774414 + }, + { + "auxiliary_loss_clip": 0.01344996, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.23227441, + "balance_loss_mlp": 1.01743245, + "epoch": 0.6269051555689162, + "flos": 26182937439720.0, + "grad_norm": 1.6612948142598694, + "language_loss": 0.80637598, + "learning_rate": 1.290790225914929e-06, + "loss": 0.83013231, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13214111, + "step": 10427, + "time_per_iteration": 4.251619815826416 + }, + { + "auxiliary_loss_clip": 0.0135127, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.23913634, + "balance_loss_mlp": 1.02185869, + "epoch": 0.6269652788215843, + "flos": 18261088738920.0, + "grad_norm": 1.7877837235434313, + "language_loss": 0.69379991, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.71766394, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13262939, + "step": 10428, + "time_per_iteration": 2.7633447647094727 + }, + { + "auxiliary_loss_clip": 0.01340273, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.2297585, + "balance_loss_mlp": 1.02071762, + "epoch": 0.6270254020742522, + "flos": 11769169685760.0, + "grad_norm": 4.319366153048797, + "language_loss": 0.72041142, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.74414742, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.1260376, + "step": 10429, + "time_per_iteration": 2.721693754196167 + }, + { + "auxiliary_loss_clip": 0.01354243, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.24087381, + "balance_loss_mlp": 1.02542603, + "epoch": 0.6270855253269202, + "flos": 23480692481520.0, + "grad_norm": 1.5507379540533281, + "language_loss": 0.80259728, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82653385, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14001465, + "step": 10430, + "time_per_iteration": 4.269777536392212 + }, + { + "auxiliary_loss_clip": 0.01165691, + "auxiliary_loss_mlp": 0.01017761, + "balance_loss_clip": 1.11895096, + "balance_loss_mlp": 1.01468563, + "epoch": 0.6271456485795882, + "flos": 70079980077000.0, + "grad_norm": 0.7674844495827684, + "language_loss": 0.59128749, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61312199, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03063965, + "step": 10431, + "time_per_iteration": 3.4158260822296143 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01010654, + "balance_loss_clip": 1.11902952, + "balance_loss_mlp": 1.00757849, + "epoch": 0.6272057718322561, + "flos": 65173858849080.0, + "grad_norm": 0.8713945815796948, + "language_loss": 0.63781106, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65957713, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03063965, + "step": 10432, + "time_per_iteration": 3.281491756439209 + }, + { + "auxiliary_loss_clip": 0.01340752, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.23158574, + "balance_loss_mlp": 1.01480377, + "epoch": 0.6272658950849241, + "flos": 24394727610360.0, + "grad_norm": 1.7065797259506634, + "language_loss": 0.64874971, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67243022, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12506104, + "step": 10433, + "time_per_iteration": 2.8594424724578857 + }, + { + "auxiliary_loss_clip": 0.01352018, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.23762536, + "balance_loss_mlp": 1.01992476, + "epoch": 0.627326018337592, + "flos": 17970306297480.0, + "grad_norm": 2.6802010495045767, + "language_loss": 0.61894709, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64280957, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14306641, + "step": 10434, + "time_per_iteration": 2.826040029525757 + }, + { + "auxiliary_loss_clip": 0.01343573, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.23167205, + "balance_loss_mlp": 1.01768589, + "epoch": 0.6273861415902601, + "flos": 20234956300920.0, + "grad_norm": 1.563132074634123, + "language_loss": 0.84886241, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.87260884, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13391113, + "step": 10435, + "time_per_iteration": 2.7565953731536865 + }, + { + "auxiliary_loss_clip": 0.01164934, + "auxiliary_loss_mlp": 0.01011066, + "balance_loss_clip": 1.11896706, + "balance_loss_mlp": 1.00809765, + "epoch": 0.627446264842928, + "flos": 64967572069560.0, + "grad_norm": 1.2084302755441922, + "language_loss": 0.6157254, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63748538, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.02966309, + "step": 10436, + "time_per_iteration": 4.688935279846191 + }, + { + "auxiliary_loss_clip": 0.01346836, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.23594165, + "balance_loss_mlp": 1.02374268, + "epoch": 0.627506388095596, + "flos": 23589228292560.0, + "grad_norm": 1.4616043793461946, + "language_loss": 0.77900386, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.80284601, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13635254, + "step": 10437, + "time_per_iteration": 2.845559597015381 + }, + { + "auxiliary_loss_clip": 0.01163952, + "auxiliary_loss_mlp": 0.01016996, + "balance_loss_clip": 1.1171118, + "balance_loss_mlp": 1.01389635, + "epoch": 0.6275665113482639, + "flos": 67598786360160.0, + "grad_norm": 0.7176001616531754, + "language_loss": 0.54327369, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56508321, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03088379, + "step": 10438, + "time_per_iteration": 3.1681418418884277 + }, + { + "auxiliary_loss_clip": 0.01339352, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.22862267, + "balance_loss_mlp": 1.02549005, + "epoch": 0.6276266346009319, + "flos": 27642778467480.0, + "grad_norm": 1.6380678702898057, + "language_loss": 0.84467554, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86845803, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13427734, + "step": 10439, + "time_per_iteration": 2.850208282470703 + }, + { + "auxiliary_loss_clip": 0.01350123, + "auxiliary_loss_mlp": 0.01041845, + "balance_loss_clip": 1.23670483, + "balance_loss_mlp": 1.02777863, + "epoch": 0.6276867578535998, + "flos": 22751137442880.0, + "grad_norm": 2.360277179009486, + "language_loss": 0.80829108, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.83221072, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.140625, + "step": 10440, + "time_per_iteration": 2.7153873443603516 + }, + { + "auxiliary_loss_clip": 0.01328848, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.22350717, + "balance_loss_mlp": 1.01749635, + "epoch": 0.6277468811062679, + "flos": 24649548026040.0, + "grad_norm": 1.3734859321495436, + "language_loss": 0.74525034, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76883084, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.11694336, + "step": 10441, + "time_per_iteration": 2.828342914581299 + }, + { + "auxiliary_loss_clip": 0.01338663, + "auxiliary_loss_mlp": 0.01026269, + "balance_loss_clip": 1.23031366, + "balance_loss_mlp": 1.01329327, + "epoch": 0.6278070043589358, + "flos": 19683262189800.0, + "grad_norm": 2.183726387486029, + "language_loss": 0.72543919, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74908853, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12988281, + "step": 10442, + "time_per_iteration": 2.739531993865967 + }, + { + "auxiliary_loss_clip": 0.01344483, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.2330296, + "balance_loss_mlp": 1.0173595, + "epoch": 0.6278671276116038, + "flos": 22126219812720.0, + "grad_norm": 1.5066471817795672, + "language_loss": 0.71801609, + "learning_rate": 1.284967229712762e-06, + "loss": 0.7417621, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12768555, + "step": 10443, + "time_per_iteration": 2.829185724258423 + }, + { + "auxiliary_loss_clip": 0.01344675, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.23357236, + "balance_loss_mlp": 1.01996613, + "epoch": 0.6279272508642717, + "flos": 23043666043800.0, + "grad_norm": 1.7883383464906923, + "language_loss": 0.73234189, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75612307, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13482666, + "step": 10444, + "time_per_iteration": 2.8015732765197754 + }, + { + "auxiliary_loss_clip": 0.01343055, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.23338711, + "balance_loss_mlp": 1.01496959, + "epoch": 0.6279873741169397, + "flos": 19828287935280.0, + "grad_norm": 1.8184949891634765, + "language_loss": 0.72783667, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.75155234, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13543701, + "step": 10445, + "time_per_iteration": 2.8727304935455322 + }, + { + "auxiliary_loss_clip": 0.01341356, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.23092186, + "balance_loss_mlp": 1.01643658, + "epoch": 0.6280474973696077, + "flos": 23920764546240.0, + "grad_norm": 1.58043080722217, + "language_loss": 0.69352949, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71723902, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13171387, + "step": 10446, + "time_per_iteration": 2.958148241043091 + }, + { + "auxiliary_loss_clip": 0.01352094, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.2364502, + "balance_loss_mlp": 1.01891446, + "epoch": 0.6281076206222757, + "flos": 17972539757280.0, + "grad_norm": 3.561841881742587, + "language_loss": 0.74097395, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.76483035, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14654541, + "step": 10447, + "time_per_iteration": 2.8159096240997314 + }, + { + "auxiliary_loss_clip": 0.01164292, + "auxiliary_loss_mlp": 0.01006035, + "balance_loss_clip": 1.11729562, + "balance_loss_mlp": 1.00264919, + "epoch": 0.6281677438749437, + "flos": 66793084000560.0, + "grad_norm": 0.7084083971834038, + "language_loss": 0.52383792, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54554117, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03393555, + "step": 10448, + "time_per_iteration": 3.148083209991455 + }, + { + "auxiliary_loss_clip": 0.01349584, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.23688531, + "balance_loss_mlp": 1.02808619, + "epoch": 0.6282278671276116, + "flos": 11659943532600.0, + "grad_norm": 2.3565172219082258, + "language_loss": 0.91836727, + "learning_rate": 1.282785392633079e-06, + "loss": 0.94227886, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13494873, + "step": 10449, + "time_per_iteration": 2.7523388862609863 + }, + { + "auxiliary_loss_clip": 0.013375, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.22785163, + "balance_loss_mlp": 1.01680243, + "epoch": 0.6282879903802796, + "flos": 42749367675480.0, + "grad_norm": 1.4882101193027668, + "language_loss": 0.60513616, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62880105, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12207031, + "step": 10450, + "time_per_iteration": 2.9399285316467285 + }, + { + "auxiliary_loss_clip": 0.0133198, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.22479129, + "balance_loss_mlp": 1.01849067, + "epoch": 0.6283481136329475, + "flos": 20013783234480.0, + "grad_norm": 1.476748737687717, + "language_loss": 0.76904076, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79267371, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12823486, + "step": 10451, + "time_per_iteration": 2.7711992263793945 + }, + { + "auxiliary_loss_clip": 0.01347146, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.23409033, + "balance_loss_mlp": 1.0201714, + "epoch": 0.6284082368856155, + "flos": 21908985757200.0, + "grad_norm": 1.5156249800740793, + "language_loss": 0.7804451, + "learning_rate": 1.281694841064566e-06, + "loss": 0.80425978, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14160156, + "step": 10452, + "time_per_iteration": 2.7254600524902344 + }, + { + "auxiliary_loss_clip": 0.01344881, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.23405743, + "balance_loss_mlp": 1.02297163, + "epoch": 0.6284683601382834, + "flos": 25489750510440.0, + "grad_norm": 1.6256522781504756, + "language_loss": 0.7308163, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.75462866, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13391113, + "step": 10453, + "time_per_iteration": 2.8249106407165527 + }, + { + "auxiliary_loss_clip": 0.01342535, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.2304672, + "balance_loss_mlp": 1.01657939, + "epoch": 0.6285284833909515, + "flos": 16542244634400.0, + "grad_norm": 1.6726516517687764, + "language_loss": 0.80750656, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.83123124, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13330078, + "step": 10454, + "time_per_iteration": 2.7279884815216064 + }, + { + "auxiliary_loss_clip": 0.01337584, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.23027706, + "balance_loss_mlp": 1.01887345, + "epoch": 0.6285886066436194, + "flos": 22825904079600.0, + "grad_norm": 1.819177955465924, + "language_loss": 0.81900936, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84269881, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12475586, + "step": 10455, + "time_per_iteration": 2.8691799640655518 + }, + { + "auxiliary_loss_clip": 0.01337285, + "auxiliary_loss_mlp": 0.01025255, + "balance_loss_clip": 1.22735953, + "balance_loss_mlp": 1.01252389, + "epoch": 0.6286487298962874, + "flos": 24720903560520.0, + "grad_norm": 1.577282415934176, + "language_loss": 0.82168591, + "learning_rate": 1.280241153705706e-06, + "loss": 0.8453114, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12738037, + "step": 10456, + "time_per_iteration": 2.841829299926758 + }, + { + "auxiliary_loss_clip": 0.01350685, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.23702753, + "balance_loss_mlp": 1.0201869, + "epoch": 0.6287088531489553, + "flos": 20745734166360.0, + "grad_norm": 1.5193288933781424, + "language_loss": 0.72014356, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74398977, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 1.13720703, + "router_z_loss_mlp": 0.13757324, + "step": 10457, + "time_per_iteration": 3.0550336837768555 + }, + { + "auxiliary_loss_clip": 0.01353613, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.2368691, + "balance_loss_mlp": 1.02136135, + "epoch": 0.6287689764016233, + "flos": 23075161149960.0, + "grad_norm": 3.462055211990056, + "language_loss": 0.80658787, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.8304714, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.13372803, + "step": 10458, + "time_per_iteration": 2.85896897315979 + }, + { + "auxiliary_loss_clip": 0.01353122, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.23955715, + "balance_loss_mlp": 1.02230883, + "epoch": 0.6288290996542913, + "flos": 32240063514960.0, + "grad_norm": 1.5006229756687472, + "language_loss": 0.6126439, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63653308, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13500977, + "step": 10459, + "time_per_iteration": 2.970957040786743 + }, + { + "auxiliary_loss_clip": 0.01347975, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.23610651, + "balance_loss_mlp": 1.02442849, + "epoch": 0.6288892229069593, + "flos": 24646299357240.0, + "grad_norm": 1.6155120494614954, + "language_loss": 0.7924211, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81626886, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12371826, + "step": 10460, + "time_per_iteration": 2.8929355144500732 + }, + { + "auxiliary_loss_clip": 0.01340698, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.23085749, + "balance_loss_mlp": 1.0156343, + "epoch": 0.6289493461596273, + "flos": 17862866912160.0, + "grad_norm": 1.749056354264317, + "language_loss": 0.74467498, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76837349, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13513184, + "step": 10461, + "time_per_iteration": 2.7514700889587402 + }, + { + "auxiliary_loss_clip": 0.01341826, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.23327279, + "balance_loss_mlp": 1.01877296, + "epoch": 0.6290094694122952, + "flos": 22350397897800.0, + "grad_norm": 1.7516705139533653, + "language_loss": 0.70649445, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.73022509, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12469482, + "step": 10462, + "time_per_iteration": 2.8376784324645996 + }, + { + "auxiliary_loss_clip": 0.0133028, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.22557533, + "balance_loss_mlp": 1.01949477, + "epoch": 0.6290695926649632, + "flos": 28408295531880.0, + "grad_norm": 1.7081219127050518, + "language_loss": 0.72213072, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74574059, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11206055, + "step": 10463, + "time_per_iteration": 2.8756442070007324 + }, + { + "auxiliary_loss_clip": 0.01340578, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.23406029, + "balance_loss_mlp": 1.02296376, + "epoch": 0.6291297159176311, + "flos": 21510276630120.0, + "grad_norm": 1.7640000501125044, + "language_loss": 0.72350669, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74727225, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13012695, + "step": 10464, + "time_per_iteration": 4.280630588531494 + }, + { + "auxiliary_loss_clip": 0.01343541, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.23574615, + "balance_loss_mlp": 1.01648211, + "epoch": 0.6291898391702991, + "flos": 12207130115760.0, + "grad_norm": 1.6330257167984938, + "language_loss": 0.69848627, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.72221148, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12506104, + "step": 10465, + "time_per_iteration": 2.7993016242980957 + }, + { + "auxiliary_loss_clip": 0.01168206, + "auxiliary_loss_mlp": 0.01001991, + "balance_loss_clip": 1.1222986, + "balance_loss_mlp": 0.99942797, + "epoch": 0.629249962422967, + "flos": 69314990921280.0, + "grad_norm": 0.6744649206089827, + "language_loss": 0.59787548, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61957741, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.02563477, + "step": 10466, + "time_per_iteration": 4.908999681472778 + }, + { + "auxiliary_loss_clip": 0.01342201, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.23251891, + "balance_loss_mlp": 1.01558673, + "epoch": 0.6293100856756351, + "flos": 40085277594480.0, + "grad_norm": 1.8823370984148189, + "language_loss": 0.65452838, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67822993, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12365723, + "step": 10467, + "time_per_iteration": 3.0217478275299072 + }, + { + "auxiliary_loss_clip": 0.01167128, + "auxiliary_loss_mlp": 0.01003354, + "balance_loss_clip": 1.12085366, + "balance_loss_mlp": 1.00067163, + "epoch": 0.629370208928303, + "flos": 67515004667520.0, + "grad_norm": 0.8085454126661011, + "language_loss": 0.5703963, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.59210116, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.02685547, + "step": 10468, + "time_per_iteration": 3.1381709575653076 + }, + { + "auxiliary_loss_clip": 0.01166671, + "auxiliary_loss_mlp": 0.01003053, + "balance_loss_clip": 1.12061334, + "balance_loss_mlp": 1.0003947, + "epoch": 0.629430332180971, + "flos": 60675891514920.0, + "grad_norm": 0.75551175547428, + "language_loss": 0.58051199, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60220921, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.02661133, + "step": 10469, + "time_per_iteration": 4.620561838150024 + }, + { + "auxiliary_loss_clip": 0.01167568, + "auxiliary_loss_mlp": 0.01000398, + "balance_loss_clip": 1.12160254, + "balance_loss_mlp": 0.99747747, + "epoch": 0.6294904554336389, + "flos": 66886895958120.0, + "grad_norm": 0.676771238768182, + "language_loss": 0.52176267, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54344237, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.0291748, + "step": 10470, + "time_per_iteration": 3.2627477645874023 + }, + { + "auxiliary_loss_clip": 0.01333444, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.22677231, + "balance_loss_mlp": 1.02128494, + "epoch": 0.6295505786863069, + "flos": 42530549893920.0, + "grad_norm": 1.7633360338803132, + "language_loss": 0.74925363, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.77292711, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12628174, + "step": 10471, + "time_per_iteration": 3.075209379196167 + }, + { + "auxiliary_loss_clip": 0.01346837, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.23631942, + "balance_loss_mlp": 1.01738024, + "epoch": 0.629610701938975, + "flos": 17388619589520.0, + "grad_norm": 1.8561533035172213, + "language_loss": 0.63108981, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65485114, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.11914062, + "step": 10472, + "time_per_iteration": 2.882136821746826 + }, + { + "auxiliary_loss_clip": 0.01353162, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.24021125, + "balance_loss_mlp": 1.02108479, + "epoch": 0.6296708251916429, + "flos": 24248239963920.0, + "grad_norm": 2.0857078868675423, + "language_loss": 0.69510883, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71898878, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13769531, + "step": 10473, + "time_per_iteration": 2.847123384475708 + }, + { + "auxiliary_loss_clip": 0.01340582, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.23164535, + "balance_loss_mlp": 1.02082384, + "epoch": 0.6297309484443109, + "flos": 19282888119960.0, + "grad_norm": 1.7363763019464757, + "language_loss": 0.7476294, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.77136445, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.12091064, + "step": 10474, + "time_per_iteration": 2.8938469886779785 + }, + { + "auxiliary_loss_clip": 0.0134703, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.23676836, + "balance_loss_mlp": 1.02110374, + "epoch": 0.6297910716969788, + "flos": 30668600440800.0, + "grad_norm": 1.5633023357903444, + "language_loss": 0.66426599, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68807763, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 1.10302734, + "router_z_loss_mlp": 0.13024902, + "step": 10475, + "time_per_iteration": 4.557210683822632 + }, + { + "auxiliary_loss_clip": 0.01333197, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.22715807, + "balance_loss_mlp": 1.01970196, + "epoch": 0.6298511949496468, + "flos": 14426356345560.0, + "grad_norm": 1.798362846792173, + "language_loss": 0.90383083, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92748177, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12200928, + "step": 10476, + "time_per_iteration": 2.847926139831543 + }, + { + "auxiliary_loss_clip": 0.01344745, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.23603415, + "balance_loss_mlp": 1.02629018, + "epoch": 0.6299113182023147, + "flos": 23519497092480.0, + "grad_norm": 1.6952978654083755, + "language_loss": 0.75787401, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.7817018, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.11743164, + "step": 10477, + "time_per_iteration": 2.8281683921813965 + }, + { + "auxiliary_loss_clip": 0.01349551, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.23844945, + "balance_loss_mlp": 1.02140212, + "epoch": 0.6299714414549827, + "flos": 22679660083320.0, + "grad_norm": 2.0993468993800146, + "language_loss": 0.70788676, + "learning_rate": 1.272253702758138e-06, + "loss": 0.7317338, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13757324, + "step": 10478, + "time_per_iteration": 2.7717342376708984 + }, + { + "auxiliary_loss_clip": 0.01353638, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.23879921, + "balance_loss_mlp": 1.02144134, + "epoch": 0.6300315647076506, + "flos": 14505833552040.0, + "grad_norm": 2.4182054162516615, + "language_loss": 0.67827958, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.70216608, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13586426, + "step": 10479, + "time_per_iteration": 2.856663465499878 + }, + { + "auxiliary_loss_clip": 0.01343123, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.23442483, + "balance_loss_mlp": 1.01896358, + "epoch": 0.6300916879603187, + "flos": 21876719092200.0, + "grad_norm": 2.0217367551606276, + "language_loss": 0.73744822, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.76120257, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13348389, + "step": 10480, + "time_per_iteration": 2.778400182723999 + }, + { + "auxiliary_loss_clip": 0.01354039, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.24190474, + "balance_loss_mlp": 1.01985276, + "epoch": 0.6301518112129866, + "flos": 21838929690240.0, + "grad_norm": 3.1560900262461176, + "language_loss": 0.7891323, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.8130095, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13842773, + "step": 10481, + "time_per_iteration": 2.7716329097747803 + }, + { + "auxiliary_loss_clip": 0.01169446, + "auxiliary_loss_mlp": 0.01012991, + "balance_loss_clip": 1.12369466, + "balance_loss_mlp": 1.01045179, + "epoch": 0.6302119344656546, + "flos": 44345317590720.0, + "grad_norm": 0.8854716181610129, + "language_loss": 0.6186921, + "learning_rate": 1.2708028696588e-06, + "loss": 0.64051646, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02539062, + "step": 10482, + "time_per_iteration": 3.0124995708465576 + }, + { + "auxiliary_loss_clip": 0.01359557, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.2428993, + "balance_loss_mlp": 1.02515137, + "epoch": 0.6302720577183225, + "flos": 11221577019000.0, + "grad_norm": 2.0340958080217946, + "language_loss": 0.8296361, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85362339, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.14013672, + "step": 10483, + "time_per_iteration": 2.723320484161377 + }, + { + "auxiliary_loss_clip": 0.01334256, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.22964215, + "balance_loss_mlp": 1.02273226, + "epoch": 0.6303321809709905, + "flos": 27970781793840.0, + "grad_norm": 1.8808350929549587, + "language_loss": 0.72849166, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75218785, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12628174, + "step": 10484, + "time_per_iteration": 2.8238489627838135 + }, + { + "auxiliary_loss_clip": 0.0134807, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.23685467, + "balance_loss_mlp": 1.01648557, + "epoch": 0.6303923042236586, + "flos": 28226698635240.0, + "grad_norm": 1.6209316874917636, + "language_loss": 0.74867088, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.77244556, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12896729, + "step": 10485, + "time_per_iteration": 2.872274160385132 + }, + { + "auxiliary_loss_clip": 0.01355768, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.24067831, + "balance_loss_mlp": 1.02333498, + "epoch": 0.6304524274763265, + "flos": 27636118696440.0, + "grad_norm": 1.942569906886168, + "language_loss": 0.81864429, + "learning_rate": 1.269352478979093e-06, + "loss": 0.84257489, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13970947, + "step": 10486, + "time_per_iteration": 2.867535352706909 + }, + { + "auxiliary_loss_clip": 0.01347113, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.23734546, + "balance_loss_mlp": 1.02559268, + "epoch": 0.6305125507289945, + "flos": 17315802154080.0, + "grad_norm": 1.6491506984299198, + "language_loss": 0.64145422, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.66530788, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12652588, + "step": 10487, + "time_per_iteration": 2.846315383911133 + }, + { + "auxiliary_loss_clip": 0.01341814, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.23280036, + "balance_loss_mlp": 1.03251457, + "epoch": 0.6305726739816624, + "flos": 25813327525560.0, + "grad_norm": 1.8919478349563579, + "language_loss": 0.67275411, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69662535, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12805176, + "step": 10488, + "time_per_iteration": 2.858379364013672 + }, + { + "auxiliary_loss_clip": 0.01350481, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.23812342, + "balance_loss_mlp": 1.01790166, + "epoch": 0.6306327972343304, + "flos": 21802317930720.0, + "grad_norm": 1.7329327972815953, + "language_loss": 0.67718863, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.70100027, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12774658, + "step": 10489, + "time_per_iteration": 3.009289264678955 + }, + { + "auxiliary_loss_clip": 0.01369278, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.25032568, + "balance_loss_mlp": 1.02765679, + "epoch": 0.6306929204869983, + "flos": 20782305317520.0, + "grad_norm": 1.935982542651545, + "language_loss": 0.69790685, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.72201431, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.13830566, + "step": 10490, + "time_per_iteration": 2.824455738067627 + }, + { + "auxiliary_loss_clip": 0.01347712, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.23554969, + "balance_loss_mlp": 1.02598298, + "epoch": 0.6307530437396663, + "flos": 23658797059200.0, + "grad_norm": 1.8692012540314273, + "language_loss": 0.78462613, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80850255, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13964844, + "step": 10491, + "time_per_iteration": 2.9664719104766846 + }, + { + "auxiliary_loss_clip": 0.01345799, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.23678493, + "balance_loss_mlp": 1.0220356, + "epoch": 0.6308131669923343, + "flos": 24724598921280.0, + "grad_norm": 4.466446438738341, + "language_loss": 0.55942202, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58322608, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12573242, + "step": 10492, + "time_per_iteration": 2.9007632732391357 + }, + { + "auxiliary_loss_clip": 0.01348608, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.23663318, + "balance_loss_mlp": 1.02237916, + "epoch": 0.6308732902450023, + "flos": 22570596363600.0, + "grad_norm": 3.859448418575084, + "language_loss": 0.6439712, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66782147, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14044189, + "step": 10493, + "time_per_iteration": 2.826029062271118 + }, + { + "auxiliary_loss_clip": 0.01347924, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.23749781, + "balance_loss_mlp": 1.0168339, + "epoch": 0.6309334134976702, + "flos": 24649548026040.0, + "grad_norm": 1.7436793149780547, + "language_loss": 0.82946336, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85324395, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13317871, + "step": 10494, + "time_per_iteration": 2.8958241939544678 + }, + { + "auxiliary_loss_clip": 0.01351251, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.24017894, + "balance_loss_mlp": 1.02108002, + "epoch": 0.6309935367503382, + "flos": 41435526993840.0, + "grad_norm": 2.0576055471847314, + "language_loss": 0.79703021, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.82088667, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13317871, + "step": 10495, + "time_per_iteration": 2.9365291595458984 + }, + { + "auxiliary_loss_clip": 0.01352237, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.24090052, + "balance_loss_mlp": 1.019804, + "epoch": 0.6310536600030061, + "flos": 15122710726920.0, + "grad_norm": 1.7696084339197724, + "language_loss": 0.70906788, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.73293018, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14196777, + "step": 10496, + "time_per_iteration": 2.7719478607177734 + }, + { + "auxiliary_loss_clip": 0.01349779, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.23818457, + "balance_loss_mlp": 1.02034819, + "epoch": 0.6311137832556741, + "flos": 15235835282640.0, + "grad_norm": 2.220749456666236, + "language_loss": 0.80907607, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.83291221, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13470459, + "step": 10497, + "time_per_iteration": 2.8808414936065674 + }, + { + "auxiliary_loss_clip": 0.01343535, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.2347008, + "balance_loss_mlp": 1.02111173, + "epoch": 0.6311739065083422, + "flos": 22023856472400.0, + "grad_norm": 2.1228487157362705, + "language_loss": 0.74406755, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76783442, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12036133, + "step": 10498, + "time_per_iteration": 2.8010339736938477 + }, + { + "auxiliary_loss_clip": 0.01344712, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.23450363, + "balance_loss_mlp": 1.02194214, + "epoch": 0.6312340297610101, + "flos": 22715906367600.0, + "grad_norm": 1.9503972894138988, + "language_loss": 0.70225322, + "learning_rate": 1.264641775364217e-06, + "loss": 0.72605199, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13226318, + "step": 10499, + "time_per_iteration": 2.847012519836426 + }, + { + "auxiliary_loss_clip": 0.01341403, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.23485446, + "balance_loss_mlp": 1.02383995, + "epoch": 0.6312941530136781, + "flos": 24285501457200.0, + "grad_norm": 1.8602314231395292, + "language_loss": 0.69844538, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72222841, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13061523, + "step": 10500, + "time_per_iteration": 2.7862398624420166 + }, + { + "auxiliary_loss_clip": 0.01345729, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.23579335, + "balance_loss_mlp": 1.02006173, + "epoch": 0.631354276266346, + "flos": 21731003004600.0, + "grad_norm": 1.927354816644316, + "language_loss": 0.74458027, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76836109, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12304688, + "step": 10501, + "time_per_iteration": 2.82900071144104 + }, + { + "auxiliary_loss_clip": 0.01339658, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.23088789, + "balance_loss_mlp": 1.01852083, + "epoch": 0.631414399519014, + "flos": 24030518608080.0, + "grad_norm": 3.325096617986055, + "language_loss": 0.75789213, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.78161067, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13684082, + "step": 10502, + "time_per_iteration": 2.9339795112609863 + }, + { + "auxiliary_loss_clip": 0.01355168, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_clip": 1.24074292, + "balance_loss_mlp": 1.0348562, + "epoch": 0.6314745227716819, + "flos": 24321098007720.0, + "grad_norm": 1.8468109338875711, + "language_loss": 0.85778892, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.88182861, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13946533, + "step": 10503, + "time_per_iteration": 4.313825845718384 + }, + { + "auxiliary_loss_clip": 0.01347837, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.23709416, + "balance_loss_mlp": 1.02315259, + "epoch": 0.6315346460243499, + "flos": 23371466328360.0, + "grad_norm": 1.7889777821796582, + "language_loss": 0.86454296, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88838762, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13482666, + "step": 10504, + "time_per_iteration": 4.32183575630188 + }, + { + "auxiliary_loss_clip": 0.01360052, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.24514198, + "balance_loss_mlp": 1.02315772, + "epoch": 0.6315947692770179, + "flos": 20264380380720.0, + "grad_norm": 1.74776037728368, + "language_loss": 0.76927686, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.79324692, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13793945, + "step": 10505, + "time_per_iteration": 2.8315069675445557 + }, + { + "auxiliary_loss_clip": 0.01350961, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.23948526, + "balance_loss_mlp": 1.0207144, + "epoch": 0.6316548925296859, + "flos": 25271460637560.0, + "grad_norm": 1.86558080931604, + "language_loss": 0.82037294, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.8442297, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14013672, + "step": 10506, + "time_per_iteration": 2.8945696353912354 + }, + { + "auxiliary_loss_clip": 0.01349226, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.23867369, + "balance_loss_mlp": 1.02011752, + "epoch": 0.6317150157823538, + "flos": 22935861183240.0, + "grad_norm": 1.667734288137375, + "language_loss": 0.74340785, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76723206, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13085938, + "step": 10507, + "time_per_iteration": 4.269730567932129 + }, + { + "auxiliary_loss_clip": 0.01359118, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.246176, + "balance_loss_mlp": 1.02620077, + "epoch": 0.6317751390350218, + "flos": 22531994794440.0, + "grad_norm": 1.6460501396850096, + "language_loss": 0.68534547, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.70933449, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 1.12744141, + "router_z_loss_mlp": 0.13580322, + "step": 10508, + "time_per_iteration": 2.834163188934326 + }, + { + "auxiliary_loss_clip": 0.0135006, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.2398771, + "balance_loss_mlp": 1.02488065, + "epoch": 0.6318352622876897, + "flos": 23299988968800.0, + "grad_norm": 1.58023690573731, + "language_loss": 0.71122319, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73510003, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12738037, + "step": 10509, + "time_per_iteration": 2.727243185043335 + }, + { + "auxiliary_loss_clip": 0.01349166, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.23927665, + "balance_loss_mlp": 1.01801515, + "epoch": 0.6318953855403577, + "flos": 20708959973400.0, + "grad_norm": 1.5963989481786358, + "language_loss": 0.79573476, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.8195374, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13079834, + "step": 10510, + "time_per_iteration": 2.7778663635253906 + }, + { + "auxiliary_loss_clip": 0.01353751, + "auxiliary_loss_mlp": 0.01037025, + "balance_loss_clip": 1.23983586, + "balance_loss_mlp": 1.02375066, + "epoch": 0.6319555087930258, + "flos": 22825173129120.0, + "grad_norm": 1.4258139652250723, + "language_loss": 0.7053948, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72930259, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1328125, + "step": 10511, + "time_per_iteration": 2.7871875762939453 + }, + { + "auxiliary_loss_clip": 0.01347819, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.23967385, + "balance_loss_mlp": 1.0229429, + "epoch": 0.6320156320456937, + "flos": 19975262882040.0, + "grad_norm": 2.657838157265844, + "language_loss": 0.80036271, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.8241958, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12548828, + "step": 10512, + "time_per_iteration": 2.8355703353881836 + }, + { + "auxiliary_loss_clip": 0.01353089, + "auxiliary_loss_mlp": 0.01038528, + "balance_loss_clip": 1.24236917, + "balance_loss_mlp": 1.02511644, + "epoch": 0.6320757552983617, + "flos": 27018754221240.0, + "grad_norm": 1.8118571087677378, + "language_loss": 0.70780337, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.73171955, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13427734, + "step": 10513, + "time_per_iteration": 2.8817036151885986 + }, + { + "auxiliary_loss_clip": 0.01356676, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.24264526, + "balance_loss_mlp": 1.01925445, + "epoch": 0.6321358785510296, + "flos": 23701215814200.0, + "grad_norm": 1.4563073260585495, + "language_loss": 0.66689169, + "learning_rate": 1.259212205855459e-06, + "loss": 0.69078517, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13415527, + "step": 10514, + "time_per_iteration": 4.396746873855591 + }, + { + "auxiliary_loss_clip": 0.01344809, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.23599243, + "balance_loss_mlp": 1.02434492, + "epoch": 0.6321960018036976, + "flos": 26000893851120.0, + "grad_norm": 1.8053096192371283, + "language_loss": 0.73846138, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76227939, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12652588, + "step": 10515, + "time_per_iteration": 2.7713515758514404 + }, + { + "auxiliary_loss_clip": 0.01338172, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.23100138, + "balance_loss_mlp": 1.01487136, + "epoch": 0.6322561250563655, + "flos": 22826269554840.0, + "grad_norm": 1.7026330092514304, + "language_loss": 0.90103918, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.9246937, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12414551, + "step": 10516, + "time_per_iteration": 2.7598519325256348 + }, + { + "auxiliary_loss_clip": 0.01367477, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.25051212, + "balance_loss_mlp": 1.01768541, + "epoch": 0.6323162483090335, + "flos": 18992958454080.0, + "grad_norm": 1.9923691740042715, + "language_loss": 0.82009524, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84409487, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14801025, + "step": 10517, + "time_per_iteration": 2.7503836154937744 + }, + { + "auxiliary_loss_clip": 0.01350134, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.24016595, + "balance_loss_mlp": 1.02728486, + "epoch": 0.6323763715617015, + "flos": 19869975739800.0, + "grad_norm": 1.585481340030698, + "language_loss": 0.78012902, + "learning_rate": 1.257765386189541e-06, + "loss": 0.80403256, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12939453, + "step": 10518, + "time_per_iteration": 2.773691177368164 + }, + { + "auxiliary_loss_clip": 0.01344847, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.2362771, + "balance_loss_mlp": 1.02096903, + "epoch": 0.6324364948143695, + "flos": 22787708594040.0, + "grad_norm": 1.825965433164384, + "language_loss": 0.8513577, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.8751381, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12219238, + "step": 10519, + "time_per_iteration": 2.7526321411132812 + }, + { + "auxiliary_loss_clip": 0.01337989, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.23140121, + "balance_loss_mlp": 1.02496815, + "epoch": 0.6324966180670374, + "flos": 22240928094480.0, + "grad_norm": 1.5217049832350948, + "language_loss": 0.72148919, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.7452451, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12634277, + "step": 10520, + "time_per_iteration": 2.765183210372925 + }, + { + "auxiliary_loss_clip": 0.01349143, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.23845959, + "balance_loss_mlp": 1.01779056, + "epoch": 0.6325567413197054, + "flos": 21694188203280.0, + "grad_norm": 1.6790558141365581, + "language_loss": 0.71612799, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73992395, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12664795, + "step": 10521, + "time_per_iteration": 2.725442409515381 + }, + { + "auxiliary_loss_clip": 0.01355181, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.2426585, + "balance_loss_mlp": 1.01951802, + "epoch": 0.6326168645723733, + "flos": 19941818574600.0, + "grad_norm": 1.7394725961125332, + "language_loss": 0.72201735, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74590659, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14215088, + "step": 10522, + "time_per_iteration": 2.7539467811584473 + }, + { + "auxiliary_loss_clip": 0.01348507, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.23872495, + "balance_loss_mlp": 1.01962447, + "epoch": 0.6326769878250413, + "flos": 20235240559440.0, + "grad_norm": 1.7068676868550618, + "language_loss": 0.81753445, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84134287, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12738037, + "step": 10523, + "time_per_iteration": 2.709071636199951 + }, + { + "auxiliary_loss_clip": 0.0134745, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.23777044, + "balance_loss_mlp": 1.01573908, + "epoch": 0.6327371110777094, + "flos": 20780477941320.0, + "grad_norm": 1.8541914188996227, + "language_loss": 0.73552287, + "learning_rate": 1.255596001333195e-06, + "loss": 0.7592845, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12994385, + "step": 10524, + "time_per_iteration": 2.805837631225586 + }, + { + "auxiliary_loss_clip": 0.01364691, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.24858248, + "balance_loss_mlp": 1.02071667, + "epoch": 0.6327972343303773, + "flos": 30342708749160.0, + "grad_norm": 2.2293631172083352, + "language_loss": 0.84288597, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86688364, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.14355469, + "step": 10525, + "time_per_iteration": 2.876854419708252 + }, + { + "auxiliary_loss_clip": 0.01347404, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.2384665, + "balance_loss_mlp": 1.01208282, + "epoch": 0.6328573575830453, + "flos": 17096700114000.0, + "grad_norm": 1.5798576017504058, + "language_loss": 0.66689658, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.6906147, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12335205, + "step": 10526, + "time_per_iteration": 2.9147443771362305 + }, + { + "auxiliary_loss_clip": 0.01361592, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.24640274, + "balance_loss_mlp": 1.01959443, + "epoch": 0.6329174808357132, + "flos": 25052764681080.0, + "grad_norm": 1.7076747030393906, + "language_loss": 0.73290569, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75686073, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14324951, + "step": 10527, + "time_per_iteration": 2.9432644844055176 + }, + { + "auxiliary_loss_clip": 0.01350554, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.24251282, + "balance_loss_mlp": 1.01740873, + "epoch": 0.6329776040883812, + "flos": 16841229964560.0, + "grad_norm": 2.1017419941796853, + "language_loss": 0.71759325, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.7413969, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12414551, + "step": 10528, + "time_per_iteration": 2.728973388671875 + }, + { + "auxiliary_loss_clip": 0.01350104, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.23929727, + "balance_loss_mlp": 1.01700258, + "epoch": 0.6330377273410491, + "flos": 13520686538880.0, + "grad_norm": 2.484489385069776, + "language_loss": 0.66576684, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68957102, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 1.10791016, + "router_z_loss_mlp": 0.13323975, + "step": 10529, + "time_per_iteration": 2.718386650085449 + }, + { + "auxiliary_loss_clip": 0.01359903, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.24576509, + "balance_loss_mlp": 1.01668561, + "epoch": 0.6330978505937171, + "flos": 21543233637240.0, + "grad_norm": 1.857720090537137, + "language_loss": 0.7618137, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.78571868, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13928223, + "step": 10530, + "time_per_iteration": 2.7262587547302246 + }, + { + "auxiliary_loss_clip": 0.01362495, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.24897027, + "balance_loss_mlp": 1.01908004, + "epoch": 0.6331579738463851, + "flos": 25014934670760.0, + "grad_norm": 1.4792852676469506, + "language_loss": 0.73715448, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76110351, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13330078, + "step": 10531, + "time_per_iteration": 2.7992231845855713 + }, + { + "auxiliary_loss_clip": 0.01339938, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.23265851, + "balance_loss_mlp": 1.01731873, + "epoch": 0.6332180970990531, + "flos": 14980162091400.0, + "grad_norm": 2.305550538181376, + "language_loss": 0.79684722, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.8205508, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13092041, + "step": 10532, + "time_per_iteration": 2.7319533824920654 + }, + { + "auxiliary_loss_clip": 0.01343979, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.23559856, + "balance_loss_mlp": 1.01950479, + "epoch": 0.633278220351721, + "flos": 22711601881440.0, + "grad_norm": 1.5878855294735583, + "language_loss": 0.7506054, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.7743597, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1194458, + "step": 10533, + "time_per_iteration": 2.797456741333008 + }, + { + "auxiliary_loss_clip": 0.01356439, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.24022889, + "balance_loss_mlp": 1.02160382, + "epoch": 0.633338343604389, + "flos": 12606570193320.0, + "grad_norm": 2.261968850154801, + "language_loss": 0.77294105, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.7968632, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.1416626, + "step": 10534, + "time_per_iteration": 2.804323196411133 + }, + { + "auxiliary_loss_clip": 0.01353058, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.24337482, + "balance_loss_mlp": 1.0172981, + "epoch": 0.6333984668570569, + "flos": 25965947034360.0, + "grad_norm": 1.5162980970106614, + "language_loss": 0.85921377, + "learning_rate": 1.251621437204777e-06, + "loss": 0.88304675, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.1295166, + "step": 10535, + "time_per_iteration": 2.7921173572540283 + }, + { + "auxiliary_loss_clip": 0.01352912, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.2418282, + "balance_loss_mlp": 1.01763427, + "epoch": 0.6334585901097249, + "flos": 23664482229600.0, + "grad_norm": 1.649418446712891, + "language_loss": 0.76240468, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78624463, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13439941, + "step": 10536, + "time_per_iteration": 2.755706548690796 + }, + { + "auxiliary_loss_clip": 0.01355741, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.24643326, + "balance_loss_mlp": 1.01575589, + "epoch": 0.633518713362393, + "flos": 28765519896240.0, + "grad_norm": 1.8404107017810616, + "language_loss": 0.60810548, + "learning_rate": 1.250899157568855e-06, + "loss": 0.63195211, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13165283, + "step": 10537, + "time_per_iteration": 2.7660951614379883 + }, + { + "auxiliary_loss_clip": 0.01164064, + "auxiliary_loss_mlp": 0.01015599, + "balance_loss_clip": 1.11897314, + "balance_loss_mlp": 1.01295304, + "epoch": 0.6335788366150609, + "flos": 70434646114680.0, + "grad_norm": 0.7767750208038263, + "language_loss": 0.52442431, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54622096, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02648926, + "step": 10538, + "time_per_iteration": 3.4003114700317383 + }, + { + "auxiliary_loss_clip": 0.01363131, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.24814939, + "balance_loss_mlp": 1.01950157, + "epoch": 0.6336389598677289, + "flos": 23737299665040.0, + "grad_norm": 1.8450259039795551, + "language_loss": 0.84031421, + "learning_rate": 1.250176991556848e-06, + "loss": 0.86427736, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.13671875, + "step": 10539, + "time_per_iteration": 2.7783901691436768 + }, + { + "auxiliary_loss_clip": 0.01361259, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.24769115, + "balance_loss_mlp": 1.01608014, + "epoch": 0.6336990831203968, + "flos": 29282470232400.0, + "grad_norm": 1.5445396100229871, + "language_loss": 0.86914325, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.89306206, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14532471, + "step": 10540, + "time_per_iteration": 2.786590576171875 + }, + { + "auxiliary_loss_clip": 0.0134758, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.23916674, + "balance_loss_mlp": 1.02241004, + "epoch": 0.6337592063730648, + "flos": 29102944362120.0, + "grad_norm": 2.1019180768573955, + "language_loss": 0.73063725, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.75445962, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12249756, + "step": 10541, + "time_per_iteration": 2.796917676925659 + }, + { + "auxiliary_loss_clip": 0.01365247, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.25017285, + "balance_loss_mlp": 1.0182699, + "epoch": 0.6338193296257327, + "flos": 34712120350800.0, + "grad_norm": 2.145657003685419, + "language_loss": 0.84989107, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.87386447, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13824463, + "step": 10542, + "time_per_iteration": 5.708235740661621 + }, + { + "auxiliary_loss_clip": 0.01358046, + "auxiliary_loss_mlp": 0.0102591, + "balance_loss_clip": 1.24737191, + "balance_loss_mlp": 1.0115869, + "epoch": 0.6338794528784008, + "flos": 16691534257680.0, + "grad_norm": 3.7975299167393595, + "language_loss": 0.78233588, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80617547, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14324951, + "step": 10543, + "time_per_iteration": 2.7050843238830566 + }, + { + "auxiliary_loss_clip": 0.01347389, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.24051082, + "balance_loss_mlp": 1.01595354, + "epoch": 0.6339395761310687, + "flos": 22351778582040.0, + "grad_norm": 1.63882118879448, + "language_loss": 0.73277724, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75653833, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12762451, + "step": 10544, + "time_per_iteration": 2.8180038928985596 + }, + { + "auxiliary_loss_clip": 0.01369359, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.25319099, + "balance_loss_mlp": 1.01730502, + "epoch": 0.6339996993837367, + "flos": 18556459925040.0, + "grad_norm": 1.8846301552922635, + "language_loss": 0.68153477, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70553797, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.13665771, + "step": 10545, + "time_per_iteration": 4.315873384475708 + }, + { + "auxiliary_loss_clip": 0.01345092, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.23682356, + "balance_loss_mlp": 1.01973748, + "epoch": 0.6340598226364046, + "flos": 12973418739000.0, + "grad_norm": 2.3808911497086873, + "language_loss": 0.7093181, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73309928, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13293457, + "step": 10546, + "time_per_iteration": 2.7948899269104004 + }, + { + "auxiliary_loss_clip": 0.01345639, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.23956871, + "balance_loss_mlp": 1.01538014, + "epoch": 0.6341199458890726, + "flos": 26693674696800.0, + "grad_norm": 1.3557562599453539, + "language_loss": 0.78175843, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80549133, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12286377, + "step": 10547, + "time_per_iteration": 2.8736047744750977 + }, + { + "auxiliary_loss_clip": 0.013591, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.24497795, + "balance_loss_mlp": 1.0199194, + "epoch": 0.6341800691417405, + "flos": 18738950205600.0, + "grad_norm": 1.7494529311913707, + "language_loss": 0.63770914, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.66163146, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13226318, + "step": 10548, + "time_per_iteration": 2.7638351917266846 + }, + { + "auxiliary_loss_clip": 0.01354723, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.24403262, + "balance_loss_mlp": 1.01531887, + "epoch": 0.6342401923944085, + "flos": 26254983316320.0, + "grad_norm": 1.6017492803991131, + "language_loss": 0.61951804, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64334857, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13031006, + "step": 10549, + "time_per_iteration": 2.9879491329193115 + }, + { + "auxiliary_loss_clip": 0.01358481, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.24798286, + "balance_loss_mlp": 1.01763272, + "epoch": 0.6343003156470765, + "flos": 24686322219000.0, + "grad_norm": 1.8093766146194576, + "language_loss": 0.7374146, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.76130784, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 1.10595703, + "router_z_loss_mlp": 0.13220215, + "step": 10550, + "time_per_iteration": 2.854890823364258 + }, + { + "auxiliary_loss_clip": 0.01163358, + "auxiliary_loss_mlp": 0.01001911, + "balance_loss_clip": 1.1184175, + "balance_loss_mlp": 0.99876386, + "epoch": 0.6343604388997445, + "flos": 69820286658120.0, + "grad_norm": 0.6950047065953889, + "language_loss": 0.57779402, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59944671, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.03149414, + "step": 10551, + "time_per_iteration": 3.323164224624634 + }, + { + "auxiliary_loss_clip": 0.01347089, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.23828268, + "balance_loss_mlp": 1.01386404, + "epoch": 0.6344205621524125, + "flos": 21987650796480.0, + "grad_norm": 1.7015628928234967, + "language_loss": 0.67343903, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69717193, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12353516, + "step": 10552, + "time_per_iteration": 2.8130829334259033 + }, + { + "auxiliary_loss_clip": 0.01363152, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.24924409, + "balance_loss_mlp": 1.01225162, + "epoch": 0.6344806854050804, + "flos": 20453733474120.0, + "grad_norm": 1.763769171336542, + "language_loss": 0.82289803, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84678793, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13604736, + "step": 10553, + "time_per_iteration": 2.702923536300659 + }, + { + "auxiliary_loss_clip": 0.01358857, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.24661756, + "balance_loss_mlp": 1.02015328, + "epoch": 0.6345408086577484, + "flos": 40517390420640.0, + "grad_norm": 1.709864764930552, + "language_loss": 0.55114996, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.57506478, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12481689, + "step": 10554, + "time_per_iteration": 4.559570550918579 + }, + { + "auxiliary_loss_clip": 0.01357046, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.24528241, + "balance_loss_mlp": 1.01677907, + "epoch": 0.6346009319104163, + "flos": 21366347310360.0, + "grad_norm": 1.7658461052652337, + "language_loss": 0.71470165, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.73856485, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12493896, + "step": 10555, + "time_per_iteration": 2.7672200202941895 + }, + { + "auxiliary_loss_clip": 0.01162984, + "auxiliary_loss_mlp": 0.0100886, + "balance_loss_clip": 1.11853409, + "balance_loss_mlp": 1.00646365, + "epoch": 0.6346610551630844, + "flos": 71379404790840.0, + "grad_norm": 0.7983702297350145, + "language_loss": 0.55418694, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57590544, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02392578, + "step": 10556, + "time_per_iteration": 3.158971071243286 + }, + { + "auxiliary_loss_clip": 0.0136243, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.2483232, + "balance_loss_mlp": 1.01828635, + "epoch": 0.6347211784157523, + "flos": 25417095508440.0, + "grad_norm": 1.7266194352545379, + "language_loss": 0.68628013, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.71022457, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13726807, + "step": 10557, + "time_per_iteration": 2.7855379581451416 + }, + { + "auxiliary_loss_clip": 0.01354096, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.24356937, + "balance_loss_mlp": 1.01992702, + "epoch": 0.6347813016684203, + "flos": 15747059840040.0, + "grad_norm": 1.755775501171739, + "language_loss": 0.70418054, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.7280525, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13171387, + "step": 10558, + "time_per_iteration": 2.754756450653076 + }, + { + "auxiliary_loss_clip": 0.01353056, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.24144387, + "balance_loss_mlp": 1.01560521, + "epoch": 0.6348414249210882, + "flos": 21469035517560.0, + "grad_norm": 1.6084868878070193, + "language_loss": 0.78566545, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80948508, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13299561, + "step": 10559, + "time_per_iteration": 2.7337892055511475 + }, + { + "auxiliary_loss_clip": 0.01359156, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.2465663, + "balance_loss_mlp": 1.0191195, + "epoch": 0.6349015481737562, + "flos": 21658104352440.0, + "grad_norm": 1.7238337227101532, + "language_loss": 0.68477738, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70869577, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13562012, + "step": 10560, + "time_per_iteration": 2.8293449878692627 + }, + { + "auxiliary_loss_clip": 0.01355526, + "auxiliary_loss_mlp": 0.01035312, + "balance_loss_clip": 1.24521673, + "balance_loss_mlp": 1.0220139, + "epoch": 0.6349616714264241, + "flos": 22199402723400.0, + "grad_norm": 1.5979166265562317, + "language_loss": 0.77186978, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79577816, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.13287354, + "step": 10561, + "time_per_iteration": 2.8471288681030273 + }, + { + "auxiliary_loss_clip": 0.01361021, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.24909163, + "balance_loss_mlp": 1.01935697, + "epoch": 0.6350217946790921, + "flos": 25415796040920.0, + "grad_norm": 2.065809477992007, + "language_loss": 0.72494698, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74888039, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12976074, + "step": 10562, + "time_per_iteration": 2.855145215988159 + }, + { + "auxiliary_loss_clip": 0.01366237, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.25247657, + "balance_loss_mlp": 1.01803851, + "epoch": 0.63508191793176, + "flos": 19723691135160.0, + "grad_norm": 1.857754077026356, + "language_loss": 0.80337977, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82736552, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14318848, + "step": 10563, + "time_per_iteration": 2.7884066104888916 + }, + { + "auxiliary_loss_clip": 0.01360794, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.24860597, + "balance_loss_mlp": 1.02021813, + "epoch": 0.6351420411844281, + "flos": 18191519972280.0, + "grad_norm": 2.281233433365636, + "language_loss": 0.81282043, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83676404, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13354492, + "step": 10564, + "time_per_iteration": 2.7917978763580322 + }, + { + "auxiliary_loss_clip": 0.01359166, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.24829853, + "balance_loss_mlp": 1.01677477, + "epoch": 0.6352021644370961, + "flos": 33732699116400.0, + "grad_norm": 1.6002681936830245, + "language_loss": 0.72441697, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74830389, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12756348, + "step": 10565, + "time_per_iteration": 2.873447895050049 + }, + { + "auxiliary_loss_clip": 0.01362083, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.24892068, + "balance_loss_mlp": 1.01823175, + "epoch": 0.635262287689764, + "flos": 20379494746080.0, + "grad_norm": 2.287712843101686, + "language_loss": 0.69462341, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71856457, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13800049, + "step": 10566, + "time_per_iteration": 2.792848825454712 + }, + { + "auxiliary_loss_clip": 0.01348096, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.24096131, + "balance_loss_mlp": 1.020751, + "epoch": 0.635322410942432, + "flos": 27530993987640.0, + "grad_norm": 1.5234372862241685, + "language_loss": 0.6985805, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.72239453, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12548828, + "step": 10567, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.01350748, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.24324822, + "balance_loss_mlp": 1.01826024, + "epoch": 0.6353825341950999, + "flos": 21548634549120.0, + "grad_norm": 1.828957180650039, + "language_loss": 0.847471, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.87128621, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12512207, + "step": 10568, + "time_per_iteration": 2.758676528930664 + }, + { + "auxiliary_loss_clip": 0.01363122, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.25025654, + "balance_loss_mlp": 1.02211154, + "epoch": 0.635442657447768, + "flos": 31766141059200.0, + "grad_norm": 1.6945679541279421, + "language_loss": 0.84478772, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86877692, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 1.12939453, + "router_z_loss_mlp": 0.13708496, + "step": 10569, + "time_per_iteration": 2.9205172061920166 + }, + { + "auxiliary_loss_clip": 0.01350071, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.24090433, + "balance_loss_mlp": 1.0175972, + "epoch": 0.6355027807004359, + "flos": 19833648238800.0, + "grad_norm": 1.5632639267004957, + "language_loss": 0.69508672, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.7188943, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13079834, + "step": 10570, + "time_per_iteration": 2.7398605346679688 + }, + { + "auxiliary_loss_clip": 0.01361947, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.24810338, + "balance_loss_mlp": 1.0172739, + "epoch": 0.6355629039531039, + "flos": 30378467733120.0, + "grad_norm": 4.520264981754232, + "language_loss": 0.66027915, + "learning_rate": 1.2386378775476e-06, + "loss": 0.6842097, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13824463, + "step": 10571, + "time_per_iteration": 2.8926877975463867 + }, + { + "auxiliary_loss_clip": 0.01362992, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.24897337, + "balance_loss_mlp": 1.01996624, + "epoch": 0.6356230272057718, + "flos": 17936699556600.0, + "grad_norm": 1.6523857632397636, + "language_loss": 0.71601349, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73998272, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13964844, + "step": 10572, + "time_per_iteration": 2.733870506286621 + }, + { + "auxiliary_loss_clip": 0.0135255, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.24331951, + "balance_loss_mlp": 1.01870203, + "epoch": 0.6356831504584398, + "flos": 25381539566280.0, + "grad_norm": 1.3773159835479738, + "language_loss": 0.81218791, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83602232, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12188721, + "step": 10573, + "time_per_iteration": 2.953416585922241 + }, + { + "auxiliary_loss_clip": 0.01365359, + "auxiliary_loss_mlp": 0.01034128, + "balance_loss_clip": 1.25167263, + "balance_loss_mlp": 1.02073526, + "epoch": 0.6357432737111077, + "flos": 46508683698360.0, + "grad_norm": 1.5341517196956114, + "language_loss": 0.6882292, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71222401, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1338501, + "step": 10574, + "time_per_iteration": 3.026028871536255 + }, + { + "auxiliary_loss_clip": 0.01353635, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.24414849, + "balance_loss_mlp": 1.0206182, + "epoch": 0.6358033969637757, + "flos": 17278784310960.0, + "grad_norm": 2.012795752171049, + "language_loss": 0.86729205, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.89116919, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13446045, + "step": 10575, + "time_per_iteration": 2.7394533157348633 + }, + { + "auxiliary_loss_clip": 0.01356839, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.24764848, + "balance_loss_mlp": 1.02278614, + "epoch": 0.6358635202164437, + "flos": 27131229043200.0, + "grad_norm": 1.5802723328941777, + "language_loss": 0.7253989, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74931967, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12438965, + "step": 10576, + "time_per_iteration": 2.7934048175811768 + }, + { + "auxiliary_loss_clip": 0.01362461, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.2500248, + "balance_loss_mlp": 1.01678801, + "epoch": 0.6359236434691117, + "flos": 27530750337480.0, + "grad_norm": 1.561114752517435, + "language_loss": 0.68888867, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71281278, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1315918, + "step": 10577, + "time_per_iteration": 2.762664318084717 + }, + { + "auxiliary_loss_clip": 0.01354429, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.24466252, + "balance_loss_mlp": 1.01999331, + "epoch": 0.6359837667217797, + "flos": 39355600730760.0, + "grad_norm": 1.9351329872622738, + "language_loss": 0.72171891, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.745579, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.11584473, + "step": 10578, + "time_per_iteration": 2.946817398071289 + }, + { + "auxiliary_loss_clip": 0.01159989, + "auxiliary_loss_mlp": 0.01011533, + "balance_loss_clip": 1.11508036, + "balance_loss_mlp": 1.00891066, + "epoch": 0.6360438899744476, + "flos": 56426815775880.0, + "grad_norm": 0.7061647109073157, + "language_loss": 0.54531348, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56702876, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02624512, + "step": 10579, + "time_per_iteration": 3.378546953201294 + }, + { + "auxiliary_loss_clip": 0.01353248, + "auxiliary_loss_mlp": 0.01025705, + "balance_loss_clip": 1.24262667, + "balance_loss_mlp": 1.01295602, + "epoch": 0.6361040132271156, + "flos": 24978566561400.0, + "grad_norm": 1.6919007118938307, + "language_loss": 0.77339321, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.7971828, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12750244, + "step": 10580, + "time_per_iteration": 2.787139415740967 + }, + { + "auxiliary_loss_clip": 0.01354312, + "auxiliary_loss_mlp": 0.01024644, + "balance_loss_clip": 1.24443698, + "balance_loss_mlp": 1.01191843, + "epoch": 0.6361641364797835, + "flos": 23264554851720.0, + "grad_norm": 5.593753208673289, + "language_loss": 0.66434419, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68813378, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12731934, + "step": 10581, + "time_per_iteration": 4.222858190536499 + }, + { + "auxiliary_loss_clip": 0.01355672, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.24520993, + "balance_loss_mlp": 1.01856685, + "epoch": 0.6362242597324516, + "flos": 26000081683920.0, + "grad_norm": 1.677063778377578, + "language_loss": 0.68480825, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70868289, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13214111, + "step": 10582, + "time_per_iteration": 2.794339179992676 + }, + { + "auxiliary_loss_clip": 0.01356995, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.24487162, + "balance_loss_mlp": 1.02219355, + "epoch": 0.6362843829851195, + "flos": 25708852550520.0, + "grad_norm": 1.7284510099152428, + "language_loss": 0.84153193, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86545336, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12963867, + "step": 10583, + "time_per_iteration": 2.800532579421997 + }, + { + "auxiliary_loss_clip": 0.01349378, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.2417531, + "balance_loss_mlp": 1.0177983, + "epoch": 0.6363445062377875, + "flos": 20527931593800.0, + "grad_norm": 2.8132296621177515, + "language_loss": 0.75673723, + "learning_rate": 1.233958531908538e-06, + "loss": 0.78053617, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1270752, + "step": 10584, + "time_per_iteration": 2.9363863468170166 + }, + { + "auxiliary_loss_clip": 0.01360984, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.24828768, + "balance_loss_mlp": 1.01583409, + "epoch": 0.6364046294904554, + "flos": 19468545852600.0, + "grad_norm": 1.7977280664232207, + "language_loss": 0.73097718, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75489414, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.14880371, + "step": 10585, + "time_per_iteration": 4.18530011177063 + }, + { + "auxiliary_loss_clip": 0.01354707, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.24480677, + "balance_loss_mlp": 1.01378608, + "epoch": 0.6364647527431234, + "flos": 21000798232200.0, + "grad_norm": 1.878891901420703, + "language_loss": 0.831038, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.85484803, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12512207, + "step": 10586, + "time_per_iteration": 2.842275857925415 + }, + { + "auxiliary_loss_clip": 0.01351148, + "auxiliary_loss_mlp": 0.01022797, + "balance_loss_clip": 1.24218619, + "balance_loss_mlp": 1.01007748, + "epoch": 0.6365248759957913, + "flos": 25775700557040.0, + "grad_norm": 1.5732411742508245, + "language_loss": 0.7256251, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74936455, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12713623, + "step": 10587, + "time_per_iteration": 2.954836368560791 + }, + { + "auxiliary_loss_clip": 0.01356599, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.24567962, + "balance_loss_mlp": 1.01385868, + "epoch": 0.6365849992484593, + "flos": 22460720476680.0, + "grad_norm": 2.268654391645489, + "language_loss": 0.77531791, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79915035, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12792969, + "step": 10588, + "time_per_iteration": 2.7973363399505615 + }, + { + "auxiliary_loss_clip": 0.01348324, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.2423315, + "balance_loss_mlp": 1.01266015, + "epoch": 0.6366451225011273, + "flos": 19030341772440.0, + "grad_norm": 1.380575215219878, + "language_loss": 0.8035785, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82732403, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13562012, + "step": 10589, + "time_per_iteration": 2.882714033126831 + }, + { + "auxiliary_loss_clip": 0.01352487, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.24391961, + "balance_loss_mlp": 1.01746833, + "epoch": 0.6367052457537953, + "flos": 25233874277400.0, + "grad_norm": 2.019356474426623, + "language_loss": 0.67313647, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69697046, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13433838, + "step": 10590, + "time_per_iteration": 2.8505637645721436 + }, + { + "auxiliary_loss_clip": 0.01369024, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.25268865, + "balance_loss_mlp": 1.01688337, + "epoch": 0.6367653690064633, + "flos": 19213156919880.0, + "grad_norm": 1.6509425635806556, + "language_loss": 0.79306483, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81706405, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.14007568, + "step": 10591, + "time_per_iteration": 2.8186862468719482 + }, + { + "auxiliary_loss_clip": 0.01354213, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.24684775, + "balance_loss_mlp": 1.01590562, + "epoch": 0.6368254922591312, + "flos": 23551641932400.0, + "grad_norm": 1.512838966259422, + "language_loss": 0.89064962, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91447413, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12335205, + "step": 10592, + "time_per_iteration": 2.745457649230957 + }, + { + "auxiliary_loss_clip": 0.01347222, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.24029922, + "balance_loss_mlp": 1.01544547, + "epoch": 0.6368856155117992, + "flos": 26472745280520.0, + "grad_norm": 1.32368031989134, + "language_loss": 0.68352044, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70726228, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.11529541, + "step": 10593, + "time_per_iteration": 4.244529485702515 + }, + { + "auxiliary_loss_clip": 0.01349282, + "auxiliary_loss_mlp": 0.01032946, + "balance_loss_clip": 1.24063528, + "balance_loss_mlp": 1.02003574, + "epoch": 0.6369457387644671, + "flos": 33698929942080.0, + "grad_norm": 1.8415986232892403, + "language_loss": 0.63954914, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.66337144, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12915039, + "step": 10594, + "time_per_iteration": 2.873063564300537 + }, + { + "auxiliary_loss_clip": 0.01162615, + "auxiliary_loss_mlp": 0.01002527, + "balance_loss_clip": 1.11819804, + "balance_loss_mlp": 0.99961835, + "epoch": 0.6370058620171352, + "flos": 70924811914440.0, + "grad_norm": 1.3907503418215046, + "language_loss": 0.54608279, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56773424, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02905273, + "step": 10595, + "time_per_iteration": 3.5130858421325684 + }, + { + "auxiliary_loss_clip": 0.01363172, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.25103593, + "balance_loss_mlp": 1.01731634, + "epoch": 0.6370659852698031, + "flos": 21146879795040.0, + "grad_norm": 1.672296914851293, + "language_loss": 0.67101222, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69494784, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13079834, + "step": 10596, + "time_per_iteration": 2.7660458087921143 + }, + { + "auxiliary_loss_clip": 0.0134975, + "auxiliary_loss_mlp": 0.01028176, + "balance_loss_clip": 1.23990989, + "balance_loss_mlp": 1.01569462, + "epoch": 0.6371261085224711, + "flos": 20198100891240.0, + "grad_norm": 2.5108994160324913, + "language_loss": 0.79754961, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.82132888, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12481689, + "step": 10597, + "time_per_iteration": 2.7651124000549316 + }, + { + "auxiliary_loss_clip": 0.0135781, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.2481302, + "balance_loss_mlp": 1.0187068, + "epoch": 0.637186231775139, + "flos": 19688419451520.0, + "grad_norm": 1.6224695864705505, + "language_loss": 0.74921846, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.7731061, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.12249756, + "step": 10598, + "time_per_iteration": 2.784644842147827 + }, + { + "auxiliary_loss_clip": 0.01357697, + "auxiliary_loss_mlp": 0.01027105, + "balance_loss_clip": 1.24684298, + "balance_loss_mlp": 1.01434362, + "epoch": 0.637246355027807, + "flos": 13074117136560.0, + "grad_norm": 1.7481589671883315, + "language_loss": 0.6847856, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70863366, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12774658, + "step": 10599, + "time_per_iteration": 2.792450189590454 + }, + { + "auxiliary_loss_clip": 0.01361761, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.24789739, + "balance_loss_mlp": 1.01967287, + "epoch": 0.6373064782804749, + "flos": 18227157131160.0, + "grad_norm": 2.6274249959425693, + "language_loss": 0.80719644, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.83114928, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13842773, + "step": 10600, + "time_per_iteration": 2.7071704864501953 + }, + { + "auxiliary_loss_clip": 0.01351315, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.24279058, + "balance_loss_mlp": 1.02011085, + "epoch": 0.637366601533143, + "flos": 24503831938440.0, + "grad_norm": 1.4489080477702223, + "language_loss": 0.79783499, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82167304, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1237793, + "step": 10601, + "time_per_iteration": 2.79899263381958 + }, + { + "auxiliary_loss_clip": 0.01353192, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.24245727, + "balance_loss_mlp": 1.01344228, + "epoch": 0.6374267247858109, + "flos": 26364615553080.0, + "grad_norm": 1.9304426847224516, + "language_loss": 0.67086041, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69465309, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12640381, + "step": 10602, + "time_per_iteration": 2.80779767036438 + }, + { + "auxiliary_loss_clip": 0.0134926, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.2404356, + "balance_loss_mlp": 1.01817143, + "epoch": 0.6374868480384789, + "flos": 20376043035480.0, + "grad_norm": 1.8618494703120683, + "language_loss": 0.79889965, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.8226983, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12420654, + "step": 10603, + "time_per_iteration": 2.7843198776245117 + }, + { + "auxiliary_loss_clip": 0.01358915, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.24864697, + "balance_loss_mlp": 1.01354706, + "epoch": 0.6375469712911469, + "flos": 21001204315800.0, + "grad_norm": 2.2673123127596373, + "language_loss": 0.76754224, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.79140043, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13354492, + "step": 10604, + "time_per_iteration": 2.753450393676758 + }, + { + "auxiliary_loss_clip": 0.01362907, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.2483834, + "balance_loss_mlp": 1.01476455, + "epoch": 0.6376070945438148, + "flos": 19720076991120.0, + "grad_norm": 2.0247733622156203, + "language_loss": 0.77231425, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79621959, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.12872314, + "step": 10605, + "time_per_iteration": 2.743204116821289 + }, + { + "auxiliary_loss_clip": 0.01360551, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.24929357, + "balance_loss_mlp": 1.01316929, + "epoch": 0.6376672177964828, + "flos": 21511738531080.0, + "grad_norm": 1.8287130715619264, + "language_loss": 0.66182792, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.68570429, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13928223, + "step": 10606, + "time_per_iteration": 2.7987945079803467 + }, + { + "auxiliary_loss_clip": 0.01343831, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.23714375, + "balance_loss_mlp": 1.02067876, + "epoch": 0.6377273410491507, + "flos": 18848582442360.0, + "grad_norm": 1.8086003596501583, + "language_loss": 0.75453138, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77830064, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12408447, + "step": 10607, + "time_per_iteration": 2.747081995010376 + }, + { + "auxiliary_loss_clip": 0.01359545, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.24919891, + "balance_loss_mlp": 1.02337933, + "epoch": 0.6377874643018188, + "flos": 53071552202400.0, + "grad_norm": 1.5412402347446936, + "language_loss": 0.65921879, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68317747, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12939453, + "step": 10608, + "time_per_iteration": 3.199333906173706 + }, + { + "auxiliary_loss_clip": 0.0116514, + "auxiliary_loss_mlp": 0.01003925, + "balance_loss_clip": 1.12036002, + "balance_loss_mlp": 1.00109947, + "epoch": 0.6378475875544867, + "flos": 65149331399640.0, + "grad_norm": 0.7195974061796069, + "language_loss": 0.51849878, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.54018939, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02819824, + "step": 10609, + "time_per_iteration": 3.2174172401428223 + }, + { + "auxiliary_loss_clip": 0.01345537, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.2384758, + "balance_loss_mlp": 1.01834166, + "epoch": 0.6379077108071547, + "flos": 23007947668200.0, + "grad_norm": 1.5029344135104499, + "language_loss": 0.74983895, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.7735967, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11895752, + "step": 10610, + "time_per_iteration": 2.8640635013580322 + }, + { + "auxiliary_loss_clip": 0.01164117, + "auxiliary_loss_mlp": 0.00999685, + "balance_loss_clip": 1.11960328, + "balance_loss_mlp": 0.99720544, + "epoch": 0.6379678340598226, + "flos": 67619496623760.0, + "grad_norm": 0.8365442935310027, + "language_loss": 0.63176835, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65340626, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02478027, + "step": 10611, + "time_per_iteration": 3.306424379348755 + }, + { + "auxiliary_loss_clip": 0.01357418, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.24652088, + "balance_loss_mlp": 1.02288699, + "epoch": 0.6380279573124906, + "flos": 29686174187760.0, + "grad_norm": 2.85164493165072, + "language_loss": 0.73115087, + "learning_rate": 1.223896654187282e-06, + "loss": 0.75508243, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12860107, + "step": 10612, + "time_per_iteration": 2.8104968070983887 + }, + { + "auxiliary_loss_clip": 0.01162615, + "auxiliary_loss_mlp": 0.01001001, + "balance_loss_clip": 1.11849916, + "balance_loss_mlp": 0.9984259, + "epoch": 0.6380880805651585, + "flos": 66498403156560.0, + "grad_norm": 0.884580254584195, + "language_loss": 0.57987189, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.60150802, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02575684, + "step": 10613, + "time_per_iteration": 3.187570333480835 + }, + { + "auxiliary_loss_clip": 0.01352919, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.24108028, + "balance_loss_mlp": 1.01771259, + "epoch": 0.6381482038178266, + "flos": 23920317854280.0, + "grad_norm": 1.6880189528030043, + "language_loss": 0.75473213, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77857214, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13378906, + "step": 10614, + "time_per_iteration": 2.7414612770080566 + }, + { + "auxiliary_loss_clip": 0.01353939, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.24467242, + "balance_loss_mlp": 1.01849079, + "epoch": 0.6382083270704945, + "flos": 24248361789000.0, + "grad_norm": 1.748765316473758, + "language_loss": 0.79370779, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81756639, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13427734, + "step": 10615, + "time_per_iteration": 2.7743165493011475 + }, + { + "auxiliary_loss_clip": 0.01162681, + "auxiliary_loss_mlp": 0.01004215, + "balance_loss_clip": 1.11892927, + "balance_loss_mlp": 1.001652, + "epoch": 0.6382684503231625, + "flos": 70793430577920.0, + "grad_norm": 0.6753475926578915, + "language_loss": 0.55675709, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.578426, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02563477, + "step": 10616, + "time_per_iteration": 3.261190414428711 + }, + { + "auxiliary_loss_clip": 0.01357338, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.24658668, + "balance_loss_mlp": 1.01984763, + "epoch": 0.6383285735758305, + "flos": 16549188663960.0, + "grad_norm": 1.8251390309401994, + "language_loss": 0.84644008, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.87034655, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13439941, + "step": 10617, + "time_per_iteration": 2.732809543609619 + }, + { + "auxiliary_loss_clip": 0.01357095, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.24512029, + "balance_loss_mlp": 1.02044845, + "epoch": 0.6383886968284984, + "flos": 14431594824000.0, + "grad_norm": 1.883948025076103, + "language_loss": 0.86911833, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89303541, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14154053, + "step": 10618, + "time_per_iteration": 2.738311767578125 + }, + { + "auxiliary_loss_clip": 0.01359937, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.24828374, + "balance_loss_mlp": 1.01913536, + "epoch": 0.6384488200811664, + "flos": 17933978796480.0, + "grad_norm": 1.759963528855474, + "language_loss": 0.7358762, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75979233, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12542725, + "step": 10619, + "time_per_iteration": 4.276333570480347 + }, + { + "auxiliary_loss_clip": 0.01370046, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.25398517, + "balance_loss_mlp": 1.02442169, + "epoch": 0.6385089433338343, + "flos": 18520254249120.0, + "grad_norm": 1.9125676581931994, + "language_loss": 0.76557857, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78966975, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.14660645, + "step": 10620, + "time_per_iteration": 4.103415489196777 + }, + { + "auxiliary_loss_clip": 0.01350823, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.24138904, + "balance_loss_mlp": 1.0132345, + "epoch": 0.6385690665865024, + "flos": 24758977221000.0, + "grad_norm": 2.351161148845956, + "language_loss": 0.71241057, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.73617649, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12542725, + "step": 10621, + "time_per_iteration": 2.7946524620056152 + }, + { + "auxiliary_loss_clip": 0.01339738, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.23349535, + "balance_loss_mlp": 1.01474309, + "epoch": 0.6386291898391703, + "flos": 20125405280880.0, + "grad_norm": 1.5497023907762253, + "language_loss": 0.77575386, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79942214, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12359619, + "step": 10622, + "time_per_iteration": 4.283202409744263 + }, + { + "auxiliary_loss_clip": 0.01348895, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.24044251, + "balance_loss_mlp": 1.01511884, + "epoch": 0.6386893130918383, + "flos": 16870897694520.0, + "grad_norm": 1.8808003091307166, + "language_loss": 0.74780047, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77156734, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12670898, + "step": 10623, + "time_per_iteration": 2.730634927749634 + }, + { + "auxiliary_loss_clip": 0.01341744, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.23517084, + "balance_loss_mlp": 1.01516581, + "epoch": 0.6387494363445062, + "flos": 22971457733760.0, + "grad_norm": 1.397473235778818, + "language_loss": 0.76787758, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.7915628, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.1161499, + "step": 10624, + "time_per_iteration": 2.7991783618927 + }, + { + "auxiliary_loss_clip": 0.01351849, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.24240017, + "balance_loss_mlp": 1.01860809, + "epoch": 0.6388095595971742, + "flos": 22863003139440.0, + "grad_norm": 2.6290797882266186, + "language_loss": 0.80717552, + "learning_rate": 1.21923289302382e-06, + "loss": 0.83101273, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13269043, + "step": 10625, + "time_per_iteration": 2.7821638584136963 + }, + { + "auxiliary_loss_clip": 0.01355811, + "auxiliary_loss_mlp": 0.01033679, + "balance_loss_clip": 1.24478042, + "balance_loss_mlp": 1.02012527, + "epoch": 0.6388696828498421, + "flos": 17316045804240.0, + "grad_norm": 1.9216325645971037, + "language_loss": 0.72819805, + "learning_rate": 1.218874349031654e-06, + "loss": 0.75209296, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13555908, + "step": 10626, + "time_per_iteration": 2.79215669631958 + }, + { + "auxiliary_loss_clip": 0.01359771, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.24855149, + "balance_loss_mlp": 1.01739085, + "epoch": 0.6389298061025102, + "flos": 17133068223360.0, + "grad_norm": 1.611919603162562, + "language_loss": 0.73473346, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.75863826, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13336182, + "step": 10627, + "time_per_iteration": 2.8417770862579346 + }, + { + "auxiliary_loss_clip": 0.01371882, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.25565362, + "balance_loss_mlp": 1.01685905, + "epoch": 0.6389899293551781, + "flos": 27716976587160.0, + "grad_norm": 2.16341454823253, + "language_loss": 0.67707717, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.70111501, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.1505127, + "step": 10628, + "time_per_iteration": 2.840491771697998 + }, + { + "auxiliary_loss_clip": 0.01343965, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.23739219, + "balance_loss_mlp": 1.01558304, + "epoch": 0.6390500526078461, + "flos": 21220753047840.0, + "grad_norm": 1.557972861422966, + "language_loss": 0.68773532, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.71145546, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12469482, + "step": 10629, + "time_per_iteration": 2.852583646774292 + }, + { + "auxiliary_loss_clip": 0.01365175, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.24795747, + "balance_loss_mlp": 1.0240221, + "epoch": 0.6391101758605141, + "flos": 21586220909280.0, + "grad_norm": 1.4257372873127654, + "language_loss": 0.75328839, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77732873, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.14831543, + "step": 10630, + "time_per_iteration": 2.8785390853881836 + }, + { + "auxiliary_loss_clip": 0.01351597, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.24293864, + "balance_loss_mlp": 1.01932526, + "epoch": 0.639170299113182, + "flos": 19905125598360.0, + "grad_norm": 1.458060560382921, + "language_loss": 0.70678711, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.73061997, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12359619, + "step": 10631, + "time_per_iteration": 4.396689176559448 + }, + { + "auxiliary_loss_clip": 0.01161678, + "auxiliary_loss_mlp": 0.01001227, + "balance_loss_clip": 1.11697102, + "balance_loss_mlp": 0.99837828, + "epoch": 0.63923042236585, + "flos": 69892454359800.0, + "grad_norm": 0.7730266352426536, + "language_loss": 0.6301167, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.6517458, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02844238, + "step": 10632, + "time_per_iteration": 3.2951579093933105 + }, + { + "auxiliary_loss_clip": 0.01350206, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.24083352, + "balance_loss_mlp": 1.0142225, + "epoch": 0.639290545618518, + "flos": 22680025558560.0, + "grad_norm": 1.9205067337453696, + "language_loss": 0.67002475, + "learning_rate": 1.216365371217893e-06, + "loss": 0.6937983, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1293335, + "step": 10633, + "time_per_iteration": 2.899981737136841 + }, + { + "auxiliary_loss_clip": 0.01352319, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.24260473, + "balance_loss_mlp": 1.01538515, + "epoch": 0.639350668871186, + "flos": 19834054322400.0, + "grad_norm": 1.952626428533829, + "language_loss": 0.82023692, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84403765, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12347412, + "step": 10634, + "time_per_iteration": 2.7694733142852783 + }, + { + "auxiliary_loss_clip": 0.01355633, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.24578393, + "balance_loss_mlp": 1.01605463, + "epoch": 0.6394107921238539, + "flos": 20557558715400.0, + "grad_norm": 1.5145885099660175, + "language_loss": 0.75166583, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77551436, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.13183594, + "step": 10635, + "time_per_iteration": 2.7606985569000244 + }, + { + "auxiliary_loss_clip": 0.01355095, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.24454606, + "balance_loss_mlp": 1.01581919, + "epoch": 0.6394709153765219, + "flos": 25780451735160.0, + "grad_norm": 1.6290461904391897, + "language_loss": 0.72152579, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.74536967, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13476562, + "step": 10636, + "time_per_iteration": 2.7549571990966797 + }, + { + "auxiliary_loss_clip": 0.01363705, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.24989986, + "balance_loss_mlp": 1.01905537, + "epoch": 0.6395310386291898, + "flos": 17534660544000.0, + "grad_norm": 1.6759353505550953, + "language_loss": 0.7350508, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75901377, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13531494, + "step": 10637, + "time_per_iteration": 2.7551705837249756 + }, + { + "auxiliary_loss_clip": 0.01359849, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.24789822, + "balance_loss_mlp": 1.01532388, + "epoch": 0.6395911618818578, + "flos": 18592868642760.0, + "grad_norm": 1.7345714481785346, + "language_loss": 0.78233171, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.80621755, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.1340332, + "step": 10638, + "time_per_iteration": 2.6997108459472656 + }, + { + "auxiliary_loss_clip": 0.01353033, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.2429148, + "balance_loss_mlp": 1.01476002, + "epoch": 0.6396512851345257, + "flos": 28372495939560.0, + "grad_norm": 1.4972171335778715, + "language_loss": 0.81858706, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.84239972, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13470459, + "step": 10639, + "time_per_iteration": 2.809044599533081 + }, + { + "auxiliary_loss_clip": 0.01161393, + "auxiliary_loss_mlp": 0.0099956, + "balance_loss_clip": 1.11729741, + "balance_loss_mlp": 0.99728292, + "epoch": 0.6397114083871938, + "flos": 70740250607520.0, + "grad_norm": 0.8092103345275168, + "language_loss": 0.59087729, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61248684, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02282715, + "step": 10640, + "time_per_iteration": 3.2520594596862793 + }, + { + "auxiliary_loss_clip": 0.01347048, + "auxiliary_loss_mlp": 0.01026477, + "balance_loss_clip": 1.23987913, + "balance_loss_mlp": 1.01425791, + "epoch": 0.6397715316398617, + "flos": 18146096198640.0, + "grad_norm": 1.8678917795204182, + "language_loss": 0.78998703, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.81372225, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12231445, + "step": 10641, + "time_per_iteration": 2.7395477294921875 + }, + { + "auxiliary_loss_clip": 0.01370218, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.2536397, + "balance_loss_mlp": 1.01864254, + "epoch": 0.6398316548925297, + "flos": 25745098834800.0, + "grad_norm": 1.926144453572142, + "language_loss": 0.63519979, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65922642, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.13806152, + "step": 10642, + "time_per_iteration": 2.9972381591796875 + }, + { + "auxiliary_loss_clip": 0.01161304, + "auxiliary_loss_mlp": 0.01001772, + "balance_loss_clip": 1.11756837, + "balance_loss_mlp": 0.99954301, + "epoch": 0.6398917781451977, + "flos": 71227516232520.0, + "grad_norm": 0.9180795917734262, + "language_loss": 0.56029791, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58192861, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02233887, + "step": 10643, + "time_per_iteration": 3.1724305152893066 + }, + { + "auxiliary_loss_clip": 0.01363549, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.24884188, + "balance_loss_mlp": 1.01517582, + "epoch": 0.6399519013978656, + "flos": 20526713343000.0, + "grad_norm": 1.9001948364072874, + "language_loss": 0.76951164, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79343152, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13269043, + "step": 10644, + "time_per_iteration": 2.8422625064849854 + }, + { + "auxiliary_loss_clip": 0.01351062, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.2425859, + "balance_loss_mlp": 1.01914489, + "epoch": 0.6400120246505336, + "flos": 24466042536480.0, + "grad_norm": 1.421808190026938, + "language_loss": 0.82552814, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84937137, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.14123535, + "step": 10645, + "time_per_iteration": 2.826129913330078 + }, + { + "auxiliary_loss_clip": 0.01363141, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.24748945, + "balance_loss_mlp": 1.02124608, + "epoch": 0.6400721479032015, + "flos": 28371968030880.0, + "grad_norm": 1.9552184320734405, + "language_loss": 0.73453039, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75852603, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.1517334, + "step": 10646, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0135696, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.24522233, + "balance_loss_mlp": 1.02129805, + "epoch": 0.6401322711558696, + "flos": 17820366940440.0, + "grad_norm": 3.131419543399471, + "language_loss": 0.80665672, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.83057296, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13372803, + "step": 10647, + "time_per_iteration": 2.7156450748443604 + }, + { + "auxiliary_loss_clip": 0.01348382, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.24067223, + "balance_loss_mlp": 1.01866794, + "epoch": 0.6401923944085375, + "flos": 26036409184920.0, + "grad_norm": 1.7255374996360762, + "language_loss": 0.75957727, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.78337067, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12304688, + "step": 10648, + "time_per_iteration": 2.7926571369171143 + }, + { + "auxiliary_loss_clip": 0.01354186, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.24311471, + "balance_loss_mlp": 1.01373982, + "epoch": 0.6402525176612055, + "flos": 23591705402520.0, + "grad_norm": 1.7543347858604115, + "language_loss": 0.79109895, + "learning_rate": 1.210636039936138e-06, + "loss": 0.8149122, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13397217, + "step": 10649, + "time_per_iteration": 2.7699103355407715 + }, + { + "auxiliary_loss_clip": 0.01353398, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.24280119, + "balance_loss_mlp": 1.0239737, + "epoch": 0.6403126409138734, + "flos": 18046047534840.0, + "grad_norm": 1.7438314468666725, + "language_loss": 0.75492984, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.7788434, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14007568, + "step": 10650, + "time_per_iteration": 2.747735023498535 + }, + { + "auxiliary_loss_clip": 0.01356073, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.2463764, + "balance_loss_mlp": 1.01744175, + "epoch": 0.6403727641665414, + "flos": 21984320910960.0, + "grad_norm": 1.4185480373285504, + "language_loss": 0.7089386, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73280883, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1350708, + "step": 10651, + "time_per_iteration": 2.887928009033203 + }, + { + "auxiliary_loss_clip": 0.01357934, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.24625015, + "balance_loss_mlp": 1.02387369, + "epoch": 0.6404328874192093, + "flos": 24900388822440.0, + "grad_norm": 3.040010175430107, + "language_loss": 0.64199156, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.66594762, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13793945, + "step": 10652, + "time_per_iteration": 2.779810667037964 + }, + { + "auxiliary_loss_clip": 0.01354412, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.24386811, + "balance_loss_mlp": 1.01764297, + "epoch": 0.6404930106718774, + "flos": 17600940033480.0, + "grad_norm": 1.8696609709189207, + "language_loss": 0.79371643, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.8175689, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13214111, + "step": 10653, + "time_per_iteration": 2.7043352127075195 + }, + { + "auxiliary_loss_clip": 0.01374891, + "auxiliary_loss_mlp": 0.01042485, + "balance_loss_clip": 1.25503755, + "balance_loss_mlp": 1.02698803, + "epoch": 0.6405531339245453, + "flos": 20162829207600.0, + "grad_norm": 2.0986327344694997, + "language_loss": 0.70909202, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.73326576, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.15515137, + "step": 10654, + "time_per_iteration": 2.863192319869995 + }, + { + "auxiliary_loss_clip": 0.01369442, + "auxiliary_loss_mlp": 0.01041864, + "balance_loss_clip": 1.25263906, + "balance_loss_mlp": 1.02703452, + "epoch": 0.6406132571772133, + "flos": 21947018809320.0, + "grad_norm": 1.712275758751415, + "language_loss": 0.73138165, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75549471, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.14813232, + "step": 10655, + "time_per_iteration": 2.7733469009399414 + }, + { + "auxiliary_loss_clip": 0.01360877, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.24797392, + "balance_loss_mlp": 1.02653205, + "epoch": 0.6406733804298813, + "flos": 28774007043480.0, + "grad_norm": 1.5404258134953515, + "language_loss": 0.82990026, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85391355, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13903809, + "step": 10656, + "time_per_iteration": 2.831052303314209 + }, + { + "auxiliary_loss_clip": 0.01350883, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.23923421, + "balance_loss_mlp": 1.02342582, + "epoch": 0.6407335036825492, + "flos": 17461680675120.0, + "grad_norm": 2.634426478048162, + "language_loss": 0.72801739, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.7518878, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12731934, + "step": 10657, + "time_per_iteration": 2.7551183700561523 + }, + { + "auxiliary_loss_clip": 0.01361182, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.2498467, + "balance_loss_mlp": 1.0237658, + "epoch": 0.6407936269352172, + "flos": 22130118215280.0, + "grad_norm": 1.8489747544957198, + "language_loss": 0.77379715, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79777753, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13098145, + "step": 10658, + "time_per_iteration": 4.208292007446289 + }, + { + "auxiliary_loss_clip": 0.0136211, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.24687433, + "balance_loss_mlp": 1.02805471, + "epoch": 0.6408537501878852, + "flos": 23115630703680.0, + "grad_norm": 2.4306429907883955, + "language_loss": 0.76179677, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78585219, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.15368652, + "step": 10659, + "time_per_iteration": 4.327193021774292 + }, + { + "auxiliary_loss_clip": 0.01355551, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.2432133, + "balance_loss_mlp": 1.02055252, + "epoch": 0.6409138734405532, + "flos": 16476858528840.0, + "grad_norm": 2.007238454034685, + "language_loss": 0.78183913, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80573839, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13824463, + "step": 10660, + "time_per_iteration": 2.684070110321045 + }, + { + "auxiliary_loss_clip": 0.01364791, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.24842393, + "balance_loss_mlp": 1.02238131, + "epoch": 0.6409739966932211, + "flos": 22782307682160.0, + "grad_norm": 1.7341819988640703, + "language_loss": 0.68951291, + "learning_rate": 1.206344067135727e-06, + "loss": 0.71352744, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14282227, + "step": 10661, + "time_per_iteration": 4.218266725540161 + }, + { + "auxiliary_loss_clip": 0.01348188, + "auxiliary_loss_mlp": 0.01036732, + "balance_loss_clip": 1.24022198, + "balance_loss_mlp": 1.02433395, + "epoch": 0.6410341199458891, + "flos": 25157077222680.0, + "grad_norm": 1.4887288658263482, + "language_loss": 0.75833869, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78218788, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12390137, + "step": 10662, + "time_per_iteration": 2.8074092864990234 + }, + { + "auxiliary_loss_clip": 0.01356115, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.2443397, + "balance_loss_mlp": 1.01713109, + "epoch": 0.641094243198557, + "flos": 27051061494600.0, + "grad_norm": 2.0932337390135776, + "language_loss": 0.69976985, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.7236315, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.12902832, + "step": 10663, + "time_per_iteration": 2.77955961227417 + }, + { + "auxiliary_loss_clip": 0.0135614, + "auxiliary_loss_mlp": 0.01043978, + "balance_loss_clip": 1.24379873, + "balance_loss_mlp": 1.02901125, + "epoch": 0.641154366451225, + "flos": 25380077665320.0, + "grad_norm": 1.914925967040984, + "language_loss": 0.68398386, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70798504, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14959717, + "step": 10664, + "time_per_iteration": 2.842956781387329 + }, + { + "auxiliary_loss_clip": 0.01352177, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.24282098, + "balance_loss_mlp": 1.01683187, + "epoch": 0.6412144897038929, + "flos": 25158539123640.0, + "grad_norm": 1.939999408862076, + "language_loss": 0.66542125, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68923908, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12774658, + "step": 10665, + "time_per_iteration": 2.9255318641662598 + }, + { + "auxiliary_loss_clip": 0.01353405, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.24369287, + "balance_loss_mlp": 1.01744485, + "epoch": 0.641274612956561, + "flos": 23446029923280.0, + "grad_norm": 1.6101047029499806, + "language_loss": 0.64764321, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.67148626, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13446045, + "step": 10666, + "time_per_iteration": 2.821183443069458 + }, + { + "auxiliary_loss_clip": 0.01357168, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.24415743, + "balance_loss_mlp": 1.02030504, + "epoch": 0.6413347362092289, + "flos": 19432258959960.0, + "grad_norm": 1.7496327682910517, + "language_loss": 0.71295142, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73685646, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13049316, + "step": 10667, + "time_per_iteration": 2.978696584701538 + }, + { + "auxiliary_loss_clip": 0.01379666, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.25828969, + "balance_loss_mlp": 1.02769399, + "epoch": 0.6413948594618969, + "flos": 17200362921840.0, + "grad_norm": 6.691023208144421, + "language_loss": 0.77622604, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.80045545, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.15588379, + "step": 10668, + "time_per_iteration": 2.7474470138549805 + }, + { + "auxiliary_loss_clip": 0.01351525, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.24106729, + "balance_loss_mlp": 1.02002358, + "epoch": 0.6414549827145648, + "flos": 22274453618640.0, + "grad_norm": 1.7095918090465627, + "language_loss": 0.67877471, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.7026273, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13702393, + "step": 10669, + "time_per_iteration": 4.322773694992065 + }, + { + "auxiliary_loss_clip": 0.01372941, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.25506806, + "balance_loss_mlp": 1.02469718, + "epoch": 0.6415151059672328, + "flos": 19644173320320.0, + "grad_norm": 1.62471263411468, + "language_loss": 0.79080015, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.81491953, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.14312744, + "step": 10670, + "time_per_iteration": 2.7813849449157715 + }, + { + "auxiliary_loss_clip": 0.01358678, + "auxiliary_loss_mlp": 0.01036438, + "balance_loss_clip": 1.24353909, + "balance_loss_mlp": 1.0219537, + "epoch": 0.6415752292199008, + "flos": 14869880120880.0, + "grad_norm": 3.137826487874089, + "language_loss": 0.89564562, + "learning_rate": 1.20277073264638e-06, + "loss": 0.91959679, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14489746, + "step": 10671, + "time_per_iteration": 2.7656338214874268 + }, + { + "auxiliary_loss_clip": 0.013486, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.24049306, + "balance_loss_mlp": 1.02103865, + "epoch": 0.6416353524725688, + "flos": 13739585537160.0, + "grad_norm": 1.406340453372806, + "language_loss": 0.69193286, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71575373, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12457275, + "step": 10672, + "time_per_iteration": 2.737579107284546 + }, + { + "auxiliary_loss_clip": 0.01368862, + "auxiliary_loss_mlp": 0.01034184, + "balance_loss_clip": 1.25175095, + "balance_loss_mlp": 1.01853144, + "epoch": 0.6416954757252368, + "flos": 24540281264520.0, + "grad_norm": 2.0078566417628667, + "language_loss": 0.74782234, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.77185279, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.15649414, + "step": 10673, + "time_per_iteration": 2.753392457962036 + }, + { + "auxiliary_loss_clip": 0.01360007, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.24741268, + "balance_loss_mlp": 1.02250934, + "epoch": 0.6417555989779047, + "flos": 27715880161440.0, + "grad_norm": 1.8965507504565549, + "language_loss": 0.70026463, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.72423482, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.1451416, + "step": 10674, + "time_per_iteration": 2.821840763092041 + }, + { + "auxiliary_loss_clip": 0.01371427, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.25360394, + "balance_loss_mlp": 1.01765776, + "epoch": 0.6418157222305727, + "flos": 20561294684520.0, + "grad_norm": 1.850993524586745, + "language_loss": 0.67397231, + "learning_rate": 1.201342244560338e-06, + "loss": 0.69801271, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.1494751, + "step": 10675, + "time_per_iteration": 2.717611789703369 + }, + { + "auxiliary_loss_clip": 0.01358862, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.24896681, + "balance_loss_mlp": 1.02392793, + "epoch": 0.6418758454832406, + "flos": 22606680214440.0, + "grad_norm": 1.7639411416660216, + "language_loss": 0.66879308, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.69275922, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13824463, + "step": 10676, + "time_per_iteration": 2.7780039310455322 + }, + { + "auxiliary_loss_clip": 0.01360127, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.24684238, + "balance_loss_mlp": 1.02135396, + "epoch": 0.6419359687359086, + "flos": 27380080029960.0, + "grad_norm": 1.7905525893242775, + "language_loss": 0.75252241, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77648592, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 1.13232422, + "router_z_loss_mlp": 0.14868164, + "step": 10677, + "time_per_iteration": 2.9781277179718018 + }, + { + "auxiliary_loss_clip": 0.01163548, + "auxiliary_loss_mlp": 0.01011695, + "balance_loss_clip": 1.11926007, + "balance_loss_mlp": 1.00913239, + "epoch": 0.6419960919885765, + "flos": 67266600372720.0, + "grad_norm": 0.7668336040047057, + "language_loss": 0.60708642, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62883884, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02563477, + "step": 10678, + "time_per_iteration": 3.326572895050049 + }, + { + "auxiliary_loss_clip": 0.01346759, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.23810601, + "balance_loss_mlp": 1.02256465, + "epoch": 0.6420562152412446, + "flos": 19906709324400.0, + "grad_norm": 1.667376114131593, + "language_loss": 0.6768595, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.70067942, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12670898, + "step": 10679, + "time_per_iteration": 2.766986131668091 + }, + { + "auxiliary_loss_clip": 0.013622, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.24820757, + "balance_loss_mlp": 1.01931942, + "epoch": 0.6421163384939125, + "flos": 24795548372160.0, + "grad_norm": 1.5501607941325062, + "language_loss": 0.73023981, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75419605, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14099121, + "step": 10680, + "time_per_iteration": 2.757636785507202 + }, + { + "auxiliary_loss_clip": 0.01356603, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.24508572, + "balance_loss_mlp": 1.01885986, + "epoch": 0.6421764617465805, + "flos": 25598286321480.0, + "grad_norm": 1.6763167082199295, + "language_loss": 0.68009949, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70397782, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12371826, + "step": 10681, + "time_per_iteration": 2.9169137477874756 + }, + { + "auxiliary_loss_clip": 0.01356373, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.24525523, + "balance_loss_mlp": 1.01831198, + "epoch": 0.6422365849992484, + "flos": 14138497706040.0, + "grad_norm": 1.7868282864062688, + "language_loss": 0.75157958, + "learning_rate": 1.198843556910427e-06, + "loss": 0.77545547, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12884521, + "step": 10682, + "time_per_iteration": 2.736520290374756 + }, + { + "auxiliary_loss_clip": 0.01345971, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.23944592, + "balance_loss_mlp": 1.01915848, + "epoch": 0.6422967082519164, + "flos": 22389202508760.0, + "grad_norm": 1.5430031405534859, + "language_loss": 0.79238379, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81615597, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12109375, + "step": 10683, + "time_per_iteration": 2.8239781856536865 + }, + { + "auxiliary_loss_clip": 0.01359747, + "auxiliary_loss_mlp": 0.01039858, + "balance_loss_clip": 1.24722874, + "balance_loss_mlp": 1.02546954, + "epoch": 0.6423568315045844, + "flos": 14652077548320.0, + "grad_norm": 1.6355232561758442, + "language_loss": 0.67571843, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69971448, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1439209, + "step": 10684, + "time_per_iteration": 2.735934257507324 + }, + { + "auxiliary_loss_clip": 0.01358132, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.24543536, + "balance_loss_mlp": 1.01852965, + "epoch": 0.6424169547572524, + "flos": 26839634434560.0, + "grad_norm": 2.0341623304038428, + "language_loss": 0.71993601, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.74383163, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12890625, + "step": 10685, + "time_per_iteration": 2.794727087020874 + }, + { + "auxiliary_loss_clip": 0.01353248, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.24379981, + "balance_loss_mlp": 1.0229609, + "epoch": 0.6424770780099204, + "flos": 22712129790120.0, + "grad_norm": 1.6283349966154785, + "language_loss": 0.7517066, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77559757, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12884521, + "step": 10686, + "time_per_iteration": 2.7776057720184326 + }, + { + "auxiliary_loss_clip": 0.01366144, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.25026309, + "balance_loss_mlp": 1.02051091, + "epoch": 0.6425372012625883, + "flos": 28474615629720.0, + "grad_norm": 1.9971534253611376, + "language_loss": 0.68637335, + "learning_rate": 1.197059691144867e-06, + "loss": 0.71039009, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.15014648, + "step": 10687, + "time_per_iteration": 2.8000078201293945 + }, + { + "auxiliary_loss_clip": 0.01365008, + "auxiliary_loss_mlp": 0.01038284, + "balance_loss_clip": 1.25102663, + "balance_loss_mlp": 1.0244441, + "epoch": 0.6425973245152563, + "flos": 29357764777800.0, + "grad_norm": 1.8781192193771368, + "language_loss": 0.66685563, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.69088852, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1385498, + "step": 10688, + "time_per_iteration": 2.8563947677612305 + }, + { + "auxiliary_loss_clip": 0.01357645, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.24575436, + "balance_loss_mlp": 1.01905262, + "epoch": 0.6426574477679242, + "flos": 16434074298600.0, + "grad_norm": 1.5491972267378573, + "language_loss": 0.7354393, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75933933, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13311768, + "step": 10689, + "time_per_iteration": 2.9284231662750244 + }, + { + "auxiliary_loss_clip": 0.01350767, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.2415967, + "balance_loss_mlp": 1.02010965, + "epoch": 0.6427175710205922, + "flos": 21841041324960.0, + "grad_norm": 2.6419462400103337, + "language_loss": 0.7247833, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74862152, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1295166, + "step": 10690, + "time_per_iteration": 2.7943923473358154 + }, + { + "auxiliary_loss_clip": 0.01351764, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.24273694, + "balance_loss_mlp": 1.01436472, + "epoch": 0.6427776942732601, + "flos": 17791308335880.0, + "grad_norm": 1.7386749844292722, + "language_loss": 0.77726203, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80105692, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13372803, + "step": 10691, + "time_per_iteration": 2.70832896232605 + }, + { + "auxiliary_loss_clip": 0.01363103, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.24901807, + "balance_loss_mlp": 1.01993334, + "epoch": 0.6428378175259282, + "flos": 15090119195040.0, + "grad_norm": 2.033615414837541, + "language_loss": 0.7489152, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.77287638, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13098145, + "step": 10692, + "time_per_iteration": 2.7377190589904785 + }, + { + "auxiliary_loss_clip": 0.01355654, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.24358106, + "balance_loss_mlp": 1.02479243, + "epoch": 0.6428979407785961, + "flos": 23847053726880.0, + "grad_norm": 1.807318423408228, + "language_loss": 0.61430287, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63824546, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13812256, + "step": 10693, + "time_per_iteration": 2.797762155532837 + }, + { + "auxiliary_loss_clip": 0.01364203, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.24855328, + "balance_loss_mlp": 1.01842999, + "epoch": 0.6429580640312641, + "flos": 32933331660960.0, + "grad_norm": 2.2860522635557157, + "language_loss": 0.60233521, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.62630284, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14129639, + "step": 10694, + "time_per_iteration": 2.8778772354125977 + }, + { + "auxiliary_loss_clip": 0.01360723, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.24754834, + "balance_loss_mlp": 1.02043128, + "epoch": 0.643018187283932, + "flos": 21073209584040.0, + "grad_norm": 1.3874408424070577, + "language_loss": 0.80200893, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82595646, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.1361084, + "step": 10695, + "time_per_iteration": 2.7365245819091797 + }, + { + "auxiliary_loss_clip": 0.01357421, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.24498701, + "balance_loss_mlp": 1.02869391, + "epoch": 0.6430783105366, + "flos": 26730367673040.0, + "grad_norm": 1.888654830669153, + "language_loss": 0.73704982, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.76105189, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14086914, + "step": 10696, + "time_per_iteration": 4.247150897979736 + }, + { + "auxiliary_loss_clip": 0.01346522, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.23786306, + "balance_loss_mlp": 1.01502383, + "epoch": 0.643138433789268, + "flos": 23702231023200.0, + "grad_norm": 1.6973983976913631, + "language_loss": 0.75862628, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.78238106, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13922119, + "step": 10697, + "time_per_iteration": 2.735638380050659 + }, + { + "auxiliary_loss_clip": 0.0135178, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.24307895, + "balance_loss_mlp": 1.02510917, + "epoch": 0.643198557041936, + "flos": 34208611381800.0, + "grad_norm": 1.4052393535872327, + "language_loss": 0.66386074, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68775356, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12402344, + "step": 10698, + "time_per_iteration": 4.3288843631744385 + }, + { + "auxiliary_loss_clip": 0.01167737, + "auxiliary_loss_mlp": 0.01004137, + "balance_loss_clip": 1.12333965, + "balance_loss_mlp": 1.00178838, + "epoch": 0.643258680294604, + "flos": 67642463974320.0, + "grad_norm": 0.8164900643559462, + "language_loss": 0.63504058, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65675932, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.0234375, + "step": 10699, + "time_per_iteration": 3.2349774837493896 + }, + { + "auxiliary_loss_clip": 0.01347599, + "auxiliary_loss_mlp": 0.0102931, + "balance_loss_clip": 1.24058282, + "balance_loss_mlp": 1.01702511, + "epoch": 0.6433188035472719, + "flos": 25190683963560.0, + "grad_norm": 1.472257749382422, + "language_loss": 0.69503641, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71880549, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12286377, + "step": 10700, + "time_per_iteration": 4.352006435394287 + }, + { + "auxiliary_loss_clip": 0.01359353, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.246382, + "balance_loss_mlp": 1.01935649, + "epoch": 0.6433789267999399, + "flos": 24979094470080.0, + "grad_norm": 1.577889432473572, + "language_loss": 0.73692191, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.76084799, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13897705, + "step": 10701, + "time_per_iteration": 2.8307242393493652 + }, + { + "auxiliary_loss_clip": 0.01362493, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.24628174, + "balance_loss_mlp": 1.01834285, + "epoch": 0.6434390500526078, + "flos": 17570460136320.0, + "grad_norm": 2.372398203457844, + "language_loss": 0.81839037, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84234619, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14764404, + "step": 10702, + "time_per_iteration": 2.779186487197876 + }, + { + "auxiliary_loss_clip": 0.01352197, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.24226367, + "balance_loss_mlp": 1.02463245, + "epoch": 0.6434991733052758, + "flos": 20847650814720.0, + "grad_norm": 1.7535457653496684, + "language_loss": 0.74561429, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76950848, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12585449, + "step": 10703, + "time_per_iteration": 2.7444241046905518 + }, + { + "auxiliary_loss_clip": 0.01165811, + "auxiliary_loss_mlp": 0.01006169, + "balance_loss_clip": 1.12179482, + "balance_loss_mlp": 1.00351036, + "epoch": 0.6435592965579437, + "flos": 66110455244880.0, + "grad_norm": 0.6561866668155383, + "language_loss": 0.54668045, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56840026, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02661133, + "step": 10704, + "time_per_iteration": 3.3003523349761963 + }, + { + "auxiliary_loss_clip": 0.01356681, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.2453295, + "balance_loss_mlp": 1.01583719, + "epoch": 0.6436194198106118, + "flos": 23774114466360.0, + "grad_norm": 1.6163004634653493, + "language_loss": 0.77245843, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79630953, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12609863, + "step": 10705, + "time_per_iteration": 2.836555004119873 + }, + { + "auxiliary_loss_clip": 0.01354493, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.24322617, + "balance_loss_mlp": 1.02149057, + "epoch": 0.6436795430632797, + "flos": 20235199951080.0, + "grad_norm": 1.755409231039793, + "language_loss": 0.7931062, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81699461, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12860107, + "step": 10706, + "time_per_iteration": 2.751575469970703 + }, + { + "auxiliary_loss_clip": 0.01354526, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.2429024, + "balance_loss_mlp": 1.01731741, + "epoch": 0.6437396663159477, + "flos": 20306555485560.0, + "grad_norm": 1.8281556696843606, + "language_loss": 0.8062228, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.83008659, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14538574, + "step": 10707, + "time_per_iteration": 2.741276264190674 + }, + { + "auxiliary_loss_clip": 0.01356673, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.24552763, + "balance_loss_mlp": 1.01609683, + "epoch": 0.6437997895686156, + "flos": 23883909136560.0, + "grad_norm": 1.6583234582875672, + "language_loss": 0.8594538, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88331264, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13104248, + "step": 10708, + "time_per_iteration": 4.363473653793335 + }, + { + "auxiliary_loss_clip": 0.01372055, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.25433087, + "balance_loss_mlp": 1.02767158, + "epoch": 0.6438599128212836, + "flos": 18993933054720.0, + "grad_norm": 3.461712448313982, + "language_loss": 0.65587533, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.68002081, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.14825439, + "step": 10709, + "time_per_iteration": 2.7848258018493652 + }, + { + "auxiliary_loss_clip": 0.01352802, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.24331319, + "balance_loss_mlp": 1.02201736, + "epoch": 0.6439200360739517, + "flos": 24101549275680.0, + "grad_norm": 1.787883363089578, + "language_loss": 0.80636442, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.83024019, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12756348, + "step": 10710, + "time_per_iteration": 2.7409627437591553 + }, + { + "auxiliary_loss_clip": 0.01354428, + "auxiliary_loss_mlp": 0.01025037, + "balance_loss_clip": 1.24356055, + "balance_loss_mlp": 1.01168597, + "epoch": 0.6439801593266196, + "flos": 31908080569320.0, + "grad_norm": 1.9453011778288896, + "language_loss": 0.66278046, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68657511, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13360596, + "step": 10711, + "time_per_iteration": 2.9092602729797363 + }, + { + "auxiliary_loss_clip": 0.01355308, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.24268246, + "balance_loss_mlp": 1.01958656, + "epoch": 0.6440402825792876, + "flos": 27131635126800.0, + "grad_norm": 1.6186862829104802, + "language_loss": 0.78984046, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.81372815, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13879395, + "step": 10712, + "time_per_iteration": 2.939335584640503 + }, + { + "auxiliary_loss_clip": 0.01364323, + "auxiliary_loss_mlp": 0.01036873, + "balance_loss_clip": 1.24930978, + "balance_loss_mlp": 1.02288365, + "epoch": 0.6441004058319555, + "flos": 20672591864040.0, + "grad_norm": 1.5615667234844481, + "language_loss": 0.83039534, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.85440725, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13995361, + "step": 10713, + "time_per_iteration": 2.73612117767334 + }, + { + "auxiliary_loss_clip": 0.01340701, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.23489499, + "balance_loss_mlp": 1.02180576, + "epoch": 0.6441605290846235, + "flos": 26030927056320.0, + "grad_norm": 1.3391868450862028, + "language_loss": 0.78340393, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80716538, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13647461, + "step": 10714, + "time_per_iteration": 2.8033640384674072 + }, + { + "auxiliary_loss_clip": 0.01349741, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.23985565, + "balance_loss_mlp": 1.01596177, + "epoch": 0.6442206523372914, + "flos": 24904855742040.0, + "grad_norm": 1.407854807502162, + "language_loss": 0.82002139, + "learning_rate": 1.187084157517583e-06, + "loss": 0.84380466, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1262207, + "step": 10715, + "time_per_iteration": 2.7531447410583496 + }, + { + "auxiliary_loss_clip": 0.01356787, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.24367952, + "balance_loss_mlp": 1.01644373, + "epoch": 0.6442807755899594, + "flos": 25161909617520.0, + "grad_norm": 1.9108561572081741, + "language_loss": 0.81503475, + "learning_rate": 1.186728333672332e-06, + "loss": 0.8389014, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13439941, + "step": 10716, + "time_per_iteration": 2.8501739501953125 + }, + { + "auxiliary_loss_clip": 0.01363302, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.24753416, + "balance_loss_mlp": 1.01912081, + "epoch": 0.6443408988426274, + "flos": 27350249866560.0, + "grad_norm": 1.7166866601754927, + "language_loss": 0.78575599, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80972552, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14544678, + "step": 10717, + "time_per_iteration": 2.784006357192993 + }, + { + "auxiliary_loss_clip": 0.01340822, + "auxiliary_loss_mlp": 0.01028276, + "balance_loss_clip": 1.23396611, + "balance_loss_mlp": 1.01553845, + "epoch": 0.6444010220952954, + "flos": 27934170034320.0, + "grad_norm": 1.4917492827636032, + "language_loss": 0.68234199, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70603299, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12744141, + "step": 10718, + "time_per_iteration": 2.8037149906158447 + }, + { + "auxiliary_loss_clip": 0.01163547, + "auxiliary_loss_mlp": 0.01003453, + "balance_loss_clip": 1.11935675, + "balance_loss_mlp": 1.00096118, + "epoch": 0.6444611453479633, + "flos": 71228409616440.0, + "grad_norm": 0.7514225233306713, + "language_loss": 0.49674276, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51841271, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02490234, + "step": 10719, + "time_per_iteration": 3.3712501525878906 + }, + { + "auxiliary_loss_clip": 0.01354898, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.24221194, + "balance_loss_mlp": 1.01909244, + "epoch": 0.6445212686006313, + "flos": 22709530855080.0, + "grad_norm": 1.938497672477976, + "language_loss": 0.78239441, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80627847, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14440918, + "step": 10720, + "time_per_iteration": 2.7721853256225586 + }, + { + "auxiliary_loss_clip": 0.01347481, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.23721123, + "balance_loss_mlp": 1.01830029, + "epoch": 0.6445813918532992, + "flos": 21183816421440.0, + "grad_norm": 1.7583318348787127, + "language_loss": 0.77282727, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79662704, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.14196777, + "step": 10721, + "time_per_iteration": 2.9040863513946533 + }, + { + "auxiliary_loss_clip": 0.01354431, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.24205613, + "balance_loss_mlp": 1.01611459, + "epoch": 0.6446415151059672, + "flos": 25197384342960.0, + "grad_norm": 1.8462301858141283, + "language_loss": 0.73432362, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.7581768, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14782715, + "step": 10722, + "time_per_iteration": 2.783967971801758 + }, + { + "auxiliary_loss_clip": 0.0134836, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.23820758, + "balance_loss_mlp": 1.01430786, + "epoch": 0.6447016383586353, + "flos": 25307828746920.0, + "grad_norm": 1.4387901717135758, + "language_loss": 0.78079587, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80454642, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1237793, + "step": 10723, + "time_per_iteration": 2.8304479122161865 + }, + { + "auxiliary_loss_clip": 0.0136089, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.24702549, + "balance_loss_mlp": 1.01712453, + "epoch": 0.6447617616113032, + "flos": 27708123964680.0, + "grad_norm": 1.6791330338983848, + "language_loss": 0.58283269, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60675013, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1373291, + "step": 10724, + "time_per_iteration": 2.939073324203491 + }, + { + "auxiliary_loss_clip": 0.01340955, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.23384953, + "balance_loss_mlp": 1.01643169, + "epoch": 0.6448218848639712, + "flos": 23044275169200.0, + "grad_norm": 1.80725048833514, + "language_loss": 0.83746529, + "learning_rate": 1.183527308454271e-06, + "loss": 0.86116707, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12805176, + "step": 10725, + "time_per_iteration": 2.8072493076324463 + }, + { + "auxiliary_loss_clip": 0.01349621, + "auxiliary_loss_mlp": 0.01034183, + "balance_loss_clip": 1.23938251, + "balance_loss_mlp": 1.02022386, + "epoch": 0.6448820081166391, + "flos": 24501151786680.0, + "grad_norm": 1.7742132179548888, + "language_loss": 0.81944096, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.843279, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13970947, + "step": 10726, + "time_per_iteration": 2.8205957412719727 + }, + { + "auxiliary_loss_clip": 0.01360381, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.24538457, + "balance_loss_mlp": 1.01678658, + "epoch": 0.6449421313693071, + "flos": 22424433584040.0, + "grad_norm": 1.800599837783353, + "language_loss": 0.81522822, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.8391366, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.13665771, + "step": 10727, + "time_per_iteration": 2.77182936668396 + }, + { + "auxiliary_loss_clip": 0.01366487, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.24825168, + "balance_loss_mlp": 1.01767552, + "epoch": 0.645002254621975, + "flos": 20230002081000.0, + "grad_norm": 1.8143828183853326, + "language_loss": 0.79537082, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81936038, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.14801025, + "step": 10728, + "time_per_iteration": 2.868298053741455 + }, + { + "auxiliary_loss_clip": 0.01351645, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.23899257, + "balance_loss_mlp": 1.01746345, + "epoch": 0.645062377874643, + "flos": 27861271382160.0, + "grad_norm": 1.916721310632994, + "language_loss": 0.74526346, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76909363, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13909912, + "step": 10729, + "time_per_iteration": 2.8479931354522705 + }, + { + "auxiliary_loss_clip": 0.0135532, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.24264216, + "balance_loss_mlp": 1.01385593, + "epoch": 0.645122501127311, + "flos": 25307016579720.0, + "grad_norm": 1.52131847499437, + "language_loss": 0.66653121, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.69036013, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.137146, + "step": 10730, + "time_per_iteration": 2.822313070297241 + }, + { + "auxiliary_loss_clip": 0.01352382, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.24038565, + "balance_loss_mlp": 1.01605439, + "epoch": 0.645182624379979, + "flos": 18812051899560.0, + "grad_norm": 1.5672440117290267, + "language_loss": 0.64258313, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.66641307, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14562988, + "step": 10731, + "time_per_iteration": 2.6998331546783447 + }, + { + "auxiliary_loss_clip": 0.01344129, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.2348485, + "balance_loss_mlp": 1.01669109, + "epoch": 0.6452427476326469, + "flos": 18336708151200.0, + "grad_norm": 1.6185249567853768, + "language_loss": 0.68474537, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70848656, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13311768, + "step": 10732, + "time_per_iteration": 2.727989673614502 + }, + { + "auxiliary_loss_clip": 0.01341647, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.23359895, + "balance_loss_mlp": 1.02098095, + "epoch": 0.6453028708853149, + "flos": 22790023270560.0, + "grad_norm": 1.6936437854930773, + "language_loss": 0.75956619, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.78332347, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13110352, + "step": 10733, + "time_per_iteration": 2.734663248062134 + }, + { + "auxiliary_loss_clip": 0.01364061, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.2489934, + "balance_loss_mlp": 1.02642488, + "epoch": 0.6453629941379828, + "flos": 23950594709640.0, + "grad_norm": 2.949129807272734, + "language_loss": 0.66358244, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.68763119, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14385986, + "step": 10734, + "time_per_iteration": 2.7725729942321777 + }, + { + "auxiliary_loss_clip": 0.01336606, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.23174143, + "balance_loss_mlp": 1.02167356, + "epoch": 0.6454231173906508, + "flos": 17680945148640.0, + "grad_norm": 1.8127108367472584, + "language_loss": 0.74103922, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76475024, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12817383, + "step": 10735, + "time_per_iteration": 4.117919921875 + }, + { + "auxiliary_loss_clip": 0.01346046, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.23544776, + "balance_loss_mlp": 1.02223444, + "epoch": 0.6454832406433189, + "flos": 23297755509000.0, + "grad_norm": 1.7169691938913363, + "language_loss": 0.75091642, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.77474141, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14208984, + "step": 10736, + "time_per_iteration": 4.3449554443359375 + }, + { + "auxiliary_loss_clip": 0.01364297, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.24921, + "balance_loss_mlp": 1.01642656, + "epoch": 0.6455433638959868, + "flos": 20162057648760.0, + "grad_norm": 2.093408025695877, + "language_loss": 0.71119821, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.73514509, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13977051, + "step": 10737, + "time_per_iteration": 2.719625949859619 + }, + { + "auxiliary_loss_clip": 0.01161551, + "auxiliary_loss_mlp": 0.01010252, + "balance_loss_clip": 1.11498833, + "balance_loss_mlp": 1.00741482, + "epoch": 0.6456034871486548, + "flos": 66548902975200.0, + "grad_norm": 0.7736385114259824, + "language_loss": 0.58587646, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60759449, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02832031, + "step": 10738, + "time_per_iteration": 4.7537291049957275 + }, + { + "auxiliary_loss_clip": 0.01345359, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.23581994, + "balance_loss_mlp": 1.01624656, + "epoch": 0.6456636104013227, + "flos": 24211343945880.0, + "grad_norm": 1.6632006722961834, + "language_loss": 0.74716568, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.77090728, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12573242, + "step": 10739, + "time_per_iteration": 2.879488706588745 + }, + { + "auxiliary_loss_clip": 0.01356174, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.24257243, + "balance_loss_mlp": 1.0183363, + "epoch": 0.6457237336539907, + "flos": 23629494804480.0, + "grad_norm": 1.8454562464069841, + "language_loss": 0.71630424, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.74018544, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13598633, + "step": 10740, + "time_per_iteration": 2.819314956665039 + }, + { + "auxiliary_loss_clip": 0.01164781, + "auxiliary_loss_mlp": 0.01004382, + "balance_loss_clip": 1.11828184, + "balance_loss_mlp": 1.00189078, + "epoch": 0.6457838569066586, + "flos": 65862741292200.0, + "grad_norm": 0.6687773463790021, + "language_loss": 0.55424011, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57593179, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02490234, + "step": 10741, + "time_per_iteration": 3.2386341094970703 + }, + { + "auxiliary_loss_clip": 0.01347484, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.23968077, + "balance_loss_mlp": 1.01878774, + "epoch": 0.6458439801593266, + "flos": 22387009657320.0, + "grad_norm": 1.902953211707136, + "language_loss": 0.80453932, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82833171, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12982178, + "step": 10742, + "time_per_iteration": 2.793093681335449 + }, + { + "auxiliary_loss_clip": 0.01336454, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.23045027, + "balance_loss_mlp": 1.01626706, + "epoch": 0.6459041034119946, + "flos": 24794492554800.0, + "grad_norm": 1.4947745589951775, + "language_loss": 0.8214891, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84514534, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12908936, + "step": 10743, + "time_per_iteration": 2.814846992492676 + }, + { + "auxiliary_loss_clip": 0.01347755, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.23866987, + "balance_loss_mlp": 1.01878476, + "epoch": 0.6459642266646626, + "flos": 18328992562800.0, + "grad_norm": 2.5477222314701637, + "language_loss": 0.71773291, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74152899, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13085938, + "step": 10744, + "time_per_iteration": 2.7905564308166504 + }, + { + "auxiliary_loss_clip": 0.01348392, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.23790479, + "balance_loss_mlp": 1.01931572, + "epoch": 0.6460243499173305, + "flos": 43589326509720.0, + "grad_norm": 1.7356880283062637, + "language_loss": 0.6674571, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.69126546, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13128662, + "step": 10745, + "time_per_iteration": 2.9223482608795166 + }, + { + "auxiliary_loss_clip": 0.01345618, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.23599911, + "balance_loss_mlp": 1.01944983, + "epoch": 0.6460844731699985, + "flos": 19248631645320.0, + "grad_norm": 2.765030766298261, + "language_loss": 0.74421805, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.76800346, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13464355, + "step": 10746, + "time_per_iteration": 2.7795534133911133 + }, + { + "auxiliary_loss_clip": 0.01352808, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.24038696, + "balance_loss_mlp": 1.01935089, + "epoch": 0.6461445964226664, + "flos": 27459029327760.0, + "grad_norm": 1.4314267905978195, + "language_loss": 0.67010731, + "learning_rate": 1.175713157660413e-06, + "loss": 0.69395715, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12817383, + "step": 10747, + "time_per_iteration": 4.517723560333252 + }, + { + "auxiliary_loss_clip": 0.01348084, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.23680496, + "balance_loss_mlp": 1.01892066, + "epoch": 0.6462047196753344, + "flos": 20299367805840.0, + "grad_norm": 1.5536150969469478, + "language_loss": 0.67349362, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69729352, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12988281, + "step": 10748, + "time_per_iteration": 2.7723562717437744 + }, + { + "auxiliary_loss_clip": 0.01360679, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.24649215, + "balance_loss_mlp": 1.02885556, + "epoch": 0.6462648429280025, + "flos": 22023490997160.0, + "grad_norm": 1.628001149750947, + "language_loss": 0.75815886, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78219378, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1395874, + "step": 10749, + "time_per_iteration": 2.7524759769439697 + }, + { + "auxiliary_loss_clip": 0.01355805, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.24217343, + "balance_loss_mlp": 1.02010369, + "epoch": 0.6463249661806704, + "flos": 27786789003960.0, + "grad_norm": 1.4695635117670318, + "language_loss": 0.77131426, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79520833, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.1348877, + "step": 10750, + "time_per_iteration": 2.770986557006836 + }, + { + "auxiliary_loss_clip": 0.01361816, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.24712849, + "balance_loss_mlp": 1.01796949, + "epoch": 0.6463850894333384, + "flos": 22055189145120.0, + "grad_norm": 2.130654168810296, + "language_loss": 0.69010067, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.71404618, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.14764404, + "step": 10751, + "time_per_iteration": 2.7907462120056152 + }, + { + "auxiliary_loss_clip": 0.01359564, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.24596357, + "balance_loss_mlp": 1.01866078, + "epoch": 0.6464452126860063, + "flos": 21111283244520.0, + "grad_norm": 1.8983383005363328, + "language_loss": 0.71453464, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73845327, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13641357, + "step": 10752, + "time_per_iteration": 2.804051399230957 + }, + { + "auxiliary_loss_clip": 0.01351753, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.23861098, + "balance_loss_mlp": 1.02074885, + "epoch": 0.6465053359386743, + "flos": 16031547985680.0, + "grad_norm": 1.7087180437376766, + "language_loss": 0.77958316, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.8034575, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14923096, + "step": 10753, + "time_per_iteration": 2.7664098739624023 + }, + { + "auxiliary_loss_clip": 0.01345626, + "auxiliary_loss_mlp": 0.01038058, + "balance_loss_clip": 1.23558867, + "balance_loss_mlp": 1.02376425, + "epoch": 0.6465654591913422, + "flos": 23403205084680.0, + "grad_norm": 1.6318301674480002, + "language_loss": 0.85401553, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87785238, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.14300537, + "step": 10754, + "time_per_iteration": 2.761291980743408 + }, + { + "auxiliary_loss_clip": 0.01350601, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.23960686, + "balance_loss_mlp": 1.02686405, + "epoch": 0.6466255824440102, + "flos": 15381470153520.0, + "grad_norm": 2.056774195556948, + "language_loss": 0.60627723, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.63018453, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13250732, + "step": 10755, + "time_per_iteration": 2.775643825531006 + }, + { + "auxiliary_loss_clip": 0.01343418, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.23315227, + "balance_loss_mlp": 1.01721275, + "epoch": 0.6466857056966782, + "flos": 16257147363360.0, + "grad_norm": 2.5534562440837942, + "language_loss": 0.68423176, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70797646, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13824463, + "step": 10756, + "time_per_iteration": 2.742309331893921 + }, + { + "auxiliary_loss_clip": 0.01365282, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.24912846, + "balance_loss_mlp": 1.02251911, + "epoch": 0.6467458289493462, + "flos": 21183166687680.0, + "grad_norm": 2.3316042563156962, + "language_loss": 0.75597543, + "learning_rate": 1.172166263444844e-06, + "loss": 0.7799902, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.13671875, + "step": 10757, + "time_per_iteration": 2.734112024307251 + }, + { + "auxiliary_loss_clip": 0.01340801, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.23344076, + "balance_loss_mlp": 1.01358259, + "epoch": 0.6468059522020141, + "flos": 17972905232520.0, + "grad_norm": 1.3809051326634452, + "language_loss": 0.74407375, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76774365, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1262207, + "step": 10758, + "time_per_iteration": 2.79327654838562 + }, + { + "auxiliary_loss_clip": 0.0135512, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.24301267, + "balance_loss_mlp": 1.02048576, + "epoch": 0.6468660754546821, + "flos": 17894077759800.0, + "grad_norm": 1.5475779842355482, + "language_loss": 0.68197942, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.7058807, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14520264, + "step": 10759, + "time_per_iteration": 2.8503851890563965 + }, + { + "auxiliary_loss_clip": 0.0136131, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.24638855, + "balance_loss_mlp": 1.01890516, + "epoch": 0.64692619870735, + "flos": 22606071089040.0, + "grad_norm": 1.4811579300739328, + "language_loss": 0.75697327, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.78091264, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.1373291, + "step": 10760, + "time_per_iteration": 2.7899258136749268 + }, + { + "auxiliary_loss_clip": 0.01342922, + "auxiliary_loss_mlp": 0.0102634, + "balance_loss_clip": 1.23370707, + "balance_loss_mlp": 1.01351893, + "epoch": 0.646986321960018, + "flos": 49609434741840.0, + "grad_norm": 1.5381261011772684, + "language_loss": 0.65184218, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67553478, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1282959, + "step": 10761, + "time_per_iteration": 3.0548300743103027 + }, + { + "auxiliary_loss_clip": 0.01351222, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.2393539, + "balance_loss_mlp": 1.0135622, + "epoch": 0.6470464452126861, + "flos": 21913533893520.0, + "grad_norm": 2.9573603323402478, + "language_loss": 0.69955635, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.7233488, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14465332, + "step": 10762, + "time_per_iteration": 2.7852141857147217 + }, + { + "auxiliary_loss_clip": 0.0136317, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.24889803, + "balance_loss_mlp": 1.02067375, + "epoch": 0.647106568465354, + "flos": 18109687480920.0, + "grad_norm": 4.549728787054815, + "language_loss": 0.82838929, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.85236877, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.14111328, + "step": 10763, + "time_per_iteration": 2.801403522491455 + }, + { + "auxiliary_loss_clip": 0.01166599, + "auxiliary_loss_mlp": 0.01010298, + "balance_loss_clip": 1.12099135, + "balance_loss_mlp": 1.00736511, + "epoch": 0.647166691718022, + "flos": 69495328958760.0, + "grad_norm": 0.7146887191631764, + "language_loss": 0.57763976, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59940875, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02929688, + "step": 10764, + "time_per_iteration": 3.4878463745117188 + }, + { + "auxiliary_loss_clip": 0.01346655, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.23597622, + "balance_loss_mlp": 1.02351832, + "epoch": 0.6472268149706899, + "flos": 34101902946960.0, + "grad_norm": 1.7330175050686247, + "language_loss": 0.6044302, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62825978, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12799072, + "step": 10765, + "time_per_iteration": 2.975099563598633 + }, + { + "auxiliary_loss_clip": 0.01341406, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.23315406, + "balance_loss_mlp": 1.01613891, + "epoch": 0.6472869382233579, + "flos": 28117147615200.0, + "grad_norm": 1.8072035822927095, + "language_loss": 0.63611996, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65982276, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12744141, + "step": 10766, + "time_per_iteration": 2.906033754348755 + }, + { + "auxiliary_loss_clip": 0.01352451, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.242208, + "balance_loss_mlp": 1.01507163, + "epoch": 0.6473470614760258, + "flos": 22497291627840.0, + "grad_norm": 1.5981452560098814, + "language_loss": 0.76172352, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.78553879, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14007568, + "step": 10767, + "time_per_iteration": 2.8019747734069824 + }, + { + "auxiliary_loss_clip": 0.01349815, + "auxiliary_loss_mlp": 0.01026789, + "balance_loss_clip": 1.2390306, + "balance_loss_mlp": 1.01327682, + "epoch": 0.6474071847286939, + "flos": 14543135653680.0, + "grad_norm": 11.417911478462782, + "language_loss": 0.78140503, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80517107, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13525391, + "step": 10768, + "time_per_iteration": 2.8482425212860107 + }, + { + "auxiliary_loss_clip": 0.01341912, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.23181617, + "balance_loss_mlp": 1.01548314, + "epoch": 0.6474673079813618, + "flos": 24103660910400.0, + "grad_norm": 1.6309938206639965, + "language_loss": 0.72111022, + "learning_rate": 1.167914135250663e-06, + "loss": 0.7448163, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13201904, + "step": 10769, + "time_per_iteration": 2.830296516418457 + }, + { + "auxiliary_loss_clip": 0.01339762, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.23210192, + "balance_loss_mlp": 1.01688766, + "epoch": 0.6475274312340298, + "flos": 14980365133200.0, + "grad_norm": 1.846888955593434, + "language_loss": 0.72152293, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74522197, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.13275146, + "step": 10770, + "time_per_iteration": 2.7234106063842773 + }, + { + "auxiliary_loss_clip": 0.0136033, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.24329972, + "balance_loss_mlp": 1.02096248, + "epoch": 0.6475875544866977, + "flos": 25050693654720.0, + "grad_norm": 1.844406141808682, + "language_loss": 0.73509181, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75905526, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.15045166, + "step": 10771, + "time_per_iteration": 2.885103225708008 + }, + { + "auxiliary_loss_clip": 0.01343526, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.23406184, + "balance_loss_mlp": 1.01992989, + "epoch": 0.6476476777393657, + "flos": 16476533661960.0, + "grad_norm": 1.8286818614145446, + "language_loss": 0.73727596, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76104337, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13293457, + "step": 10772, + "time_per_iteration": 2.736210346221924 + }, + { + "auxiliary_loss_clip": 0.01339252, + "auxiliary_loss_mlp": 0.01029014, + "balance_loss_clip": 1.2315135, + "balance_loss_mlp": 1.01675892, + "epoch": 0.6477078009920336, + "flos": 25817835053520.0, + "grad_norm": 1.4801748688392513, + "language_loss": 0.83014935, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85383201, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12261963, + "step": 10773, + "time_per_iteration": 4.285508155822754 + }, + { + "auxiliary_loss_clip": 0.01335412, + "auxiliary_loss_mlp": 0.01026066, + "balance_loss_clip": 1.227898, + "balance_loss_mlp": 1.01356101, + "epoch": 0.6477679242447016, + "flos": 17680660890120.0, + "grad_norm": 1.5309735443177794, + "language_loss": 0.79064447, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.81425929, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12506104, + "step": 10774, + "time_per_iteration": 4.163000822067261 + }, + { + "auxiliary_loss_clip": 0.01355915, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.24299204, + "balance_loss_mlp": 1.02039576, + "epoch": 0.6478280474973696, + "flos": 21037206949920.0, + "grad_norm": 3.160876964557225, + "language_loss": 0.6919595, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71585894, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.1362915, + "step": 10775, + "time_per_iteration": 2.761923313140869 + }, + { + "auxiliary_loss_clip": 0.01358755, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.24340796, + "balance_loss_mlp": 1.02181005, + "epoch": 0.6478881707500376, + "flos": 21622020501600.0, + "grad_norm": 1.7718763554485908, + "language_loss": 0.65888917, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.682827, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 1.15478516, + "router_z_loss_mlp": 0.13220215, + "step": 10776, + "time_per_iteration": 2.816770553588867 + }, + { + "auxiliary_loss_clip": 0.01350336, + "auxiliary_loss_mlp": 0.01033752, + "balance_loss_clip": 1.2376411, + "balance_loss_mlp": 1.01966143, + "epoch": 0.6479482940027056, + "flos": 18447233771880.0, + "grad_norm": 3.2092368366591972, + "language_loss": 0.79539138, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81923223, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14074707, + "step": 10777, + "time_per_iteration": 4.223371744155884 + }, + { + "auxiliary_loss_clip": 0.01345246, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.23475885, + "balance_loss_mlp": 1.01682556, + "epoch": 0.6480084172553735, + "flos": 22169207084760.0, + "grad_norm": 2.0035926342426755, + "language_loss": 0.73579913, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75956202, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.1418457, + "step": 10778, + "time_per_iteration": 2.7340002059936523 + }, + { + "auxiliary_loss_clip": 0.01342258, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.23349071, + "balance_loss_mlp": 1.01466405, + "epoch": 0.6480685405080415, + "flos": 24321585308040.0, + "grad_norm": 1.3685069779514167, + "language_loss": 0.7813223, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80502278, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13140869, + "step": 10779, + "time_per_iteration": 2.7969348430633545 + }, + { + "auxiliary_loss_clip": 0.01165585, + "auxiliary_loss_mlp": 0.01002182, + "balance_loss_clip": 1.11887276, + "balance_loss_mlp": 0.99908274, + "epoch": 0.6481286637607094, + "flos": 59906151181080.0, + "grad_norm": 0.7314313422107351, + "language_loss": 0.59427392, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.6159516, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.03088379, + "step": 10780, + "time_per_iteration": 3.231821060180664 + }, + { + "auxiliary_loss_clip": 0.01342847, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.23284173, + "balance_loss_mlp": 1.01934493, + "epoch": 0.6481887870133775, + "flos": 25489750510440.0, + "grad_norm": 1.7433452827614795, + "language_loss": 0.79543394, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81918371, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12774658, + "step": 10781, + "time_per_iteration": 3.011535167694092 + }, + { + "auxiliary_loss_clip": 0.01357717, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.2441591, + "balance_loss_mlp": 1.01766157, + "epoch": 0.6482489102660454, + "flos": 19933981161120.0, + "grad_norm": 2.2979830052267847, + "language_loss": 0.79218131, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81608486, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 1.13525391, + "router_z_loss_mlp": 0.1496582, + "step": 10782, + "time_per_iteration": 2.7453689575195312 + }, + { + "auxiliary_loss_clip": 0.01347437, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.23464537, + "balance_loss_mlp": 1.02119613, + "epoch": 0.6483090335187134, + "flos": 26985147480360.0, + "grad_norm": 1.9137150672646086, + "language_loss": 0.64110792, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66493607, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14190674, + "step": 10783, + "time_per_iteration": 2.8449995517730713 + }, + { + "auxiliary_loss_clip": 0.01352958, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.23807573, + "balance_loss_mlp": 1.01946783, + "epoch": 0.6483691567713813, + "flos": 25082838494640.0, + "grad_norm": 1.8883805997348262, + "language_loss": 0.89044654, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.91432011, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14929199, + "step": 10784, + "time_per_iteration": 2.893972158432007 + }, + { + "auxiliary_loss_clip": 0.01337135, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.22856593, + "balance_loss_mlp": 1.01421475, + "epoch": 0.6484292800240493, + "flos": 16110131808240.0, + "grad_norm": 1.8656879128346844, + "language_loss": 0.7330606, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75672007, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.14575195, + "step": 10785, + "time_per_iteration": 4.265395164489746 + }, + { + "auxiliary_loss_clip": 0.01341597, + "auxiliary_loss_mlp": 0.01027334, + "balance_loss_clip": 1.2339592, + "balance_loss_mlp": 1.01435173, + "epoch": 0.6484894032767172, + "flos": 28846337178600.0, + "grad_norm": 1.4368758232774976, + "language_loss": 0.69058502, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71427429, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12994385, + "step": 10786, + "time_per_iteration": 2.856299638748169 + }, + { + "auxiliary_loss_clip": 0.01342389, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.23258162, + "balance_loss_mlp": 1.01735568, + "epoch": 0.6485495265293852, + "flos": 30233198337480.0, + "grad_norm": 1.8126933738345647, + "language_loss": 0.71611071, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73984444, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13647461, + "step": 10787, + "time_per_iteration": 2.9622673988342285 + }, + { + "auxiliary_loss_clip": 0.01355274, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.24143243, + "balance_loss_mlp": 1.01771188, + "epoch": 0.6486096497820532, + "flos": 20086763103360.0, + "grad_norm": 2.0488367745823775, + "language_loss": 0.85132015, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.87518919, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13922119, + "step": 10788, + "time_per_iteration": 2.7360963821411133 + }, + { + "auxiliary_loss_clip": 0.01345147, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.23441672, + "balance_loss_mlp": 1.02077746, + "epoch": 0.6486697730347212, + "flos": 17133555523680.0, + "grad_norm": 2.069775153590274, + "language_loss": 0.77781641, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.80161512, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13946533, + "step": 10789, + "time_per_iteration": 2.758124589920044 + }, + { + "auxiliary_loss_clip": 0.01337763, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.2291342, + "balance_loss_mlp": 1.01438355, + "epoch": 0.6487298962873892, + "flos": 38919386460240.0, + "grad_norm": 1.650627210482181, + "language_loss": 0.75905049, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78269494, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12304688, + "step": 10790, + "time_per_iteration": 2.857290744781494 + }, + { + "auxiliary_loss_clip": 0.01342274, + "auxiliary_loss_mlp": 0.01024982, + "balance_loss_clip": 1.23394585, + "balance_loss_mlp": 1.01301932, + "epoch": 0.6487900195400571, + "flos": 11951619357960.0, + "grad_norm": 2.0123023207832937, + "language_loss": 0.6039359, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62760842, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.11968994, + "step": 10791, + "time_per_iteration": 2.748546600341797 + }, + { + "auxiliary_loss_clip": 0.01341185, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.2316097, + "balance_loss_mlp": 1.01518989, + "epoch": 0.6488501427927251, + "flos": 22349342080440.0, + "grad_norm": 1.6639703529858523, + "language_loss": 0.85943544, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88312554, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.1262207, + "step": 10792, + "time_per_iteration": 2.704319477081299 + }, + { + "auxiliary_loss_clip": 0.0135174, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.23764062, + "balance_loss_mlp": 1.01690626, + "epoch": 0.648910266045393, + "flos": 22241334178080.0, + "grad_norm": 2.2053743561267307, + "language_loss": 0.78570199, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80952829, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13983154, + "step": 10793, + "time_per_iteration": 2.8822944164276123 + }, + { + "auxiliary_loss_clip": 0.01348444, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.23589635, + "balance_loss_mlp": 1.01385951, + "epoch": 0.6489703892980611, + "flos": 25306732321200.0, + "grad_norm": 1.8358081573362106, + "language_loss": 0.74564809, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76940763, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13641357, + "step": 10794, + "time_per_iteration": 2.843111038208008 + }, + { + "auxiliary_loss_clip": 0.01345299, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.23490834, + "balance_loss_mlp": 1.0169729, + "epoch": 0.649030512550729, + "flos": 24577339716000.0, + "grad_norm": 1.6691924282080166, + "language_loss": 0.70588577, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72964215, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13372803, + "step": 10795, + "time_per_iteration": 2.763505220413208 + }, + { + "auxiliary_loss_clip": 0.01355666, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.24179578, + "balance_loss_mlp": 1.02012134, + "epoch": 0.649090635803397, + "flos": 26249623012800.0, + "grad_norm": 1.5877954768884857, + "language_loss": 0.54233694, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56624043, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14562988, + "step": 10796, + "time_per_iteration": 2.7844436168670654 + }, + { + "auxiliary_loss_clip": 0.01347732, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.23623431, + "balance_loss_mlp": 1.01880884, + "epoch": 0.6491507590560649, + "flos": 24942929402520.0, + "grad_norm": 1.54890104959382, + "language_loss": 0.77867687, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.80247819, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 1.11474609, + "router_z_loss_mlp": 0.13592529, + "step": 10797, + "time_per_iteration": 2.742464065551758 + }, + { + "auxiliary_loss_clip": 0.01336937, + "auxiliary_loss_mlp": 0.01026237, + "balance_loss_clip": 1.23028886, + "balance_loss_mlp": 1.01371396, + "epoch": 0.6492108823087329, + "flos": 19504467270000.0, + "grad_norm": 1.9013458834888726, + "language_loss": 0.70611429, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72974604, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12530518, + "step": 10798, + "time_per_iteration": 2.7352592945098877 + }, + { + "auxiliary_loss_clip": 0.01343769, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.23276877, + "balance_loss_mlp": 1.01693523, + "epoch": 0.6492710055614008, + "flos": 19724097218760.0, + "grad_norm": 1.9019193478224639, + "language_loss": 0.77521533, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79895127, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12908936, + "step": 10799, + "time_per_iteration": 2.8019673824310303 + }, + { + "auxiliary_loss_clip": 0.01350384, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.23488951, + "balance_loss_mlp": 1.02032244, + "epoch": 0.6493311288140688, + "flos": 24322478691960.0, + "grad_norm": 2.068169129372902, + "language_loss": 0.72289836, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74674994, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.14465332, + "step": 10800, + "time_per_iteration": 2.989837646484375 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.010185, + "balance_loss_clip": 1.11925197, + "balance_loss_mlp": 1.01546061, + "epoch": 0.6493912520667368, + "flos": 70949484816120.0, + "grad_norm": 0.7982594368469546, + "language_loss": 0.60253888, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62437767, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.03039551, + "step": 10801, + "time_per_iteration": 3.3568496704101562 + }, + { + "auxiliary_loss_clip": 0.01351587, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.23748708, + "balance_loss_mlp": 1.02760971, + "epoch": 0.6494513753194048, + "flos": 25343262864000.0, + "grad_norm": 1.7617677045358906, + "language_loss": 0.79112649, + "learning_rate": 1.156244280393614e-06, + "loss": 0.81505817, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13964844, + "step": 10802, + "time_per_iteration": 2.772066116333008 + }, + { + "auxiliary_loss_clip": 0.01346794, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.23585868, + "balance_loss_mlp": 1.02280116, + "epoch": 0.6495114985720728, + "flos": 24687499861440.0, + "grad_norm": 1.6681759799070828, + "language_loss": 0.74902487, + "learning_rate": 1.155891189918541e-06, + "loss": 0.77286106, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14019775, + "step": 10803, + "time_per_iteration": 2.805068016052246 + }, + { + "auxiliary_loss_clip": 0.01347248, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.23554015, + "balance_loss_mlp": 1.01922297, + "epoch": 0.6495716218247407, + "flos": 23655345348600.0, + "grad_norm": 2.133498738852306, + "language_loss": 0.70430547, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72809714, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.12701416, + "step": 10804, + "time_per_iteration": 2.7318778038024902 + }, + { + "auxiliary_loss_clip": 0.01341045, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.23195982, + "balance_loss_mlp": 1.0165422, + "epoch": 0.6496317450774087, + "flos": 22351291281720.0, + "grad_norm": 1.7015003161806075, + "language_loss": 0.73314607, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.75686002, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13812256, + "step": 10805, + "time_per_iteration": 2.94923734664917 + }, + { + "auxiliary_loss_clip": 0.01349187, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.23705649, + "balance_loss_mlp": 1.0183295, + "epoch": 0.6496918683300766, + "flos": 30524792946120.0, + "grad_norm": 2.1647629753375237, + "language_loss": 0.6590454, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68284845, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12792969, + "step": 10806, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.01354426, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.23925948, + "balance_loss_mlp": 1.02019167, + "epoch": 0.6497519915827447, + "flos": 12462884523720.0, + "grad_norm": 2.2230779134084715, + "language_loss": 0.78649575, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.81038225, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14038086, + "step": 10807, + "time_per_iteration": 2.7732295989990234 + }, + { + "auxiliary_loss_clip": 0.01162292, + "auxiliary_loss_mlp": 0.01005686, + "balance_loss_clip": 1.11680365, + "balance_loss_mlp": 1.00261009, + "epoch": 0.6498121148354126, + "flos": 69110954582760.0, + "grad_norm": 0.7851363501456392, + "language_loss": 0.59056395, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.61224371, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.03063965, + "step": 10808, + "time_per_iteration": 3.427842855453491 + }, + { + "auxiliary_loss_clip": 0.01342044, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.23481059, + "balance_loss_mlp": 1.01751041, + "epoch": 0.6498722380880806, + "flos": 36903262576680.0, + "grad_norm": 1.8334954092355569, + "language_loss": 0.63481605, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65854454, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13305664, + "step": 10809, + "time_per_iteration": 2.9141242504119873 + }, + { + "auxiliary_loss_clip": 0.01340134, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.23192751, + "balance_loss_mlp": 1.01814163, + "epoch": 0.6499323613407485, + "flos": 29023061072040.0, + "grad_norm": 1.5549244197984484, + "language_loss": 0.81646365, + "learning_rate": 1.153420453586008e-06, + "loss": 0.84017241, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12591553, + "step": 10810, + "time_per_iteration": 2.8323569297790527 + }, + { + "auxiliary_loss_clip": 0.01335628, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.22903657, + "balance_loss_mlp": 1.01843202, + "epoch": 0.6499924845934165, + "flos": 20123902771560.0, + "grad_norm": 1.5450732717506654, + "language_loss": 0.72179204, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.7454561, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12353516, + "step": 10811, + "time_per_iteration": 2.737332820892334 + }, + { + "auxiliary_loss_clip": 0.01334849, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.22998619, + "balance_loss_mlp": 1.01672888, + "epoch": 0.6500526078460844, + "flos": 24426141499800.0, + "grad_norm": 1.5136908774922677, + "language_loss": 0.78169721, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80533063, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11767578, + "step": 10812, + "time_per_iteration": 5.7994065284729 + }, + { + "auxiliary_loss_clip": 0.01345089, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.23497462, + "balance_loss_mlp": 1.02470469, + "epoch": 0.6501127310987524, + "flos": 23336235253080.0, + "grad_norm": 1.7671361003445019, + "language_loss": 0.85157377, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87540686, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13519287, + "step": 10813, + "time_per_iteration": 2.719359874725342 + }, + { + "auxiliary_loss_clip": 0.01342104, + "auxiliary_loss_mlp": 0.01039449, + "balance_loss_clip": 1.23271203, + "balance_loss_mlp": 1.02675366, + "epoch": 0.6501728543514204, + "flos": 18702500879520.0, + "grad_norm": 2.3890140247861775, + "language_loss": 0.80176753, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.8255831, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1270752, + "step": 10814, + "time_per_iteration": 2.7905220985412598 + }, + { + "auxiliary_loss_clip": 0.01347472, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.23596621, + "balance_loss_mlp": 1.02679837, + "epoch": 0.6502329776040884, + "flos": 44207665585560.0, + "grad_norm": 1.6705638360538053, + "language_loss": 0.65715832, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.6810351, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13409424, + "step": 10815, + "time_per_iteration": 4.552392244338989 + }, + { + "auxiliary_loss_clip": 0.01356255, + "auxiliary_loss_mlp": 0.01037066, + "balance_loss_clip": 1.24104953, + "balance_loss_mlp": 1.02104473, + "epoch": 0.6502931008567564, + "flos": 14578854029280.0, + "grad_norm": 1.8375596776123273, + "language_loss": 0.7547878, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77872097, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.16015625, + "step": 10816, + "time_per_iteration": 2.8237922191619873 + }, + { + "auxiliary_loss_clip": 0.01338409, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.23076701, + "balance_loss_mlp": 1.01884663, + "epoch": 0.6503532241094243, + "flos": 21399669792720.0, + "grad_norm": 2.348341803667668, + "language_loss": 0.73056591, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75427514, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13653564, + "step": 10817, + "time_per_iteration": 2.7603700160980225 + }, + { + "auxiliary_loss_clip": 0.01344322, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.23333168, + "balance_loss_mlp": 1.02992153, + "epoch": 0.6504133473620923, + "flos": 74752948772280.0, + "grad_norm": 1.554058577188788, + "language_loss": 0.7205019, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74438202, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13775635, + "step": 10818, + "time_per_iteration": 3.1379306316375732 + }, + { + "auxiliary_loss_clip": 0.01348547, + "auxiliary_loss_mlp": 0.01031443, + "balance_loss_clip": 1.23599195, + "balance_loss_mlp": 1.01802039, + "epoch": 0.6504734706147602, + "flos": 19717275014280.0, + "grad_norm": 1.8085417193255728, + "language_loss": 0.65517688, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67897677, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13433838, + "step": 10819, + "time_per_iteration": 2.779304027557373 + }, + { + "auxiliary_loss_clip": 0.01342523, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.23143589, + "balance_loss_mlp": 1.02176738, + "epoch": 0.6505335938674283, + "flos": 25562405512440.0, + "grad_norm": 1.974871229791573, + "language_loss": 0.83668315, + "learning_rate": 1.14989356009286e-06, + "loss": 0.86045611, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13006592, + "step": 10820, + "time_per_iteration": 2.7668464183807373 + }, + { + "auxiliary_loss_clip": 0.01352884, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.24004769, + "balance_loss_mlp": 1.02029991, + "epoch": 0.6505937171200962, + "flos": 17825889677400.0, + "grad_norm": 1.851920079995777, + "language_loss": 0.78589487, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80976987, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14331055, + "step": 10821, + "time_per_iteration": 2.7802927494049072 + }, + { + "auxiliary_loss_clip": 0.01334932, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.22940707, + "balance_loss_mlp": 1.01841998, + "epoch": 0.6506538403727642, + "flos": 20673079164360.0, + "grad_norm": 1.3657450124367148, + "language_loss": 0.79811549, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82176352, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11437988, + "step": 10822, + "time_per_iteration": 2.8521382808685303 + }, + { + "auxiliary_loss_clip": 0.01337294, + "auxiliary_loss_mlp": 0.01030751, + "balance_loss_clip": 1.22845042, + "balance_loss_mlp": 1.01712561, + "epoch": 0.6507139636254321, + "flos": 11724192604080.0, + "grad_norm": 2.4175138296337506, + "language_loss": 0.87765515, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.90133566, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1361084, + "step": 10823, + "time_per_iteration": 2.787315845489502 + }, + { + "auxiliary_loss_clip": 0.01344333, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.23365998, + "balance_loss_mlp": 1.01975965, + "epoch": 0.6507740868781001, + "flos": 26767547949600.0, + "grad_norm": 1.6005832255204229, + "language_loss": 0.6699428, + "learning_rate": 1.148483704558183e-06, + "loss": 0.69371438, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13067627, + "step": 10824, + "time_per_iteration": 4.372370719909668 + }, + { + "auxiliary_loss_clip": 0.01341414, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.22863066, + "balance_loss_mlp": 1.02353144, + "epoch": 0.650834210130768, + "flos": 16476046361640.0, + "grad_norm": 2.3715958358575078, + "language_loss": 0.8832345, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.90701604, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13201904, + "step": 10825, + "time_per_iteration": 2.7743308544158936 + }, + { + "auxiliary_loss_clip": 0.01348989, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.23555183, + "balance_loss_mlp": 1.02008748, + "epoch": 0.650894333383436, + "flos": 17133108831720.0, + "grad_norm": 2.96894457714621, + "language_loss": 0.72971755, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75354785, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13964844, + "step": 10826, + "time_per_iteration": 2.6964492797851562 + }, + { + "auxiliary_loss_clip": 0.0134262, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.23249042, + "balance_loss_mlp": 1.01779985, + "epoch": 0.650954456636104, + "flos": 18738909597240.0, + "grad_norm": 1.7468247569862896, + "language_loss": 0.69421995, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71794784, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12365723, + "step": 10827, + "time_per_iteration": 2.708160161972046 + }, + { + "auxiliary_loss_clip": 0.01342265, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.23105526, + "balance_loss_mlp": 1.01599598, + "epoch": 0.651014579888772, + "flos": 24532728109560.0, + "grad_norm": 1.7806324378209677, + "language_loss": 0.76909983, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79280901, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12658691, + "step": 10828, + "time_per_iteration": 2.8970723152160645 + }, + { + "auxiliary_loss_clip": 0.01339014, + "auxiliary_loss_mlp": 0.01025, + "balance_loss_clip": 1.23111057, + "balance_loss_mlp": 1.01230383, + "epoch": 0.65107470314144, + "flos": 24066440025480.0, + "grad_norm": 1.7786107589307234, + "language_loss": 0.89569116, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91933131, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12695312, + "step": 10829, + "time_per_iteration": 2.7579903602600098 + }, + { + "auxiliary_loss_clip": 0.01167529, + "auxiliary_loss_mlp": 0.01002228, + "balance_loss_clip": 1.11758256, + "balance_loss_mlp": 0.99910432, + "epoch": 0.6511348263941079, + "flos": 72497574456120.0, + "grad_norm": 0.6366069533162452, + "language_loss": 0.55397052, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.5756681, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.03112793, + "step": 10830, + "time_per_iteration": 3.360006332397461 + }, + { + "auxiliary_loss_clip": 0.01351061, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.23700309, + "balance_loss_mlp": 1.01621675, + "epoch": 0.6511949496467759, + "flos": 23372928229320.0, + "grad_norm": 2.0135286768358442, + "language_loss": 0.75599706, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.77979988, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13012695, + "step": 10831, + "time_per_iteration": 2.7588112354278564 + }, + { + "auxiliary_loss_clip": 0.01163415, + "auxiliary_loss_mlp": 0.01004336, + "balance_loss_clip": 1.11468983, + "balance_loss_mlp": 1.00087857, + "epoch": 0.6512550728994438, + "flos": 67348960772760.0, + "grad_norm": 0.8030419831876974, + "language_loss": 0.51133859, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53301615, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.03466797, + "step": 10832, + "time_per_iteration": 3.287766456604004 + }, + { + "auxiliary_loss_clip": 0.01349642, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.23567009, + "balance_loss_mlp": 1.01918364, + "epoch": 0.6513151961521119, + "flos": 21146595536520.0, + "grad_norm": 2.0598148586524445, + "language_loss": 0.83986223, + "learning_rate": 1.145313419848316e-06, + "loss": 0.86369109, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.140625, + "step": 10833, + "time_per_iteration": 2.6989593505859375 + }, + { + "auxiliary_loss_clip": 0.01334459, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.22544181, + "balance_loss_mlp": 1.0182296, + "epoch": 0.6513753194047798, + "flos": 15163058455560.0, + "grad_norm": 3.2071561573881824, + "language_loss": 0.83725893, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.86091352, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12780762, + "step": 10834, + "time_per_iteration": 2.7353014945983887 + }, + { + "auxiliary_loss_clip": 0.01342945, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.23274207, + "balance_loss_mlp": 1.02147698, + "epoch": 0.6514354426574478, + "flos": 30232711037160.0, + "grad_norm": 1.4222268202409845, + "language_loss": 0.77104771, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79482234, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13049316, + "step": 10835, + "time_per_iteration": 2.8534011840820312 + }, + { + "auxiliary_loss_clip": 0.01345152, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.23472953, + "balance_loss_mlp": 1.01610768, + "epoch": 0.6514955659101157, + "flos": 24210775428840.0, + "grad_norm": 1.4223157538074644, + "language_loss": 0.77732819, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.80106831, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12750244, + "step": 10836, + "time_per_iteration": 2.7961864471435547 + }, + { + "auxiliary_loss_clip": 0.0134275, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.23253739, + "balance_loss_mlp": 1.02147317, + "epoch": 0.6515556891627837, + "flos": 12380564732040.0, + "grad_norm": 1.943875071667369, + "language_loss": 0.82791865, + "learning_rate": 1.143905246497783e-06, + "loss": 0.85168862, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12780762, + "step": 10837, + "time_per_iteration": 2.701836585998535 + }, + { + "auxiliary_loss_clip": 0.01336792, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.23027372, + "balance_loss_mlp": 1.01379573, + "epoch": 0.6516158124154516, + "flos": 49609922042160.0, + "grad_norm": 1.8511214166587426, + "language_loss": 0.59573376, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61938047, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.14086914, + "step": 10838, + "time_per_iteration": 2.9975924491882324 + }, + { + "auxiliary_loss_clip": 0.01162925, + "auxiliary_loss_mlp": 0.00999924, + "balance_loss_clip": 1.11458325, + "balance_loss_mlp": 0.9970625, + "epoch": 0.6516759356681197, + "flos": 59716879304400.0, + "grad_norm": 0.7256636754974803, + "language_loss": 0.60940397, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63103247, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.02856445, + "step": 10839, + "time_per_iteration": 3.3235933780670166 + }, + { + "auxiliary_loss_clip": 0.01337275, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.22949386, + "balance_loss_mlp": 1.01478601, + "epoch": 0.6517360589207876, + "flos": 37457839881360.0, + "grad_norm": 1.5474643087647122, + "language_loss": 0.68410265, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70774829, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12506104, + "step": 10840, + "time_per_iteration": 2.96061372756958 + }, + { + "auxiliary_loss_clip": 0.01336727, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.22695255, + "balance_loss_mlp": 1.01912379, + "epoch": 0.6517961821734556, + "flos": 25380483748920.0, + "grad_norm": 2.78405200996628, + "language_loss": 0.74061537, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.76429588, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12213135, + "step": 10841, + "time_per_iteration": 2.8073647022247314 + }, + { + "auxiliary_loss_clip": 0.01344469, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.23327649, + "balance_loss_mlp": 1.01679134, + "epoch": 0.6518563054261236, + "flos": 28772870009400.0, + "grad_norm": 1.4268443043686045, + "language_loss": 0.6287958, + "learning_rate": 1.142145760331648e-06, + "loss": 0.65253818, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12982178, + "step": 10842, + "time_per_iteration": 2.8115551471710205 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01004225, + "balance_loss_clip": 1.11596084, + "balance_loss_mlp": 1.0009346, + "epoch": 0.6519164286787915, + "flos": 68938924277880.0, + "grad_norm": 0.8156013617901814, + "language_loss": 0.5616293, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58332193, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.03295898, + "step": 10843, + "time_per_iteration": 3.046619176864624 + }, + { + "auxiliary_loss_clip": 0.01356268, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.24100852, + "balance_loss_mlp": 1.01983762, + "epoch": 0.6519765519314595, + "flos": 20445896060640.0, + "grad_norm": 1.895986304814798, + "language_loss": 0.82631361, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.85022044, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.14581299, + "step": 10844, + "time_per_iteration": 2.866443634033203 + }, + { + "auxiliary_loss_clip": 0.01342856, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.23158622, + "balance_loss_mlp": 1.01671159, + "epoch": 0.6520366751841274, + "flos": 28408904657280.0, + "grad_norm": 1.9643766078412936, + "language_loss": 0.60134459, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62508023, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14013672, + "step": 10845, + "time_per_iteration": 2.875908613204956 + }, + { + "auxiliary_loss_clip": 0.01341614, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.23070765, + "balance_loss_mlp": 1.01836681, + "epoch": 0.6520967984367955, + "flos": 22278555063000.0, + "grad_norm": 1.5529613829112578, + "language_loss": 0.79410994, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81783819, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.128479, + "step": 10846, + "time_per_iteration": 2.843935012817383 + }, + { + "auxiliary_loss_clip": 0.01164797, + "auxiliary_loss_mlp": 0.01009472, + "balance_loss_clip": 1.1158998, + "balance_loss_mlp": 1.0065037, + "epoch": 0.6521569216894634, + "flos": 68935083464880.0, + "grad_norm": 0.7106250255155714, + "language_loss": 0.60273218, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62447476, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.02966309, + "step": 10847, + "time_per_iteration": 3.312190055847168 + }, + { + "auxiliary_loss_clip": 0.01353728, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.24085832, + "balance_loss_mlp": 1.02515674, + "epoch": 0.6522170449421314, + "flos": 29136266844480.0, + "grad_norm": 1.5197118028533565, + "language_loss": 0.81256461, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83648968, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13623047, + "step": 10848, + "time_per_iteration": 2.7855677604675293 + }, + { + "auxiliary_loss_clip": 0.0133795, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.22911525, + "balance_loss_mlp": 1.02636349, + "epoch": 0.6522771681947993, + "flos": 26657672062680.0, + "grad_norm": 2.1569644037728932, + "language_loss": 0.75218678, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.77596056, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13067627, + "step": 10849, + "time_per_iteration": 2.767062187194824 + }, + { + "auxiliary_loss_clip": 0.01338575, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.23155022, + "balance_loss_mlp": 1.01767409, + "epoch": 0.6523372914474673, + "flos": 25745383093320.0, + "grad_norm": 1.487565505572503, + "language_loss": 0.68278545, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70647579, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12780762, + "step": 10850, + "time_per_iteration": 4.200137138366699 + }, + { + "auxiliary_loss_clip": 0.0133516, + "auxiliary_loss_mlp": 0.01029142, + "balance_loss_clip": 1.22825408, + "balance_loss_mlp": 1.01617837, + "epoch": 0.6523974147001352, + "flos": 24832809865440.0, + "grad_norm": 1.8137104366807504, + "language_loss": 0.67088395, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.69452691, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12963867, + "step": 10851, + "time_per_iteration": 4.290473937988281 + }, + { + "auxiliary_loss_clip": 0.01347749, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.23526943, + "balance_loss_mlp": 1.01862907, + "epoch": 0.6524575379528033, + "flos": 26322562273320.0, + "grad_norm": 2.51863415474652, + "language_loss": 0.73787028, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76166809, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13409424, + "step": 10852, + "time_per_iteration": 2.8641064167022705 + }, + { + "auxiliary_loss_clip": 0.01354307, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.24037361, + "balance_loss_mlp": 1.01750541, + "epoch": 0.6525176612054712, + "flos": 19498091757480.0, + "grad_norm": 1.9509628507363435, + "language_loss": 0.66555005, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68940789, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13977051, + "step": 10853, + "time_per_iteration": 4.208316802978516 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.0100873, + "balance_loss_clip": 1.11861157, + "balance_loss_mlp": 1.00548792, + "epoch": 0.6525777844581392, + "flos": 71722636252200.0, + "grad_norm": 0.7215451447474821, + "language_loss": 0.63033068, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65209532, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.0324707, + "step": 10854, + "time_per_iteration": 3.4150454998016357 + }, + { + "auxiliary_loss_clip": 0.01345732, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.23511839, + "balance_loss_mlp": 1.02190888, + "epoch": 0.6526379077108072, + "flos": 26659012138560.0, + "grad_norm": 1.8477619031143462, + "language_loss": 0.7768566, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.8006711, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13793945, + "step": 10855, + "time_per_iteration": 2.8853189945220947 + }, + { + "auxiliary_loss_clip": 0.01330423, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.22419202, + "balance_loss_mlp": 1.01355875, + "epoch": 0.6526980309634751, + "flos": 22825538604360.0, + "grad_norm": 1.812073790009235, + "language_loss": 0.79377353, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81734103, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12780762, + "step": 10856, + "time_per_iteration": 2.7759034633636475 + }, + { + "auxiliary_loss_clip": 0.01348176, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.23783398, + "balance_loss_mlp": 1.01935852, + "epoch": 0.6527581542161431, + "flos": 28370790388440.0, + "grad_norm": 1.6561441816306535, + "language_loss": 0.74002254, + "learning_rate": 1.136872187988815e-06, + "loss": 0.76384515, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14709473, + "step": 10857, + "time_per_iteration": 2.913656234741211 + }, + { + "auxiliary_loss_clip": 0.01341458, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.23164189, + "balance_loss_mlp": 1.01656747, + "epoch": 0.652818277468811, + "flos": 18373807211040.0, + "grad_norm": 2.1498152276198965, + "language_loss": 0.62867713, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.6523838, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12634277, + "step": 10858, + "time_per_iteration": 2.7871854305267334 + }, + { + "auxiliary_loss_clip": 0.01336325, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.22889948, + "balance_loss_mlp": 1.01929033, + "epoch": 0.6528784007214791, + "flos": 18040118714280.0, + "grad_norm": 1.5597580794909442, + "language_loss": 0.78817773, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.81186461, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.13067627, + "step": 10859, + "time_per_iteration": 2.6938698291778564 + }, + { + "auxiliary_loss_clip": 0.01347781, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.2353698, + "balance_loss_mlp": 1.02040458, + "epoch": 0.652938523974147, + "flos": 22387050265680.0, + "grad_norm": 1.588637655287111, + "language_loss": 0.6817826, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70559722, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1328125, + "step": 10860, + "time_per_iteration": 2.8130886554718018 + }, + { + "auxiliary_loss_clip": 0.01352235, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.23876143, + "balance_loss_mlp": 1.01886106, + "epoch": 0.652998647226815, + "flos": 16768453137480.0, + "grad_norm": 1.9697651008636807, + "language_loss": 0.67287815, + "learning_rate": 1.135467143909712e-06, + "loss": 0.69672024, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13116455, + "step": 10861, + "time_per_iteration": 2.70949649810791 + }, + { + "auxiliary_loss_clip": 0.01349158, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.23599029, + "balance_loss_mlp": 1.01898384, + "epoch": 0.6530587704794829, + "flos": 35779343505480.0, + "grad_norm": 1.5931330922848612, + "language_loss": 0.65315443, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67698342, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14770508, + "step": 10862, + "time_per_iteration": 3.0365149974823 + }, + { + "auxiliary_loss_clip": 0.01340789, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.23153615, + "balance_loss_mlp": 1.0221312, + "epoch": 0.6531188937321509, + "flos": 19320636913560.0, + "grad_norm": 1.5544733199448972, + "language_loss": 0.7707746, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79453146, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.12750244, + "step": 10863, + "time_per_iteration": 4.349055051803589 + }, + { + "auxiliary_loss_clip": 0.0134535, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.23613274, + "balance_loss_mlp": 1.01758158, + "epoch": 0.6531790169848188, + "flos": 22899005773560.0, + "grad_norm": 1.635873513354877, + "language_loss": 0.7467649, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77051795, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12353516, + "step": 10864, + "time_per_iteration": 2.768115520477295 + }, + { + "auxiliary_loss_clip": 0.01341668, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.23308086, + "balance_loss_mlp": 1.01984596, + "epoch": 0.6532391402374869, + "flos": 29568704537520.0, + "grad_norm": 1.6840303471227998, + "language_loss": 0.86845911, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.89219689, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1227417, + "step": 10865, + "time_per_iteration": 2.8632149696350098 + }, + { + "auxiliary_loss_clip": 0.01352027, + "auxiliary_loss_mlp": 0.01029286, + "balance_loss_clip": 1.23965549, + "balance_loss_mlp": 1.01601851, + "epoch": 0.6532992634901548, + "flos": 23109783099840.0, + "grad_norm": 1.5672485495585304, + "language_loss": 0.81046772, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83428085, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13269043, + "step": 10866, + "time_per_iteration": 2.737739086151123 + }, + { + "auxiliary_loss_clip": 0.01341889, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.23350096, + "balance_loss_mlp": 1.01431084, + "epoch": 0.6533593867428228, + "flos": 26072899119360.0, + "grad_norm": 1.429493174321846, + "language_loss": 0.82424122, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84793806, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13494873, + "step": 10867, + "time_per_iteration": 2.7746787071228027 + }, + { + "auxiliary_loss_clip": 0.01340167, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.22932553, + "balance_loss_mlp": 1.01513171, + "epoch": 0.6534195099954908, + "flos": 21217098295440.0, + "grad_norm": 1.7997552677926472, + "language_loss": 0.81531447, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.83899748, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12994385, + "step": 10868, + "time_per_iteration": 2.7406463623046875 + }, + { + "auxiliary_loss_clip": 0.01342145, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.23017669, + "balance_loss_mlp": 1.01570809, + "epoch": 0.6534796332481587, + "flos": 19651401608400.0, + "grad_norm": 2.122733311950964, + "language_loss": 0.79586589, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81958199, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13751221, + "step": 10869, + "time_per_iteration": 2.7813727855682373 + }, + { + "auxiliary_loss_clip": 0.01341725, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.23165298, + "balance_loss_mlp": 1.01667202, + "epoch": 0.6535397565008267, + "flos": 24027554197800.0, + "grad_norm": 4.037961425086493, + "language_loss": 0.72430122, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74801326, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12817383, + "step": 10870, + "time_per_iteration": 2.804802417755127 + }, + { + "auxiliary_loss_clip": 0.01341027, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.23160601, + "balance_loss_mlp": 1.01871371, + "epoch": 0.6535998797534947, + "flos": 24607454137920.0, + "grad_norm": 2.1519518617439637, + "language_loss": 0.74729204, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77102077, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13134766, + "step": 10871, + "time_per_iteration": 2.7483246326446533 + }, + { + "auxiliary_loss_clip": 0.0133529, + "auxiliary_loss_mlp": 0.01027119, + "balance_loss_clip": 1.22907805, + "balance_loss_mlp": 1.01454198, + "epoch": 0.6536600030061627, + "flos": 23368948610040.0, + "grad_norm": 1.83336464025245, + "language_loss": 0.55914891, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.58277297, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12591553, + "step": 10872, + "time_per_iteration": 2.8128366470336914 + }, + { + "auxiliary_loss_clip": 0.01336676, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.22894621, + "balance_loss_mlp": 1.02076519, + "epoch": 0.6537201262588306, + "flos": 23883787311480.0, + "grad_norm": 1.679559031041067, + "language_loss": 0.75266039, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77636278, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12774658, + "step": 10873, + "time_per_iteration": 2.7838850021362305 + }, + { + "auxiliary_loss_clip": 0.0133579, + "auxiliary_loss_mlp": 0.01030012, + "balance_loss_clip": 1.22655189, + "balance_loss_mlp": 1.01665485, + "epoch": 0.6537802495114986, + "flos": 24360389919000.0, + "grad_norm": 1.9858047474805351, + "language_loss": 0.75774992, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.78140789, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13354492, + "step": 10874, + "time_per_iteration": 3.050840139389038 + }, + { + "auxiliary_loss_clip": 0.01336659, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.22733593, + "balance_loss_mlp": 1.01290679, + "epoch": 0.6538403727641665, + "flos": 28002155074920.0, + "grad_norm": 1.5612346262173815, + "language_loss": 0.81547076, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83909631, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13000488, + "step": 10875, + "time_per_iteration": 2.8121230602264404 + }, + { + "auxiliary_loss_clip": 0.01341614, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.23129845, + "balance_loss_mlp": 1.01683474, + "epoch": 0.6539004960168345, + "flos": 27569392515000.0, + "grad_norm": 1.5531631027173955, + "language_loss": 0.70299566, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72670603, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12591553, + "step": 10876, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.01336312, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.22708452, + "balance_loss_mlp": 1.02102852, + "epoch": 0.6539606192695024, + "flos": 14533105388760.0, + "grad_norm": 1.9078112430496335, + "language_loss": 0.79822087, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.8219263, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13208008, + "step": 10877, + "time_per_iteration": 2.754983425140381 + }, + { + "auxiliary_loss_clip": 0.01334853, + "auxiliary_loss_mlp": 0.01024976, + "balance_loss_clip": 1.22509408, + "balance_loss_mlp": 1.0121249, + "epoch": 0.6540207425221705, + "flos": 21621370767840.0, + "grad_norm": 2.1320769402368227, + "language_loss": 0.79911447, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82271278, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12835693, + "step": 10878, + "time_per_iteration": 2.786318063735962 + }, + { + "auxiliary_loss_clip": 0.01332908, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.22333062, + "balance_loss_mlp": 1.01304364, + "epoch": 0.6540808657748384, + "flos": 17672579826480.0, + "grad_norm": 2.0568456369921355, + "language_loss": 0.84609532, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86969388, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13903809, + "step": 10879, + "time_per_iteration": 2.805715322494507 + }, + { + "auxiliary_loss_clip": 0.01342437, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.22950244, + "balance_loss_mlp": 1.01340222, + "epoch": 0.6541409890275064, + "flos": 14542404703200.0, + "grad_norm": 2.569735811263235, + "language_loss": 0.72338438, + "learning_rate": 1.128800362199601e-06, + "loss": 0.74708641, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14367676, + "step": 10880, + "time_per_iteration": 2.714179515838623 + }, + { + "auxiliary_loss_clip": 0.01333964, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.22628176, + "balance_loss_mlp": 1.01253164, + "epoch": 0.6542011122801744, + "flos": 17169761199600.0, + "grad_norm": 2.025429260830428, + "language_loss": 0.84616756, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.8697648, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13238525, + "step": 10881, + "time_per_iteration": 2.772235870361328 + }, + { + "auxiliary_loss_clip": 0.01350349, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.23729098, + "balance_loss_mlp": 1.01754308, + "epoch": 0.6542612355328423, + "flos": 18191154497040.0, + "grad_norm": 2.03135353198695, + "language_loss": 0.78095376, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.804775, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14215088, + "step": 10882, + "time_per_iteration": 2.7583417892456055 + }, + { + "auxiliary_loss_clip": 0.01342383, + "auxiliary_loss_mlp": 0.01027411, + "balance_loss_clip": 1.23052311, + "balance_loss_mlp": 1.01283789, + "epoch": 0.6543213587855103, + "flos": 19797280129440.0, + "grad_norm": 2.3577432604197996, + "language_loss": 0.82605231, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84975022, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14562988, + "step": 10883, + "time_per_iteration": 2.7885901927948 + }, + { + "auxiliary_loss_clip": 0.01339482, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.22842073, + "balance_loss_mlp": 1.01656008, + "epoch": 0.6543814820381783, + "flos": 21109983777000.0, + "grad_norm": 3.791658982431743, + "language_loss": 0.86004174, + "learning_rate": 1.127398345803988e-06, + "loss": 0.88373721, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1348877, + "step": 10884, + "time_per_iteration": 2.779416561126709 + }, + { + "auxiliary_loss_clip": 0.01343046, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.23168695, + "balance_loss_mlp": 1.01794696, + "epoch": 0.6544416052908463, + "flos": 20198953666800.0, + "grad_norm": 2.1776464437001026, + "language_loss": 0.80078721, + "learning_rate": 1.127047924394715e-06, + "loss": 0.8245337, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13653564, + "step": 10885, + "time_per_iteration": 2.764327049255371 + }, + { + "auxiliary_loss_clip": 0.01331018, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.22325802, + "balance_loss_mlp": 1.01443076, + "epoch": 0.6545017285435142, + "flos": 23373577963080.0, + "grad_norm": 1.8176825223910604, + "language_loss": 0.72144419, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74502945, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13085938, + "step": 10886, + "time_per_iteration": 2.9469499588012695 + }, + { + "auxiliary_loss_clip": 0.01340378, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.2313273, + "balance_loss_mlp": 1.01824141, + "epoch": 0.6545618517961822, + "flos": 19139324275440.0, + "grad_norm": 1.7118152676645666, + "language_loss": 0.78255594, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80626559, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12335205, + "step": 10887, + "time_per_iteration": 2.875281572341919 + }, + { + "auxiliary_loss_clip": 0.01335796, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.22625995, + "balance_loss_mlp": 1.01682901, + "epoch": 0.6546219750488501, + "flos": 14942656947960.0, + "grad_norm": 1.7603756537340864, + "language_loss": 0.79030704, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81396174, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.1282959, + "step": 10888, + "time_per_iteration": 4.235857009887695 + }, + { + "auxiliary_loss_clip": 0.01330321, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.22333789, + "balance_loss_mlp": 1.01682591, + "epoch": 0.6546820983015181, + "flos": 36327504689280.0, + "grad_norm": 1.4920658035227, + "language_loss": 0.66912377, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.69271803, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12286377, + "step": 10889, + "time_per_iteration": 2.935654401779175 + }, + { + "auxiliary_loss_clip": 0.01336143, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.22568989, + "balance_loss_mlp": 1.01578116, + "epoch": 0.654742221554186, + "flos": 20416187722320.0, + "grad_norm": 1.4933023839492803, + "language_loss": 0.79765999, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82131374, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13446045, + "step": 10890, + "time_per_iteration": 4.112157344818115 + }, + { + "auxiliary_loss_clip": 0.01341327, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.22954297, + "balance_loss_mlp": 1.01353264, + "epoch": 0.6548023448068541, + "flos": 24869096758080.0, + "grad_norm": 2.0437657974572634, + "language_loss": 0.65509629, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67877823, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 1.11767578, + "router_z_loss_mlp": 0.13342285, + "step": 10891, + "time_per_iteration": 2.7684178352355957 + }, + { + "auxiliary_loss_clip": 0.0133731, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.22737992, + "balance_loss_mlp": 1.02321267, + "epoch": 0.654862468059522, + "flos": 21431733415920.0, + "grad_norm": 2.095781064033408, + "language_loss": 0.7972647, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.8209933, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12335205, + "step": 10892, + "time_per_iteration": 4.273289918899536 + }, + { + "auxiliary_loss_clip": 0.01345591, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.23165298, + "balance_loss_mlp": 1.01953626, + "epoch": 0.65492259131219, + "flos": 26583433334640.0, + "grad_norm": 1.7255638538868938, + "language_loss": 0.77842581, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80221295, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13574219, + "step": 10893, + "time_per_iteration": 2.7964820861816406 + }, + { + "auxiliary_loss_clip": 0.01336704, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.22565484, + "balance_loss_mlp": 1.01708448, + "epoch": 0.6549827145648579, + "flos": 21505606668720.0, + "grad_norm": 1.6628372015775452, + "language_loss": 0.70602834, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72970796, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14178467, + "step": 10894, + "time_per_iteration": 2.7714734077453613 + }, + { + "auxiliary_loss_clip": 0.0134654, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.23377633, + "balance_loss_mlp": 1.01922584, + "epoch": 0.6550428378175259, + "flos": 22598477325720.0, + "grad_norm": 3.15968298461327, + "language_loss": 0.62590849, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64970458, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.1383667, + "step": 10895, + "time_per_iteration": 2.903517961502075 + }, + { + "auxiliary_loss_clip": 0.01330945, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.22209108, + "balance_loss_mlp": 1.02131653, + "epoch": 0.655102961070194, + "flos": 12827743259760.0, + "grad_norm": 1.9294888013806146, + "language_loss": 0.79120147, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81484914, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12530518, + "step": 10896, + "time_per_iteration": 2.8532440662384033 + }, + { + "auxiliary_loss_clip": 0.01328122, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.22240543, + "balance_loss_mlp": 1.01875472, + "epoch": 0.6551630843228619, + "flos": 24796563581160.0, + "grad_norm": 1.369098746956579, + "language_loss": 0.70721388, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.73080587, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12304688, + "step": 10897, + "time_per_iteration": 2.996699810028076 + }, + { + "auxiliary_loss_clip": 0.01338837, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.22635293, + "balance_loss_mlp": 1.01980388, + "epoch": 0.6552232075755299, + "flos": 16728470884080.0, + "grad_norm": 1.679100330076495, + "language_loss": 0.75468385, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77839679, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12634277, + "step": 10898, + "time_per_iteration": 2.7372076511383057 + }, + { + "auxiliary_loss_clip": 0.01332244, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.2232852, + "balance_loss_mlp": 1.02180815, + "epoch": 0.6552833308281978, + "flos": 22021379362440.0, + "grad_norm": 2.341249357186882, + "language_loss": 0.7408452, + "learning_rate": 1.122145506463827e-06, + "loss": 0.76451385, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12823486, + "step": 10899, + "time_per_iteration": 2.789360284805298 + }, + { + "auxiliary_loss_clip": 0.01338683, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.22752738, + "balance_loss_mlp": 1.01815379, + "epoch": 0.6553434540808658, + "flos": 24868812499560.0, + "grad_norm": 2.2760265790016816, + "language_loss": 0.55999482, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58369327, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13018799, + "step": 10900, + "time_per_iteration": 2.7790842056274414 + }, + { + "auxiliary_loss_clip": 0.01337643, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.22735023, + "balance_loss_mlp": 1.01935911, + "epoch": 0.6554035773335337, + "flos": 23226196932720.0, + "grad_norm": 1.6196221819634091, + "language_loss": 0.77074122, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79444802, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13690186, + "step": 10901, + "time_per_iteration": 2.776033878326416 + }, + { + "auxiliary_loss_clip": 0.01331836, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.22341132, + "balance_loss_mlp": 1.01558328, + "epoch": 0.6554637005862017, + "flos": 22788723803040.0, + "grad_norm": 1.6075183414769336, + "language_loss": 0.73727733, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.76088142, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12994385, + "step": 10902, + "time_per_iteration": 4.312322378158569 + }, + { + "auxiliary_loss_clip": 0.01326918, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.22022295, + "balance_loss_mlp": 1.01407182, + "epoch": 0.6555238238388696, + "flos": 21512266439760.0, + "grad_norm": 1.5295221337722698, + "language_loss": 0.68129158, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.70483863, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13708496, + "step": 10903, + "time_per_iteration": 2.758497714996338 + }, + { + "auxiliary_loss_clip": 0.01345628, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.23098195, + "balance_loss_mlp": 1.02092886, + "epoch": 0.6555839470915377, + "flos": 30525726938400.0, + "grad_norm": 1.9977055757375373, + "language_loss": 0.66993666, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6937409, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.13879395, + "step": 10904, + "time_per_iteration": 2.924649477005005 + }, + { + "auxiliary_loss_clip": 0.0133629, + "auxiliary_loss_mlp": 0.01038802, + "balance_loss_clip": 1.22427821, + "balance_loss_mlp": 1.02447271, + "epoch": 0.6556440703442056, + "flos": 24648126733440.0, + "grad_norm": 1.7278468082475043, + "language_loss": 0.91238165, + "learning_rate": 1.120046465383464e-06, + "loss": 0.93613255, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14331055, + "step": 10905, + "time_per_iteration": 2.8064565658569336 + }, + { + "auxiliary_loss_clip": 0.01325759, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.21982372, + "balance_loss_mlp": 1.01821303, + "epoch": 0.6557041935968736, + "flos": 23737746357000.0, + "grad_norm": 1.9932590483140502, + "language_loss": 0.75710154, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.78067046, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12921143, + "step": 10906, + "time_per_iteration": 2.8510773181915283 + }, + { + "auxiliary_loss_clip": 0.01337056, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.22552609, + "balance_loss_mlp": 1.02807188, + "epoch": 0.6557643168495415, + "flos": 11105650486440.0, + "grad_norm": 2.4800105968589747, + "language_loss": 0.75848413, + "learning_rate": 1.119347051825267e-06, + "loss": 0.78227103, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13574219, + "step": 10907, + "time_per_iteration": 2.78262996673584 + }, + { + "auxiliary_loss_clip": 0.01331572, + "auxiliary_loss_mlp": 0.01032587, + "balance_loss_clip": 1.22124922, + "balance_loss_mlp": 1.01821029, + "epoch": 0.6558244401022095, + "flos": 30197236311720.0, + "grad_norm": 1.3839335359789382, + "language_loss": 0.72282791, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74646956, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14379883, + "step": 10908, + "time_per_iteration": 2.789546251296997 + }, + { + "auxiliary_loss_clip": 0.01332228, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.22292185, + "balance_loss_mlp": 1.02212453, + "epoch": 0.6558845633548775, + "flos": 17935562522520.0, + "grad_norm": 2.0604010058814235, + "language_loss": 0.81838822, + "learning_rate": 1.118647771844861e-06, + "loss": 0.8420676, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13586426, + "step": 10909, + "time_per_iteration": 2.8595361709594727 + }, + { + "auxiliary_loss_clip": 0.0134055, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.22835183, + "balance_loss_mlp": 1.02263832, + "epoch": 0.6559446866075455, + "flos": 21908660890320.0, + "grad_norm": 2.067268441269894, + "language_loss": 0.64174491, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66551399, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13739014, + "step": 10910, + "time_per_iteration": 2.717533588409424 + }, + { + "auxiliary_loss_clip": 0.0134611, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.22896647, + "balance_loss_mlp": 1.02023101, + "epoch": 0.6560048098602135, + "flos": 14130863334360.0, + "grad_norm": 2.6949735647903252, + "language_loss": 0.76079178, + "learning_rate": 1.117948625548313e-06, + "loss": 0.78459799, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.14282227, + "step": 10911, + "time_per_iteration": 2.7000701427459717 + }, + { + "auxiliary_loss_clip": 0.01327229, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.2204572, + "balance_loss_mlp": 1.01779616, + "epoch": 0.6560649331128814, + "flos": 18812295549720.0, + "grad_norm": 1.626486378640359, + "language_loss": 0.7542311, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77780414, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12286377, + "step": 10912, + "time_per_iteration": 2.7364799976348877 + }, + { + "auxiliary_loss_clip": 0.01354555, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.23811698, + "balance_loss_mlp": 1.02355218, + "epoch": 0.6561250563655494, + "flos": 17057773677960.0, + "grad_norm": 1.6596031085051237, + "language_loss": 0.77748454, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.80141258, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.14691162, + "step": 10913, + "time_per_iteration": 2.790769577026367 + }, + { + "auxiliary_loss_clip": 0.01326559, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.21996486, + "balance_loss_mlp": 1.02177703, + "epoch": 0.6561851796182173, + "flos": 22642764065280.0, + "grad_norm": 1.6975068870950445, + "language_loss": 0.71571118, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73931324, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11871338, + "step": 10914, + "time_per_iteration": 2.7833902835845947 + }, + { + "auxiliary_loss_clip": 0.01328295, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.21953321, + "balance_loss_mlp": 1.01786172, + "epoch": 0.6562453028708853, + "flos": 19243190125080.0, + "grad_norm": 1.6631003167127814, + "language_loss": 0.73926461, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76285547, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12939453, + "step": 10915, + "time_per_iteration": 2.778670072555542 + }, + { + "auxiliary_loss_clip": 0.01330021, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.22252917, + "balance_loss_mlp": 1.01762855, + "epoch": 0.6563054261235532, + "flos": 23806056264480.0, + "grad_norm": 1.7559151076080515, + "language_loss": 0.79734111, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.8209542, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13659668, + "step": 10916, + "time_per_iteration": 2.8395841121673584 + }, + { + "auxiliary_loss_clip": 0.01336349, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.22504759, + "balance_loss_mlp": 1.02198529, + "epoch": 0.6563655493762213, + "flos": 19244124117360.0, + "grad_norm": 1.6131636290251044, + "language_loss": 0.76422477, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78793514, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1270752, + "step": 10917, + "time_per_iteration": 2.8623054027557373 + }, + { + "auxiliary_loss_clip": 0.01330117, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.22112155, + "balance_loss_mlp": 1.01489913, + "epoch": 0.6564256726288892, + "flos": 25561918212120.0, + "grad_norm": 1.693579634260719, + "language_loss": 0.69960874, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.72319466, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13562012, + "step": 10918, + "time_per_iteration": 2.8695430755615234 + }, + { + "auxiliary_loss_clip": 0.01323501, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.21942151, + "balance_loss_mlp": 1.01756954, + "epoch": 0.6564857958815572, + "flos": 22205778235920.0, + "grad_norm": 1.5704836977522068, + "language_loss": 0.76492268, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78845477, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.12145996, + "step": 10919, + "time_per_iteration": 2.754323720932007 + }, + { + "auxiliary_loss_clip": 0.01160146, + "auxiliary_loss_mlp": 0.01009127, + "balance_loss_clip": 1.11319792, + "balance_loss_mlp": 1.00569332, + "epoch": 0.6565459191342251, + "flos": 58135670223840.0, + "grad_norm": 0.718337150663312, + "language_loss": 0.5306716, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55236435, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.03442383, + "step": 10920, + "time_per_iteration": 3.2656569480895996 + }, + { + "auxiliary_loss_clip": 0.01331197, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.2235992, + "balance_loss_mlp": 1.01860023, + "epoch": 0.6566060423868931, + "flos": 30815697212640.0, + "grad_norm": 1.4545056476842837, + "language_loss": 0.65765047, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.68128467, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1362915, + "step": 10921, + "time_per_iteration": 3.0208921432495117 + }, + { + "auxiliary_loss_clip": 0.01332022, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.22247434, + "balance_loss_mlp": 1.01677465, + "epoch": 0.6566661656395612, + "flos": 23372643970800.0, + "grad_norm": 1.8041995248443665, + "language_loss": 0.81467366, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83830249, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.14086914, + "step": 10922, + "time_per_iteration": 2.846104860305786 + }, + { + "auxiliary_loss_clip": 0.01335313, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.22477221, + "balance_loss_mlp": 1.02019012, + "epoch": 0.6567262888922291, + "flos": 25740672523560.0, + "grad_norm": 2.162673627921043, + "language_loss": 0.71740985, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.74109334, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12841797, + "step": 10923, + "time_per_iteration": 2.778459310531616 + }, + { + "auxiliary_loss_clip": 0.01337141, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.22821617, + "balance_loss_mlp": 1.01582909, + "epoch": 0.6567864121448971, + "flos": 17127504878040.0, + "grad_norm": 1.9289906101288319, + "language_loss": 0.80940187, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83306336, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13195801, + "step": 10924, + "time_per_iteration": 2.70061993598938 + }, + { + "auxiliary_loss_clip": 0.01334901, + "auxiliary_loss_mlp": 0.01024026, + "balance_loss_clip": 1.22563529, + "balance_loss_mlp": 1.01143169, + "epoch": 0.656846535397565, + "flos": 22424474192400.0, + "grad_norm": 1.4917298572444417, + "language_loss": 0.72493422, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74852353, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1260376, + "step": 10925, + "time_per_iteration": 2.809258460998535 + }, + { + "auxiliary_loss_clip": 0.01337342, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.22816479, + "balance_loss_mlp": 1.01534557, + "epoch": 0.656906658650233, + "flos": 17708013943560.0, + "grad_norm": 2.3974610791981736, + "language_loss": 0.72815871, + "learning_rate": 1.112709300197942e-06, + "loss": 0.75181472, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12896729, + "step": 10926, + "time_per_iteration": 2.6751506328582764 + }, + { + "auxiliary_loss_clip": 0.01341814, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.22991395, + "balance_loss_mlp": 1.02014899, + "epoch": 0.6569667819029009, + "flos": 21179674368720.0, + "grad_norm": 1.6212441192414997, + "language_loss": 0.72451031, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74826539, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13549805, + "step": 10927, + "time_per_iteration": 4.251676797866821 + }, + { + "auxiliary_loss_clip": 0.01157244, + "auxiliary_loss_mlp": 0.01013105, + "balance_loss_clip": 1.11067915, + "balance_loss_mlp": 1.00926626, + "epoch": 0.6570269051555689, + "flos": 68778606161880.0, + "grad_norm": 0.7445036681880609, + "language_loss": 0.64525884, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66696233, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.03833008, + "step": 10928, + "time_per_iteration": 3.3508141040802 + }, + { + "auxiliary_loss_clip": 0.01333613, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.2260468, + "balance_loss_mlp": 1.0163039, + "epoch": 0.6570870284082369, + "flos": 26324430257880.0, + "grad_norm": 1.8698587835732936, + "language_loss": 0.77928495, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80291563, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1315918, + "step": 10929, + "time_per_iteration": 4.202442169189453 + }, + { + "auxiliary_loss_clip": 0.01334645, + "auxiliary_loss_mlp": 0.01029934, + "balance_loss_clip": 1.22620094, + "balance_loss_mlp": 1.01587284, + "epoch": 0.6571471516609049, + "flos": 26180054246160.0, + "grad_norm": 1.704310456556481, + "language_loss": 0.65296412, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67660993, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.140625, + "step": 10930, + "time_per_iteration": 2.76900315284729 + }, + { + "auxiliary_loss_clip": 0.01328167, + "auxiliary_loss_mlp": 0.01027927, + "balance_loss_clip": 1.21919024, + "balance_loss_mlp": 1.01420641, + "epoch": 0.6572072749135728, + "flos": 20381159688840.0, + "grad_norm": 1.5190909622232818, + "language_loss": 0.70770711, + "learning_rate": 1.110964538515258e-06, + "loss": 0.73126805, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13726807, + "step": 10931, + "time_per_iteration": 4.170510530471802 + }, + { + "auxiliary_loss_clip": 0.01338925, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.2273761, + "balance_loss_mlp": 1.02133524, + "epoch": 0.6572673981662408, + "flos": 17133190048440.0, + "grad_norm": 2.4346614780275186, + "language_loss": 0.6891734, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.71290553, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12939453, + "step": 10932, + "time_per_iteration": 3.0522122383117676 + }, + { + "auxiliary_loss_clip": 0.01334808, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.2256813, + "balance_loss_mlp": 1.01368594, + "epoch": 0.6573275214189087, + "flos": 41281770450960.0, + "grad_norm": 1.7539390242001591, + "language_loss": 0.80520719, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82882404, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13201904, + "step": 10933, + "time_per_iteration": 2.9752438068389893 + }, + { + "auxiliary_loss_clip": 0.01344786, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.23368502, + "balance_loss_mlp": 1.0188812, + "epoch": 0.6573876446715767, + "flos": 22894904329200.0, + "grad_norm": 1.7551511127033688, + "language_loss": 0.73884857, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.76262039, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13519287, + "step": 10934, + "time_per_iteration": 2.8465747833251953 + }, + { + "auxiliary_loss_clip": 0.01335713, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.22822237, + "balance_loss_mlp": 1.0178355, + "epoch": 0.6574477679242448, + "flos": 44026028080560.0, + "grad_norm": 1.5694224596480728, + "language_loss": 0.76166844, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.7853412, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13720703, + "step": 10935, + "time_per_iteration": 2.9954440593719482 + }, + { + "auxiliary_loss_clip": 0.01336262, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.22611856, + "balance_loss_mlp": 1.02150548, + "epoch": 0.6575078911769127, + "flos": 24576893024040.0, + "grad_norm": 1.6458826432946485, + "language_loss": 0.78558958, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80931431, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14709473, + "step": 10936, + "time_per_iteration": 2.8100509643554688 + }, + { + "auxiliary_loss_clip": 0.01328721, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.22154737, + "balance_loss_mlp": 1.01560354, + "epoch": 0.6575680144295807, + "flos": 20929077222480.0, + "grad_norm": 2.360858859625451, + "language_loss": 0.69938582, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.72295618, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.1270752, + "step": 10937, + "time_per_iteration": 2.8934853076934814 + }, + { + "auxiliary_loss_clip": 0.01335795, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.2273339, + "balance_loss_mlp": 1.0156579, + "epoch": 0.6576281376822486, + "flos": 10929210851520.0, + "grad_norm": 2.144598071866834, + "language_loss": 0.68520743, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70885837, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13635254, + "step": 10938, + "time_per_iteration": 2.7224061489105225 + }, + { + "auxiliary_loss_clip": 0.01341969, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.2311641, + "balance_loss_mlp": 1.01959324, + "epoch": 0.6576882609349166, + "flos": 19286299222200.0, + "grad_norm": 2.141181474121191, + "language_loss": 0.71590626, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73966503, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14312744, + "step": 10939, + "time_per_iteration": 2.923290729522705 + }, + { + "auxiliary_loss_clip": 0.01341214, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.22958386, + "balance_loss_mlp": 1.01367426, + "epoch": 0.6577483841875845, + "flos": 20224154477160.0, + "grad_norm": 2.2234340177782985, + "language_loss": 0.78406858, + "learning_rate": 1.107826092473037e-06, + "loss": 0.80776036, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14282227, + "step": 10940, + "time_per_iteration": 4.3081018924713135 + }, + { + "auxiliary_loss_clip": 0.01343198, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.22934926, + "balance_loss_mlp": 1.0173285, + "epoch": 0.6578085074402525, + "flos": 34757787774600.0, + "grad_norm": 2.436065797296304, + "language_loss": 0.68765092, + "learning_rate": 1.107477545226471e-06, + "loss": 0.71139467, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13824463, + "step": 10941, + "time_per_iteration": 2.9256739616394043 + }, + { + "auxiliary_loss_clip": 0.01337101, + "auxiliary_loss_mlp": 0.01025028, + "balance_loss_clip": 1.22729671, + "balance_loss_mlp": 1.01166475, + "epoch": 0.6578686306929205, + "flos": 23475454003080.0, + "grad_norm": 1.9187400083452433, + "language_loss": 0.68983495, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.71345615, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13348389, + "step": 10942, + "time_per_iteration": 2.7581591606140137 + }, + { + "auxiliary_loss_clip": 0.01349352, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.23319376, + "balance_loss_mlp": 1.0144912, + "epoch": 0.6579287539455885, + "flos": 18081603477000.0, + "grad_norm": 1.9857782837508786, + "language_loss": 0.71622682, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.74001527, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.15002441, + "step": 10943, + "time_per_iteration": 2.7274818420410156 + }, + { + "auxiliary_loss_clip": 0.01334429, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.22658229, + "balance_loss_mlp": 1.01797974, + "epoch": 0.6579888771982564, + "flos": 28667745300600.0, + "grad_norm": 1.6300708754469846, + "language_loss": 0.59799302, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.62165529, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13824463, + "step": 10944, + "time_per_iteration": 2.965787649154663 + }, + { + "auxiliary_loss_clip": 0.01353441, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.23812866, + "balance_loss_mlp": 1.01728976, + "epoch": 0.6580490004509244, + "flos": 25052074338960.0, + "grad_norm": 1.9815250162395976, + "language_loss": 0.72596478, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.7498188, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14685059, + "step": 10945, + "time_per_iteration": 2.7799878120422363 + }, + { + "auxiliary_loss_clip": 0.0133174, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.22311854, + "balance_loss_mlp": 1.01551461, + "epoch": 0.6581091237035923, + "flos": 43516793332800.0, + "grad_norm": 1.5350077090907321, + "language_loss": 0.71163642, + "learning_rate": 1.105735316926046e-06, + "loss": 0.73524582, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13677979, + "step": 10946, + "time_per_iteration": 2.928375244140625 + }, + { + "auxiliary_loss_clip": 0.0134354, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.23340368, + "balance_loss_mlp": 1.01642942, + "epoch": 0.6581692469562603, + "flos": 22420047881160.0, + "grad_norm": 2.100835434533717, + "language_loss": 0.82085156, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84459746, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14599609, + "step": 10947, + "time_per_iteration": 2.9005117416381836 + }, + { + "auxiliary_loss_clip": 0.01339507, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.22792184, + "balance_loss_mlp": 1.01337099, + "epoch": 0.6582293702089284, + "flos": 24864711055200.0, + "grad_norm": 1.6060211131757927, + "language_loss": 0.77652627, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.80018771, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13275146, + "step": 10948, + "time_per_iteration": 2.8034651279449463 + }, + { + "auxiliary_loss_clip": 0.01334906, + "auxiliary_loss_mlp": 0.01027711, + "balance_loss_clip": 1.22601771, + "balance_loss_mlp": 1.01466334, + "epoch": 0.6582894934615963, + "flos": 23044924902960.0, + "grad_norm": 1.6635867060555116, + "language_loss": 0.79233599, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.8159622, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13037109, + "step": 10949, + "time_per_iteration": 2.7523887157440186 + }, + { + "auxiliary_loss_clip": 0.01156962, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.11030364, + "balance_loss_mlp": 1.00497866, + "epoch": 0.6583496167142643, + "flos": 72568605123720.0, + "grad_norm": 0.7369323168591003, + "language_loss": 0.61827505, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63993227, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.0378418, + "step": 10950, + "time_per_iteration": 3.2927615642547607 + }, + { + "auxiliary_loss_clip": 0.01331024, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.22478819, + "balance_loss_mlp": 1.01642942, + "epoch": 0.6584097399669322, + "flos": 13082766717240.0, + "grad_norm": 1.8610955828504767, + "language_loss": 0.66726869, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69087172, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12854004, + "step": 10951, + "time_per_iteration": 2.7349207401275635 + }, + { + "auxiliary_loss_clip": 0.01335795, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.22788405, + "balance_loss_mlp": 1.01921892, + "epoch": 0.6584698632196002, + "flos": 28698915539880.0, + "grad_norm": 1.4080909592912043, + "language_loss": 0.76820779, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.79189092, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13305664, + "step": 10952, + "time_per_iteration": 2.864258289337158 + }, + { + "auxiliary_loss_clip": 0.01331977, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.22492075, + "balance_loss_mlp": 1.01502132, + "epoch": 0.6585299864722681, + "flos": 14323343271480.0, + "grad_norm": 2.0624403236171074, + "language_loss": 0.73635453, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75995338, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12908936, + "step": 10953, + "time_per_iteration": 2.779017210006714 + }, + { + "auxiliary_loss_clip": 0.01339145, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.23003948, + "balance_loss_mlp": 1.01920569, + "epoch": 0.6585901097249361, + "flos": 26803672408800.0, + "grad_norm": 1.8756242846206679, + "language_loss": 0.78946716, + "learning_rate": 1.102949515683546e-06, + "loss": 0.81318998, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.1394043, + "step": 10954, + "time_per_iteration": 2.8404853343963623 + }, + { + "auxiliary_loss_clip": 0.01339027, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.22992182, + "balance_loss_mlp": 1.01737928, + "epoch": 0.658650232977604, + "flos": 18737650738080.0, + "grad_norm": 3.1932067106258573, + "language_loss": 0.69657123, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.72026885, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13342285, + "step": 10955, + "time_per_iteration": 2.738945722579956 + }, + { + "auxiliary_loss_clip": 0.01329093, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.22530448, + "balance_loss_mlp": 1.01637447, + "epoch": 0.6587103562302721, + "flos": 24758774179200.0, + "grad_norm": 2.2403600619308155, + "language_loss": 0.81141746, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.83499181, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.11968994, + "step": 10956, + "time_per_iteration": 2.9158997535705566 + }, + { + "auxiliary_loss_clip": 0.01332777, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.22428083, + "balance_loss_mlp": 1.02439916, + "epoch": 0.65877047948294, + "flos": 22351250673360.0, + "grad_norm": 4.613811129375449, + "language_loss": 0.81548786, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83920068, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.14099121, + "step": 10957, + "time_per_iteration": 2.7385306358337402 + }, + { + "auxiliary_loss_clip": 0.01335624, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.22982466, + "balance_loss_mlp": 1.01860738, + "epoch": 0.658830602735608, + "flos": 45187898987160.0, + "grad_norm": 1.656464755893335, + "language_loss": 0.75864947, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78230882, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11712646, + "step": 10958, + "time_per_iteration": 3.0151591300964355 + }, + { + "auxiliary_loss_clip": 0.01337437, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.23037314, + "balance_loss_mlp": 1.01434815, + "epoch": 0.6588907259882759, + "flos": 19906425065880.0, + "grad_norm": 1.5913485467194644, + "language_loss": 0.75500488, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77865016, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12762451, + "step": 10959, + "time_per_iteration": 2.7629377841949463 + }, + { + "auxiliary_loss_clip": 0.01335027, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.22701406, + "balance_loss_mlp": 1.01441956, + "epoch": 0.6589508492409439, + "flos": 24139095027480.0, + "grad_norm": 1.4523159675108246, + "language_loss": 0.64761508, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67123842, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12896729, + "step": 10960, + "time_per_iteration": 2.788222551345825 + }, + { + "auxiliary_loss_clip": 0.01355062, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.24113667, + "balance_loss_mlp": 1.01616406, + "epoch": 0.659010972493612, + "flos": 18227441389680.0, + "grad_norm": 2.308137222628629, + "language_loss": 0.82005823, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.84391111, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14068604, + "step": 10961, + "time_per_iteration": 2.778475522994995 + }, + { + "auxiliary_loss_clip": 0.01343708, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.23430443, + "balance_loss_mlp": 1.0186162, + "epoch": 0.6590710957462799, + "flos": 27605801232720.0, + "grad_norm": 2.272952085764295, + "language_loss": 0.73610693, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75986433, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13421631, + "step": 10962, + "time_per_iteration": 2.7916202545166016 + }, + { + "auxiliary_loss_clip": 0.01346075, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.23533607, + "balance_loss_mlp": 1.01830411, + "epoch": 0.6591312189989479, + "flos": 20307814344720.0, + "grad_norm": 2.081033409755657, + "language_loss": 0.80454707, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82832807, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.137146, + "step": 10963, + "time_per_iteration": 2.760267734527588 + }, + { + "auxiliary_loss_clip": 0.01335546, + "auxiliary_loss_mlp": 0.01025564, + "balance_loss_clip": 1.2289238, + "balance_loss_mlp": 1.01242745, + "epoch": 0.6591913422516158, + "flos": 12316802960880.0, + "grad_norm": 1.7361745650365141, + "language_loss": 0.78685451, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.81046557, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13153076, + "step": 10964, + "time_per_iteration": 2.7457544803619385 + }, + { + "auxiliary_loss_clip": 0.01346615, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.23525381, + "balance_loss_mlp": 1.01416183, + "epoch": 0.6592514655042838, + "flos": 25890083971920.0, + "grad_norm": 2.7881137711138404, + "language_loss": 0.73702145, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76076365, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13433838, + "step": 10965, + "time_per_iteration": 2.874835968017578 + }, + { + "auxiliary_loss_clip": 0.01351626, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.23746395, + "balance_loss_mlp": 1.01693892, + "epoch": 0.6593115887569517, + "flos": 14067629471880.0, + "grad_norm": 1.7707808086860262, + "language_loss": 0.73409784, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75792795, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.14465332, + "step": 10966, + "time_per_iteration": 4.257760286331177 + }, + { + "auxiliary_loss_clip": 0.01344912, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.23539138, + "balance_loss_mlp": 1.01799703, + "epoch": 0.6593717120096197, + "flos": 24723380670480.0, + "grad_norm": 1.5463726590123341, + "language_loss": 0.77003115, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79379892, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13885498, + "step": 10967, + "time_per_iteration": 2.7663328647613525 + }, + { + "auxiliary_loss_clip": 0.01162301, + "auxiliary_loss_mlp": 0.01012195, + "balance_loss_clip": 1.1169529, + "balance_loss_mlp": 1.00826156, + "epoch": 0.6594318352622877, + "flos": 55573594989120.0, + "grad_norm": 0.6946208622959203, + "language_loss": 0.48548198, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50722694, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.03930664, + "step": 10968, + "time_per_iteration": 4.7345969676971436 + }, + { + "auxiliary_loss_clip": 0.01343196, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.23259509, + "balance_loss_mlp": 1.01765549, + "epoch": 0.6594919585149557, + "flos": 17461396416600.0, + "grad_norm": 1.694172713349179, + "language_loss": 0.79745287, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.82119298, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13153076, + "step": 10969, + "time_per_iteration": 2.7438859939575195 + }, + { + "auxiliary_loss_clip": 0.01336267, + "auxiliary_loss_mlp": 0.01023643, + "balance_loss_clip": 1.22772002, + "balance_loss_mlp": 1.0108819, + "epoch": 0.6595520817676236, + "flos": 18227806864920.0, + "grad_norm": 1.9077778575583668, + "language_loss": 0.65783381, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.6814329, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12756348, + "step": 10970, + "time_per_iteration": 4.242666482925415 + }, + { + "auxiliary_loss_clip": 0.01343357, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.23302007, + "balance_loss_mlp": 1.01852643, + "epoch": 0.6596122050202916, + "flos": 22204803635280.0, + "grad_norm": 1.5505180158919625, + "language_loss": 0.76658285, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.79033715, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13562012, + "step": 10971, + "time_per_iteration": 2.7681446075439453 + }, + { + "auxiliary_loss_clip": 0.01343456, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.23266816, + "balance_loss_mlp": 1.01965404, + "epoch": 0.6596723282729595, + "flos": 14177952050760.0, + "grad_norm": 2.328513553508039, + "language_loss": 0.70441312, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72817183, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12768555, + "step": 10972, + "time_per_iteration": 2.7465057373046875 + }, + { + "auxiliary_loss_clip": 0.01341048, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.23214221, + "balance_loss_mlp": 1.01332057, + "epoch": 0.6597324515256275, + "flos": 30559090029120.0, + "grad_norm": 1.6787276615652231, + "language_loss": 0.56104243, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58472294, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13684082, + "step": 10973, + "time_per_iteration": 2.854573965072632 + }, + { + "auxiliary_loss_clip": 0.01358198, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.24264085, + "balance_loss_mlp": 1.02320814, + "epoch": 0.6597925747782956, + "flos": 17644414605840.0, + "grad_norm": 1.871830127991855, + "language_loss": 0.78971744, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81366813, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.13659668, + "step": 10974, + "time_per_iteration": 2.7973949909210205 + }, + { + "auxiliary_loss_clip": 0.01343369, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.23285341, + "balance_loss_mlp": 1.01584053, + "epoch": 0.6598526980309635, + "flos": 22824117311760.0, + "grad_norm": 2.949465753925389, + "language_loss": 0.69424027, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71796924, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13690186, + "step": 10975, + "time_per_iteration": 2.8532750606536865 + }, + { + "auxiliary_loss_clip": 0.01347726, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.23771656, + "balance_loss_mlp": 1.01329088, + "epoch": 0.6599128212836315, + "flos": 21072844108800.0, + "grad_norm": 1.6419501874358957, + "language_loss": 0.71154255, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.7352885, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13580322, + "step": 10976, + "time_per_iteration": 2.9036307334899902 + }, + { + "auxiliary_loss_clip": 0.01337166, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.23022366, + "balance_loss_mlp": 1.0138762, + "epoch": 0.6599729445362994, + "flos": 22168597959360.0, + "grad_norm": 1.6522783466186046, + "language_loss": 0.67718315, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.7008298, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13641357, + "step": 10977, + "time_per_iteration": 3.019888162612915 + }, + { + "auxiliary_loss_clip": 0.01353913, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.24027348, + "balance_loss_mlp": 1.01574135, + "epoch": 0.6600330677889674, + "flos": 18154623954240.0, + "grad_norm": 4.8055816207000985, + "language_loss": 0.818712, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.8425597, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.15112305, + "step": 10978, + "time_per_iteration": 4.430091142654419 + }, + { + "auxiliary_loss_clip": 0.01353436, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.24152899, + "balance_loss_mlp": 1.02291834, + "epoch": 0.6600931910416353, + "flos": 18154664562600.0, + "grad_norm": 2.0979861761765424, + "language_loss": 0.67445338, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69836408, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14703369, + "step": 10979, + "time_per_iteration": 2.8251116275787354 + }, + { + "auxiliary_loss_clip": 0.01349492, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.23863578, + "balance_loss_mlp": 1.01423669, + "epoch": 0.6601533142943034, + "flos": 17425231349040.0, + "grad_norm": 2.235481760920297, + "language_loss": 0.73956966, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.7633428, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13580322, + "step": 10980, + "time_per_iteration": 2.8076462745666504 + }, + { + "auxiliary_loss_clip": 0.01335246, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.23004532, + "balance_loss_mlp": 1.01672494, + "epoch": 0.6602134375469713, + "flos": 28225358559360.0, + "grad_norm": 1.6349016207694738, + "language_loss": 0.73093545, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75457537, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.12023926, + "step": 10981, + "time_per_iteration": 2.8212268352508545 + }, + { + "auxiliary_loss_clip": 0.01344273, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.23372221, + "balance_loss_mlp": 1.01715565, + "epoch": 0.6602735607996393, + "flos": 29423272708440.0, + "grad_norm": 2.1930703430552208, + "language_loss": 0.68747091, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71122146, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13616943, + "step": 10982, + "time_per_iteration": 2.8487749099731445 + }, + { + "auxiliary_loss_clip": 0.0134016, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.23293114, + "balance_loss_mlp": 1.01763177, + "epoch": 0.6603336840523072, + "flos": 18592665600960.0, + "grad_norm": 1.5225353431857653, + "language_loss": 0.69729471, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.72100592, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13342285, + "step": 10983, + "time_per_iteration": 2.829072952270508 + }, + { + "auxiliary_loss_clip": 0.0134957, + "auxiliary_loss_mlp": 0.0102713, + "balance_loss_clip": 1.23881507, + "balance_loss_mlp": 1.01311111, + "epoch": 0.6603938073049752, + "flos": 33261334987320.0, + "grad_norm": 1.845250619003782, + "language_loss": 0.70531243, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72907948, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14025879, + "step": 10984, + "time_per_iteration": 2.8784101009368896 + }, + { + "auxiliary_loss_clip": 0.01339349, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.23297572, + "balance_loss_mlp": 1.01576424, + "epoch": 0.6604539305576431, + "flos": 17388782022960.0, + "grad_norm": 1.5189269139236443, + "language_loss": 0.84383798, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.86752391, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13494873, + "step": 10985, + "time_per_iteration": 2.7035768032073975 + }, + { + "auxiliary_loss_clip": 0.01348704, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.23745549, + "balance_loss_mlp": 1.0221833, + "epoch": 0.6605140538103111, + "flos": 21256065339840.0, + "grad_norm": 2.183316704233602, + "language_loss": 0.74409688, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76794124, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 1.11279297, + "router_z_loss_mlp": 0.13543701, + "step": 10986, + "time_per_iteration": 2.7682127952575684 + }, + { + "auxiliary_loss_clip": 0.01335009, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.22844839, + "balance_loss_mlp": 1.01386809, + "epoch": 0.6605741770629792, + "flos": 13885748316720.0, + "grad_norm": 1.8591044448860918, + "language_loss": 0.79397321, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.8175959, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13397217, + "step": 10987, + "time_per_iteration": 2.773254871368408 + }, + { + "auxiliary_loss_clip": 0.01171138, + "auxiliary_loss_mlp": 0.01017761, + "balance_loss_clip": 1.12383246, + "balance_loss_mlp": 1.01418483, + "epoch": 0.6606343003156471, + "flos": 69333914417040.0, + "grad_norm": 0.8224785686385586, + "language_loss": 0.54220319, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56409216, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.03564453, + "step": 10988, + "time_per_iteration": 3.4291582107543945 + }, + { + "auxiliary_loss_clip": 0.0133885, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.23225403, + "balance_loss_mlp": 1.01820755, + "epoch": 0.6606944235683151, + "flos": 27278244598320.0, + "grad_norm": 1.4757495327964754, + "language_loss": 0.77463824, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79833126, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12243652, + "step": 10989, + "time_per_iteration": 2.8604891300201416 + }, + { + "auxiliary_loss_clip": 0.01338556, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.23200321, + "balance_loss_mlp": 1.01490402, + "epoch": 0.660754546820983, + "flos": 13776928247160.0, + "grad_norm": 1.8913012047675375, + "language_loss": 0.77555525, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.79922235, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13256836, + "step": 10990, + "time_per_iteration": 2.86413836479187 + }, + { + "auxiliary_loss_clip": 0.01341623, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.23224521, + "balance_loss_mlp": 1.01566517, + "epoch": 0.660814670073651, + "flos": 15709392263160.0, + "grad_norm": 1.907306715673072, + "language_loss": 0.61027777, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.63398254, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13208008, + "step": 10991, + "time_per_iteration": 2.780268430709839 + }, + { + "auxiliary_loss_clip": 0.01346901, + "auxiliary_loss_mlp": 0.01027667, + "balance_loss_clip": 1.23574591, + "balance_loss_mlp": 1.01416099, + "epoch": 0.6608747933263189, + "flos": 20854919711160.0, + "grad_norm": 3.892688233685857, + "language_loss": 0.68688089, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.71062654, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13500977, + "step": 10992, + "time_per_iteration": 2.753157615661621 + }, + { + "auxiliary_loss_clip": 0.01346799, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.23507047, + "balance_loss_mlp": 1.01308882, + "epoch": 0.660934916578987, + "flos": 20637117138600.0, + "grad_norm": 2.5932028260525324, + "language_loss": 0.88087463, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90460771, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13427734, + "step": 10993, + "time_per_iteration": 2.77226185798645 + }, + { + "auxiliary_loss_clip": 0.01354575, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.24012172, + "balance_loss_mlp": 1.02099681, + "epoch": 0.6609950398316549, + "flos": 25118353828440.0, + "grad_norm": 1.7256715448143811, + "language_loss": 0.67324996, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69715691, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.15130615, + "step": 10994, + "time_per_iteration": 2.788133382797241 + }, + { + "auxiliary_loss_clip": 0.01344128, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.23491108, + "balance_loss_mlp": 1.01990438, + "epoch": 0.6610551630843229, + "flos": 18665848511640.0, + "grad_norm": 1.8152472931569748, + "language_loss": 0.77495086, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.7987324, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14111328, + "step": 10995, + "time_per_iteration": 2.731516122817993 + }, + { + "auxiliary_loss_clip": 0.01341452, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.23235559, + "balance_loss_mlp": 1.0175935, + "epoch": 0.6611152863369908, + "flos": 23263417817640.0, + "grad_norm": 1.7932280274925076, + "language_loss": 0.74540335, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76911551, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12176514, + "step": 10996, + "time_per_iteration": 2.79123854637146 + }, + { + "auxiliary_loss_clip": 0.01346713, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.23800874, + "balance_loss_mlp": 1.02016306, + "epoch": 0.6611754095896588, + "flos": 22164293473200.0, + "grad_norm": 1.7854753687612108, + "language_loss": 0.6955238, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71931916, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12658691, + "step": 10997, + "time_per_iteration": 2.742424488067627 + }, + { + "auxiliary_loss_clip": 0.01347522, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.2352047, + "balance_loss_mlp": 1.02109766, + "epoch": 0.6612355328423267, + "flos": 13995299336760.0, + "grad_norm": 2.132245506982641, + "language_loss": 0.68774235, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.71156609, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13763428, + "step": 10998, + "time_per_iteration": 2.7971463203430176 + }, + { + "auxiliary_loss_clip": 0.01168085, + "auxiliary_loss_mlp": 0.01012312, + "balance_loss_clip": 1.1229248, + "balance_loss_mlp": 1.00897408, + "epoch": 0.6612956560949947, + "flos": 61468030682280.0, + "grad_norm": 0.9849956146056819, + "language_loss": 0.51172549, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53352946, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.03344727, + "step": 10999, + "time_per_iteration": 3.2565042972564697 + }, + { + "auxiliary_loss_clip": 0.01350324, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.2375778, + "balance_loss_mlp": 1.02025139, + "epoch": 0.6613557793476627, + "flos": 21621695634720.0, + "grad_norm": 2.2635129168502814, + "language_loss": 0.70691574, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73074591, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12457275, + "step": 11000, + "time_per_iteration": 2.839951515197754 + }, + { + "auxiliary_loss_clip": 0.01335459, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.22941458, + "balance_loss_mlp": 1.02136421, + "epoch": 0.6614159026003307, + "flos": 34026567793200.0, + "grad_norm": 1.4861133634280268, + "language_loss": 0.65145028, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67513925, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12078857, + "step": 11001, + "time_per_iteration": 2.9393181800842285 + }, + { + "auxiliary_loss_clip": 0.01344505, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.23679805, + "balance_loss_mlp": 1.01664233, + "epoch": 0.6614760258529987, + "flos": 24102605093040.0, + "grad_norm": 1.6253867414763679, + "language_loss": 0.73159599, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75533843, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13104248, + "step": 11002, + "time_per_iteration": 2.759711265563965 + }, + { + "auxiliary_loss_clip": 0.0134227, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.23479676, + "balance_loss_mlp": 1.01955163, + "epoch": 0.6615361491056666, + "flos": 14908034998080.0, + "grad_norm": 2.0646925852445377, + "language_loss": 0.79031456, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81406504, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13201904, + "step": 11003, + "time_per_iteration": 2.902745246887207 + }, + { + "auxiliary_loss_clip": 0.0135018, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.23945343, + "balance_loss_mlp": 1.02369821, + "epoch": 0.6615962723583346, + "flos": 15309140018400.0, + "grad_norm": 1.8154901445390537, + "language_loss": 0.69416094, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71804231, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.14257812, + "step": 11004, + "time_per_iteration": 2.7142701148986816 + }, + { + "auxiliary_loss_clip": 0.01350531, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.23760676, + "balance_loss_mlp": 1.0205543, + "epoch": 0.6616563956110025, + "flos": 18737041612680.0, + "grad_norm": 3.276076908208207, + "language_loss": 0.69458079, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71843398, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14227295, + "step": 11005, + "time_per_iteration": 4.219159126281738 + }, + { + "auxiliary_loss_clip": 0.01338342, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.23099422, + "balance_loss_mlp": 1.01918411, + "epoch": 0.6617165188636706, + "flos": 24499974144240.0, + "grad_norm": 1.5623388337606534, + "language_loss": 0.78618103, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80988282, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12664795, + "step": 11006, + "time_per_iteration": 4.251449823379517 + }, + { + "auxiliary_loss_clip": 0.01344905, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.23658168, + "balance_loss_mlp": 1.02232504, + "epoch": 0.6617766421163385, + "flos": 22384410722280.0, + "grad_norm": 1.5363894895177976, + "language_loss": 0.76689827, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.79070443, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.1338501, + "step": 11007, + "time_per_iteration": 2.7803795337677 + }, + { + "auxiliary_loss_clip": 0.01342787, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.2343328, + "balance_loss_mlp": 1.02281165, + "epoch": 0.6618367653690065, + "flos": 20855366403120.0, + "grad_norm": 1.5827143168142472, + "language_loss": 0.78410906, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80789483, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12982178, + "step": 11008, + "time_per_iteration": 2.74794864654541 + }, + { + "auxiliary_loss_clip": 0.01354841, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.24092531, + "balance_loss_mlp": 1.0249877, + "epoch": 0.6618968886216744, + "flos": 17716825957680.0, + "grad_norm": 1.621677261262259, + "language_loss": 0.82232159, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.84626603, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14611816, + "step": 11009, + "time_per_iteration": 4.226950168609619 + }, + { + "auxiliary_loss_clip": 0.01168165, + "auxiliary_loss_mlp": 0.01004146, + "balance_loss_clip": 1.12329638, + "balance_loss_mlp": 1.00113034, + "epoch": 0.6619570118743424, + "flos": 67050300309480.0, + "grad_norm": 0.9687740759150534, + "language_loss": 0.67438471, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69610786, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.03015137, + "step": 11010, + "time_per_iteration": 3.2207939624786377 + }, + { + "auxiliary_loss_clip": 0.01345429, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.23527181, + "balance_loss_mlp": 1.01984656, + "epoch": 0.6620171351270103, + "flos": 18665523644760.0, + "grad_norm": 1.5242270513992706, + "language_loss": 0.71120179, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73499084, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13653564, + "step": 11011, + "time_per_iteration": 2.8217921257019043 + }, + { + "auxiliary_loss_clip": 0.01340779, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.23263931, + "balance_loss_mlp": 1.02167189, + "epoch": 0.6620772583796783, + "flos": 24175869220440.0, + "grad_norm": 1.5889870979676246, + "language_loss": 0.72855234, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.7522999, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12304688, + "step": 11012, + "time_per_iteration": 2.8585190773010254 + }, + { + "auxiliary_loss_clip": 0.01327068, + "auxiliary_loss_mlp": 0.01036887, + "balance_loss_clip": 1.22616637, + "balance_loss_mlp": 1.02583051, + "epoch": 0.6621373816323463, + "flos": 23628885679080.0, + "grad_norm": 1.8290964516785944, + "language_loss": 0.79518348, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81882304, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.11053467, + "step": 11013, + "time_per_iteration": 2.9293620586395264 + }, + { + "auxiliary_loss_clip": 0.01341107, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.23406887, + "balance_loss_mlp": 1.01530707, + "epoch": 0.6621975048850143, + "flos": 18446827688280.0, + "grad_norm": 2.4233792169440296, + "language_loss": 0.70532548, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72902012, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13061523, + "step": 11014, + "time_per_iteration": 2.7768032550811768 + }, + { + "auxiliary_loss_clip": 0.01338138, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.23289537, + "balance_loss_mlp": 1.02028501, + "epoch": 0.6622576281376823, + "flos": 14067913730400.0, + "grad_norm": 1.9931582251678595, + "language_loss": 0.77347404, + "learning_rate": 1.081779858400137e-06, + "loss": 0.79717356, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11529541, + "step": 11015, + "time_per_iteration": 2.729379892349243 + }, + { + "auxiliary_loss_clip": 0.01341662, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.23490548, + "balance_loss_mlp": 1.01745713, + "epoch": 0.6623177513903502, + "flos": 17023354769880.0, + "grad_norm": 2.1665322594340246, + "language_loss": 0.823861, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8475737, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12145996, + "step": 11016, + "time_per_iteration": 2.822204351425171 + }, + { + "auxiliary_loss_clip": 0.01351691, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.24021435, + "balance_loss_mlp": 1.0221324, + "epoch": 0.6623778746430182, + "flos": 17275007733480.0, + "grad_norm": 2.02717176810383, + "language_loss": 0.70779395, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.73166502, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13293457, + "step": 11017, + "time_per_iteration": 4.323014974594116 + }, + { + "auxiliary_loss_clip": 0.01342854, + "auxiliary_loss_mlp": 0.01045726, + "balance_loss_clip": 1.23436952, + "balance_loss_mlp": 1.03230882, + "epoch": 0.6624379978956861, + "flos": 48800524321800.0, + "grad_norm": 2.8629085299849044, + "language_loss": 0.77482963, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79871547, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13427734, + "step": 11018, + "time_per_iteration": 2.9983084201812744 + }, + { + "auxiliary_loss_clip": 0.01335344, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.22801447, + "balance_loss_mlp": 1.02497768, + "epoch": 0.6624981211483542, + "flos": 18957037036680.0, + "grad_norm": 3.9144964809870837, + "language_loss": 0.83932805, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.86306566, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13433838, + "step": 11019, + "time_per_iteration": 2.8161683082580566 + }, + { + "auxiliary_loss_clip": 0.0133723, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.2314136, + "balance_loss_mlp": 1.01762521, + "epoch": 0.6625582444010221, + "flos": 23261590441440.0, + "grad_norm": 2.2098741547912937, + "language_loss": 0.7222687, + "learning_rate": 1.080050345253328e-06, + "loss": 0.74594277, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12542725, + "step": 11020, + "time_per_iteration": 2.7645111083984375 + }, + { + "auxiliary_loss_clip": 0.01354617, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.24102616, + "balance_loss_mlp": 1.02032828, + "epoch": 0.6626183676536901, + "flos": 21399466750920.0, + "grad_norm": 1.9872079753229863, + "language_loss": 0.72716665, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.75106192, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14569092, + "step": 11021, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.01344493, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.23702836, + "balance_loss_mlp": 1.01862192, + "epoch": 0.662678490906358, + "flos": 14574955626720.0, + "grad_norm": 2.2182785811553396, + "language_loss": 0.83487749, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85864747, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13885498, + "step": 11022, + "time_per_iteration": 2.7651939392089844 + }, + { + "auxiliary_loss_clip": 0.01361413, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.24324012, + "balance_loss_mlp": 1.02352464, + "epoch": 0.662738614159026, + "flos": 15995951435160.0, + "grad_norm": 2.7876272021660795, + "language_loss": 0.73280698, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.75680268, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.14642334, + "step": 11023, + "time_per_iteration": 2.7279911041259766 + }, + { + "auxiliary_loss_clip": 0.01343624, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.23672855, + "balance_loss_mlp": 1.01442981, + "epoch": 0.6627987374116939, + "flos": 19541038421160.0, + "grad_norm": 1.7165334149688807, + "language_loss": 0.74991792, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77362454, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12615967, + "step": 11024, + "time_per_iteration": 2.7631423473358154 + }, + { + "auxiliary_loss_clip": 0.01348989, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.23946261, + "balance_loss_mlp": 1.0161469, + "epoch": 0.662858860664362, + "flos": 15706712111400.0, + "grad_norm": 2.6931812688776513, + "language_loss": 0.69677192, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.7205621, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13909912, + "step": 11025, + "time_per_iteration": 2.8482823371887207 + }, + { + "auxiliary_loss_clip": 0.01347876, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.23994756, + "balance_loss_mlp": 1.01721048, + "epoch": 0.6629189839170299, + "flos": 20158768371600.0, + "grad_norm": 1.499094181633605, + "language_loss": 0.79454714, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.81833255, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13446045, + "step": 11026, + "time_per_iteration": 2.801440954208374 + }, + { + "auxiliary_loss_clip": 0.01342065, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.23611283, + "balance_loss_mlp": 1.01792789, + "epoch": 0.6629791071696979, + "flos": 20919331216080.0, + "grad_norm": 1.7630525514734114, + "language_loss": 0.7660718, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78979897, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12731934, + "step": 11027, + "time_per_iteration": 2.8229081630706787 + }, + { + "auxiliary_loss_clip": 0.01351403, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.24274671, + "balance_loss_mlp": 1.02037096, + "epoch": 0.6630392304223659, + "flos": 20851386783840.0, + "grad_norm": 2.0893905628417384, + "language_loss": 0.70535266, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72920346, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13305664, + "step": 11028, + "time_per_iteration": 2.7494282722473145 + }, + { + "auxiliary_loss_clip": 0.01342581, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.23478174, + "balance_loss_mlp": 1.02302217, + "epoch": 0.6630993536750338, + "flos": 21000717015480.0, + "grad_norm": 2.1915990071023645, + "language_loss": 0.79651606, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.82028627, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.11413574, + "step": 11029, + "time_per_iteration": 2.760312080383301 + }, + { + "auxiliary_loss_clip": 0.01351333, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.24105537, + "balance_loss_mlp": 1.0222621, + "epoch": 0.6631594769277018, + "flos": 18263525240520.0, + "grad_norm": 2.238090912960245, + "language_loss": 0.76294553, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78681839, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13708496, + "step": 11030, + "time_per_iteration": 2.7852251529693604 + }, + { + "auxiliary_loss_clip": 0.01356968, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.24339461, + "balance_loss_mlp": 1.01765108, + "epoch": 0.6632196001803697, + "flos": 17824833860040.0, + "grad_norm": 2.251646538295268, + "language_loss": 0.7572664, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.78114462, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13220215, + "step": 11031, + "time_per_iteration": 2.8111562728881836 + }, + { + "auxiliary_loss_clip": 0.01353737, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.24203253, + "balance_loss_mlp": 1.020818, + "epoch": 0.6632797234330378, + "flos": 12673093332960.0, + "grad_norm": 2.779587471260831, + "language_loss": 0.75125849, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77513498, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13110352, + "step": 11032, + "time_per_iteration": 2.7326126098632812 + }, + { + "auxiliary_loss_clip": 0.01342361, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.23558342, + "balance_loss_mlp": 1.02037311, + "epoch": 0.6633398466857057, + "flos": 23589756201240.0, + "grad_norm": 1.764572023147765, + "language_loss": 0.80652571, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12384033, + "step": 11033, + "time_per_iteration": 2.8162810802459717 + }, + { + "auxiliary_loss_clip": 0.01349016, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.23949635, + "balance_loss_mlp": 1.02043033, + "epoch": 0.6633999699383737, + "flos": 20636629838280.0, + "grad_norm": 1.7237017466748126, + "language_loss": 0.80767447, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.83150399, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1350708, + "step": 11034, + "time_per_iteration": 2.7701313495635986 + }, + { + "auxiliary_loss_clip": 0.01345885, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.23850107, + "balance_loss_mlp": 1.01914835, + "epoch": 0.6634600931910416, + "flos": 21801911847120.0, + "grad_norm": 2.8672794701224498, + "language_loss": 0.76232696, + "learning_rate": 1.074867045054166e-06, + "loss": 0.786098, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12078857, + "step": 11035, + "time_per_iteration": 2.8193163871765137 + }, + { + "auxiliary_loss_clip": 0.01354372, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.24154687, + "balance_loss_mlp": 1.01794755, + "epoch": 0.6635202164437096, + "flos": 18737366479560.0, + "grad_norm": 2.1689096904595075, + "language_loss": 0.83774894, + "learning_rate": 1.074521771867622e-06, + "loss": 0.861601, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.12878418, + "step": 11036, + "time_per_iteration": 2.7689008712768555 + }, + { + "auxiliary_loss_clip": 0.01170495, + "auxiliary_loss_mlp": 0.01008141, + "balance_loss_clip": 1.12559092, + "balance_loss_mlp": 1.00568509, + "epoch": 0.6635803396963775, + "flos": 60238174735080.0, + "grad_norm": 0.7754035484936735, + "language_loss": 0.52380025, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54558659, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02453613, + "step": 11037, + "time_per_iteration": 3.3135836124420166 + }, + { + "auxiliary_loss_clip": 0.01349686, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.24030221, + "balance_loss_mlp": 1.01972032, + "epoch": 0.6636404629490456, + "flos": 29172797387280.0, + "grad_norm": 1.5299625016782556, + "language_loss": 0.79636359, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.82019347, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13580322, + "step": 11038, + "time_per_iteration": 2.904365301132202 + }, + { + "auxiliary_loss_clip": 0.01349513, + "auxiliary_loss_mlp": 0.01043249, + "balance_loss_clip": 1.23953307, + "balance_loss_mlp": 1.02936697, + "epoch": 0.6637005862017135, + "flos": 38914594673760.0, + "grad_norm": 1.8265459486574136, + "language_loss": 0.64248443, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66641206, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13885498, + "step": 11039, + "time_per_iteration": 2.9086999893188477 + }, + { + "auxiliary_loss_clip": 0.01351785, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.23981345, + "balance_loss_mlp": 1.01763213, + "epoch": 0.6637607094543815, + "flos": 22788439544520.0, + "grad_norm": 2.392973689517394, + "language_loss": 0.64260995, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66643268, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12860107, + "step": 11040, + "time_per_iteration": 2.8887600898742676 + }, + { + "auxiliary_loss_clip": 0.01343199, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.23578036, + "balance_loss_mlp": 1.01773763, + "epoch": 0.6638208327070495, + "flos": 18118702536840.0, + "grad_norm": 1.9361458470269084, + "language_loss": 0.72533435, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.74907064, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12689209, + "step": 11041, + "time_per_iteration": 2.764418601989746 + }, + { + "auxiliary_loss_clip": 0.01342821, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.23585176, + "balance_loss_mlp": 1.02592278, + "epoch": 0.6638809559597174, + "flos": 29430988296840.0, + "grad_norm": 1.8818674445557129, + "language_loss": 0.61944479, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64325988, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12744141, + "step": 11042, + "time_per_iteration": 2.8231546878814697 + }, + { + "auxiliary_loss_clip": 0.01358872, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.24559236, + "balance_loss_mlp": 1.01476955, + "epoch": 0.6639410792123854, + "flos": 28078343004240.0, + "grad_norm": 2.0421588186807993, + "language_loss": 0.68616068, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.71004659, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.1496582, + "step": 11043, + "time_per_iteration": 2.834238290786743 + }, + { + "auxiliary_loss_clip": 0.01334348, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.23122728, + "balance_loss_mlp": 1.0160017, + "epoch": 0.6640012024650533, + "flos": 25561836995400.0, + "grad_norm": 1.4675516482009556, + "language_loss": 0.83892345, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86254501, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.11798096, + "step": 11044, + "time_per_iteration": 4.286802053451538 + }, + { + "auxiliary_loss_clip": 0.01346738, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.23921919, + "balance_loss_mlp": 1.01721025, + "epoch": 0.6640613257177214, + "flos": 14871138980040.0, + "grad_norm": 1.9637958281447478, + "language_loss": 0.69807613, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.72185093, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13531494, + "step": 11045, + "time_per_iteration": 4.184414863586426 + }, + { + "auxiliary_loss_clip": 0.01349053, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.23928928, + "balance_loss_mlp": 1.01523066, + "epoch": 0.6641214489703893, + "flos": 23226237541080.0, + "grad_norm": 1.6055431714531383, + "language_loss": 0.64651716, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.67029315, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13305664, + "step": 11046, + "time_per_iteration": 2.8210699558258057 + }, + { + "auxiliary_loss_clip": 0.01343314, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.23599184, + "balance_loss_mlp": 1.01611471, + "epoch": 0.6641815722230573, + "flos": 37749678140160.0, + "grad_norm": 1.761036129211659, + "language_loss": 0.71730894, + "learning_rate": 1.070726085914088e-06, + "loss": 0.74103141, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1282959, + "step": 11047, + "time_per_iteration": 4.4105775356292725 + }, + { + "auxiliary_loss_clip": 0.01350689, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.24279976, + "balance_loss_mlp": 1.01711822, + "epoch": 0.6642416954757252, + "flos": 17935643739240.0, + "grad_norm": 1.962060786036905, + "language_loss": 0.77321798, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79702866, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13262939, + "step": 11048, + "time_per_iteration": 2.8391010761260986 + }, + { + "auxiliary_loss_clip": 0.01170263, + "auxiliary_loss_mlp": 0.01007636, + "balance_loss_clip": 1.12639761, + "balance_loss_mlp": 1.00522828, + "epoch": 0.6643018187283932, + "flos": 52007351047560.0, + "grad_norm": 0.7451509948327153, + "language_loss": 0.55067587, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57245481, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02404785, + "step": 11049, + "time_per_iteration": 3.2822577953338623 + }, + { + "auxiliary_loss_clip": 0.01344697, + "auxiliary_loss_mlp": 0.01026714, + "balance_loss_clip": 1.23757935, + "balance_loss_mlp": 1.01440585, + "epoch": 0.6643619419810611, + "flos": 30232751645520.0, + "grad_norm": 2.2100997808312264, + "language_loss": 0.64715749, + "learning_rate": 1.069691638104648e-06, + "loss": 0.67087162, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12304688, + "step": 11050, + "time_per_iteration": 2.8864011764526367 + }, + { + "auxiliary_loss_clip": 0.01340123, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.23468602, + "balance_loss_mlp": 1.01728606, + "epoch": 0.6644220652337292, + "flos": 22971295300320.0, + "grad_norm": 2.4620977890377245, + "language_loss": 0.79777217, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.8214705, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12426758, + "step": 11051, + "time_per_iteration": 2.796206474304199 + }, + { + "auxiliary_loss_clip": 0.01347169, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.23869514, + "balance_loss_mlp": 1.01742327, + "epoch": 0.6644821884863971, + "flos": 21147204661920.0, + "grad_norm": 1.6465165051520758, + "language_loss": 0.85911781, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.88289052, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12689209, + "step": 11052, + "time_per_iteration": 2.789944887161255 + }, + { + "auxiliary_loss_clip": 0.01352232, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.24028528, + "balance_loss_mlp": 1.01649141, + "epoch": 0.6645423117390651, + "flos": 20197572982560.0, + "grad_norm": 3.1881381948003717, + "language_loss": 0.75265133, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.77647889, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14031982, + "step": 11053, + "time_per_iteration": 2.796922445297241 + }, + { + "auxiliary_loss_clip": 0.01341153, + "auxiliary_loss_mlp": 0.01026192, + "balance_loss_clip": 1.23542237, + "balance_loss_mlp": 1.01351428, + "epoch": 0.6646024349917331, + "flos": 24357222466920.0, + "grad_norm": 1.5338547938533174, + "language_loss": 0.79707211, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.82074559, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12683105, + "step": 11054, + "time_per_iteration": 2.8330724239349365 + }, + { + "auxiliary_loss_clip": 0.01342051, + "auxiliary_loss_mlp": 0.01025546, + "balance_loss_clip": 1.23594248, + "balance_loss_mlp": 1.01333332, + "epoch": 0.664662558244401, + "flos": 18811320949080.0, + "grad_norm": 3.399520459978607, + "language_loss": 0.74301445, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76669037, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12194824, + "step": 11055, + "time_per_iteration": 2.7540078163146973 + }, + { + "auxiliary_loss_clip": 0.01351229, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.24185467, + "balance_loss_mlp": 1.01962113, + "epoch": 0.664722681497069, + "flos": 18957443120280.0, + "grad_norm": 2.7522357549825776, + "language_loss": 0.73137939, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75523579, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.14764404, + "step": 11056, + "time_per_iteration": 4.481196165084839 + }, + { + "auxiliary_loss_clip": 0.01345473, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.23940337, + "balance_loss_mlp": 1.01555634, + "epoch": 0.6647828047497369, + "flos": 19575782196120.0, + "grad_norm": 1.7910756067647595, + "language_loss": 0.69798028, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.72172058, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13006592, + "step": 11057, + "time_per_iteration": 2.9087178707122803 + }, + { + "auxiliary_loss_clip": 0.01348817, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.24061072, + "balance_loss_mlp": 1.01569617, + "epoch": 0.664842928002405, + "flos": 23154760181520.0, + "grad_norm": 1.8396868584857742, + "language_loss": 0.80636656, + "learning_rate": 1.066934663776291e-06, + "loss": 0.83014238, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13067627, + "step": 11058, + "time_per_iteration": 2.879638910293579 + }, + { + "auxiliary_loss_clip": 0.01165623, + "auxiliary_loss_mlp": 0.0100194, + "balance_loss_clip": 1.12217104, + "balance_loss_mlp": 0.99949664, + "epoch": 0.6649030512550729, + "flos": 65259288503280.0, + "grad_norm": 0.8000990440710919, + "language_loss": 0.6261903, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64786589, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02441406, + "step": 11059, + "time_per_iteration": 3.1938838958740234 + }, + { + "auxiliary_loss_clip": 0.01342506, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.23621523, + "balance_loss_mlp": 1.01674438, + "epoch": 0.6649631745077409, + "flos": 20199806442360.0, + "grad_norm": 1.5210828839480819, + "language_loss": 0.78751624, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81123108, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12231445, + "step": 11060, + "time_per_iteration": 2.9791741371154785 + }, + { + "auxiliary_loss_clip": 0.01347545, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.23968863, + "balance_loss_mlp": 1.01741862, + "epoch": 0.6650232977604088, + "flos": 17243065935360.0, + "grad_norm": 2.025109951868451, + "language_loss": 0.78951901, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81329656, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12786865, + "step": 11061, + "time_per_iteration": 2.6996870040893555 + }, + { + "auxiliary_loss_clip": 0.01348972, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.24269509, + "balance_loss_mlp": 1.01081359, + "epoch": 0.6650834210130768, + "flos": 10009449943920.0, + "grad_norm": 1.9838689269928795, + "language_loss": 0.57493079, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59865248, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1237793, + "step": 11062, + "time_per_iteration": 2.744843006134033 + }, + { + "auxiliary_loss_clip": 0.01352026, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.2393837, + "balance_loss_mlp": 1.01207662, + "epoch": 0.6651435442657447, + "flos": 10455694479360.0, + "grad_norm": 2.4775195006640343, + "language_loss": 0.76729572, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.79107738, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14080811, + "step": 11063, + "time_per_iteration": 2.7201249599456787 + }, + { + "auxiliary_loss_clip": 0.01352796, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.24458456, + "balance_loss_mlp": 1.02411532, + "epoch": 0.6652036675184128, + "flos": 22349098430280.0, + "grad_norm": 1.3757194363424101, + "language_loss": 0.70892358, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.73281956, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12689209, + "step": 11064, + "time_per_iteration": 2.828070640563965 + }, + { + "auxiliary_loss_clip": 0.01171087, + "auxiliary_loss_mlp": 0.01006694, + "balance_loss_clip": 1.12787104, + "balance_loss_mlp": 1.00404763, + "epoch": 0.6652637907710807, + "flos": 52921995301800.0, + "grad_norm": 0.8474631170224253, + "language_loss": 0.63064069, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65241849, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02648926, + "step": 11065, + "time_per_iteration": 3.2068517208099365 + }, + { + "auxiliary_loss_clip": 0.01353704, + "auxiliary_loss_mlp": 0.01029155, + "balance_loss_clip": 1.24525142, + "balance_loss_mlp": 1.01646483, + "epoch": 0.6653239140237487, + "flos": 23108849107560.0, + "grad_norm": 1.5410402324712462, + "language_loss": 0.62344658, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64727515, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12683105, + "step": 11066, + "time_per_iteration": 2.8032310009002686 + }, + { + "auxiliary_loss_clip": 0.01353665, + "auxiliary_loss_mlp": 0.01025532, + "balance_loss_clip": 1.24352646, + "balance_loss_mlp": 1.01196575, + "epoch": 0.6653840372764167, + "flos": 25965134867160.0, + "grad_norm": 1.4653689649611492, + "language_loss": 0.70028794, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72407991, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13562012, + "step": 11067, + "time_per_iteration": 2.8302969932556152 + }, + { + "auxiliary_loss_clip": 0.01169871, + "auxiliary_loss_mlp": 0.01008541, + "balance_loss_clip": 1.12616086, + "balance_loss_mlp": 1.00588286, + "epoch": 0.6654441605290846, + "flos": 66055732156800.0, + "grad_norm": 0.9145881241605457, + "language_loss": 0.72095257, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.7427367, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02661133, + "step": 11068, + "time_per_iteration": 3.171086311340332 + }, + { + "auxiliary_loss_clip": 0.01170762, + "auxiliary_loss_mlp": 0.01001692, + "balance_loss_clip": 1.12628305, + "balance_loss_mlp": 0.99909347, + "epoch": 0.6655042837817526, + "flos": 65213638060680.0, + "grad_norm": 0.7088034744462343, + "language_loss": 0.57826155, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59998608, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02600098, + "step": 11069, + "time_per_iteration": 3.361241579055786 + }, + { + "auxiliary_loss_clip": 0.0116827, + "auxiliary_loss_mlp": 0.01001326, + "balance_loss_clip": 1.12436712, + "balance_loss_mlp": 0.99857217, + "epoch": 0.6655644070344205, + "flos": 69024370913280.0, + "grad_norm": 0.7713352154271402, + "language_loss": 0.63585722, + "learning_rate": 1.062803450204029e-06, + "loss": 0.6575532, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02758789, + "step": 11070, + "time_per_iteration": 3.3822555541992188 + }, + { + "auxiliary_loss_clip": 0.01344861, + "auxiliary_loss_mlp": 0.01026253, + "balance_loss_clip": 1.23663068, + "balance_loss_mlp": 1.013587, + "epoch": 0.6656245302870886, + "flos": 36321900735600.0, + "grad_norm": 1.6236720109904774, + "language_loss": 0.58815002, + "learning_rate": 1.062459413096116e-06, + "loss": 0.61186111, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12677002, + "step": 11071, + "time_per_iteration": 3.153409242630005 + }, + { + "auxiliary_loss_clip": 0.01346433, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.23945069, + "balance_loss_mlp": 1.01535034, + "epoch": 0.6656846535397565, + "flos": 21799353520440.0, + "grad_norm": 1.9888917502289687, + "language_loss": 0.73354077, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75728452, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12591553, + "step": 11072, + "time_per_iteration": 2.7945303916931152 + }, + { + "auxiliary_loss_clip": 0.01339585, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.23455572, + "balance_loss_mlp": 1.0165931, + "epoch": 0.6657447767924245, + "flos": 37494695291040.0, + "grad_norm": 1.6744343409147406, + "language_loss": 0.71194571, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.73564565, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.13824463, + "step": 11073, + "time_per_iteration": 2.924969434738159 + }, + { + "auxiliary_loss_clip": 0.01353541, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.24280405, + "balance_loss_mlp": 1.01614213, + "epoch": 0.6658049000450924, + "flos": 16842204565200.0, + "grad_norm": 1.9770097170920982, + "language_loss": 0.56323457, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58706391, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13244629, + "step": 11074, + "time_per_iteration": 2.8113033771514893 + }, + { + "auxiliary_loss_clip": 0.01344381, + "auxiliary_loss_mlp": 0.0102777, + "balance_loss_clip": 1.2380867, + "balance_loss_mlp": 1.01487756, + "epoch": 0.6658650232977604, + "flos": 33517576695600.0, + "grad_norm": 1.4574667389787792, + "language_loss": 0.72539884, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74912035, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12890625, + "step": 11075, + "time_per_iteration": 2.856207847595215 + }, + { + "auxiliary_loss_clip": 0.01341393, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.23648357, + "balance_loss_mlp": 1.01575756, + "epoch": 0.6659251465504283, + "flos": 37713391247520.0, + "grad_norm": 1.9823847751064716, + "language_loss": 0.66369748, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.687392, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12310791, + "step": 11076, + "time_per_iteration": 2.926015615463257 + }, + { + "auxiliary_loss_clip": 0.01344622, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.23825788, + "balance_loss_mlp": 1.01564026, + "epoch": 0.6659852698030964, + "flos": 24897992929200.0, + "grad_norm": 1.5901779899449966, + "language_loss": 0.75307381, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77680999, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13360596, + "step": 11077, + "time_per_iteration": 2.828404664993286 + }, + { + "auxiliary_loss_clip": 0.01343809, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.23652315, + "balance_loss_mlp": 1.0184201, + "epoch": 0.6660453930557643, + "flos": 24358197067560.0, + "grad_norm": 1.6200597564457564, + "language_loss": 0.66865408, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.6924026, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.1262207, + "step": 11078, + "time_per_iteration": 2.872981071472168 + }, + { + "auxiliary_loss_clip": 0.01351542, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.24147165, + "balance_loss_mlp": 1.01803589, + "epoch": 0.6661055163084323, + "flos": 10601410566960.0, + "grad_norm": 2.0127266062502405, + "language_loss": 0.69635367, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.7201879, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1385498, + "step": 11079, + "time_per_iteration": 2.734593152999878 + }, + { + "auxiliary_loss_clip": 0.01344129, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.23756468, + "balance_loss_mlp": 1.01310027, + "epoch": 0.6661656395611003, + "flos": 24062379189480.0, + "grad_norm": 1.461770741899833, + "language_loss": 0.80317628, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82687283, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12432861, + "step": 11080, + "time_per_iteration": 2.7632999420166016 + }, + { + "auxiliary_loss_clip": 0.01334535, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.23121428, + "balance_loss_mlp": 1.01549387, + "epoch": 0.6662257628137682, + "flos": 23040985892040.0, + "grad_norm": 1.6214005761797887, + "language_loss": 0.78321755, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80683821, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12036133, + "step": 11081, + "time_per_iteration": 2.835564374923706 + }, + { + "auxiliary_loss_clip": 0.01348585, + "auxiliary_loss_mlp": 0.01025858, + "balance_loss_clip": 1.23949599, + "balance_loss_mlp": 1.01223826, + "epoch": 0.6662858860664362, + "flos": 24759911213280.0, + "grad_norm": 2.14704892595807, + "language_loss": 0.8032195, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82696396, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13598633, + "step": 11082, + "time_per_iteration": 2.832305431365967 + }, + { + "auxiliary_loss_clip": 0.01340767, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.23440564, + "balance_loss_mlp": 1.0195787, + "epoch": 0.6663460093191041, + "flos": 20013620801040.0, + "grad_norm": 1.484829716766624, + "language_loss": 0.83662307, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86035061, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12408447, + "step": 11083, + "time_per_iteration": 4.389414310455322 + }, + { + "auxiliary_loss_clip": 0.01357728, + "auxiliary_loss_mlp": 0.01030914, + "balance_loss_clip": 1.24675834, + "balance_loss_mlp": 1.0172286, + "epoch": 0.6664061325717722, + "flos": 17825849069040.0, + "grad_norm": 2.3578879780070428, + "language_loss": 0.85879153, + "learning_rate": 1.057990170638731e-06, + "loss": 0.88267791, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13684082, + "step": 11084, + "time_per_iteration": 4.182518243789673 + }, + { + "auxiliary_loss_clip": 0.01349875, + "auxiliary_loss_mlp": 0.01025204, + "balance_loss_clip": 1.23970759, + "balance_loss_mlp": 1.01178694, + "epoch": 0.6664662558244401, + "flos": 18081400435200.0, + "grad_norm": 4.128594690462965, + "language_loss": 0.74378312, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.7675339, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.13397217, + "step": 11085, + "time_per_iteration": 2.7770142555236816 + }, + { + "auxiliary_loss_clip": 0.0134678, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.23981416, + "balance_loss_mlp": 1.01922226, + "epoch": 0.6665263790771081, + "flos": 21578464712520.0, + "grad_norm": 2.4036039503734217, + "language_loss": 0.80409628, + "learning_rate": 1.057303129975894e-06, + "loss": 0.8278898, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13366699, + "step": 11086, + "time_per_iteration": 4.187839508056641 + }, + { + "auxiliary_loss_clip": 0.013419, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.23560345, + "balance_loss_mlp": 1.01514363, + "epoch": 0.666586502329776, + "flos": 24211953071280.0, + "grad_norm": 1.9488648516259852, + "language_loss": 0.74540722, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76911533, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.13775635, + "step": 11087, + "time_per_iteration": 2.8132529258728027 + }, + { + "auxiliary_loss_clip": 0.01343534, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.23686004, + "balance_loss_mlp": 1.01922548, + "epoch": 0.666646625582444, + "flos": 22205656410840.0, + "grad_norm": 1.8543573433055893, + "language_loss": 0.65482986, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67859066, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13323975, + "step": 11088, + "time_per_iteration": 2.7803752422332764 + }, + { + "auxiliary_loss_clip": 0.0134863, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.24028158, + "balance_loss_mlp": 1.01636553, + "epoch": 0.6667067488351119, + "flos": 18264540449520.0, + "grad_norm": 1.6665882806152, + "language_loss": 0.64185929, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66565335, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.14422607, + "step": 11089, + "time_per_iteration": 2.7573111057281494 + }, + { + "auxiliary_loss_clip": 0.01341464, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.23701572, + "balance_loss_mlp": 1.01729226, + "epoch": 0.66676687208778, + "flos": 17240791867200.0, + "grad_norm": 2.146978390484903, + "language_loss": 0.81261921, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83632755, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12091064, + "step": 11090, + "time_per_iteration": 2.805652618408203 + }, + { + "auxiliary_loss_clip": 0.01347551, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.23728645, + "balance_loss_mlp": 1.01719141, + "epoch": 0.6668269953404479, + "flos": 19756526317200.0, + "grad_norm": 1.9364805027299763, + "language_loss": 0.77511311, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79888779, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12731934, + "step": 11091, + "time_per_iteration": 2.737236976623535 + }, + { + "auxiliary_loss_clip": 0.013385, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.23112965, + "balance_loss_mlp": 1.0167377, + "epoch": 0.6668871185931159, + "flos": 20563325102520.0, + "grad_norm": 1.9022966132974641, + "language_loss": 0.79631078, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81999516, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13214111, + "step": 11092, + "time_per_iteration": 2.8076071739196777 + }, + { + "auxiliary_loss_clip": 0.01166121, + "auxiliary_loss_mlp": 0.01001293, + "balance_loss_clip": 1.12196326, + "balance_loss_mlp": 0.99833632, + "epoch": 0.6669472418457839, + "flos": 58100885840520.0, + "grad_norm": 0.7643342444070869, + "language_loss": 0.57696378, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59863794, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02954102, + "step": 11093, + "time_per_iteration": 3.3145980834960938 + }, + { + "auxiliary_loss_clip": 0.01339883, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.23437393, + "balance_loss_mlp": 1.01594901, + "epoch": 0.6670073650984518, + "flos": 26070381401040.0, + "grad_norm": 1.528781479555066, + "language_loss": 0.76574665, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78943503, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13012695, + "step": 11094, + "time_per_iteration": 3.0479907989501953 + }, + { + "auxiliary_loss_clip": 0.01342328, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.2348125, + "balance_loss_mlp": 1.02029836, + "epoch": 0.6670674883511198, + "flos": 32423365962720.0, + "grad_norm": 1.7853644114238183, + "language_loss": 0.73088551, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75464463, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13299561, + "step": 11095, + "time_per_iteration": 4.3668248653411865 + }, + { + "auxiliary_loss_clip": 0.0134294, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.23740506, + "balance_loss_mlp": 1.023664, + "epoch": 0.6671276116037878, + "flos": 18041946090480.0, + "grad_norm": 1.9335103635069915, + "language_loss": 0.7364791, + "learning_rate": 1.053870073574727e-06, + "loss": 0.76027393, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12890625, + "step": 11096, + "time_per_iteration": 2.7839577198028564 + }, + { + "auxiliary_loss_clip": 0.01333881, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.23021996, + "balance_loss_mlp": 1.0190208, + "epoch": 0.6671877348564558, + "flos": 23772084048360.0, + "grad_norm": 1.8309169583556117, + "language_loss": 0.64854866, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.67220199, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12426758, + "step": 11097, + "time_per_iteration": 2.7708301544189453 + }, + { + "auxiliary_loss_clip": 0.01350799, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.24055016, + "balance_loss_mlp": 1.02362001, + "epoch": 0.6672478581091237, + "flos": 20922661101600.0, + "grad_norm": 1.7712771756440442, + "language_loss": 0.75851667, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.78238803, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12713623, + "step": 11098, + "time_per_iteration": 2.8051302433013916 + }, + { + "auxiliary_loss_clip": 0.01346929, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.23812485, + "balance_loss_mlp": 1.02061129, + "epoch": 0.6673079813617917, + "flos": 27861677465760.0, + "grad_norm": 1.8796005872956656, + "language_loss": 0.74570525, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76950455, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1239624, + "step": 11099, + "time_per_iteration": 2.792268753051758 + }, + { + "auxiliary_loss_clip": 0.01332516, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.22881532, + "balance_loss_mlp": 1.01840162, + "epoch": 0.6673681046144596, + "flos": 21621939284880.0, + "grad_norm": 1.8313428161686995, + "language_loss": 0.78424358, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80787516, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.12219238, + "step": 11100, + "time_per_iteration": 2.883171319961548 + }, + { + "auxiliary_loss_clip": 0.01342164, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.23633552, + "balance_loss_mlp": 1.02262175, + "epoch": 0.6674282278671276, + "flos": 20895226831440.0, + "grad_norm": 1.6796698272341375, + "language_loss": 0.60424894, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62801945, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1227417, + "step": 11101, + "time_per_iteration": 2.75504207611084 + }, + { + "auxiliary_loss_clip": 0.01358954, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.24461365, + "balance_loss_mlp": 1.02457833, + "epoch": 0.6674883511197955, + "flos": 23629616629560.0, + "grad_norm": 1.824207277762817, + "language_loss": 0.71481395, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.7387948, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 1.14404297, + "router_z_loss_mlp": 0.14550781, + "step": 11102, + "time_per_iteration": 2.79121994972229 + }, + { + "auxiliary_loss_clip": 0.01344332, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.23601246, + "balance_loss_mlp": 1.01937687, + "epoch": 0.6675484743724636, + "flos": 19614424373640.0, + "grad_norm": 1.55913702492734, + "language_loss": 0.84889364, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8726579, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12731934, + "step": 11103, + "time_per_iteration": 2.720072031021118 + }, + { + "auxiliary_loss_clip": 0.01341891, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.23376679, + "balance_loss_mlp": 1.01712692, + "epoch": 0.6676085976251315, + "flos": 14323546313280.0, + "grad_norm": 1.9253906926883366, + "language_loss": 0.78523374, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80893981, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.11584473, + "step": 11104, + "time_per_iteration": 2.7325024604797363 + }, + { + "auxiliary_loss_clip": 0.0135174, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.2403779, + "balance_loss_mlp": 1.01878142, + "epoch": 0.6676687208777995, + "flos": 38111328815760.0, + "grad_norm": 1.7178824679256588, + "language_loss": 0.58428454, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60811722, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12756348, + "step": 11105, + "time_per_iteration": 2.958812713623047 + }, + { + "auxiliary_loss_clip": 0.01352434, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.24011242, + "balance_loss_mlp": 1.01963818, + "epoch": 0.6677288441304675, + "flos": 23986069435080.0, + "grad_norm": 1.6036276675299699, + "language_loss": 0.73542398, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75928533, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14050293, + "step": 11106, + "time_per_iteration": 2.976999282836914 + }, + { + "auxiliary_loss_clip": 0.0134213, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.23513329, + "balance_loss_mlp": 1.01851201, + "epoch": 0.6677889673831354, + "flos": 24176072262240.0, + "grad_norm": 2.049544535669703, + "language_loss": 0.76929975, + "learning_rate": 1.0500978558659e-06, + "loss": 0.79303437, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1282959, + "step": 11107, + "time_per_iteration": 2.8696813583374023 + }, + { + "auxiliary_loss_clip": 0.01330295, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.22792578, + "balance_loss_mlp": 1.01716602, + "epoch": 0.6678490906358034, + "flos": 22314760738920.0, + "grad_norm": 2.4047763278820335, + "language_loss": 0.90078574, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92438126, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.12084961, + "step": 11108, + "time_per_iteration": 2.8110105991363525 + }, + { + "auxiliary_loss_clip": 0.01335894, + "auxiliary_loss_mlp": 0.01025323, + "balance_loss_clip": 1.23138428, + "balance_loss_mlp": 1.01414704, + "epoch": 0.6679092138884714, + "flos": 36905089952880.0, + "grad_norm": 1.4568088950661255, + "language_loss": 0.82914507, + "learning_rate": 1.049412465858646e-06, + "loss": 0.85275722, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.11181641, + "step": 11109, + "time_per_iteration": 3.055443048477173 + }, + { + "auxiliary_loss_clip": 0.01337828, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.23164248, + "balance_loss_mlp": 1.0168066, + "epoch": 0.6679693371411394, + "flos": 18154867604400.0, + "grad_norm": 1.9353374524049205, + "language_loss": 0.69845021, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.72212243, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12585449, + "step": 11110, + "time_per_iteration": 2.733267307281494 + }, + { + "auxiliary_loss_clip": 0.01341786, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.23155499, + "balance_loss_mlp": 1.01766062, + "epoch": 0.6680294603938073, + "flos": 27203843436840.0, + "grad_norm": 1.540775014321194, + "language_loss": 0.7319622, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75569534, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13867188, + "step": 11111, + "time_per_iteration": 2.8842012882232666 + }, + { + "auxiliary_loss_clip": 0.01336442, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.23084891, + "balance_loss_mlp": 1.0173595, + "epoch": 0.6680895836464753, + "flos": 21730272054120.0, + "grad_norm": 2.2448068856379053, + "language_loss": 0.66060287, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6842618, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12078857, + "step": 11112, + "time_per_iteration": 2.7681753635406494 + }, + { + "auxiliary_loss_clip": 0.01335921, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.22939491, + "balance_loss_mlp": 1.01754558, + "epoch": 0.6681497068991432, + "flos": 19651279783320.0, + "grad_norm": 1.9152772112905514, + "language_loss": 0.63207376, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65573299, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12451172, + "step": 11113, + "time_per_iteration": 2.7881243228912354 + }, + { + "auxiliary_loss_clip": 0.01331072, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.22682667, + "balance_loss_mlp": 1.01619375, + "epoch": 0.6682098301518112, + "flos": 17423444581200.0, + "grad_norm": 2.0873395198028915, + "language_loss": 0.66487807, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68847036, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.11962891, + "step": 11114, + "time_per_iteration": 2.8948590755462646 + }, + { + "auxiliary_loss_clip": 0.01338301, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.23239183, + "balance_loss_mlp": 1.0210042, + "epoch": 0.6682699534044791, + "flos": 22603837629240.0, + "grad_norm": 2.110388213056616, + "language_loss": 0.78563017, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80935371, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13061523, + "step": 11115, + "time_per_iteration": 2.7964589595794678 + }, + { + "auxiliary_loss_clip": 0.01338186, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.23003769, + "balance_loss_mlp": 1.01862597, + "epoch": 0.6683300766571472, + "flos": 24869137366440.0, + "grad_norm": 1.6598417361820244, + "language_loss": 0.80045736, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.82414901, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12341309, + "step": 11116, + "time_per_iteration": 2.8764193058013916 + }, + { + "auxiliary_loss_clip": 0.01343013, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.23384356, + "balance_loss_mlp": 1.01537669, + "epoch": 0.6683901999098151, + "flos": 27132487902360.0, + "grad_norm": 2.7790397283800057, + "language_loss": 0.79812682, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.82185096, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.14025879, + "step": 11117, + "time_per_iteration": 2.810687303543091 + }, + { + "auxiliary_loss_clip": 0.01340567, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.23198783, + "balance_loss_mlp": 1.01635218, + "epoch": 0.6684503231624831, + "flos": 20743703748360.0, + "grad_norm": 1.7204261434403774, + "language_loss": 0.66100025, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.68470913, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13964844, + "step": 11118, + "time_per_iteration": 2.9443979263305664 + }, + { + "auxiliary_loss_clip": 0.01338851, + "auxiliary_loss_mlp": 0.01022596, + "balance_loss_clip": 1.23295474, + "balance_loss_mlp": 1.01075244, + "epoch": 0.668510446415151, + "flos": 21767208680520.0, + "grad_norm": 1.5883585571313534, + "language_loss": 0.69327229, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71688676, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.11828613, + "step": 11119, + "time_per_iteration": 2.8031232357025146 + }, + { + "auxiliary_loss_clip": 0.01340001, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.23262119, + "balance_loss_mlp": 1.0154078, + "epoch": 0.668570569667819, + "flos": 30197479961880.0, + "grad_norm": 1.8710792839706194, + "language_loss": 0.67674619, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.70042574, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12548828, + "step": 11120, + "time_per_iteration": 4.249925851821899 + }, + { + "auxiliary_loss_clip": 0.01339369, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.2328496, + "balance_loss_mlp": 1.01828146, + "epoch": 0.668630692920487, + "flos": 24176031653880.0, + "grad_norm": 1.8479865548624084, + "language_loss": 0.72795701, + "learning_rate": 1.045303157347638e-06, + "loss": 0.75166678, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13323975, + "step": 11121, + "time_per_iteration": 2.811338186264038 + }, + { + "auxiliary_loss_clip": 0.01341783, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.23302901, + "balance_loss_mlp": 1.01730072, + "epoch": 0.668690816173155, + "flos": 17461640066760.0, + "grad_norm": 3.4071352164370676, + "language_loss": 0.70151913, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72523701, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12713623, + "step": 11122, + "time_per_iteration": 4.128631830215454 + }, + { + "auxiliary_loss_clip": 0.01344057, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.23613989, + "balance_loss_mlp": 1.02316356, + "epoch": 0.668750939425823, + "flos": 25009980450840.0, + "grad_norm": 1.7116606129760596, + "language_loss": 0.71512371, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73892546, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1295166, + "step": 11123, + "time_per_iteration": 2.7516064643859863 + }, + { + "auxiliary_loss_clip": 0.01348994, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.23970985, + "balance_loss_mlp": 1.01994848, + "epoch": 0.6688110626784909, + "flos": 24102036576000.0, + "grad_norm": 1.8205042586493068, + "language_loss": 0.79057038, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81439579, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13604736, + "step": 11124, + "time_per_iteration": 2.805955171585083 + }, + { + "auxiliary_loss_clip": 0.01339468, + "auxiliary_loss_mlp": 0.01036829, + "balance_loss_clip": 1.23335624, + "balance_loss_mlp": 1.02402592, + "epoch": 0.6688711859311589, + "flos": 21764203661880.0, + "grad_norm": 1.6115635895255112, + "language_loss": 0.74596292, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76972586, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12805176, + "step": 11125, + "time_per_iteration": 4.145724296569824 + }, + { + "auxiliary_loss_clip": 0.01345595, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.23741031, + "balance_loss_mlp": 1.01589251, + "epoch": 0.6689313091838268, + "flos": 22934805365880.0, + "grad_norm": 1.9792270484349817, + "language_loss": 0.66855979, + "learning_rate": 1.043592482774116e-06, + "loss": 0.69230258, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12805176, + "step": 11126, + "time_per_iteration": 2.8017711639404297 + }, + { + "auxiliary_loss_clip": 0.01340613, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.23336911, + "balance_loss_mlp": 1.01537013, + "epoch": 0.6689914324364948, + "flos": 20891003562000.0, + "grad_norm": 1.6471528534209998, + "language_loss": 0.71119261, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73488116, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12866211, + "step": 11127, + "time_per_iteration": 2.8350086212158203 + }, + { + "auxiliary_loss_clip": 0.01346887, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.23640585, + "balance_loss_mlp": 1.01696301, + "epoch": 0.6690515556891627, + "flos": 22753492727760.0, + "grad_norm": 1.992202532830834, + "language_loss": 0.80404866, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82782954, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14239502, + "step": 11128, + "time_per_iteration": 2.7782492637634277 + }, + { + "auxiliary_loss_clip": 0.01347821, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.23835969, + "balance_loss_mlp": 1.0146631, + "epoch": 0.6691116789418308, + "flos": 23336722553400.0, + "grad_norm": 1.96447627934551, + "language_loss": 0.81347334, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83722961, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13140869, + "step": 11129, + "time_per_iteration": 2.971273183822632 + }, + { + "auxiliary_loss_clip": 0.01326182, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.22448635, + "balance_loss_mlp": 1.01751304, + "epoch": 0.6691718021944987, + "flos": 32452708825800.0, + "grad_norm": 1.4800778202587943, + "language_loss": 0.70512104, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72867483, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.11688232, + "step": 11130, + "time_per_iteration": 2.922774076461792 + }, + { + "auxiliary_loss_clip": 0.01330988, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.22732449, + "balance_loss_mlp": 1.01769781, + "epoch": 0.6692319254471667, + "flos": 23736771756360.0, + "grad_norm": 1.7211583960771515, + "language_loss": 0.70044726, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72405469, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.12054443, + "step": 11131, + "time_per_iteration": 2.8590128421783447 + }, + { + "auxiliary_loss_clip": 0.01339181, + "auxiliary_loss_mlp": 0.01024501, + "balance_loss_clip": 1.23082268, + "balance_loss_mlp": 1.01131058, + "epoch": 0.6692920486998346, + "flos": 14431879082520.0, + "grad_norm": 2.341126662934668, + "language_loss": 0.6630733, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.68671018, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13189697, + "step": 11132, + "time_per_iteration": 2.753500461578369 + }, + { + "auxiliary_loss_clip": 0.01344454, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.23581457, + "balance_loss_mlp": 1.02070761, + "epoch": 0.6693521719525026, + "flos": 21512550698280.0, + "grad_norm": 1.6731671702580617, + "language_loss": 0.74661887, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.77040607, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13543701, + "step": 11133, + "time_per_iteration": 4.344938278198242 + }, + { + "auxiliary_loss_clip": 0.01350106, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.23927474, + "balance_loss_mlp": 1.01625872, + "epoch": 0.6694122952051706, + "flos": 25411938246720.0, + "grad_norm": 1.8683403175897084, + "language_loss": 0.67060095, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.69440657, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14202881, + "step": 11134, + "time_per_iteration": 2.7996273040771484 + }, + { + "auxiliary_loss_clip": 0.01344353, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.23290348, + "balance_loss_mlp": 1.0177598, + "epoch": 0.6694724184578386, + "flos": 25666677445680.0, + "grad_norm": 1.7585622356952433, + "language_loss": 0.77377081, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79753256, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14074707, + "step": 11135, + "time_per_iteration": 2.804982900619507 + }, + { + "auxiliary_loss_clip": 0.0133412, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.22957301, + "balance_loss_mlp": 1.01339531, + "epoch": 0.6695325417105066, + "flos": 17713414855440.0, + "grad_norm": 2.6575345481998855, + "language_loss": 0.74598563, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76958996, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12921143, + "step": 11136, + "time_per_iteration": 2.7459681034088135 + }, + { + "auxiliary_loss_clip": 0.01354708, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.2437973, + "balance_loss_mlp": 1.01903713, + "epoch": 0.6695926649631745, + "flos": 24465149152560.0, + "grad_norm": 1.524118377710354, + "language_loss": 0.62770391, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.65157962, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13818359, + "step": 11137, + "time_per_iteration": 2.7914087772369385 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.23276329, + "balance_loss_mlp": 1.01660585, + "epoch": 0.6696527882158425, + "flos": 24285542065560.0, + "grad_norm": 1.9477484816317792, + "language_loss": 0.66045398, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68415415, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1315918, + "step": 11138, + "time_per_iteration": 2.7519283294677734 + }, + { + "auxiliary_loss_clip": 0.01329092, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.22561955, + "balance_loss_mlp": 1.01473868, + "epoch": 0.6697129114685104, + "flos": 23007947668200.0, + "grad_norm": 1.6756656725472794, + "language_loss": 0.72968483, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75325155, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12854004, + "step": 11139, + "time_per_iteration": 2.7597107887268066 + }, + { + "auxiliary_loss_clip": 0.01326581, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.22463155, + "balance_loss_mlp": 1.01514781, + "epoch": 0.6697730347211784, + "flos": 22643291973960.0, + "grad_norm": 1.768923539829714, + "language_loss": 0.7039364, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72747231, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.11846924, + "step": 11140, + "time_per_iteration": 2.7660744190216064 + }, + { + "auxiliary_loss_clip": 0.01340017, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.23118424, + "balance_loss_mlp": 1.0158174, + "epoch": 0.6698331579738463, + "flos": 28883233196640.0, + "grad_norm": 1.8343987572040574, + "language_loss": 0.75684655, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78054017, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13525391, + "step": 11141, + "time_per_iteration": 2.9584810733795166 + }, + { + "auxiliary_loss_clip": 0.01342213, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.23460102, + "balance_loss_mlp": 1.01819205, + "epoch": 0.6698932812265144, + "flos": 24212480979960.0, + "grad_norm": 1.7631637351877996, + "language_loss": 0.82806838, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.8518042, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1317749, + "step": 11142, + "time_per_iteration": 2.8942863941192627 + }, + { + "auxiliary_loss_clip": 0.01331865, + "auxiliary_loss_mlp": 0.01022216, + "balance_loss_clip": 1.22642851, + "balance_loss_mlp": 1.01025927, + "epoch": 0.6699534044791823, + "flos": 22095293223600.0, + "grad_norm": 1.4460244400046816, + "language_loss": 0.70243722, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72597808, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11956787, + "step": 11143, + "time_per_iteration": 2.781269073486328 + }, + { + "auxiliary_loss_clip": 0.01329301, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.22652674, + "balance_loss_mlp": 1.01767337, + "epoch": 0.6700135277318503, + "flos": 25197912251640.0, + "grad_norm": 1.6361317486800286, + "language_loss": 0.70496035, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.72854555, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.11541748, + "step": 11144, + "time_per_iteration": 2.8580057621002197 + }, + { + "auxiliary_loss_clip": 0.01333818, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.22925019, + "balance_loss_mlp": 1.01664472, + "epoch": 0.6700736509845182, + "flos": 23445380189520.0, + "grad_norm": 1.601055187029838, + "language_loss": 0.7466197, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.77025861, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.13415527, + "step": 11145, + "time_per_iteration": 2.724248170852661 + }, + { + "auxiliary_loss_clip": 0.01342231, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.2339927, + "balance_loss_mlp": 1.01543045, + "epoch": 0.6701337742371862, + "flos": 24395417952480.0, + "grad_norm": 1.5357506129711431, + "language_loss": 0.71038949, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73409951, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13330078, + "step": 11146, + "time_per_iteration": 2.784482479095459 + }, + { + "auxiliary_loss_clip": 0.01323737, + "auxiliary_loss_mlp": 0.01026286, + "balance_loss_clip": 1.22258699, + "balance_loss_mlp": 1.01375103, + "epoch": 0.6701938974898543, + "flos": 14797793635920.0, + "grad_norm": 2.023167149734756, + "language_loss": 0.78745687, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.81095713, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.12536621, + "step": 11147, + "time_per_iteration": 2.746155023574829 + }, + { + "auxiliary_loss_clip": 0.01332249, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.22745991, + "balance_loss_mlp": 1.01629996, + "epoch": 0.6702540207425222, + "flos": 20158443504720.0, + "grad_norm": 1.7220034885289592, + "language_loss": 0.70068204, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72429591, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12841797, + "step": 11148, + "time_per_iteration": 2.781454563140869 + }, + { + "auxiliary_loss_clip": 0.01339626, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.23305154, + "balance_loss_mlp": 1.02230561, + "epoch": 0.6703141439951902, + "flos": 21219006888360.0, + "grad_norm": 1.7876871093799278, + "language_loss": 0.7017864, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72553396, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12835693, + "step": 11149, + "time_per_iteration": 2.8029427528381348 + }, + { + "auxiliary_loss_clip": 0.0133785, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.22929263, + "balance_loss_mlp": 1.01768875, + "epoch": 0.6703742672478581, + "flos": 23117782946760.0, + "grad_norm": 2.1026189655031375, + "language_loss": 0.73827243, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.76195157, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12390137, + "step": 11150, + "time_per_iteration": 2.7886722087860107 + }, + { + "auxiliary_loss_clip": 0.01339348, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.23186922, + "balance_loss_mlp": 1.02033019, + "epoch": 0.6704343905005261, + "flos": 22534106429160.0, + "grad_norm": 1.6832883776472856, + "language_loss": 0.7908181, + "learning_rate": 1.035052742460671e-06, + "loss": 0.8145386, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12359619, + "step": 11151, + "time_per_iteration": 2.8259544372558594 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.0100434, + "balance_loss_clip": 1.11976337, + "balance_loss_mlp": 1.00140715, + "epoch": 0.670494513753194, + "flos": 64810421405640.0, + "grad_norm": 0.8039994524101014, + "language_loss": 0.5549053, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57659721, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02929688, + "step": 11152, + "time_per_iteration": 3.3451223373413086 + }, + { + "auxiliary_loss_clip": 0.0133891, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.23093033, + "balance_loss_mlp": 1.01888812, + "epoch": 0.670554637005862, + "flos": 23516492073840.0, + "grad_norm": 1.6396367422221068, + "language_loss": 0.81123543, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83493936, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1260376, + "step": 11153, + "time_per_iteration": 3.039008855819702 + }, + { + "auxiliary_loss_clip": 0.01337544, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.23077905, + "balance_loss_mlp": 1.0173223, + "epoch": 0.67061476025853, + "flos": 19468099160640.0, + "grad_norm": 1.4475838133328143, + "language_loss": 0.76303113, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78669667, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11694336, + "step": 11154, + "time_per_iteration": 2.99072265625 + }, + { + "auxiliary_loss_clip": 0.01339285, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.22967505, + "balance_loss_mlp": 1.0226742, + "epoch": 0.670674883511198, + "flos": 20524601708280.0, + "grad_norm": 1.506681515635572, + "language_loss": 0.76415277, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78791094, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13867188, + "step": 11155, + "time_per_iteration": 2.861074447631836 + }, + { + "auxiliary_loss_clip": 0.01339331, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.23222518, + "balance_loss_mlp": 1.0197109, + "epoch": 0.6707350067638659, + "flos": 25489506860280.0, + "grad_norm": 1.8068869796879177, + "language_loss": 0.82105172, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84476435, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.12213135, + "step": 11156, + "time_per_iteration": 2.856278657913208 + }, + { + "auxiliary_loss_clip": 0.01333578, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.22922421, + "balance_loss_mlp": 1.02104771, + "epoch": 0.6707951300165339, + "flos": 22278595671360.0, + "grad_norm": 1.8498299626185875, + "language_loss": 0.74963796, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77330714, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12298584, + "step": 11157, + "time_per_iteration": 2.7636146545410156 + }, + { + "auxiliary_loss_clip": 0.01342226, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.23523521, + "balance_loss_mlp": 1.02070761, + "epoch": 0.6708552532692018, + "flos": 23989480537320.0, + "grad_norm": 1.5782547578408548, + "language_loss": 0.74493766, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76869392, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12701416, + "step": 11158, + "time_per_iteration": 2.8919317722320557 + }, + { + "auxiliary_loss_clip": 0.01339375, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.230335, + "balance_loss_mlp": 1.01955497, + "epoch": 0.6709153765218698, + "flos": 24943091835960.0, + "grad_norm": 2.041207383490966, + "language_loss": 0.82056522, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.84428883, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13433838, + "step": 11159, + "time_per_iteration": 4.2395124435424805 + }, + { + "auxiliary_loss_clip": 0.01341219, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.23351872, + "balance_loss_mlp": 1.02036834, + "epoch": 0.6709754997745379, + "flos": 17534741760720.0, + "grad_norm": 1.6134055956546824, + "language_loss": 0.77352548, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79726839, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12719727, + "step": 11160, + "time_per_iteration": 2.7044782638549805 + }, + { + "auxiliary_loss_clip": 0.01329349, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.22480583, + "balance_loss_mlp": 1.01521432, + "epoch": 0.6710356230272058, + "flos": 22095983565720.0, + "grad_norm": 1.8739683284704314, + "language_loss": 0.74218911, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.76575804, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12341309, + "step": 11161, + "time_per_iteration": 4.1121602058410645 + }, + { + "auxiliary_loss_clip": 0.01347458, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.23595786, + "balance_loss_mlp": 1.02351642, + "epoch": 0.6710957462798738, + "flos": 24211750029480.0, + "grad_norm": 1.6210114662145245, + "language_loss": 0.68575466, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70960265, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.13800049, + "step": 11162, + "time_per_iteration": 2.8824551105499268 + }, + { + "auxiliary_loss_clip": 0.01333274, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.22809327, + "balance_loss_mlp": 1.02225208, + "epoch": 0.6711558695325417, + "flos": 19098001946160.0, + "grad_norm": 1.717393089394006, + "language_loss": 0.70177376, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72544682, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11761475, + "step": 11163, + "time_per_iteration": 2.703925371170044 + }, + { + "auxiliary_loss_clip": 0.01329928, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.22725868, + "balance_loss_mlp": 1.02184546, + "epoch": 0.6712159927852097, + "flos": 25563542546520.0, + "grad_norm": 1.6231013366148739, + "language_loss": 0.75659227, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.78022879, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11859131, + "step": 11164, + "time_per_iteration": 4.4161646366119385 + }, + { + "auxiliary_loss_clip": 0.01334733, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.22866106, + "balance_loss_mlp": 1.0180347, + "epoch": 0.6712761160378776, + "flos": 22232887639200.0, + "grad_norm": 5.843866002862509, + "language_loss": 0.65304267, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67669761, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.1272583, + "step": 11165, + "time_per_iteration": 2.776367425918579 + }, + { + "auxiliary_loss_clip": 0.01335282, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.22979617, + "balance_loss_mlp": 1.02164602, + "epoch": 0.6713362392905456, + "flos": 22460639259960.0, + "grad_norm": 2.1547389792024885, + "language_loss": 0.72020936, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.74389982, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12109375, + "step": 11166, + "time_per_iteration": 2.798707962036133 + }, + { + "auxiliary_loss_clip": 0.01330543, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.22748291, + "balance_loss_mlp": 1.01794469, + "epoch": 0.6713963625432136, + "flos": 25635791464920.0, + "grad_norm": 1.76396422316643, + "language_loss": 0.76732707, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79092681, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.11486816, + "step": 11167, + "time_per_iteration": 2.819491147994995 + }, + { + "auxiliary_loss_clip": 0.01337698, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.22959137, + "balance_loss_mlp": 1.02316082, + "epoch": 0.6714564857958816, + "flos": 35013217315680.0, + "grad_norm": 1.8413783387773173, + "language_loss": 0.69007778, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71381605, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12969971, + "step": 11168, + "time_per_iteration": 2.876152276992798 + }, + { + "auxiliary_loss_clip": 0.01343821, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.23553705, + "balance_loss_mlp": 1.0258559, + "epoch": 0.6715166090485495, + "flos": 26284691654640.0, + "grad_norm": 1.7547638447911305, + "language_loss": 0.73795807, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.7617988, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.14385986, + "step": 11169, + "time_per_iteration": 2.8192553520202637 + }, + { + "auxiliary_loss_clip": 0.01342951, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.23510599, + "balance_loss_mlp": 1.01627803, + "epoch": 0.6715767323012175, + "flos": 15928372478160.0, + "grad_norm": 1.9208249273050102, + "language_loss": 0.76390505, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78763098, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13360596, + "step": 11170, + "time_per_iteration": 2.7389461994171143 + }, + { + "auxiliary_loss_clip": 0.01340584, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.2323761, + "balance_loss_mlp": 1.01747298, + "epoch": 0.6716368555538854, + "flos": 17495652891240.0, + "grad_norm": 2.1034744197213753, + "language_loss": 0.74811256, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.77181399, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12097168, + "step": 11171, + "time_per_iteration": 4.324563980102539 + }, + { + "auxiliary_loss_clip": 0.01343657, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.23443627, + "balance_loss_mlp": 1.02393436, + "epoch": 0.6716969788065534, + "flos": 16765610552280.0, + "grad_norm": 1.615266939402785, + "language_loss": 0.86544722, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.8892538, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13079834, + "step": 11172, + "time_per_iteration": 2.8608622550964355 + }, + { + "auxiliary_loss_clip": 0.01340344, + "auxiliary_loss_mlp": 0.01036423, + "balance_loss_clip": 1.23272359, + "balance_loss_mlp": 1.02316141, + "epoch": 0.6717571020592215, + "flos": 22714809941880.0, + "grad_norm": 1.802434495737085, + "language_loss": 0.64171815, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.66548574, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13250732, + "step": 11173, + "time_per_iteration": 2.7691073417663574 + }, + { + "auxiliary_loss_clip": 0.01352251, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.23803377, + "balance_loss_mlp": 1.02630401, + "epoch": 0.6718172253118894, + "flos": 18738747163800.0, + "grad_norm": 2.6412280464847173, + "language_loss": 0.71912211, + "learning_rate": 1.02721637475002e-06, + "loss": 0.74304807, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14050293, + "step": 11174, + "time_per_iteration": 2.7594027519226074 + }, + { + "auxiliary_loss_clip": 0.01331999, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.2286737, + "balance_loss_mlp": 1.02027822, + "epoch": 0.6718773485645574, + "flos": 15636737261160.0, + "grad_norm": 4.165654705228492, + "language_loss": 0.69366419, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.71730793, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12103271, + "step": 11175, + "time_per_iteration": 2.76017165184021 + }, + { + "auxiliary_loss_clip": 0.01327146, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.22337055, + "balance_loss_mlp": 1.02176332, + "epoch": 0.6719374718172253, + "flos": 19359563349600.0, + "grad_norm": 2.13705548575835, + "language_loss": 0.74311328, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.76672733, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.12493896, + "step": 11176, + "time_per_iteration": 2.8515615463256836 + }, + { + "auxiliary_loss_clip": 0.01340893, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.23244238, + "balance_loss_mlp": 1.01801538, + "epoch": 0.6719975950698933, + "flos": 21986391937320.0, + "grad_norm": 1.632217389115932, + "language_loss": 0.7363131, + "learning_rate": 1.026195675108182e-06, + "loss": 0.76003373, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13153076, + "step": 11177, + "time_per_iteration": 2.791971445083618 + }, + { + "auxiliary_loss_clip": 0.01341357, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.23323965, + "balance_loss_mlp": 1.02098083, + "epoch": 0.6720577183225612, + "flos": 25233508802160.0, + "grad_norm": 3.8213864286143, + "language_loss": 0.76366973, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78742886, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.13574219, + "step": 11178, + "time_per_iteration": 2.7804291248321533 + }, + { + "auxiliary_loss_clip": 0.01344167, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.23531759, + "balance_loss_mlp": 1.02138019, + "epoch": 0.6721178415752292, + "flos": 16950253075920.0, + "grad_norm": 1.585109024345341, + "language_loss": 0.70286453, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72664487, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12493896, + "step": 11179, + "time_per_iteration": 2.775432825088501 + }, + { + "auxiliary_loss_clip": 0.01337734, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.23213124, + "balance_loss_mlp": 1.01776719, + "epoch": 0.6721779648278972, + "flos": 21546116830800.0, + "grad_norm": 1.5623690767953928, + "language_loss": 0.74257553, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76625788, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.1272583, + "step": 11180, + "time_per_iteration": 2.765080213546753 + }, + { + "auxiliary_loss_clip": 0.01334416, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.22894704, + "balance_loss_mlp": 1.0198555, + "epoch": 0.6722380880805652, + "flos": 22611106525680.0, + "grad_norm": 1.549378232719077, + "language_loss": 0.75731874, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.78098786, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12646484, + "step": 11181, + "time_per_iteration": 2.8026669025421143 + }, + { + "auxiliary_loss_clip": 0.01343825, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.23536134, + "balance_loss_mlp": 1.02106333, + "epoch": 0.6722982113332331, + "flos": 15929631337320.0, + "grad_norm": 4.611906329814458, + "language_loss": 0.75416958, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.77794421, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12585449, + "step": 11182, + "time_per_iteration": 2.693977117538452 + }, + { + "auxiliary_loss_clip": 0.0133224, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.22797, + "balance_loss_mlp": 1.02043176, + "epoch": 0.6723583345859011, + "flos": 20601236329560.0, + "grad_norm": 1.7316901379980096, + "language_loss": 0.69759464, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72125137, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12994385, + "step": 11183, + "time_per_iteration": 2.8541018962860107 + }, + { + "auxiliary_loss_clip": 0.01342078, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.23444819, + "balance_loss_mlp": 1.01825619, + "epoch": 0.672418457838569, + "flos": 21730881179520.0, + "grad_norm": 1.4955520568387424, + "language_loss": 0.77976406, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.80349696, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1295166, + "step": 11184, + "time_per_iteration": 2.763397455215454 + }, + { + "auxiliary_loss_clip": 0.01361839, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.24761319, + "balance_loss_mlp": 1.02410555, + "epoch": 0.672478581091237, + "flos": 21475370421720.0, + "grad_norm": 2.0955708714015717, + "language_loss": 0.66832042, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.69232458, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.14477539, + "step": 11185, + "time_per_iteration": 2.7552454471588135 + }, + { + "auxiliary_loss_clip": 0.01336912, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.23034549, + "balance_loss_mlp": 1.01801598, + "epoch": 0.6725387043439051, + "flos": 30852918097560.0, + "grad_norm": 1.5895648156299018, + "language_loss": 0.80647659, + "learning_rate": 1.023135571620345e-06, + "loss": 0.83015978, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13397217, + "step": 11186, + "time_per_iteration": 2.9280834197998047 + }, + { + "auxiliary_loss_clip": 0.01329902, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.22699559, + "balance_loss_mlp": 1.02075624, + "epoch": 0.672598827596573, + "flos": 24060633030000.0, + "grad_norm": 1.7155443903525698, + "language_loss": 0.80386221, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82747936, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11053467, + "step": 11187, + "time_per_iteration": 2.801513433456421 + }, + { + "auxiliary_loss_clip": 0.01354479, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.24184966, + "balance_loss_mlp": 1.02384198, + "epoch": 0.672658950849241, + "flos": 21876922134000.0, + "grad_norm": 1.8313168580453298, + "language_loss": 0.70846856, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73239923, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.14746094, + "step": 11188, + "time_per_iteration": 2.9771816730499268 + }, + { + "auxiliary_loss_clip": 0.01330878, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.22868538, + "balance_loss_mlp": 1.01927197, + "epoch": 0.6727190741019089, + "flos": 23227374575160.0, + "grad_norm": 1.886052874879484, + "language_loss": 0.75974512, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.78337008, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.12353516, + "step": 11189, + "time_per_iteration": 2.839759349822998 + }, + { + "auxiliary_loss_clip": 0.01349096, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.23824191, + "balance_loss_mlp": 1.01987052, + "epoch": 0.6727791973545769, + "flos": 15782859432360.0, + "grad_norm": 4.438842288602311, + "language_loss": 0.75654459, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.78037137, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13708496, + "step": 11190, + "time_per_iteration": 2.832606554031372 + }, + { + "auxiliary_loss_clip": 0.01333181, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.22688437, + "balance_loss_mlp": 1.02426994, + "epoch": 0.6728393206072448, + "flos": 21254603438880.0, + "grad_norm": 1.4550237538480957, + "language_loss": 0.77259612, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79630387, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13330078, + "step": 11191, + "time_per_iteration": 2.836432456970215 + }, + { + "auxiliary_loss_clip": 0.01334188, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.2303673, + "balance_loss_mlp": 1.01738715, + "epoch": 0.6728994438599128, + "flos": 32129537894280.0, + "grad_norm": 1.931105116992345, + "language_loss": 0.86537552, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88901973, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.128479, + "step": 11192, + "time_per_iteration": 2.864353656768799 + }, + { + "auxiliary_loss_clip": 0.01341402, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.23242831, + "balance_loss_mlp": 1.02228951, + "epoch": 0.6729595671125808, + "flos": 23117579904960.0, + "grad_norm": 2.1127073457969248, + "language_loss": 0.76082313, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78459322, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13323975, + "step": 11193, + "time_per_iteration": 2.791191577911377 + }, + { + "auxiliary_loss_clip": 0.01341062, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.23414421, + "balance_loss_mlp": 1.01861978, + "epoch": 0.6730196903652488, + "flos": 14615547005520.0, + "grad_norm": 1.9739332610102236, + "language_loss": 0.78639305, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.81011945, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1295166, + "step": 11194, + "time_per_iteration": 2.957892417907715 + }, + { + "auxiliary_loss_clip": 0.01344861, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.23611331, + "balance_loss_mlp": 1.01680112, + "epoch": 0.6730798136179167, + "flos": 21110917769280.0, + "grad_norm": 1.8245687570469276, + "language_loss": 0.89816833, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92190897, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12402344, + "step": 11195, + "time_per_iteration": 2.8668594360351562 + }, + { + "auxiliary_loss_clip": 0.01337491, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.23233438, + "balance_loss_mlp": 1.02122366, + "epoch": 0.6731399368705847, + "flos": 28992215699640.0, + "grad_norm": 1.7512832343715174, + "language_loss": 0.72336924, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74708027, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12390137, + "step": 11196, + "time_per_iteration": 2.855912923812866 + }, + { + "auxiliary_loss_clip": 0.01166522, + "auxiliary_loss_mlp": 0.01006404, + "balance_loss_clip": 1.1233815, + "balance_loss_mlp": 1.00328088, + "epoch": 0.6732000601232526, + "flos": 64759271853240.0, + "grad_norm": 0.7709489285166851, + "language_loss": 0.5664854, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58821476, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.03125, + "step": 11197, + "time_per_iteration": 3.214873790740967 + }, + { + "auxiliary_loss_clip": 0.01334553, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.23202014, + "balance_loss_mlp": 1.01654291, + "epoch": 0.6732601833759206, + "flos": 17206088700600.0, + "grad_norm": 2.254337903141198, + "language_loss": 0.75684881, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.78048533, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.12536621, + "step": 11198, + "time_per_iteration": 4.291730642318726 + }, + { + "auxiliary_loss_clip": 0.01345521, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.23682499, + "balance_loss_mlp": 1.01889169, + "epoch": 0.6733203066285887, + "flos": 18663208968240.0, + "grad_norm": 3.3987528122435338, + "language_loss": 0.81762385, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.84141248, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.14453125, + "step": 11199, + "time_per_iteration": 4.354953050613403 + }, + { + "auxiliary_loss_clip": 0.01347898, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.23781621, + "balance_loss_mlp": 1.01681614, + "epoch": 0.6733804298812566, + "flos": 35816198915160.0, + "grad_norm": 1.75087160388343, + "language_loss": 0.7147696, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73855096, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13433838, + "step": 11200, + "time_per_iteration": 2.915170192718506 + }, + { + "auxiliary_loss_clip": 0.01344566, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.23720241, + "balance_loss_mlp": 1.01833212, + "epoch": 0.6734405531339246, + "flos": 61648757822160.0, + "grad_norm": 1.518942506371759, + "language_loss": 0.64577782, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.6695376, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13079834, + "step": 11201, + "time_per_iteration": 3.2931618690490723 + }, + { + "auxiliary_loss_clip": 0.01350444, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.24039531, + "balance_loss_mlp": 1.02024281, + "epoch": 0.6735006763865925, + "flos": 20527281860040.0, + "grad_norm": 1.5843067915820694, + "language_loss": 0.6370042, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.66084433, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13330078, + "step": 11202, + "time_per_iteration": 2.7950799465179443 + }, + { + "auxiliary_loss_clip": 0.01343595, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.23536694, + "balance_loss_mlp": 1.01450622, + "epoch": 0.6735607996392605, + "flos": 13923009810000.0, + "grad_norm": 1.7371603253434256, + "language_loss": 0.74766272, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.7713716, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12774658, + "step": 11203, + "time_per_iteration": 4.178842067718506 + }, + { + "auxiliary_loss_clip": 0.01359144, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.24513435, + "balance_loss_mlp": 1.0135603, + "epoch": 0.6736209228919284, + "flos": 18811930074480.0, + "grad_norm": 1.849659248263731, + "language_loss": 0.68122101, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.70509744, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14941406, + "step": 11204, + "time_per_iteration": 2.7950239181518555 + }, + { + "auxiliary_loss_clip": 0.01356875, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.24483812, + "balance_loss_mlp": 1.01674128, + "epoch": 0.6736810461445965, + "flos": 20377626761520.0, + "grad_norm": 1.4623811441703078, + "language_loss": 0.74629819, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.77017194, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13751221, + "step": 11205, + "time_per_iteration": 2.802105665206909 + }, + { + "auxiliary_loss_clip": 0.01334507, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.23000777, + "balance_loss_mlp": 1.01463223, + "epoch": 0.6737411693972644, + "flos": 30013568388720.0, + "grad_norm": 1.5494684378071049, + "language_loss": 0.71421534, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73783773, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.13104248, + "step": 11206, + "time_per_iteration": 2.8135457038879395 + }, + { + "auxiliary_loss_clip": 0.01361618, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.24813533, + "balance_loss_mlp": 1.02236104, + "epoch": 0.6738012926499324, + "flos": 25452854492400.0, + "grad_norm": 1.9422282079980506, + "language_loss": 0.67575032, + "learning_rate": 1.016007014855092e-06, + "loss": 0.6997273, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13726807, + "step": 11207, + "time_per_iteration": 2.865903615951538 + }, + { + "auxiliary_loss_clip": 0.01335236, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.23234689, + "balance_loss_mlp": 1.01704299, + "epoch": 0.6738614159026003, + "flos": 20781736800480.0, + "grad_norm": 2.026791840754808, + "language_loss": 0.73882663, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76248163, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.13220215, + "step": 11208, + "time_per_iteration": 2.757430076599121 + }, + { + "auxiliary_loss_clip": 0.01348064, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.23942006, + "balance_loss_mlp": 1.01616621, + "epoch": 0.6739215391552683, + "flos": 19570827976200.0, + "grad_norm": 6.442870253073396, + "language_loss": 0.75659835, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78038406, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.14355469, + "step": 11209, + "time_per_iteration": 2.7885360717773438 + }, + { + "auxiliary_loss_clip": 0.01330338, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.22674704, + "balance_loss_mlp": 1.01472878, + "epoch": 0.6739816624079362, + "flos": 24393428142840.0, + "grad_norm": 1.9172032051579837, + "language_loss": 0.66409099, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68766344, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.12176514, + "step": 11210, + "time_per_iteration": 4.528246879577637 + }, + { + "auxiliary_loss_clip": 0.01332923, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.22961152, + "balance_loss_mlp": 1.01995254, + "epoch": 0.6740417856606042, + "flos": 22533253653600.0, + "grad_norm": 2.101476881249128, + "language_loss": 0.79748446, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82113039, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.11730957, + "step": 11211, + "time_per_iteration": 2.7992546558380127 + }, + { + "auxiliary_loss_clip": 0.01331338, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.2271086, + "balance_loss_mlp": 1.01452327, + "epoch": 0.6741019089132723, + "flos": 25780979643840.0, + "grad_norm": 1.355301590019138, + "language_loss": 0.76396096, + "learning_rate": 1.014312160327143e-06, + "loss": 0.7875405, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12084961, + "step": 11212, + "time_per_iteration": 2.9040844440460205 + }, + { + "auxiliary_loss_clip": 0.01341776, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.23365545, + "balance_loss_mlp": 1.01369214, + "epoch": 0.6741620321659402, + "flos": 21110633510760.0, + "grad_norm": 1.5655050974849625, + "language_loss": 0.77981138, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.80350196, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.13580322, + "step": 11213, + "time_per_iteration": 2.9296391010284424 + }, + { + "auxiliary_loss_clip": 0.01347922, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.23747253, + "balance_loss_mlp": 1.02171373, + "epoch": 0.6742221554186082, + "flos": 20745287474400.0, + "grad_norm": 1.7074038447782875, + "language_loss": 0.6839906, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.70783496, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.14788818, + "step": 11214, + "time_per_iteration": 2.822777509689331 + }, + { + "auxiliary_loss_clip": 0.01347531, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.23806441, + "balance_loss_mlp": 1.02292371, + "epoch": 0.6742822786712761, + "flos": 37780564120920.0, + "grad_norm": 1.7509876849353636, + "language_loss": 0.72801864, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.75184977, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12658691, + "step": 11215, + "time_per_iteration": 2.9481749534606934 + }, + { + "auxiliary_loss_clip": 0.01345953, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.23661423, + "balance_loss_mlp": 1.01965964, + "epoch": 0.6743424019239441, + "flos": 37270314164160.0, + "grad_norm": 1.6062440999906817, + "language_loss": 0.67602962, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69980961, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12402344, + "step": 11216, + "time_per_iteration": 2.891200542449951 + }, + { + "auxiliary_loss_clip": 0.01162358, + "auxiliary_loss_mlp": 0.00999608, + "balance_loss_clip": 1.11926246, + "balance_loss_mlp": 0.99668694, + "epoch": 0.674402525176612, + "flos": 66015238975920.0, + "grad_norm": 0.6759382153360868, + "language_loss": 0.56327951, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58489919, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.0291748, + "step": 11217, + "time_per_iteration": 3.402512311935425 + }, + { + "auxiliary_loss_clip": 0.01337667, + "auxiliary_loss_mlp": 0.01026172, + "balance_loss_clip": 1.23201144, + "balance_loss_mlp": 1.01288664, + "epoch": 0.67446264842928, + "flos": 26465841859320.0, + "grad_norm": 1.9891430933509242, + "language_loss": 0.74573547, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76937389, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13287354, + "step": 11218, + "time_per_iteration": 2.7640910148620605 + }, + { + "auxiliary_loss_clip": 0.01343096, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.23490763, + "balance_loss_mlp": 1.02057266, + "epoch": 0.674522771681948, + "flos": 23737665140280.0, + "grad_norm": 1.5660574437317842, + "language_loss": 0.66237652, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68615711, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.14385986, + "step": 11219, + "time_per_iteration": 2.784395933151245 + }, + { + "auxiliary_loss_clip": 0.01346888, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.23656344, + "balance_loss_mlp": 1.01866531, + "epoch": 0.674582894934616, + "flos": 24759789388200.0, + "grad_norm": 1.8844541437498241, + "language_loss": 0.74906814, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77285242, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12896729, + "step": 11220, + "time_per_iteration": 2.820497751235962 + }, + { + "auxiliary_loss_clip": 0.01342313, + "auxiliary_loss_mlp": 0.01028164, + "balance_loss_clip": 1.23416102, + "balance_loss_mlp": 1.01453853, + "epoch": 0.6746430181872839, + "flos": 24831997698240.0, + "grad_norm": 1.4735711838680348, + "language_loss": 0.70626491, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72996974, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13623047, + "step": 11221, + "time_per_iteration": 2.786193370819092 + }, + { + "auxiliary_loss_clip": 0.01344018, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.23816884, + "balance_loss_mlp": 1.01416969, + "epoch": 0.6747031414399519, + "flos": 16877922940800.0, + "grad_norm": 2.02743309842391, + "language_loss": 0.58228719, + "learning_rate": 1.010925256180498e-06, + "loss": 0.6059916, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12261963, + "step": 11222, + "time_per_iteration": 2.9040539264678955 + }, + { + "auxiliary_loss_clip": 0.01341295, + "auxiliary_loss_mlp": 0.0102796, + "balance_loss_clip": 1.2334013, + "balance_loss_mlp": 1.0148952, + "epoch": 0.6747632646926198, + "flos": 22790226312360.0, + "grad_norm": 1.6218371277408585, + "language_loss": 0.76815474, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.79184729, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13061523, + "step": 11223, + "time_per_iteration": 2.744945764541626 + }, + { + "auxiliary_loss_clip": 0.01340397, + "auxiliary_loss_mlp": 0.01023778, + "balance_loss_clip": 1.23272824, + "balance_loss_mlp": 1.01050377, + "epoch": 0.6748233879452878, + "flos": 20050719860880.0, + "grad_norm": 1.6848288123834487, + "language_loss": 0.75850248, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.78214419, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13256836, + "step": 11224, + "time_per_iteration": 2.825960874557495 + }, + { + "auxiliary_loss_clip": 0.01337779, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.23423219, + "balance_loss_mlp": 1.01777005, + "epoch": 0.6748835111979558, + "flos": 23008028884920.0, + "grad_norm": 1.5673909632621865, + "language_loss": 0.63346136, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65713316, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.11633301, + "step": 11225, + "time_per_iteration": 2.8313095569610596 + }, + { + "auxiliary_loss_clip": 0.01328235, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.22626972, + "balance_loss_mlp": 1.01550436, + "epoch": 0.6749436344506238, + "flos": 12200348519640.0, + "grad_norm": 1.6590633683071765, + "language_loss": 0.64393485, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66748488, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.11260986, + "step": 11226, + "time_per_iteration": 2.8161280155181885 + }, + { + "auxiliary_loss_clip": 0.01347335, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.23856819, + "balance_loss_mlp": 1.01741076, + "epoch": 0.6750037577032918, + "flos": 11877340021560.0, + "grad_norm": 2.283595790925166, + "language_loss": 0.72283912, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.74661827, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13153076, + "step": 11227, + "time_per_iteration": 2.7857749462127686 + }, + { + "auxiliary_loss_clip": 0.01336359, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.23149848, + "balance_loss_mlp": 1.0158788, + "epoch": 0.6750638809559597, + "flos": 17024085720360.0, + "grad_norm": 2.2292273103079645, + "language_loss": 0.7114588, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73511094, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12994385, + "step": 11228, + "time_per_iteration": 2.8653974533081055 + }, + { + "auxiliary_loss_clip": 0.01165578, + "auxiliary_loss_mlp": 0.01005664, + "balance_loss_clip": 1.12240481, + "balance_loss_mlp": 1.00293446, + "epoch": 0.6751240042086277, + "flos": 70968002228280.0, + "grad_norm": 0.7543378285430569, + "language_loss": 0.53315997, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55487239, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02734375, + "step": 11229, + "time_per_iteration": 3.319159507751465 + }, + { + "auxiliary_loss_clip": 0.01331417, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.22729707, + "balance_loss_mlp": 1.01541829, + "epoch": 0.6751841274612956, + "flos": 22680391033800.0, + "grad_norm": 1.8095726782958146, + "language_loss": 0.80982339, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.83342183, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.13006592, + "step": 11230, + "time_per_iteration": 2.7779085636138916 + }, + { + "auxiliary_loss_clip": 0.01337372, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.23282254, + "balance_loss_mlp": 1.0176152, + "epoch": 0.6752442507139637, + "flos": 21293692308360.0, + "grad_norm": 1.519051054834528, + "language_loss": 0.65881151, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68248081, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.1194458, + "step": 11231, + "time_per_iteration": 2.8230159282684326 + }, + { + "auxiliary_loss_clip": 0.01353333, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.24128151, + "balance_loss_mlp": 1.02153945, + "epoch": 0.6753043739666316, + "flos": 28262823094440.0, + "grad_norm": 2.6056108096334336, + "language_loss": 0.66971886, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.69361585, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 1.12158203, + "router_z_loss_mlp": 0.14807129, + "step": 11232, + "time_per_iteration": 2.8293092250823975 + }, + { + "auxiliary_loss_clip": 0.01331749, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.22724557, + "balance_loss_mlp": 1.01670957, + "epoch": 0.6753644972192996, + "flos": 21365250884640.0, + "grad_norm": 1.7846129399832593, + "language_loss": 0.72521961, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.7488308, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12664795, + "step": 11233, + "time_per_iteration": 2.804215431213379 + }, + { + "auxiliary_loss_clip": 0.01342571, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.23620009, + "balance_loss_mlp": 1.01563978, + "epoch": 0.6754246204719675, + "flos": 26547633742320.0, + "grad_norm": 1.5547795917051963, + "language_loss": 0.77329117, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79700047, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12719727, + "step": 11234, + "time_per_iteration": 2.9479284286499023 + }, + { + "auxiliary_loss_clip": 0.01339207, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.23330367, + "balance_loss_mlp": 1.0177871, + "epoch": 0.6754847437246355, + "flos": 25562202470640.0, + "grad_norm": 1.496665030062305, + "language_loss": 0.75507635, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77877784, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.1315918, + "step": 11235, + "time_per_iteration": 2.8841662406921387 + }, + { + "auxiliary_loss_clip": 0.01165562, + "auxiliary_loss_mlp": 0.01005811, + "balance_loss_clip": 1.12238431, + "balance_loss_mlp": 1.00325954, + "epoch": 0.6755448669773034, + "flos": 59526511002000.0, + "grad_norm": 0.7892436408210584, + "language_loss": 0.51434386, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53605759, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.0255127, + "step": 11236, + "time_per_iteration": 3.2443697452545166 + }, + { + "auxiliary_loss_clip": 0.01334784, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.22991133, + "balance_loss_mlp": 1.01394022, + "epoch": 0.6756049902299714, + "flos": 23299745318640.0, + "grad_norm": 1.9128488226595475, + "language_loss": 0.75643748, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.78006011, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.13537598, + "step": 11237, + "time_per_iteration": 5.643535614013672 + }, + { + "auxiliary_loss_clip": 0.01344535, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.23919749, + "balance_loss_mlp": 1.02016163, + "epoch": 0.6756651134826394, + "flos": 31581539143920.0, + "grad_norm": 1.7552973788529427, + "language_loss": 0.77597749, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.7997514, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.1270752, + "step": 11238, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.01348928, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.23744321, + "balance_loss_mlp": 1.02015305, + "epoch": 0.6757252367353074, + "flos": 27278122773240.0, + "grad_norm": 1.6514110371033632, + "language_loss": 0.66762036, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.69144738, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.1361084, + "step": 11239, + "time_per_iteration": 2.8306212425231934 + }, + { + "auxiliary_loss_clip": 0.01331813, + "auxiliary_loss_mlp": 0.01025105, + "balance_loss_clip": 1.22854638, + "balance_loss_mlp": 1.01280284, + "epoch": 0.6757853599879754, + "flos": 16834529585160.0, + "grad_norm": 1.8654431563636114, + "language_loss": 0.83000004, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85356927, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.12310791, + "step": 11240, + "time_per_iteration": 4.160295724868774 + }, + { + "auxiliary_loss_clip": 0.01356935, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.24485481, + "balance_loss_mlp": 1.01272392, + "epoch": 0.6758454832406433, + "flos": 23225059898640.0, + "grad_norm": 2.704402065256848, + "language_loss": 0.74239898, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76624322, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14746094, + "step": 11241, + "time_per_iteration": 2.8349220752716064 + }, + { + "auxiliary_loss_clip": 0.01340452, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.23445463, + "balance_loss_mlp": 1.02415705, + "epoch": 0.6759056064933113, + "flos": 16293880947960.0, + "grad_norm": 2.1189407593812617, + "language_loss": 0.80950135, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.83327836, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13092041, + "step": 11242, + "time_per_iteration": 2.830881118774414 + }, + { + "auxiliary_loss_clip": 0.01342198, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.23494411, + "balance_loss_mlp": 1.01902139, + "epoch": 0.6759657297459792, + "flos": 25927954590600.0, + "grad_norm": 2.6345553208336003, + "language_loss": 0.72602272, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74976075, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12591553, + "step": 11243, + "time_per_iteration": 2.813936233520508 + }, + { + "auxiliary_loss_clip": 0.01334677, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.22858036, + "balance_loss_mlp": 1.02147317, + "epoch": 0.6760258529986473, + "flos": 23005754816760.0, + "grad_norm": 1.619266601504122, + "language_loss": 0.73267412, + "learning_rate": 1.003487287162221e-06, + "loss": 0.75635493, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11938477, + "step": 11244, + "time_per_iteration": 2.870431661605835 + }, + { + "auxiliary_loss_clip": 0.0134445, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.23719883, + "balance_loss_mlp": 1.02037501, + "epoch": 0.6760859762513152, + "flos": 20964227081040.0, + "grad_norm": 1.9029050410630401, + "language_loss": 0.85952991, + "learning_rate": 1.003149631190393e-06, + "loss": 0.88330913, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13079834, + "step": 11245, + "time_per_iteration": 2.9430248737335205 + }, + { + "auxiliary_loss_clip": 0.01357109, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.24522114, + "balance_loss_mlp": 1.02004075, + "epoch": 0.6761460995039832, + "flos": 23628195336960.0, + "grad_norm": 1.7836714093261354, + "language_loss": 0.73636806, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.76027614, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13659668, + "step": 11246, + "time_per_iteration": 2.7918567657470703 + }, + { + "auxiliary_loss_clip": 0.01336589, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.23034215, + "balance_loss_mlp": 1.01545632, + "epoch": 0.6762062227566511, + "flos": 20774955204360.0, + "grad_norm": 1.8109090987303125, + "language_loss": 0.88096422, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90461266, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12792969, + "step": 11247, + "time_per_iteration": 2.880470037460327 + }, + { + "auxiliary_loss_clip": 0.0116488, + "auxiliary_loss_mlp": 0.0100801, + "balance_loss_clip": 1.12136984, + "balance_loss_mlp": 1.00565004, + "epoch": 0.6762663460093191, + "flos": 52832789070120.0, + "grad_norm": 0.8217964473326624, + "language_loss": 0.54024935, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56197822, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02355957, + "step": 11248, + "time_per_iteration": 4.7143895626068115 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.23053169, + "balance_loss_mlp": 1.01409471, + "epoch": 0.676326469261987, + "flos": 23701621897800.0, + "grad_norm": 1.5458868586527852, + "language_loss": 0.73781955, + "learning_rate": 1.001799385437761e-06, + "loss": 0.7613982, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.1217041, + "step": 11249, + "time_per_iteration": 2.8020877838134766 + }, + { + "auxiliary_loss_clip": 0.01343761, + "auxiliary_loss_mlp": 0.01033807, + "balance_loss_clip": 1.23503184, + "balance_loss_mlp": 1.02022934, + "epoch": 0.676386592514655, + "flos": 14067751296960.0, + "grad_norm": 1.927020468059806, + "language_loss": 0.7444483, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.768224, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13580322, + "step": 11250, + "time_per_iteration": 2.7875852584838867 + }, + { + "auxiliary_loss_clip": 0.01340089, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.23334157, + "balance_loss_mlp": 1.01536655, + "epoch": 0.676446715767323, + "flos": 20417081106240.0, + "grad_norm": 1.767870750838966, + "language_loss": 0.75091541, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77459776, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12786865, + "step": 11251, + "time_per_iteration": 2.8205487728118896 + }, + { + "auxiliary_loss_clip": 0.01339069, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.23421741, + "balance_loss_mlp": 1.01665485, + "epoch": 0.676506839019991, + "flos": 21293245616400.0, + "grad_norm": 1.914572210768531, + "language_loss": 0.69859779, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72228044, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12542725, + "step": 11252, + "time_per_iteration": 2.808305263519287 + }, + { + "auxiliary_loss_clip": 0.01339675, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.23461103, + "balance_loss_mlp": 1.01646686, + "epoch": 0.676566962272659, + "flos": 29938233234960.0, + "grad_norm": 1.9438523381013695, + "language_loss": 0.66693133, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69061697, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12402344, + "step": 11253, + "time_per_iteration": 2.907738208770752 + }, + { + "auxiliary_loss_clip": 0.0134743, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.23804617, + "balance_loss_mlp": 1.02193081, + "epoch": 0.6766270855253269, + "flos": 17935724955960.0, + "grad_norm": 1.5916359770452673, + "language_loss": 0.77348793, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79732168, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14013672, + "step": 11254, + "time_per_iteration": 2.8157501220703125 + }, + { + "auxiliary_loss_clip": 0.01342042, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.23493052, + "balance_loss_mlp": 1.01693356, + "epoch": 0.6766872087779949, + "flos": 23109092757720.0, + "grad_norm": 2.068737163811804, + "language_loss": 0.7250855, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74880332, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12817383, + "step": 11255, + "time_per_iteration": 2.7948875427246094 + }, + { + "auxiliary_loss_clip": 0.013449, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.23695982, + "balance_loss_mlp": 1.01838875, + "epoch": 0.6767473320306628, + "flos": 26218696423680.0, + "grad_norm": 1.9335328106714211, + "language_loss": 0.75563765, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77940094, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13037109, + "step": 11256, + "time_per_iteration": 2.817239761352539 + }, + { + "auxiliary_loss_clip": 0.01337653, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.23199797, + "balance_loss_mlp": 1.01566732, + "epoch": 0.6768074552833309, + "flos": 18373279302360.0, + "grad_norm": 2.16533671045458, + "language_loss": 0.65283895, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67650193, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12969971, + "step": 11257, + "time_per_iteration": 2.875453472137451 + }, + { + "auxiliary_loss_clip": 0.01329228, + "auxiliary_loss_mlp": 0.01025811, + "balance_loss_clip": 1.22571206, + "balance_loss_mlp": 1.01363349, + "epoch": 0.6768675785359988, + "flos": 23045209161480.0, + "grad_norm": 2.0962499135306327, + "language_loss": 0.75827146, + "learning_rate": 9.987635480759109e-07, + "loss": 0.78182185, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.12182617, + "step": 11258, + "time_per_iteration": 2.749969005584717 + }, + { + "auxiliary_loss_clip": 0.01325971, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.223616, + "balance_loss_mlp": 1.01424265, + "epoch": 0.6769277017886668, + "flos": 33043045114440.0, + "grad_norm": 1.635517528822049, + "language_loss": 0.6703614, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69388485, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.12139893, + "step": 11259, + "time_per_iteration": 2.903341770172119 + }, + { + "auxiliary_loss_clip": 0.0133691, + "auxiliary_loss_mlp": 0.01027619, + "balance_loss_clip": 1.2306267, + "balance_loss_mlp": 1.01477396, + "epoch": 0.6769878250413347, + "flos": 20853092334960.0, + "grad_norm": 1.9894038097313593, + "language_loss": 0.86134022, + "learning_rate": 9.980893348596839e-07, + "loss": 0.88498551, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12854004, + "step": 11260, + "time_per_iteration": 2.8086507320404053 + }, + { + "auxiliary_loss_clip": 0.01348558, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.23838925, + "balance_loss_mlp": 1.01849413, + "epoch": 0.6770479482940027, + "flos": 15600166110000.0, + "grad_norm": 2.333961069654454, + "language_loss": 0.77971315, + "learning_rate": 9.977522852340081e-07, + "loss": 0.80351597, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 1.10107422, + "router_z_loss_mlp": 0.13238525, + "step": 11261, + "time_per_iteration": 2.7106292247772217 + }, + { + "auxiliary_loss_clip": 0.0133828, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.23083138, + "balance_loss_mlp": 1.02006602, + "epoch": 0.6771080715466706, + "flos": 18625175916120.0, + "grad_norm": 1.6232855549055318, + "language_loss": 0.8774842, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90119994, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13238525, + "step": 11262, + "time_per_iteration": 2.753131866455078 + }, + { + "auxiliary_loss_clip": 0.01343879, + "auxiliary_loss_mlp": 0.01026005, + "balance_loss_clip": 1.23587179, + "balance_loss_mlp": 1.0128566, + "epoch": 0.6771681947993387, + "flos": 12499333849800.0, + "grad_norm": 1.801447525500762, + "language_loss": 0.74701691, + "learning_rate": 9.97078300011439e-07, + "loss": 0.77071577, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13146973, + "step": 11263, + "time_per_iteration": 2.707916259765625 + }, + { + "auxiliary_loss_clip": 0.01347283, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.23745179, + "balance_loss_mlp": 1.01982796, + "epoch": 0.6772283180520066, + "flos": 22242105736920.0, + "grad_norm": 3.9456406137197844, + "language_loss": 0.68094432, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70475894, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14355469, + "step": 11264, + "time_per_iteration": 2.80880069732666 + }, + { + "auxiliary_loss_clip": 0.01332123, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.22681808, + "balance_loss_mlp": 1.01485097, + "epoch": 0.6772884413046746, + "flos": 16147393301520.0, + "grad_norm": 2.5047816010392254, + "language_loss": 0.73490286, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75850201, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12927246, + "step": 11265, + "time_per_iteration": 2.8181378841400146 + }, + { + "auxiliary_loss_clip": 0.01336428, + "auxiliary_loss_mlp": 0.01027479, + "balance_loss_clip": 1.23292053, + "balance_loss_mlp": 1.01564205, + "epoch": 0.6773485645573426, + "flos": 19140501917880.0, + "grad_norm": 1.6081347737367861, + "language_loss": 0.6204077, + "learning_rate": 9.96067607441207e-07, + "loss": 0.64404678, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.11816406, + "step": 11266, + "time_per_iteration": 2.8116343021392822 + }, + { + "auxiliary_loss_clip": 0.01342982, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.23588681, + "balance_loss_mlp": 1.01906872, + "epoch": 0.6774086878100105, + "flos": 14140974816000.0, + "grad_norm": 2.043006220775083, + "language_loss": 0.70966059, + "learning_rate": 9.957307860391976e-07, + "loss": 0.73340631, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12518311, + "step": 11267, + "time_per_iteration": 2.812002658843994 + }, + { + "auxiliary_loss_clip": 0.01333784, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.2276082, + "balance_loss_mlp": 1.01644063, + "epoch": 0.6774688110626785, + "flos": 22201758008280.0, + "grad_norm": 1.9116621708203767, + "language_loss": 0.71119201, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73481691, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12255859, + "step": 11268, + "time_per_iteration": 2.8026530742645264 + }, + { + "auxiliary_loss_clip": 0.01343564, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.23695946, + "balance_loss_mlp": 1.01589966, + "epoch": 0.6775289343153464, + "flos": 23045249769840.0, + "grad_norm": 1.4689560751184458, + "language_loss": 0.76789451, + "learning_rate": 9.950572574939194e-07, + "loss": 0.79162109, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13195801, + "step": 11269, + "time_per_iteration": 2.987332582473755 + }, + { + "auxiliary_loss_clip": 0.0134306, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.23420167, + "balance_loss_mlp": 1.02355945, + "epoch": 0.6775890575680145, + "flos": 18297741106800.0, + "grad_norm": 2.8810464896132264, + "language_loss": 0.7478444, + "learning_rate": 9.94720550376189e-07, + "loss": 0.77164739, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13677979, + "step": 11270, + "time_per_iteration": 2.737368106842041 + }, + { + "auxiliary_loss_clip": 0.01337243, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.2312535, + "balance_loss_mlp": 1.01936138, + "epoch": 0.6776491808206824, + "flos": 25341760354680.0, + "grad_norm": 1.6043936219227608, + "language_loss": 0.72838581, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75208664, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13470459, + "step": 11271, + "time_per_iteration": 2.82279634475708 + }, + { + "auxiliary_loss_clip": 0.0134396, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.23788285, + "balance_loss_mlp": 1.01931095, + "epoch": 0.6777093040733504, + "flos": 26033444774640.0, + "grad_norm": 1.7887168467301566, + "language_loss": 0.68016982, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70392752, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.125, + "step": 11272, + "time_per_iteration": 2.8666317462921143 + }, + { + "auxiliary_loss_clip": 0.01344155, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.2352035, + "balance_loss_mlp": 1.01755261, + "epoch": 0.6777694273260183, + "flos": 18008055091080.0, + "grad_norm": 1.7744940413025927, + "language_loss": 0.74129212, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76505154, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.14245605, + "step": 11273, + "time_per_iteration": 2.8112704753875732 + }, + { + "auxiliary_loss_clip": 0.013333, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.22912073, + "balance_loss_mlp": 1.0255357, + "epoch": 0.6778295505786863, + "flos": 23446395398520.0, + "grad_norm": 1.8157900234967095, + "language_loss": 0.70343965, + "learning_rate": 9.933741032359015e-07, + "loss": 0.7271539, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.12597656, + "step": 11274, + "time_per_iteration": 2.8263425827026367 + }, + { + "auxiliary_loss_clip": 0.01341194, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.23267102, + "balance_loss_mlp": 1.01281428, + "epoch": 0.6778896738313542, + "flos": 19103037382800.0, + "grad_norm": 1.5607094580679455, + "language_loss": 0.65950638, + "learning_rate": 9.930375868473093e-07, + "loss": 0.68317574, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1293335, + "step": 11275, + "time_per_iteration": 2.8053972721099854 + }, + { + "auxiliary_loss_clip": 0.0134343, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.23687279, + "balance_loss_mlp": 1.01665914, + "epoch": 0.6779497970840223, + "flos": 26109510878880.0, + "grad_norm": 1.611908795511099, + "language_loss": 0.72880435, + "learning_rate": 9.927011086428335e-07, + "loss": 0.75252843, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12322998, + "step": 11276, + "time_per_iteration": 5.696898460388184 + }, + { + "auxiliary_loss_clip": 0.01338624, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.23381495, + "balance_loss_mlp": 1.01303959, + "epoch": 0.6780099203366902, + "flos": 19724016002040.0, + "grad_norm": 2.1161222708018035, + "language_loss": 0.77198505, + "learning_rate": 9.923646686352317e-07, + "loss": 0.79562891, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1270752, + "step": 11277, + "time_per_iteration": 2.7620108127593994 + }, + { + "auxiliary_loss_clip": 0.01346547, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.23740292, + "balance_loss_mlp": 1.01708078, + "epoch": 0.6780700435893582, + "flos": 18217979641800.0, + "grad_norm": 2.46277846950528, + "language_loss": 0.84153831, + "learning_rate": 9.920282668372627e-07, + "loss": 0.8653059, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13122559, + "step": 11278, + "time_per_iteration": 2.673510789871216 + }, + { + "auxiliary_loss_clip": 0.01331258, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.22850251, + "balance_loss_mlp": 1.01539135, + "epoch": 0.6781301668420262, + "flos": 25381986258240.0, + "grad_norm": 1.5439983901093086, + "language_loss": 0.70313013, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72671425, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.11773682, + "step": 11279, + "time_per_iteration": 4.23581600189209 + }, + { + "auxiliary_loss_clip": 0.01336572, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.22966838, + "balance_loss_mlp": 1.01777017, + "epoch": 0.6781902900946941, + "flos": 24025239521280.0, + "grad_norm": 1.8463483825083689, + "language_loss": 0.74431038, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76798606, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13220215, + "step": 11280, + "time_per_iteration": 2.950439929962158 + }, + { + "auxiliary_loss_clip": 0.01345606, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.23682141, + "balance_loss_mlp": 1.0179342, + "epoch": 0.6782504133473621, + "flos": 19651604650200.0, + "grad_norm": 1.916788396430499, + "language_loss": 0.70645237, + "learning_rate": 9.910192908287104e-07, + "loss": 0.73021603, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12817383, + "step": 11281, + "time_per_iteration": 2.7185893058776855 + }, + { + "auxiliary_loss_clip": 0.01328998, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.22620177, + "balance_loss_mlp": 1.01492488, + "epoch": 0.67831053660003, + "flos": 24937690924080.0, + "grad_norm": 1.5680446839347995, + "language_loss": 0.64082718, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66438794, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.12158203, + "step": 11282, + "time_per_iteration": 2.8675286769866943 + }, + { + "auxiliary_loss_clip": 0.01352568, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.24008536, + "balance_loss_mlp": 1.02451682, + "epoch": 0.6783706598526981, + "flos": 31214162689560.0, + "grad_norm": 1.4814012649894468, + "language_loss": 0.74588001, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76979095, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14013672, + "step": 11283, + "time_per_iteration": 2.860476493835449 + }, + { + "auxiliary_loss_clip": 0.01336074, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.22920156, + "balance_loss_mlp": 1.01308429, + "epoch": 0.678430783105366, + "flos": 35447726035080.0, + "grad_norm": 1.5330074155730904, + "language_loss": 0.56804025, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59165812, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12628174, + "step": 11284, + "time_per_iteration": 2.9541633129119873 + }, + { + "auxiliary_loss_clip": 0.0133428, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.22835445, + "balance_loss_mlp": 1.01503885, + "epoch": 0.678490906358034, + "flos": 14432894291520.0, + "grad_norm": 1.997193013536305, + "language_loss": 0.75859845, + "learning_rate": 9.896745251925535e-07, + "loss": 0.78221804, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12646484, + "step": 11285, + "time_per_iteration": 2.8394086360931396 + }, + { + "auxiliary_loss_clip": 0.0132786, + "auxiliary_loss_mlp": 0.01024909, + "balance_loss_clip": 1.22438705, + "balance_loss_mlp": 1.01258278, + "epoch": 0.6785510296107019, + "flos": 24315940746000.0, + "grad_norm": 1.5634508807170373, + "language_loss": 0.66618991, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68971759, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.12322998, + "step": 11286, + "time_per_iteration": 2.8572967052459717 + }, + { + "auxiliary_loss_clip": 0.01341858, + "auxiliary_loss_mlp": 0.01026527, + "balance_loss_clip": 1.23350513, + "balance_loss_mlp": 1.01399791, + "epoch": 0.6786111528633699, + "flos": 26982629762040.0, + "grad_norm": 2.401968414652056, + "language_loss": 0.53121912, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55490291, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12524414, + "step": 11287, + "time_per_iteration": 4.415710926055908 + }, + { + "auxiliary_loss_clip": 0.01335789, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.22910249, + "balance_loss_mlp": 1.01533723, + "epoch": 0.6786712761160378, + "flos": 24323087817360.0, + "grad_norm": 1.50338695926981, + "language_loss": 0.77207404, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79571307, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12780762, + "step": 11288, + "time_per_iteration": 2.9471142292022705 + }, + { + "auxiliary_loss_clip": 0.01338061, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.23103893, + "balance_loss_mlp": 1.02408791, + "epoch": 0.6787313993687059, + "flos": 22935698749800.0, + "grad_norm": 2.054960193951617, + "language_loss": 0.73473769, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75848478, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12548828, + "step": 11289, + "time_per_iteration": 2.789858818054199 + }, + { + "auxiliary_loss_clip": 0.01338572, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.23108745, + "balance_loss_mlp": 1.01715112, + "epoch": 0.6787915226213738, + "flos": 26873890909200.0, + "grad_norm": 1.4608000784222734, + "language_loss": 0.80224717, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82593095, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12670898, + "step": 11290, + "time_per_iteration": 2.817153215408325 + }, + { + "auxiliary_loss_clip": 0.01328746, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.22544384, + "balance_loss_mlp": 1.01650119, + "epoch": 0.6788516458740418, + "flos": 20013377150880.0, + "grad_norm": 1.4711815640729755, + "language_loss": 0.75251919, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77609539, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.1237793, + "step": 11291, + "time_per_iteration": 2.7704713344573975 + }, + { + "auxiliary_loss_clip": 0.01339118, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.2312603, + "balance_loss_mlp": 1.01656771, + "epoch": 0.6789117691267098, + "flos": 28732522280760.0, + "grad_norm": 1.74143243550185, + "language_loss": 0.75489783, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77858621, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1315918, + "step": 11292, + "time_per_iteration": 2.8934054374694824 + }, + { + "auxiliary_loss_clip": 0.01336104, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.22949362, + "balance_loss_mlp": 1.01859832, + "epoch": 0.6789718923793777, + "flos": 23408605996560.0, + "grad_norm": 2.2518409620320856, + "language_loss": 0.84213746, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86581111, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12670898, + "step": 11293, + "time_per_iteration": 2.7486653327941895 + }, + { + "auxiliary_loss_clip": 0.01355312, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.24224877, + "balance_loss_mlp": 1.0214653, + "epoch": 0.6790320156320457, + "flos": 20453733474120.0, + "grad_norm": 2.1701130579340844, + "language_loss": 0.79530871, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81921875, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14221191, + "step": 11294, + "time_per_iteration": 2.759709119796753 + }, + { + "auxiliary_loss_clip": 0.01334836, + "auxiliary_loss_mlp": 0.0102564, + "balance_loss_clip": 1.2281363, + "balance_loss_mlp": 1.0134089, + "epoch": 0.6790921388847136, + "flos": 24172133251320.0, + "grad_norm": 1.7159512030329427, + "language_loss": 0.79334772, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81695241, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12231445, + "step": 11295, + "time_per_iteration": 2.802757978439331 + }, + { + "auxiliary_loss_clip": 0.01325616, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.22353601, + "balance_loss_mlp": 1.01929712, + "epoch": 0.6791522621373817, + "flos": 21913127809920.0, + "grad_norm": 1.6853997459299856, + "language_loss": 0.71561778, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73917437, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.10742188, + "step": 11296, + "time_per_iteration": 2.845446825027466 + }, + { + "auxiliary_loss_clip": 0.01339001, + "auxiliary_loss_mlp": 0.0102643, + "balance_loss_clip": 1.23202765, + "balance_loss_mlp": 1.01424122, + "epoch": 0.6792123853900496, + "flos": 24831875873160.0, + "grad_norm": 1.5714748011599924, + "language_loss": 0.70848286, + "learning_rate": 9.856439094633949e-07, + "loss": 0.7321372, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12194824, + "step": 11297, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.01346865, + "auxiliary_loss_mlp": 0.01027848, + "balance_loss_clip": 1.23596728, + "balance_loss_mlp": 1.01454449, + "epoch": 0.6792725086427176, + "flos": 17570744394840.0, + "grad_norm": 2.49668560299827, + "language_loss": 0.66441786, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68816501, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13305664, + "step": 11298, + "time_per_iteration": 2.7014427185058594 + }, + { + "auxiliary_loss_clip": 0.01341722, + "auxiliary_loss_mlp": 0.0102602, + "balance_loss_clip": 1.23373473, + "balance_loss_mlp": 1.0143261, + "epoch": 0.6793326318953855, + "flos": 26947358078400.0, + "grad_norm": 1.731724202525951, + "language_loss": 0.72084951, + "learning_rate": 9.84972678083801e-07, + "loss": 0.74452698, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.11682129, + "step": 11299, + "time_per_iteration": 2.799295425415039 + }, + { + "auxiliary_loss_clip": 0.01337679, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.22978294, + "balance_loss_mlp": 1.01751208, + "epoch": 0.6793927551480535, + "flos": 24323899984560.0, + "grad_norm": 1.349183976850776, + "language_loss": 0.775581, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79926831, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13549805, + "step": 11300, + "time_per_iteration": 2.761147975921631 + }, + { + "auxiliary_loss_clip": 0.01335156, + "auxiliary_loss_mlp": 0.0102716, + "balance_loss_clip": 1.22806144, + "balance_loss_mlp": 1.01489294, + "epoch": 0.6794528784007214, + "flos": 11440272975480.0, + "grad_norm": 2.248621733748329, + "language_loss": 0.63615459, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65977776, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12268066, + "step": 11301, + "time_per_iteration": 2.7383551597595215 + }, + { + "auxiliary_loss_clip": 0.01337736, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.23015523, + "balance_loss_mlp": 1.01540327, + "epoch": 0.6795130016533895, + "flos": 25235336178360.0, + "grad_norm": 1.6466451002693199, + "language_loss": 0.82838976, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85204518, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12402344, + "step": 11302, + "time_per_iteration": 2.762908697128296 + }, + { + "auxiliary_loss_clip": 0.01342042, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.2326405, + "balance_loss_mlp": 1.01652622, + "epoch": 0.6795731249060574, + "flos": 18301030383960.0, + "grad_norm": 1.9163206187040278, + "language_loss": 0.6957038, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71941459, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12518311, + "step": 11303, + "time_per_iteration": 2.75152850151062 + }, + { + "auxiliary_loss_clip": 0.01347329, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.2357614, + "balance_loss_mlp": 1.02077687, + "epoch": 0.6796332481587254, + "flos": 20304890542800.0, + "grad_norm": 1.902166055059363, + "language_loss": 0.70740414, + "learning_rate": 9.832952734313813e-07, + "loss": 0.7312206, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.13531494, + "step": 11304, + "time_per_iteration": 2.9432711601257324 + }, + { + "auxiliary_loss_clip": 0.01347746, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.23936868, + "balance_loss_mlp": 1.01952517, + "epoch": 0.6796933714113934, + "flos": 23592152094480.0, + "grad_norm": 2.106610035031064, + "language_loss": 0.73356193, + "learning_rate": 9.829599081106536e-07, + "loss": 0.75736856, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.13397217, + "step": 11305, + "time_per_iteration": 2.7961013317108154 + }, + { + "auxiliary_loss_clip": 0.01342063, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.23383081, + "balance_loss_mlp": 1.01559019, + "epoch": 0.6797534946640613, + "flos": 27124894139040.0, + "grad_norm": 2.3002623224172263, + "language_loss": 0.66268384, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68638974, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1295166, + "step": 11306, + "time_per_iteration": 2.871058702468872 + }, + { + "auxiliary_loss_clip": 0.01338901, + "auxiliary_loss_mlp": 0.01024417, + "balance_loss_clip": 1.23259974, + "balance_loss_mlp": 1.0115962, + "epoch": 0.6798136179167293, + "flos": 22132554716880.0, + "grad_norm": 1.508936978152821, + "language_loss": 0.80077922, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82441241, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12805176, + "step": 11307, + "time_per_iteration": 2.820819854736328 + }, + { + "auxiliary_loss_clip": 0.01335542, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.22976983, + "balance_loss_mlp": 1.01717401, + "epoch": 0.6798737411693972, + "flos": 17492485439160.0, + "grad_norm": 1.6318577342338283, + "language_loss": 0.89338124, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91703844, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13006592, + "step": 11308, + "time_per_iteration": 2.8406035900115967 + }, + { + "auxiliary_loss_clip": 0.01342404, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.23396969, + "balance_loss_mlp": 1.01903212, + "epoch": 0.6799338644220653, + "flos": 22897300222440.0, + "grad_norm": 2.034536031290667, + "language_loss": 0.71229446, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73604304, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13427734, + "step": 11309, + "time_per_iteration": 2.845761299133301 + }, + { + "auxiliary_loss_clip": 0.01342396, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.2340672, + "balance_loss_mlp": 1.02545333, + "epoch": 0.6799939876747332, + "flos": 23184996428520.0, + "grad_norm": 1.6809647404266375, + "language_loss": 0.84064591, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86445016, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12573242, + "step": 11310, + "time_per_iteration": 2.8416390419006348 + }, + { + "auxiliary_loss_clip": 0.01333339, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.22983193, + "balance_loss_mlp": 1.01474762, + "epoch": 0.6800541109274012, + "flos": 19504629703440.0, + "grad_norm": 2.2873728470204515, + "language_loss": 0.83565664, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85925382, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.11633301, + "step": 11311, + "time_per_iteration": 2.795889377593994 + }, + { + "auxiliary_loss_clip": 0.01348078, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.23520494, + "balance_loss_mlp": 1.01692092, + "epoch": 0.6801142341800691, + "flos": 22283346849480.0, + "grad_norm": 1.6052428636875908, + "language_loss": 0.76677328, + "learning_rate": 9.806134314328767e-07, + "loss": 0.79056799, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14465332, + "step": 11312, + "time_per_iteration": 2.834824800491333 + }, + { + "auxiliary_loss_clip": 0.01165712, + "auxiliary_loss_mlp": 0.01012337, + "balance_loss_clip": 1.12226117, + "balance_loss_mlp": 1.00934494, + "epoch": 0.6801743574327371, + "flos": 68730136761240.0, + "grad_norm": 0.6737948675968469, + "language_loss": 0.57243645, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59421694, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02990723, + "step": 11313, + "time_per_iteration": 3.3764610290527344 + }, + { + "auxiliary_loss_clip": 0.01341418, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.23125124, + "balance_loss_mlp": 1.01838064, + "epoch": 0.680234480685405, + "flos": 29466463022280.0, + "grad_norm": 1.7652465195636589, + "language_loss": 0.68765771, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71138406, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12841797, + "step": 11314, + "time_per_iteration": 5.723975419998169 + }, + { + "auxiliary_loss_clip": 0.01335304, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.22967017, + "balance_loss_mlp": 1.0181886, + "epoch": 0.6802946039380731, + "flos": 15920128981080.0, + "grad_norm": 1.7024594253357865, + "language_loss": 0.81964624, + "learning_rate": 9.796083781453972e-07, + "loss": 0.84329796, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11688232, + "step": 11315, + "time_per_iteration": 2.8565847873687744 + }, + { + "auxiliary_loss_clip": 0.0133558, + "auxiliary_loss_mlp": 0.01025934, + "balance_loss_clip": 1.22874093, + "balance_loss_mlp": 1.01353669, + "epoch": 0.680354727190741, + "flos": 22023856472400.0, + "grad_norm": 1.8292190488074611, + "language_loss": 0.70108259, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72469771, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12408447, + "step": 11316, + "time_per_iteration": 2.8196451663970947 + }, + { + "auxiliary_loss_clip": 0.01338561, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.23215485, + "balance_loss_mlp": 1.01704729, + "epoch": 0.680414850443409, + "flos": 18446015521080.0, + "grad_norm": 2.2271959939952195, + "language_loss": 0.6673407, + "learning_rate": 9.789385360660003e-07, + "loss": 0.69101852, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12182617, + "step": 11317, + "time_per_iteration": 2.7317161560058594 + }, + { + "auxiliary_loss_clip": 0.01347017, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.23813772, + "balance_loss_mlp": 1.02448225, + "epoch": 0.680474973696077, + "flos": 26364128252760.0, + "grad_norm": 1.4770330580306006, + "language_loss": 0.75269032, + "learning_rate": 9.78603673098082e-07, + "loss": 0.7765286, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12329102, + "step": 11318, + "time_per_iteration": 4.232100248336792 + }, + { + "auxiliary_loss_clip": 0.01326412, + "auxiliary_loss_mlp": 0.01025797, + "balance_loss_clip": 1.22233331, + "balance_loss_mlp": 1.01397753, + "epoch": 0.6805350969487449, + "flos": 18337763968560.0, + "grad_norm": 1.6677140746270875, + "language_loss": 0.68788123, + "learning_rate": 9.782688488616143e-07, + "loss": 0.71140325, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.1182251, + "step": 11319, + "time_per_iteration": 2.8035502433776855 + }, + { + "auxiliary_loss_clip": 0.01330983, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.22640514, + "balance_loss_mlp": 1.01950526, + "epoch": 0.6805952202014129, + "flos": 19942224658200.0, + "grad_norm": 1.6480227915463637, + "language_loss": 0.77434307, + "learning_rate": 9.779340633692945e-07, + "loss": 0.79796767, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11981201, + "step": 11320, + "time_per_iteration": 2.766533374786377 + }, + { + "auxiliary_loss_clip": 0.0133356, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.22785711, + "balance_loss_mlp": 1.01453996, + "epoch": 0.6806553434540809, + "flos": 25229244924360.0, + "grad_norm": 1.690119220681656, + "language_loss": 0.74948227, + "learning_rate": 9.77599316633817e-07, + "loss": 0.77308655, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12329102, + "step": 11321, + "time_per_iteration": 2.769037961959839 + }, + { + "auxiliary_loss_clip": 0.01338514, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.23108888, + "balance_loss_mlp": 1.01715016, + "epoch": 0.6807154667067489, + "flos": 17790130693440.0, + "grad_norm": 1.777253101841394, + "language_loss": 0.73192972, + "learning_rate": 9.772646086678758e-07, + "loss": 0.75560832, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12213135, + "step": 11322, + "time_per_iteration": 2.769836664199829 + }, + { + "auxiliary_loss_clip": 0.01338765, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.22992015, + "balance_loss_mlp": 1.0143019, + "epoch": 0.6807755899594168, + "flos": 22204966068720.0, + "grad_norm": 1.526956116482684, + "language_loss": 0.78497326, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80863619, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13220215, + "step": 11323, + "time_per_iteration": 2.819401979446411 + }, + { + "auxiliary_loss_clip": 0.01165709, + "auxiliary_loss_mlp": 0.01011778, + "balance_loss_clip": 1.1220026, + "balance_loss_mlp": 1.00919116, + "epoch": 0.6808357132120848, + "flos": 68644974384360.0, + "grad_norm": 0.7591612446014817, + "language_loss": 0.5714429, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59321773, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02587891, + "step": 11324, + "time_per_iteration": 3.1525726318359375 + }, + { + "auxiliary_loss_clip": 0.01339511, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.23026657, + "balance_loss_mlp": 1.02429056, + "epoch": 0.6808958364647527, + "flos": 23848840494720.0, + "grad_norm": 1.8244461675886312, + "language_loss": 0.68455642, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70833045, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13586426, + "step": 11325, + "time_per_iteration": 4.432799577713013 + }, + { + "auxiliary_loss_clip": 0.01345677, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.23400724, + "balance_loss_mlp": 1.01819706, + "epoch": 0.6809559597174207, + "flos": 17716135615560.0, + "grad_norm": 2.3459967381958817, + "language_loss": 0.70548487, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72925478, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13110352, + "step": 11326, + "time_per_iteration": 3.0280871391296387 + }, + { + "auxiliary_loss_clip": 0.0133529, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.22876477, + "balance_loss_mlp": 1.01573944, + "epoch": 0.6810160829700886, + "flos": 22497007369320.0, + "grad_norm": 1.7106104496389996, + "language_loss": 0.735241, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7588799, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12884521, + "step": 11327, + "time_per_iteration": 2.861016035079956 + }, + { + "auxiliary_loss_clip": 0.01331689, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.22593164, + "balance_loss_mlp": 1.01476026, + "epoch": 0.6810762062227567, + "flos": 16836763044960.0, + "grad_norm": 1.6214651891440288, + "language_loss": 0.77214092, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79573321, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12780762, + "step": 11328, + "time_per_iteration": 2.757760524749756 + }, + { + "auxiliary_loss_clip": 0.01339095, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.22926044, + "balance_loss_mlp": 1.01514626, + "epoch": 0.6811363294754246, + "flos": 12718638931680.0, + "grad_norm": 1.6941949808546577, + "language_loss": 0.64353675, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66720176, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12243652, + "step": 11329, + "time_per_iteration": 2.764146566390991 + }, + { + "auxiliary_loss_clip": 0.01338944, + "auxiliary_loss_mlp": 0.01024537, + "balance_loss_clip": 1.23011136, + "balance_loss_mlp": 1.01127529, + "epoch": 0.6811964527280926, + "flos": 17716419874080.0, + "grad_norm": 4.021336282052161, + "language_loss": 0.79788077, + "learning_rate": 9.745883421664096e-07, + "loss": 0.82151562, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13269043, + "step": 11330, + "time_per_iteration": 2.930623769760132 + }, + { + "auxiliary_loss_clip": 0.0133688, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.22944117, + "balance_loss_mlp": 1.01318836, + "epoch": 0.6812565759807605, + "flos": 24868893716280.0, + "grad_norm": 1.7022206912659976, + "language_loss": 0.64226294, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66589642, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13293457, + "step": 11331, + "time_per_iteration": 3.0075650215148926 + }, + { + "auxiliary_loss_clip": 0.0133252, + "auxiliary_loss_mlp": 0.01026696, + "balance_loss_clip": 1.22554564, + "balance_loss_mlp": 1.01394093, + "epoch": 0.6813166992334285, + "flos": 17170573366800.0, + "grad_norm": 1.8283643711597737, + "language_loss": 0.72670716, + "learning_rate": 9.739196641245148e-07, + "loss": 0.75029927, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12762451, + "step": 11332, + "time_per_iteration": 2.8521511554718018 + }, + { + "auxiliary_loss_clip": 0.01339743, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.23090625, + "balance_loss_mlp": 1.0211575, + "epoch": 0.6813768224860965, + "flos": 18848216967120.0, + "grad_norm": 1.9574395114339556, + "language_loss": 0.75089616, + "learning_rate": 9.735853834608326e-07, + "loss": 0.77463639, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13128662, + "step": 11333, + "time_per_iteration": 2.7361979484558105 + }, + { + "auxiliary_loss_clip": 0.01342555, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.23278213, + "balance_loss_mlp": 1.01800942, + "epoch": 0.6814369457387645, + "flos": 24537804154560.0, + "grad_norm": 1.4178221212904487, + "language_loss": 0.72063237, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74437058, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13256836, + "step": 11334, + "time_per_iteration": 2.8148186206817627 + }, + { + "auxiliary_loss_clip": 0.01330289, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.225173, + "balance_loss_mlp": 1.02017987, + "epoch": 0.6814970689914325, + "flos": 18227522606400.0, + "grad_norm": 1.7152150171325657, + "language_loss": 0.86311245, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88674593, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12884521, + "step": 11335, + "time_per_iteration": 2.7160446643829346 + }, + { + "auxiliary_loss_clip": 0.0131995, + "auxiliary_loss_mlp": 0.0102288, + "balance_loss_clip": 1.21783447, + "balance_loss_mlp": 1.0108819, + "epoch": 0.6815571922441004, + "flos": 25234320969360.0, + "grad_norm": 1.631418503909199, + "language_loss": 0.8198328, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84326112, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.11993408, + "step": 11336, + "time_per_iteration": 2.779449701309204 + }, + { + "auxiliary_loss_clip": 0.01326909, + "auxiliary_loss_mlp": 0.01029133, + "balance_loss_clip": 1.22360039, + "balance_loss_mlp": 1.01665759, + "epoch": 0.6816173154967684, + "flos": 19459815055200.0, + "grad_norm": 1.765050953391719, + "language_loss": 0.81928825, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84284866, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.12469482, + "step": 11337, + "time_per_iteration": 2.799067258834839 + }, + { + "auxiliary_loss_clip": 0.01331422, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.22739589, + "balance_loss_mlp": 1.01584339, + "epoch": 0.6816774387494363, + "flos": 17936009214480.0, + "grad_norm": 1.9444897189094397, + "language_loss": 0.72587597, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74947381, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12530518, + "step": 11338, + "time_per_iteration": 2.7876522541046143 + }, + { + "auxiliary_loss_clip": 0.01333108, + "auxiliary_loss_mlp": 0.01025925, + "balance_loss_clip": 1.22630882, + "balance_loss_mlp": 1.01303887, + "epoch": 0.6817375620021043, + "flos": 22237638817320.0, + "grad_norm": 1.4409316775516334, + "language_loss": 0.77336317, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79695356, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12872314, + "step": 11339, + "time_per_iteration": 2.882404088973999 + }, + { + "auxiliary_loss_clip": 0.01340542, + "auxiliary_loss_mlp": 0.01026197, + "balance_loss_clip": 1.23299742, + "balance_loss_mlp": 1.01365638, + "epoch": 0.6817976852547722, + "flos": 25379915231880.0, + "grad_norm": 1.8251833090945857, + "language_loss": 0.70466632, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72833371, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12567139, + "step": 11340, + "time_per_iteration": 2.763310670852661 + }, + { + "auxiliary_loss_clip": 0.01345948, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.23521674, + "balance_loss_mlp": 1.01779246, + "epoch": 0.6818578085074403, + "flos": 22269905482320.0, + "grad_norm": 2.1663223476202975, + "language_loss": 0.83666933, + "learning_rate": 9.709125403986722e-07, + "loss": 0.86043304, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12658691, + "step": 11341, + "time_per_iteration": 2.7519724369049072 + }, + { + "auxiliary_loss_clip": 0.01339499, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.23045433, + "balance_loss_mlp": 1.01868844, + "epoch": 0.6819179317601082, + "flos": 19322910981720.0, + "grad_norm": 1.5409413843451205, + "language_loss": 0.68262947, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70635116, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13964844, + "step": 11342, + "time_per_iteration": 2.7623443603515625 + }, + { + "auxiliary_loss_clip": 0.0133295, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.22659349, + "balance_loss_mlp": 1.01442504, + "epoch": 0.6819780550127762, + "flos": 21108887351280.0, + "grad_norm": 1.5206041310762854, + "language_loss": 0.74944836, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77305359, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13153076, + "step": 11343, + "time_per_iteration": 2.867572069168091 + }, + { + "auxiliary_loss_clip": 0.01340274, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.23236537, + "balance_loss_mlp": 1.01771271, + "epoch": 0.6820381782654441, + "flos": 29722501688760.0, + "grad_norm": 1.729105413967048, + "language_loss": 0.80023527, + "learning_rate": 9.699108677831639e-07, + "loss": 0.8239488, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13366699, + "step": 11344, + "time_per_iteration": 2.8900997638702393 + }, + { + "auxiliary_loss_clip": 0.01333626, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.2261076, + "balance_loss_mlp": 1.02303016, + "epoch": 0.6820983015181121, + "flos": 29248173149400.0, + "grad_norm": 2.0178691166175153, + "language_loss": 0.66451603, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68821627, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13366699, + "step": 11345, + "time_per_iteration": 2.8723647594451904 + }, + { + "auxiliary_loss_clip": 0.01346604, + "auxiliary_loss_mlp": 0.0103549, + "balance_loss_clip": 1.23661685, + "balance_loss_mlp": 1.02197766, + "epoch": 0.6821584247707801, + "flos": 18875366978760.0, + "grad_norm": 2.6705556854755814, + "language_loss": 0.65562189, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67944288, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13531494, + "step": 11346, + "time_per_iteration": 2.792454719543457 + }, + { + "auxiliary_loss_clip": 0.01346968, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.23704827, + "balance_loss_mlp": 1.0136764, + "epoch": 0.6822185480234481, + "flos": 21329573117400.0, + "grad_norm": 1.6066433957666921, + "language_loss": 0.78914934, + "learning_rate": 9.689095467173952e-07, + "loss": 0.81288874, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13299561, + "step": 11347, + "time_per_iteration": 2.7958269119262695 + }, + { + "auxiliary_loss_clip": 0.01161255, + "auxiliary_loss_mlp": 0.01009061, + "balance_loss_clip": 1.11779022, + "balance_loss_mlp": 1.00636673, + "epoch": 0.6822786712761161, + "flos": 63501761612880.0, + "grad_norm": 0.7785970055914699, + "language_loss": 0.52563596, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54733914, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02697754, + "step": 11348, + "time_per_iteration": 3.3357605934143066 + }, + { + "auxiliary_loss_clip": 0.01332572, + "auxiliary_loss_mlp": 0.01027275, + "balance_loss_clip": 1.22758079, + "balance_loss_mlp": 1.0146867, + "epoch": 0.682338794528784, + "flos": 21073859317800.0, + "grad_norm": 7.13356322739085, + "language_loss": 0.80317587, + "learning_rate": 9.682421948143873e-07, + "loss": 0.82677436, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12591553, + "step": 11349, + "time_per_iteration": 2.7968339920043945 + }, + { + "auxiliary_loss_clip": 0.01354718, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.23945129, + "balance_loss_mlp": 1.01333976, + "epoch": 0.682398917781452, + "flos": 36290121370920.0, + "grad_norm": 1.7126722793810705, + "language_loss": 0.73582619, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75965858, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.15179443, + "step": 11350, + "time_per_iteration": 2.8907997608184814 + }, + { + "auxiliary_loss_clip": 0.01335567, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.22937191, + "balance_loss_mlp": 1.01534855, + "epoch": 0.6824590410341199, + "flos": 24864061321440.0, + "grad_norm": 1.4607938838397536, + "language_loss": 0.79635775, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81999773, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13079834, + "step": 11351, + "time_per_iteration": 2.94618558883667 + }, + { + "auxiliary_loss_clip": 0.01330161, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.22386742, + "balance_loss_mlp": 1.01633954, + "epoch": 0.6825191642867879, + "flos": 22457106332640.0, + "grad_norm": 2.3262242307809875, + "language_loss": 0.73814142, + "learning_rate": 9.672414604241954e-07, + "loss": 0.76173073, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12438965, + "step": 11352, + "time_per_iteration": 2.8041036128997803 + }, + { + "auxiliary_loss_clip": 0.01342733, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.23275816, + "balance_loss_mlp": 1.01948118, + "epoch": 0.6825792875394558, + "flos": 29430419779800.0, + "grad_norm": 1.435856367328204, + "language_loss": 0.8054986, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82926238, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14172363, + "step": 11353, + "time_per_iteration": 5.853459596633911 + }, + { + "auxiliary_loss_clip": 0.01334929, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.22809327, + "balance_loss_mlp": 1.01326013, + "epoch": 0.6826394107921239, + "flos": 18775683790200.0, + "grad_norm": 1.7036680415262526, + "language_loss": 0.78736496, + "learning_rate": 9.665744999545218e-07, + "loss": 0.81097603, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12915039, + "step": 11354, + "time_per_iteration": 2.8798716068267822 + }, + { + "auxiliary_loss_clip": 0.01334984, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.22808564, + "balance_loss_mlp": 1.01240373, + "epoch": 0.6826995340447918, + "flos": 16621924882680.0, + "grad_norm": 1.9701258664021832, + "language_loss": 0.6219492, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64554757, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12451172, + "step": 11355, + "time_per_iteration": 2.736362934112549 + }, + { + "auxiliary_loss_clip": 0.0133531, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.22734988, + "balance_loss_mlp": 1.01693201, + "epoch": 0.6827596572974598, + "flos": 20852970509880.0, + "grad_norm": 1.9579885382839552, + "language_loss": 0.82327116, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84692085, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12731934, + "step": 11356, + "time_per_iteration": 4.1911303997039795 + }, + { + "auxiliary_loss_clip": 0.0134162, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.23251152, + "balance_loss_mlp": 1.0166049, + "epoch": 0.6828197805501277, + "flos": 22753330294320.0, + "grad_norm": 1.7100369966567424, + "language_loss": 0.7878226, + "learning_rate": 9.655743531886052e-07, + "loss": 0.81153393, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12908936, + "step": 11357, + "time_per_iteration": 2.814849615097046 + }, + { + "auxiliary_loss_clip": 0.01162894, + "auxiliary_loss_mlp": 0.01016323, + "balance_loss_clip": 1.1187799, + "balance_loss_mlp": 1.01354504, + "epoch": 0.6828799038027957, + "flos": 71663950526040.0, + "grad_norm": 0.842930743407364, + "language_loss": 0.59683239, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61862457, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02783203, + "step": 11358, + "time_per_iteration": 3.3223767280578613 + }, + { + "auxiliary_loss_clip": 0.01348153, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.2360332, + "balance_loss_mlp": 1.02143872, + "epoch": 0.6829400270554637, + "flos": 19834013714040.0, + "grad_norm": 1.6511162682591485, + "language_loss": 0.79103172, + "learning_rate": 9.64907784784544e-07, + "loss": 0.81487179, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14428711, + "step": 11359, + "time_per_iteration": 2.748072862625122 + }, + { + "auxiliary_loss_clip": 0.0134222, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.23340702, + "balance_loss_mlp": 1.02236176, + "epoch": 0.6830001503081317, + "flos": 21985620378480.0, + "grad_norm": 1.8118581376542122, + "language_loss": 0.82084697, + "learning_rate": 9.645745594523958e-07, + "loss": 0.84462082, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12805176, + "step": 11360, + "time_per_iteration": 2.7558975219726562 + }, + { + "auxiliary_loss_clip": 0.01336957, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.22697306, + "balance_loss_mlp": 1.02035522, + "epoch": 0.6830602735607997, + "flos": 24322194433440.0, + "grad_norm": 1.8173719950977583, + "language_loss": 0.75362468, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77732664, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.12884521, + "step": 11361, + "time_per_iteration": 2.9053428173065186 + }, + { + "auxiliary_loss_clip": 0.01160782, + "auxiliary_loss_mlp": 0.01007177, + "balance_loss_clip": 1.11673093, + "balance_loss_mlp": 1.00466204, + "epoch": 0.6831203968134676, + "flos": 57703151314080.0, + "grad_norm": 0.8616548750826535, + "language_loss": 0.59795582, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61963546, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02514648, + "step": 11362, + "time_per_iteration": 3.3187386989593506 + }, + { + "auxiliary_loss_clip": 0.01340193, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.23033273, + "balance_loss_mlp": 1.01385164, + "epoch": 0.6831805200661356, + "flos": 14392018654200.0, + "grad_norm": 2.1355466467415125, + "language_loss": 0.75538403, + "learning_rate": 9.635751190871074e-07, + "loss": 0.7790572, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13287354, + "step": 11363, + "time_per_iteration": 2.748253583908081 + }, + { + "auxiliary_loss_clip": 0.01335093, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.22923708, + "balance_loss_mlp": 1.01639414, + "epoch": 0.6832406433188035, + "flos": 22825457387640.0, + "grad_norm": 2.665369004595697, + "language_loss": 0.89321375, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91686565, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13708496, + "step": 11364, + "time_per_iteration": 2.842914581298828 + }, + { + "auxiliary_loss_clip": 0.01333002, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.22713149, + "balance_loss_mlp": 1.01728058, + "epoch": 0.6833007665714715, + "flos": 17565587133120.0, + "grad_norm": 1.7532680265854264, + "language_loss": 0.88197744, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90560502, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12481689, + "step": 11365, + "time_per_iteration": 4.478847026824951 + }, + { + "auxiliary_loss_clip": 0.0135001, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.2375021, + "balance_loss_mlp": 1.02351964, + "epoch": 0.6833608898241395, + "flos": 22449796827840.0, + "grad_norm": 2.325437887426935, + "language_loss": 0.81791133, + "learning_rate": 9.625760324338272e-07, + "loss": 0.84178257, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 1.12548828, + "router_z_loss_mlp": 0.13580322, + "step": 11366, + "time_per_iteration": 2.8582353591918945 + }, + { + "auxiliary_loss_clip": 0.01338864, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.2292546, + "balance_loss_mlp": 1.01802242, + "epoch": 0.6834210130768075, + "flos": 24539956397640.0, + "grad_norm": 1.6444592235532156, + "language_loss": 0.77000242, + "learning_rate": 9.622430822110062e-07, + "loss": 0.79370129, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.13006592, + "step": 11367, + "time_per_iteration": 2.8826003074645996 + }, + { + "auxiliary_loss_clip": 0.01335014, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.22707999, + "balance_loss_mlp": 1.0171423, + "epoch": 0.6834811363294754, + "flos": 20051897503320.0, + "grad_norm": 1.5681647526320437, + "language_loss": 0.6928761, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71653509, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13726807, + "step": 11368, + "time_per_iteration": 2.8067283630371094 + }, + { + "auxiliary_loss_clip": 0.01337751, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.23035824, + "balance_loss_mlp": 1.01810336, + "epoch": 0.6835412595821434, + "flos": 24796198105920.0, + "grad_norm": 1.627934380940285, + "language_loss": 0.74105287, + "learning_rate": 9.615772998335261e-07, + "loss": 0.76474035, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12902832, + "step": 11369, + "time_per_iteration": 2.788888454437256 + }, + { + "auxiliary_loss_clip": 0.01341998, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.23302579, + "balance_loss_mlp": 1.01706052, + "epoch": 0.6836013828348113, + "flos": 19505117003760.0, + "grad_norm": 1.7822296208337916, + "language_loss": 0.78807747, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81180096, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13287354, + "step": 11370, + "time_per_iteration": 2.8489491939544678 + }, + { + "auxiliary_loss_clip": 0.0116093, + "auxiliary_loss_mlp": 0.01005093, + "balance_loss_clip": 1.11641443, + "balance_loss_mlp": 1.0024941, + "epoch": 0.6836615060874793, + "flos": 58379421538440.0, + "grad_norm": 0.7551347603237476, + "language_loss": 0.59872591, + "learning_rate": 9.609116749644162e-07, + "loss": 0.62038612, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02600098, + "step": 11371, + "time_per_iteration": 3.1251773834228516 + }, + { + "auxiliary_loss_clip": 0.01328883, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.22443426, + "balance_loss_mlp": 1.01639676, + "epoch": 0.6837216293401474, + "flos": 12170721398040.0, + "grad_norm": 1.5901693174953264, + "language_loss": 0.64127398, + "learning_rate": 9.605789216270511e-07, + "loss": 0.66485029, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12359619, + "step": 11372, + "time_per_iteration": 2.7269036769866943 + }, + { + "auxiliary_loss_clip": 0.01339843, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.23313344, + "balance_loss_mlp": 1.01516283, + "epoch": 0.6837817525928153, + "flos": 22132717150320.0, + "grad_norm": 1.422984021827098, + "language_loss": 0.71844137, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74212146, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13012695, + "step": 11373, + "time_per_iteration": 2.8776113986968994 + }, + { + "auxiliary_loss_clip": 0.01158022, + "auxiliary_loss_mlp": 0.01008561, + "balance_loss_clip": 1.11374259, + "balance_loss_mlp": 1.00597465, + "epoch": 0.6838418758454833, + "flos": 65022416982720.0, + "grad_norm": 1.2008676612925901, + "language_loss": 0.56702226, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58868814, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.02587891, + "step": 11374, + "time_per_iteration": 3.3754804134368896 + }, + { + "auxiliary_loss_clip": 0.01345572, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.23687077, + "balance_loss_mlp": 1.01648045, + "epoch": 0.6839019990981512, + "flos": 21035542007160.0, + "grad_norm": 1.781506394164277, + "language_loss": 0.74073118, + "learning_rate": 9.595808981551312e-07, + "loss": 0.76448685, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13519287, + "step": 11375, + "time_per_iteration": 2.9265682697296143 + }, + { + "auxiliary_loss_clip": 0.01332279, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.22609758, + "balance_loss_mlp": 1.01753998, + "epoch": 0.6839621223508192, + "flos": 24940858376160.0, + "grad_norm": 1.8907298153597656, + "language_loss": 0.70560658, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72923088, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1262207, + "step": 11376, + "time_per_iteration": 2.8346593379974365 + }, + { + "auxiliary_loss_clip": 0.01343795, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.23281717, + "balance_loss_mlp": 1.02065277, + "epoch": 0.6840222456034871, + "flos": 26364371902920.0, + "grad_norm": 1.8208296186041186, + "language_loss": 0.74479055, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76856673, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.1317749, + "step": 11377, + "time_per_iteration": 2.907633066177368 + }, + { + "auxiliary_loss_clip": 0.01157055, + "auxiliary_loss_mlp": 0.01016835, + "balance_loss_clip": 1.11339831, + "balance_loss_mlp": 1.01397443, + "epoch": 0.6840823688561551, + "flos": 66004599585600.0, + "grad_norm": 0.7262479429681833, + "language_loss": 0.56901616, + "learning_rate": 9.585832297583707e-07, + "loss": 0.59075511, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02856445, + "step": 11378, + "time_per_iteration": 3.3715853691101074 + }, + { + "auxiliary_loss_clip": 0.01340642, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.23067379, + "balance_loss_mlp": 1.02298439, + "epoch": 0.684142492108823, + "flos": 21402593594640.0, + "grad_norm": 1.6515296983820609, + "language_loss": 0.78438276, + "learning_rate": 9.58250752590561e-07, + "loss": 0.8081612, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14202881, + "step": 11379, + "time_per_iteration": 2.7625393867492676 + }, + { + "auxiliary_loss_clip": 0.01330988, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.22871196, + "balance_loss_mlp": 1.01463711, + "epoch": 0.6842026153614911, + "flos": 18805351520160.0, + "grad_norm": 1.8253710820961548, + "language_loss": 0.69186091, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71543622, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11901855, + "step": 11380, + "time_per_iteration": 2.787769317626953 + }, + { + "auxiliary_loss_clip": 0.01336637, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.22876906, + "balance_loss_mlp": 1.02209461, + "epoch": 0.684262738614159, + "flos": 19651198566600.0, + "grad_norm": 1.8889322175674725, + "language_loss": 0.78738314, + "learning_rate": 9.575859167772568e-07, + "loss": 0.81110847, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13806152, + "step": 11381, + "time_per_iteration": 2.7677135467529297 + }, + { + "auxiliary_loss_clip": 0.01157216, + "auxiliary_loss_mlp": 0.01006284, + "balance_loss_clip": 1.11379433, + "balance_loss_mlp": 1.00330377, + "epoch": 0.684322861866827, + "flos": 62367406193160.0, + "grad_norm": 0.8613558987944923, + "language_loss": 0.67225236, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69388735, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02978516, + "step": 11382, + "time_per_iteration": 3.113734483718872 + }, + { + "auxiliary_loss_clip": 0.01156717, + "auxiliary_loss_mlp": 0.01005332, + "balance_loss_clip": 1.11317015, + "balance_loss_mlp": 1.00257826, + "epoch": 0.6843829851194949, + "flos": 65821175312760.0, + "grad_norm": 0.8347347770874938, + "language_loss": 0.58171743, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60333794, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02758789, + "step": 11383, + "time_per_iteration": 3.2315690517425537 + }, + { + "auxiliary_loss_clip": 0.01333419, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.22699761, + "balance_loss_mlp": 1.020087, + "epoch": 0.6844431083721629, + "flos": 27860946515280.0, + "grad_norm": 1.8019189781626999, + "language_loss": 0.79844415, + "learning_rate": 9.565889595521517e-07, + "loss": 0.82210171, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12243652, + "step": 11384, + "time_per_iteration": 2.96505069732666 + }, + { + "auxiliary_loss_clip": 0.01342966, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.23171186, + "balance_loss_mlp": 1.02183056, + "epoch": 0.684503231624831, + "flos": 18258977104200.0, + "grad_norm": 1.7181132332155538, + "language_loss": 0.77218097, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79596925, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14025879, + "step": 11385, + "time_per_iteration": 2.724390745162964 + }, + { + "auxiliary_loss_clip": 0.01351682, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.23672342, + "balance_loss_mlp": 1.02222419, + "epoch": 0.6845633548774989, + "flos": 17644211564040.0, + "grad_norm": 1.873860060665714, + "language_loss": 0.84707338, + "learning_rate": 9.55924519212335e-07, + "loss": 0.87096596, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.15344238, + "step": 11386, + "time_per_iteration": 2.7062387466430664 + }, + { + "auxiliary_loss_clip": 0.0134213, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.23354232, + "balance_loss_mlp": 1.02095914, + "epoch": 0.6846234781301669, + "flos": 20811688788960.0, + "grad_norm": 2.18816304453941, + "language_loss": 0.8297416, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85349822, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12561035, + "step": 11387, + "time_per_iteration": 2.7845656871795654 + }, + { + "auxiliary_loss_clip": 0.0133482, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.22861564, + "balance_loss_mlp": 1.01723289, + "epoch": 0.6846836013828348, + "flos": 36108077782320.0, + "grad_norm": 1.5277400346099015, + "language_loss": 0.72450942, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74815923, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12927246, + "step": 11388, + "time_per_iteration": 2.8798558712005615 + }, + { + "auxiliary_loss_clip": 0.01330314, + "auxiliary_loss_mlp": 0.01025943, + "balance_loss_clip": 1.22421956, + "balance_loss_mlp": 1.01326537, + "epoch": 0.6847437246355028, + "flos": 43150229045640.0, + "grad_norm": 2.017612871409587, + "language_loss": 0.62387592, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64743853, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12683105, + "step": 11389, + "time_per_iteration": 2.992587089538574 + }, + { + "auxiliary_loss_clip": 0.01156316, + "auxiliary_loss_mlp": 0.01007493, + "balance_loss_clip": 1.11270022, + "balance_loss_mlp": 1.00495434, + "epoch": 0.6848038478881707, + "flos": 71678593162800.0, + "grad_norm": 0.7389675017814182, + "language_loss": 0.56026554, + "learning_rate": 9.54596113730818e-07, + "loss": 0.5819037, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02539062, + "step": 11390, + "time_per_iteration": 3.359941244125366 + }, + { + "auxiliary_loss_clip": 0.01343746, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.23624039, + "balance_loss_mlp": 1.01920676, + "epoch": 0.6848639711408387, + "flos": 19942590133440.0, + "grad_norm": 1.7929834841822603, + "language_loss": 0.88488615, + "learning_rate": 9.542641114335109e-07, + "loss": 0.90864348, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12780762, + "step": 11391, + "time_per_iteration": 2.727085828781128 + }, + { + "auxiliary_loss_clip": 0.01343465, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.23320818, + "balance_loss_mlp": 1.02193165, + "epoch": 0.6849240943935067, + "flos": 26872835091840.0, + "grad_norm": 1.538791858934177, + "language_loss": 0.79063308, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81441879, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13165283, + "step": 11392, + "time_per_iteration": 5.71622109413147 + }, + { + "auxiliary_loss_clip": 0.01331678, + "auxiliary_loss_mlp": 0.01030445, + "balance_loss_clip": 1.22735906, + "balance_loss_mlp": 1.01817191, + "epoch": 0.6849842176461747, + "flos": 13739341887000.0, + "grad_norm": 2.573480384558412, + "language_loss": 0.71094298, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73456419, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12261963, + "step": 11393, + "time_per_iteration": 2.7133028507232666 + }, + { + "auxiliary_loss_clip": 0.01350633, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.23937905, + "balance_loss_mlp": 1.0222137, + "epoch": 0.6850443408988426, + "flos": 24978525953040.0, + "grad_norm": 1.695823768658546, + "language_loss": 0.64197028, + "learning_rate": 9.532683425183936e-07, + "loss": 0.665833, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13427734, + "step": 11394, + "time_per_iteration": 2.737971305847168 + }, + { + "auxiliary_loss_clip": 0.01338945, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.22975588, + "balance_loss_mlp": 1.02318561, + "epoch": 0.6851044641515106, + "flos": 27749811769200.0, + "grad_norm": 1.7669290674539382, + "language_loss": 0.81211835, + "learning_rate": 9.529364989142468e-07, + "loss": 0.83587027, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1307373, + "step": 11395, + "time_per_iteration": 4.356601238250732 + }, + { + "auxiliary_loss_clip": 0.01337739, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.2306602, + "balance_loss_mlp": 1.02040589, + "epoch": 0.6851645874041785, + "flos": 24356207257920.0, + "grad_norm": 1.7003996979811147, + "language_loss": 0.72898275, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75271338, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.14916992, + "step": 11396, + "time_per_iteration": 2.7829525470733643 + }, + { + "auxiliary_loss_clip": 0.01343173, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.23239505, + "balance_loss_mlp": 1.01612055, + "epoch": 0.6852247106568465, + "flos": 15081022922400.0, + "grad_norm": 2.1463700067738665, + "language_loss": 0.79734558, + "learning_rate": 9.522729308327931e-07, + "loss": 0.82107377, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13531494, + "step": 11397, + "time_per_iteration": 2.7078590393066406 + }, + { + "auxiliary_loss_clip": 0.01335443, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.22701097, + "balance_loss_mlp": 1.01932156, + "epoch": 0.6852848339095146, + "flos": 18774587364480.0, + "grad_norm": 1.714198483071771, + "language_loss": 0.71414006, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73781562, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12799072, + "step": 11398, + "time_per_iteration": 2.7355237007141113 + }, + { + "auxiliary_loss_clip": 0.01329532, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.22357845, + "balance_loss_mlp": 1.01731884, + "epoch": 0.6853449571621825, + "flos": 27860053131360.0, + "grad_norm": 1.6870746277546063, + "language_loss": 0.71211922, + "learning_rate": 9.516095216709996e-07, + "loss": 0.73570704, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.1194458, + "step": 11399, + "time_per_iteration": 2.7628490924835205 + }, + { + "auxiliary_loss_clip": 0.0134114, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.23291016, + "balance_loss_mlp": 1.01723671, + "epoch": 0.6854050804148505, + "flos": 18155111254560.0, + "grad_norm": 1.477678762498131, + "language_loss": 0.7009517, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72466767, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.13238525, + "step": 11400, + "time_per_iteration": 2.8068645000457764 + }, + { + "auxiliary_loss_clip": 0.01362282, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.24403238, + "balance_loss_mlp": 1.02298319, + "epoch": 0.6854652036675184, + "flos": 16330573924200.0, + "grad_norm": 4.755352770386316, + "language_loss": 0.78194034, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80595326, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.16027832, + "step": 11401, + "time_per_iteration": 2.753588914871216 + }, + { + "auxiliary_loss_clip": 0.01330403, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.22511744, + "balance_loss_mlp": 1.0174799, + "epoch": 0.6855253269201864, + "flos": 14945134057920.0, + "grad_norm": 1.7947637422240115, + "language_loss": 0.76129121, + "learning_rate": 9.50614706122786e-07, + "loss": 0.78489321, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12322998, + "step": 11402, + "time_per_iteration": 2.7364935874938965 + }, + { + "auxiliary_loss_clip": 0.01342476, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.23134625, + "balance_loss_mlp": 1.01853418, + "epoch": 0.6855854501728543, + "flos": 23042732051520.0, + "grad_norm": 1.5441412597845134, + "language_loss": 0.72593629, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74968094, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13458252, + "step": 11403, + "time_per_iteration": 4.400287866592407 + }, + { + "auxiliary_loss_clip": 0.0133231, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.22696829, + "balance_loss_mlp": 1.01846504, + "epoch": 0.6856455734255223, + "flos": 13256566808760.0, + "grad_norm": 2.057713965841127, + "language_loss": 0.8161732, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83980966, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12878418, + "step": 11404, + "time_per_iteration": 2.7109313011169434 + }, + { + "auxiliary_loss_clip": 0.01334172, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.22873974, + "balance_loss_mlp": 1.01929092, + "epoch": 0.6857056966781903, + "flos": 23338996621560.0, + "grad_norm": 1.4683680840555697, + "language_loss": 0.77967, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80333734, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.13275146, + "step": 11405, + "time_per_iteration": 2.7615809440612793 + }, + { + "auxiliary_loss_clip": 0.01155683, + "auxiliary_loss_mlp": 0.01010019, + "balance_loss_clip": 1.1119051, + "balance_loss_mlp": 1.00722933, + "epoch": 0.6857658199308583, + "flos": 61866699201000.0, + "grad_norm": 0.7894987220174786, + "language_loss": 0.60984886, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63150585, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.0279541, + "step": 11406, + "time_per_iteration": 3.315457820892334 + }, + { + "auxiliary_loss_clip": 0.01337725, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.23003292, + "balance_loss_mlp": 1.01799464, + "epoch": 0.6858259431835262, + "flos": 16658942725800.0, + "grad_norm": 2.0897091056450816, + "language_loss": 0.77201754, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79571342, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.13873291, + "step": 11407, + "time_per_iteration": 2.9848721027374268 + }, + { + "auxiliary_loss_clip": 0.01337846, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.22896111, + "balance_loss_mlp": 1.02186143, + "epoch": 0.6858860664361942, + "flos": 21878180993160.0, + "grad_norm": 2.472988984166123, + "language_loss": 0.70913255, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73286748, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13775635, + "step": 11408, + "time_per_iteration": 2.849447011947632 + }, + { + "auxiliary_loss_clip": 0.01345282, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.2346971, + "balance_loss_mlp": 1.01526761, + "epoch": 0.6859461896888621, + "flos": 15271797308400.0, + "grad_norm": 1.9225422548784805, + "language_loss": 0.70327449, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72700971, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12988281, + "step": 11409, + "time_per_iteration": 2.7348403930664062 + }, + { + "auxiliary_loss_clip": 0.01324406, + "auxiliary_loss_mlp": 0.0102634, + "balance_loss_clip": 1.22290802, + "balance_loss_mlp": 1.01465762, + "epoch": 0.6860063129415301, + "flos": 18624363748920.0, + "grad_norm": 1.6021857589293655, + "language_loss": 0.78212821, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80563569, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.11688232, + "step": 11410, + "time_per_iteration": 2.731114625930786 + }, + { + "auxiliary_loss_clip": 0.01347648, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.23582768, + "balance_loss_mlp": 1.02243924, + "epoch": 0.6860664361941982, + "flos": 23956685963640.0, + "grad_norm": 1.9054687417038418, + "language_loss": 0.7155093, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73934793, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13781738, + "step": 11411, + "time_per_iteration": 2.7660181522369385 + }, + { + "auxiliary_loss_clip": 0.01337943, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.23012924, + "balance_loss_mlp": 1.01505041, + "epoch": 0.6861265594468661, + "flos": 20412045669600.0, + "grad_norm": 1.911688816861359, + "language_loss": 0.70290458, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7265715, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13708496, + "step": 11412, + "time_per_iteration": 2.775538921356201 + }, + { + "auxiliary_loss_clip": 0.01335249, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.22798014, + "balance_loss_mlp": 1.01517296, + "epoch": 0.6861866826995341, + "flos": 11431176702840.0, + "grad_norm": 2.900558011560707, + "language_loss": 0.72389108, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74752939, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13409424, + "step": 11413, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.01340092, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.23225045, + "balance_loss_mlp": 1.02355528, + "epoch": 0.686246805952202, + "flos": 16001880255720.0, + "grad_norm": 1.7973914012387775, + "language_loss": 0.74653578, + "learning_rate": 9.466390286747164e-07, + "loss": 0.77029598, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12390137, + "step": 11414, + "time_per_iteration": 2.6894097328186035 + }, + { + "auxiliary_loss_clip": 0.01339843, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.23120356, + "balance_loss_mlp": 1.01646972, + "epoch": 0.68630692920487, + "flos": 19831252345560.0, + "grad_norm": 2.1630117926835473, + "language_loss": 0.86685485, + "learning_rate": 9.46307981554495e-07, + "loss": 0.89054787, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12988281, + "step": 11415, + "time_per_iteration": 2.7585363388061523 + }, + { + "auxiliary_loss_clip": 0.01338758, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.22827268, + "balance_loss_mlp": 1.01731277, + "epoch": 0.6863670524575379, + "flos": 26291798117640.0, + "grad_norm": 1.8277924178781622, + "language_loss": 0.67962062, + "learning_rate": 9.459769743903801e-07, + "loss": 0.70331824, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13677979, + "step": 11416, + "time_per_iteration": 2.75892972946167 + }, + { + "auxiliary_loss_clip": 0.01327704, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.22162771, + "balance_loss_mlp": 1.01755786, + "epoch": 0.686427175710206, + "flos": 19178007061320.0, + "grad_norm": 1.3502158032732101, + "language_loss": 0.7627936, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78637719, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.13110352, + "step": 11417, + "time_per_iteration": 2.7285678386688232 + }, + { + "auxiliary_loss_clip": 0.0134179, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.23457456, + "balance_loss_mlp": 1.01848769, + "epoch": 0.6864872989628739, + "flos": 18921318661080.0, + "grad_norm": 1.926083138353895, + "language_loss": 0.77567667, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79940212, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1227417, + "step": 11418, + "time_per_iteration": 2.7203285694122314 + }, + { + "auxiliary_loss_clip": 0.0134218, + "auxiliary_loss_mlp": 0.01025172, + "balance_loss_clip": 1.23469114, + "balance_loss_mlp": 1.01320934, + "epoch": 0.6865474222155419, + "flos": 25961520723120.0, + "grad_norm": 2.010694980505223, + "language_loss": 0.76608104, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78975457, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11975098, + "step": 11419, + "time_per_iteration": 2.9971842765808105 + }, + { + "auxiliary_loss_clip": 0.01339811, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.23284101, + "balance_loss_mlp": 1.02293015, + "epoch": 0.6866075454682098, + "flos": 18482627280600.0, + "grad_norm": 1.6901366393239672, + "language_loss": 0.71842492, + "learning_rate": 9.446533455460044e-07, + "loss": 0.74217749, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12512207, + "step": 11420, + "time_per_iteration": 2.8540468215942383 + }, + { + "auxiliary_loss_clip": 0.01336144, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.22970462, + "balance_loss_mlp": 1.01586366, + "epoch": 0.6866676687208778, + "flos": 34246603825560.0, + "grad_norm": 1.327844261021319, + "language_loss": 0.74497527, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76862186, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12664795, + "step": 11421, + "time_per_iteration": 2.8730084896087646 + }, + { + "auxiliary_loss_clip": 0.01334734, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.23032379, + "balance_loss_mlp": 1.01900864, + "epoch": 0.6867277919735457, + "flos": 21726251826480.0, + "grad_norm": 1.7541899399357737, + "language_loss": 0.77535123, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79901373, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.125, + "step": 11422, + "time_per_iteration": 2.794557809829712 + }, + { + "auxiliary_loss_clip": 0.01340762, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.23284769, + "balance_loss_mlp": 1.02600455, + "epoch": 0.6867879152262137, + "flos": 24103417260240.0, + "grad_norm": 2.3692451572216613, + "language_loss": 0.77532101, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79912102, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13238525, + "step": 11423, + "time_per_iteration": 2.768872022628784 + }, + { + "auxiliary_loss_clip": 0.01344204, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.23509991, + "balance_loss_mlp": 1.02071226, + "epoch": 0.6868480384788818, + "flos": 21620761642440.0, + "grad_norm": 1.5700977070376767, + "language_loss": 0.7286315, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75241041, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12982178, + "step": 11424, + "time_per_iteration": 2.8770177364349365 + }, + { + "auxiliary_loss_clip": 0.01337817, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.22935474, + "balance_loss_mlp": 1.01532125, + "epoch": 0.6869081617315497, + "flos": 26291716900920.0, + "grad_norm": 1.7619228485942096, + "language_loss": 0.65803206, + "learning_rate": 9.429997100087112e-07, + "loss": 0.6816873, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12390137, + "step": 11425, + "time_per_iteration": 2.856609582901001 + }, + { + "auxiliary_loss_clip": 0.01334635, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.22992492, + "balance_loss_mlp": 1.01564693, + "epoch": 0.6869682849842177, + "flos": 21109983777000.0, + "grad_norm": 1.392123933679672, + "language_loss": 0.72137338, + "learning_rate": 9.426691030957657e-07, + "loss": 0.74499917, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.1229248, + "step": 11426, + "time_per_iteration": 2.815854549407959 + }, + { + "auxiliary_loss_clip": 0.01344675, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.2364192, + "balance_loss_mlp": 1.01644731, + "epoch": 0.6870284082368856, + "flos": 17097146805960.0, + "grad_norm": 2.4066293402033163, + "language_loss": 0.85444647, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87818003, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12237549, + "step": 11427, + "time_per_iteration": 2.73589825630188 + }, + { + "auxiliary_loss_clip": 0.01336582, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.23124909, + "balance_loss_mlp": 1.0169605, + "epoch": 0.6870885314895536, + "flos": 27313841148840.0, + "grad_norm": 1.3620587738025214, + "language_loss": 0.76108617, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78474569, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12432861, + "step": 11428, + "time_per_iteration": 2.8937535285949707 + }, + { + "auxiliary_loss_clip": 0.01351776, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.24025309, + "balance_loss_mlp": 1.02232432, + "epoch": 0.6871486547422215, + "flos": 20819566810800.0, + "grad_norm": 1.6688622999782463, + "language_loss": 0.73668367, + "learning_rate": 9.4167752297163e-07, + "loss": 0.7605651, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14050293, + "step": 11429, + "time_per_iteration": 2.8759124279022217 + }, + { + "auxiliary_loss_clip": 0.01342822, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.23397207, + "balance_loss_mlp": 1.01557279, + "epoch": 0.6872087779948896, + "flos": 30160259076960.0, + "grad_norm": 1.644548842002584, + "language_loss": 0.835302, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85901153, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12561035, + "step": 11430, + "time_per_iteration": 4.218959331512451 + }, + { + "auxiliary_loss_clip": 0.0134023, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.23252153, + "balance_loss_mlp": 1.01940322, + "epoch": 0.6872689012475575, + "flos": 20709528490440.0, + "grad_norm": 1.8363748562616018, + "language_loss": 0.70020747, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72393078, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12695312, + "step": 11431, + "time_per_iteration": 4.318030834197998 + }, + { + "auxiliary_loss_clip": 0.01342833, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.23339069, + "balance_loss_mlp": 1.01918602, + "epoch": 0.6873290245002255, + "flos": 25526443486680.0, + "grad_norm": 1.8976125864037459, + "language_loss": 0.80737567, + "learning_rate": 9.406863040327355e-07, + "loss": 0.8311224, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12652588, + "step": 11432, + "time_per_iteration": 2.8045494556427 + }, + { + "auxiliary_loss_clip": 0.01330848, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.22744131, + "balance_loss_mlp": 1.01622164, + "epoch": 0.6873891477528934, + "flos": 25196897042640.0, + "grad_norm": 1.6190288770848411, + "language_loss": 0.68042767, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70401955, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12133789, + "step": 11433, + "time_per_iteration": 2.7546067237854004 + }, + { + "auxiliary_loss_clip": 0.01341281, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.23354959, + "balance_loss_mlp": 1.02135897, + "epoch": 0.6874492710055614, + "flos": 35159136445080.0, + "grad_norm": 1.9988483651086029, + "language_loss": 0.73077208, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75453067, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.13208008, + "step": 11434, + "time_per_iteration": 2.89054799079895 + }, + { + "auxiliary_loss_clip": 0.0133623, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.23056722, + "balance_loss_mlp": 1.02087533, + "epoch": 0.6875093942582293, + "flos": 17826783061320.0, + "grad_norm": 1.6419406063080524, + "language_loss": 0.80401832, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82772177, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13250732, + "step": 11435, + "time_per_iteration": 4.243057489395142 + }, + { + "auxiliary_loss_clip": 0.01343896, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.23434186, + "balance_loss_mlp": 1.01786828, + "epoch": 0.6875695175108973, + "flos": 20709487882080.0, + "grad_norm": 3.004145599375982, + "language_loss": 0.81596607, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83972073, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13690186, + "step": 11436, + "time_per_iteration": 2.7698843479156494 + }, + { + "auxiliary_loss_clip": 0.01323772, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.22182918, + "balance_loss_mlp": 1.02087688, + "epoch": 0.6876296407635654, + "flos": 25379468539920.0, + "grad_norm": 1.7463749384620695, + "language_loss": 0.82461506, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84817219, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.1105957, + "step": 11437, + "time_per_iteration": 2.82942533493042 + }, + { + "auxiliary_loss_clip": 0.01362264, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.24664402, + "balance_loss_mlp": 1.01846647, + "epoch": 0.6876897640162333, + "flos": 23227618225320.0, + "grad_norm": 2.2845500761076583, + "language_loss": 0.78428495, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80823493, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.14263916, + "step": 11438, + "time_per_iteration": 2.7876665592193604 + }, + { + "auxiliary_loss_clip": 0.01329012, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.22574341, + "balance_loss_mlp": 1.01662672, + "epoch": 0.6877498872689013, + "flos": 27129888967320.0, + "grad_norm": 1.663731922208447, + "language_loss": 0.72525316, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74883169, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.12213135, + "step": 11439, + "time_per_iteration": 2.8197131156921387 + }, + { + "auxiliary_loss_clip": 0.01337021, + "auxiliary_loss_mlp": 0.01024707, + "balance_loss_clip": 1.23065782, + "balance_loss_mlp": 1.01218462, + "epoch": 0.6878100105215692, + "flos": 11585217504240.0, + "grad_norm": 1.8192199889754506, + "language_loss": 0.75774813, + "learning_rate": 9.380448218957623e-07, + "loss": 0.78136539, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12536621, + "step": 11440, + "time_per_iteration": 2.739675283432007 + }, + { + "auxiliary_loss_clip": 0.01332415, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.22778046, + "balance_loss_mlp": 1.0168047, + "epoch": 0.6878701337742372, + "flos": 20308057994880.0, + "grad_norm": 1.5303684324932312, + "language_loss": 0.71932453, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74294114, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12438965, + "step": 11441, + "time_per_iteration": 2.7160539627075195 + }, + { + "auxiliary_loss_clip": 0.01347368, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.23564672, + "balance_loss_mlp": 1.01620722, + "epoch": 0.6879302570269051, + "flos": 13843451386800.0, + "grad_norm": 2.0140273438107785, + "language_loss": 0.66929728, + "learning_rate": 9.373848538056317e-07, + "loss": 0.6930722, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13916016, + "step": 11442, + "time_per_iteration": 4.359332799911499 + }, + { + "auxiliary_loss_clip": 0.01341858, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.2350992, + "balance_loss_mlp": 1.01954126, + "epoch": 0.6879903802795732, + "flos": 21329654334120.0, + "grad_norm": 1.9856762048776062, + "language_loss": 0.70250118, + "learning_rate": 9.370549301960189e-07, + "loss": 0.72624183, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12670898, + "step": 11443, + "time_per_iteration": 2.803687572479248 + }, + { + "auxiliary_loss_clip": 0.0134514, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.23659384, + "balance_loss_mlp": 1.01932883, + "epoch": 0.6880505035322411, + "flos": 25157077222680.0, + "grad_norm": 1.414980577074327, + "language_loss": 0.76471269, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78848684, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12957764, + "step": 11444, + "time_per_iteration": 2.7700839042663574 + }, + { + "auxiliary_loss_clip": 0.01334286, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.22909391, + "balance_loss_mlp": 1.01311255, + "epoch": 0.6881106267849091, + "flos": 23219943245280.0, + "grad_norm": 2.0067756254651625, + "language_loss": 0.7656076, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78920102, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.1194458, + "step": 11445, + "time_per_iteration": 2.801657199859619 + }, + { + "auxiliary_loss_clip": 0.01154016, + "auxiliary_loss_mlp": 0.01002191, + "balance_loss_clip": 1.11076844, + "balance_loss_mlp": 0.99940139, + "epoch": 0.688170750037577, + "flos": 48494814985080.0, + "grad_norm": 0.8276223158802781, + "language_loss": 0.5845741, + "learning_rate": 9.360654012591183e-07, + "loss": 0.6061362, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.0279541, + "step": 11446, + "time_per_iteration": 3.30515456199646 + }, + { + "auxiliary_loss_clip": 0.01346677, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.23590112, + "balance_loss_mlp": 1.01475132, + "epoch": 0.688230873290245, + "flos": 22788764411400.0, + "grad_norm": 1.499610582012681, + "language_loss": 0.75876403, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78251243, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13415527, + "step": 11447, + "time_per_iteration": 2.806490421295166 + }, + { + "auxiliary_loss_clip": 0.01342894, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.23381066, + "balance_loss_mlp": 1.01742268, + "epoch": 0.6882909965429129, + "flos": 22460801693400.0, + "grad_norm": 1.8409508158601162, + "language_loss": 0.73291475, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75664544, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12780762, + "step": 11448, + "time_per_iteration": 2.860663652420044 + }, + { + "auxiliary_loss_clip": 0.01350641, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.23780847, + "balance_loss_mlp": 1.01725388, + "epoch": 0.688351119795581, + "flos": 26219833457760.0, + "grad_norm": 1.8082502142555121, + "language_loss": 0.74863243, + "learning_rate": 9.350762354227673e-07, + "loss": 0.77244735, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13598633, + "step": 11449, + "time_per_iteration": 2.992832660675049 + }, + { + "auxiliary_loss_clip": 0.01337888, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.23181772, + "balance_loss_mlp": 1.02356315, + "epoch": 0.6884112430482489, + "flos": 22570433930160.0, + "grad_norm": 1.6820601157449306, + "language_loss": 0.7023319, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72606981, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12353516, + "step": 11450, + "time_per_iteration": 2.792462110519409 + }, + { + "auxiliary_loss_clip": 0.01353598, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.24067461, + "balance_loss_mlp": 1.01715124, + "epoch": 0.6884713663009169, + "flos": 17344657716840.0, + "grad_norm": 2.2207672658470576, + "language_loss": 0.76617634, + "learning_rate": 9.344169934211068e-07, + "loss": 0.79002094, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13702393, + "step": 11451, + "time_per_iteration": 2.699768304824829 + }, + { + "auxiliary_loss_clip": 0.01348958, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.23977089, + "balance_loss_mlp": 1.01738834, + "epoch": 0.6885314895535849, + "flos": 26476724899800.0, + "grad_norm": 1.3470452299630018, + "language_loss": 0.6909411, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71472847, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.12408447, + "step": 11452, + "time_per_iteration": 2.836073398590088 + }, + { + "auxiliary_loss_clip": 0.01342473, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.23507333, + "balance_loss_mlp": 1.01842785, + "epoch": 0.6885916128062528, + "flos": 20526388476120.0, + "grad_norm": 1.8425305055395416, + "language_loss": 0.72188997, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74564266, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.14379883, + "step": 11453, + "time_per_iteration": 2.7308857440948486 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01012433, + "balance_loss_clip": 1.1122843, + "balance_loss_mlp": 1.00965583, + "epoch": 0.6886517360589208, + "flos": 70730626426200.0, + "grad_norm": 0.7946264868961926, + "language_loss": 0.50676483, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52844852, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02783203, + "step": 11454, + "time_per_iteration": 3.2250680923461914 + }, + { + "auxiliary_loss_clip": 0.01330836, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.22826648, + "balance_loss_mlp": 1.01948738, + "epoch": 0.6887118593115887, + "flos": 17897894945640.0, + "grad_norm": 2.129889020896911, + "language_loss": 0.75664258, + "learning_rate": 9.330989944019263e-07, + "loss": 0.78027707, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.13128662, + "step": 11455, + "time_per_iteration": 2.7291882038116455 + }, + { + "auxiliary_loss_clip": 0.01346982, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.23542178, + "balance_loss_mlp": 1.02101684, + "epoch": 0.6887719825642568, + "flos": 17456970105360.0, + "grad_norm": 2.3611941041322515, + "language_loss": 0.73021078, + "learning_rate": 9.327695957583803e-07, + "loss": 0.75403237, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 1.11474609, + "router_z_loss_mlp": 0.14160156, + "step": 11456, + "time_per_iteration": 2.689026355743408 + }, + { + "auxiliary_loss_clip": 0.01336794, + "auxiliary_loss_mlp": 0.01036123, + "balance_loss_clip": 1.23041463, + "balance_loss_mlp": 1.02345061, + "epoch": 0.6888321058169247, + "flos": 23074673849640.0, + "grad_norm": 1.7615881149082633, + "language_loss": 0.80872774, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83245689, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12670898, + "step": 11457, + "time_per_iteration": 2.766050338745117 + }, + { + "auxiliary_loss_clip": 0.01347547, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.23804522, + "balance_loss_mlp": 1.01716638, + "epoch": 0.6888922290695927, + "flos": 23374227696840.0, + "grad_norm": 1.693117523054795, + "language_loss": 0.76370293, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78748369, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13354492, + "step": 11458, + "time_per_iteration": 2.7534983158111572 + }, + { + "auxiliary_loss_clip": 0.01342766, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.2346251, + "balance_loss_mlp": 1.01795363, + "epoch": 0.6889523523222606, + "flos": 17635115291400.0, + "grad_norm": 2.32266295901033, + "language_loss": 0.68398237, + "learning_rate": 9.31781642694603e-07, + "loss": 0.7077142, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12469482, + "step": 11459, + "time_per_iteration": 2.7303390502929688 + }, + { + "auxiliary_loss_clip": 0.01345781, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.23840499, + "balance_loss_mlp": 1.02202642, + "epoch": 0.6890124755749286, + "flos": 25233590018880.0, + "grad_norm": 1.416844045314607, + "language_loss": 0.688743, + "learning_rate": 9.314524060039221e-07, + "loss": 0.71254802, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12713623, + "step": 11460, + "time_per_iteration": 2.8734495639801025 + }, + { + "auxiliary_loss_clip": 0.01360298, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.24449599, + "balance_loss_mlp": 1.02095461, + "epoch": 0.6890725988275965, + "flos": 20235118734360.0, + "grad_norm": 1.7808936906731105, + "language_loss": 0.77695751, + "learning_rate": 9.311232098326731e-07, + "loss": 0.80092078, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.15075684, + "step": 11461, + "time_per_iteration": 2.859924793243408 + }, + { + "auxiliary_loss_clip": 0.01340468, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.23233211, + "balance_loss_mlp": 1.02168357, + "epoch": 0.6891327220802645, + "flos": 14539034209320.0, + "grad_norm": 1.7228953680694803, + "language_loss": 0.69770062, + "learning_rate": 9.307940541933401e-07, + "loss": 0.72145975, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.13751221, + "step": 11462, + "time_per_iteration": 2.7679357528686523 + }, + { + "auxiliary_loss_clip": 0.01349029, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.24001646, + "balance_loss_mlp": 1.01623607, + "epoch": 0.6891928453329325, + "flos": 21143712342960.0, + "grad_norm": 1.4973664552043828, + "language_loss": 0.87624979, + "learning_rate": 9.304649390984034e-07, + "loss": 0.90003061, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12823486, + "step": 11463, + "time_per_iteration": 2.843512535095215 + }, + { + "auxiliary_loss_clip": 0.01328047, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.2244494, + "balance_loss_mlp": 1.01664782, + "epoch": 0.6892529685856005, + "flos": 17863069953960.0, + "grad_norm": 1.6591344644049497, + "language_loss": 0.68676126, + "learning_rate": 9.301358645603428e-07, + "loss": 0.71032494, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.11676025, + "step": 11464, + "time_per_iteration": 2.931483030319214 + }, + { + "auxiliary_loss_clip": 0.01343817, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.23577821, + "balance_loss_mlp": 1.01790738, + "epoch": 0.6893130918382685, + "flos": 29940872778360.0, + "grad_norm": 1.8708899082685098, + "language_loss": 0.65406442, + "learning_rate": 9.298068305916373e-07, + "loss": 0.677809, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12738037, + "step": 11465, + "time_per_iteration": 2.904247999191284 + }, + { + "auxiliary_loss_clip": 0.01355851, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.2438488, + "balance_loss_mlp": 1.01739347, + "epoch": 0.6893732150909364, + "flos": 24393793618080.0, + "grad_norm": 1.3546533626571817, + "language_loss": 0.72505307, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74891758, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13208008, + "step": 11466, + "time_per_iteration": 3.012827157974243 + }, + { + "auxiliary_loss_clip": 0.01344146, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.23644054, + "balance_loss_mlp": 1.01734638, + "epoch": 0.6894333383436044, + "flos": 16987311527400.0, + "grad_norm": 1.808919932222913, + "language_loss": 0.72546315, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74920464, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12664795, + "step": 11467, + "time_per_iteration": 2.739743947982788 + }, + { + "auxiliary_loss_clip": 0.01346559, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.23513389, + "balance_loss_mlp": 1.02113152, + "epoch": 0.6894934615962723, + "flos": 18989709785280.0, + "grad_norm": 2.428081794051202, + "language_loss": 0.81339788, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83721936, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14465332, + "step": 11468, + "time_per_iteration": 2.7900500297546387 + }, + { + "auxiliary_loss_clip": 0.01351681, + "auxiliary_loss_mlp": 0.01036609, + "balance_loss_clip": 1.24263072, + "balance_loss_mlp": 1.02285242, + "epoch": 0.6895535848489404, + "flos": 34538279650920.0, + "grad_norm": 1.5470604021071495, + "language_loss": 0.66668302, + "learning_rate": 9.284911006598875e-07, + "loss": 0.69056594, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13769531, + "step": 11469, + "time_per_iteration": 4.348276615142822 + }, + { + "auxiliary_loss_clip": 0.01152777, + "auxiliary_loss_mlp": 0.01004706, + "balance_loss_clip": 1.10930455, + "balance_loss_mlp": 1.00197625, + "epoch": 0.6896137081016083, + "flos": 50089424824440.0, + "grad_norm": 0.7939013268721036, + "language_loss": 0.55195236, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57352722, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02734375, + "step": 11470, + "time_per_iteration": 4.677281141281128 + }, + { + "auxiliary_loss_clip": 0.01338856, + "auxiliary_loss_mlp": 0.01029484, + "balance_loss_clip": 1.23436165, + "balance_loss_mlp": 1.01869583, + "epoch": 0.6896738313542763, + "flos": 19943036825400.0, + "grad_norm": 2.0189086200062967, + "language_loss": 0.77860975, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80229318, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.10791016, + "step": 11471, + "time_per_iteration": 2.7076141834259033 + }, + { + "auxiliary_loss_clip": 0.01343719, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.2361654, + "balance_loss_mlp": 1.01928377, + "epoch": 0.6897339546069442, + "flos": 21730475095920.0, + "grad_norm": 1.6739849398329727, + "language_loss": 0.78735101, + "learning_rate": 9.275047298005232e-07, + "loss": 0.81110799, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12689209, + "step": 11472, + "time_per_iteration": 4.185056209564209 + }, + { + "auxiliary_loss_clip": 0.01341669, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.23544836, + "balance_loss_mlp": 1.01839268, + "epoch": 0.6897940778596122, + "flos": 19831008695400.0, + "grad_norm": 1.6903168500272552, + "language_loss": 0.76532125, + "learning_rate": 9.271760208357024e-07, + "loss": 0.789047, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12524414, + "step": 11473, + "time_per_iteration": 2.780151844024658 + }, + { + "auxiliary_loss_clip": 0.01352049, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.24024248, + "balance_loss_mlp": 1.02260256, + "epoch": 0.6898542011122801, + "flos": 17314380861480.0, + "grad_norm": 3.078077067130651, + "language_loss": 0.75962317, + "learning_rate": 9.268473525524751e-07, + "loss": 0.78351498, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14520264, + "step": 11474, + "time_per_iteration": 2.706878185272217 + }, + { + "auxiliary_loss_clip": 0.01348001, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.23979354, + "balance_loss_mlp": 1.01717412, + "epoch": 0.6899143243649482, + "flos": 24759464521320.0, + "grad_norm": 1.5040196117691393, + "language_loss": 0.74550974, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76929009, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12866211, + "step": 11475, + "time_per_iteration": 2.7753384113311768 + }, + { + "auxiliary_loss_clip": 0.01338464, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.23255754, + "balance_loss_mlp": 1.01728845, + "epoch": 0.6899744476176161, + "flos": 17238517799040.0, + "grad_norm": 2.0265523618183345, + "language_loss": 0.89081025, + "learning_rate": 9.261901380806491e-07, + "loss": 0.91449523, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12774658, + "step": 11476, + "time_per_iteration": 2.7066218852996826 + }, + { + "auxiliary_loss_clip": 0.01338482, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.23164165, + "balance_loss_mlp": 1.01873112, + "epoch": 0.6900345708702841, + "flos": 25416161516160.0, + "grad_norm": 1.5715064827416256, + "language_loss": 0.70576143, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72945917, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12561035, + "step": 11477, + "time_per_iteration": 2.9499030113220215 + }, + { + "auxiliary_loss_clip": 0.01353164, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.24123168, + "balance_loss_mlp": 1.02199554, + "epoch": 0.6900946941229521, + "flos": 23438598593400.0, + "grad_norm": 2.071175102786827, + "language_loss": 0.68328208, + "learning_rate": 9.255330864847313e-07, + "loss": 0.7071709, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.137146, + "step": 11478, + "time_per_iteration": 2.7494547367095947 + }, + { + "auxiliary_loss_clip": 0.01348725, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.23939359, + "balance_loss_mlp": 1.0194416, + "epoch": 0.69015481737562, + "flos": 17824387168080.0, + "grad_norm": 2.2181073304499055, + "language_loss": 0.76918674, + "learning_rate": 9.252046217963843e-07, + "loss": 0.79299903, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13067627, + "step": 11479, + "time_per_iteration": 4.294621467590332 + }, + { + "auxiliary_loss_clip": 0.01348468, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.23844707, + "balance_loss_mlp": 1.01831293, + "epoch": 0.690214940628288, + "flos": 17460909116280.0, + "grad_norm": 1.5270579937332873, + "language_loss": 0.78795302, + "learning_rate": 9.248761978643856e-07, + "loss": 0.81175238, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13153076, + "step": 11480, + "time_per_iteration": 2.830192804336548 + }, + { + "auxiliary_loss_clip": 0.01339695, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.23263407, + "balance_loss_mlp": 1.02001262, + "epoch": 0.6902750638809559, + "flos": 29572278073200.0, + "grad_norm": 1.5567100732084094, + "language_loss": 0.75610834, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77983218, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12677002, + "step": 11481, + "time_per_iteration": 2.805163621902466 + }, + { + "auxiliary_loss_clip": 0.01343109, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.23625624, + "balance_loss_mlp": 1.02169561, + "epoch": 0.690335187133624, + "flos": 25562689770960.0, + "grad_norm": 1.7311773964484851, + "language_loss": 0.69147491, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71525514, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13208008, + "step": 11482, + "time_per_iteration": 2.818099021911621 + }, + { + "auxiliary_loss_clip": 0.01340432, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.23246324, + "balance_loss_mlp": 1.01895809, + "epoch": 0.6903953103862919, + "flos": 22493108966760.0, + "grad_norm": 1.4545914133884952, + "language_loss": 0.82667184, + "learning_rate": 9.238911707310096e-07, + "loss": 0.85039252, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12677002, + "step": 11483, + "time_per_iteration": 2.785628080368042 + }, + { + "auxiliary_loss_clip": 0.01347857, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.23829079, + "balance_loss_mlp": 1.02312887, + "epoch": 0.6904554336389599, + "flos": 26105653084680.0, + "grad_norm": 1.780952725746426, + "language_loss": 0.65748918, + "learning_rate": 9.235629099489273e-07, + "loss": 0.6813184, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.11938477, + "step": 11484, + "time_per_iteration": 2.8260247707366943 + }, + { + "auxiliary_loss_clip": 0.01333386, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.22768712, + "balance_loss_mlp": 1.01864564, + "epoch": 0.6905155568916278, + "flos": 31177510321680.0, + "grad_norm": 1.9598730553279942, + "language_loss": 0.73836464, + "learning_rate": 9.232346899854479e-07, + "loss": 0.76201451, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12957764, + "step": 11485, + "time_per_iteration": 2.8629965782165527 + }, + { + "auxiliary_loss_clip": 0.013457, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.23683906, + "balance_loss_mlp": 1.02038598, + "epoch": 0.6905756801442958, + "flos": 17644211564040.0, + "grad_norm": 4.7555720563395765, + "language_loss": 0.85046542, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87425911, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1328125, + "step": 11486, + "time_per_iteration": 2.7127327919006348 + }, + { + "auxiliary_loss_clip": 0.01340935, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.23299789, + "balance_loss_mlp": 1.02102125, + "epoch": 0.6906358033969637, + "flos": 22348489304880.0, + "grad_norm": 1.4813616418123974, + "language_loss": 0.73060828, + "learning_rate": 9.225783725640786e-07, + "loss": 0.75435811, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13037109, + "step": 11487, + "time_per_iteration": 2.7832143306732178 + }, + { + "auxiliary_loss_clip": 0.01160749, + "auxiliary_loss_mlp": 0.01005656, + "balance_loss_clip": 1.11749458, + "balance_loss_mlp": 1.00305736, + "epoch": 0.6906959266496318, + "flos": 69764727775680.0, + "grad_norm": 0.9007753879153437, + "language_loss": 0.66701424, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68867826, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02600098, + "step": 11488, + "time_per_iteration": 3.266549587249756 + }, + { + "auxiliary_loss_clip": 0.01354071, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.24043703, + "balance_loss_mlp": 1.01950741, + "epoch": 0.6907560499022997, + "flos": 21439286570880.0, + "grad_norm": 2.4160727718705006, + "language_loss": 0.74931216, + "learning_rate": 9.219222185664519e-07, + "loss": 0.773193, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14508057, + "step": 11489, + "time_per_iteration": 2.896308422088623 + }, + { + "auxiliary_loss_clip": 0.01348476, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.23775482, + "balance_loss_mlp": 1.02203596, + "epoch": 0.6908161731549677, + "flos": 14396282532000.0, + "grad_norm": 1.8806908912147615, + "language_loss": 0.62012851, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64397103, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13726807, + "step": 11490, + "time_per_iteration": 2.829284429550171 + }, + { + "auxiliary_loss_clip": 0.01346353, + "auxiliary_loss_mlp": 0.01029822, + "balance_loss_clip": 1.23751926, + "balance_loss_mlp": 1.01661944, + "epoch": 0.6908762964076357, + "flos": 20015773044120.0, + "grad_norm": 1.600187241039162, + "language_loss": 0.72911298, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75287473, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.13189697, + "step": 11491, + "time_per_iteration": 2.7472715377807617 + }, + { + "auxiliary_loss_clip": 0.01335709, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.2291286, + "balance_loss_mlp": 1.02195263, + "epoch": 0.6909364196603036, + "flos": 28775225294280.0, + "grad_norm": 1.4786433805380648, + "language_loss": 0.70269996, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72640842, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13189697, + "step": 11492, + "time_per_iteration": 2.822889804840088 + }, + { + "auxiliary_loss_clip": 0.01352932, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.23969185, + "balance_loss_mlp": 1.01871395, + "epoch": 0.6909965429129716, + "flos": 22533213045240.0, + "grad_norm": 2.0386331742522694, + "language_loss": 0.74898851, + "learning_rate": 9.206104012405049e-07, + "loss": 0.77284205, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13720703, + "step": 11493, + "time_per_iteration": 2.732909679412842 + }, + { + "auxiliary_loss_clip": 0.01337638, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.23074508, + "balance_loss_mlp": 1.01357174, + "epoch": 0.6910566661656395, + "flos": 18410581404000.0, + "grad_norm": 2.369223880852356, + "language_loss": 0.74581528, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76945823, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13085938, + "step": 11494, + "time_per_iteration": 2.757181167602539 + }, + { + "auxiliary_loss_clip": 0.01337249, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.23005152, + "balance_loss_mlp": 1.01882195, + "epoch": 0.6911167894183076, + "flos": 30780141270480.0, + "grad_norm": 1.9069798470994683, + "language_loss": 0.68701541, + "learning_rate": 9.19954738111161e-07, + "loss": 0.71070373, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12768555, + "step": 11495, + "time_per_iteration": 2.790100574493408 + }, + { + "auxiliary_loss_clip": 0.01342098, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.23397303, + "balance_loss_mlp": 1.01375949, + "epoch": 0.6911769126709755, + "flos": 13739747970600.0, + "grad_norm": 1.856643607349223, + "language_loss": 0.737921, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76160711, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12756348, + "step": 11496, + "time_per_iteration": 2.7905848026275635 + }, + { + "auxiliary_loss_clip": 0.01340627, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.23331571, + "balance_loss_mlp": 1.01545691, + "epoch": 0.6912370359236435, + "flos": 17571394128600.0, + "grad_norm": 1.5973636295951346, + "language_loss": 0.80262882, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82631427, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12451172, + "step": 11497, + "time_per_iteration": 2.698430299758911 + }, + { + "auxiliary_loss_clip": 0.0135021, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.24027705, + "balance_loss_mlp": 1.01967824, + "epoch": 0.6912971591763114, + "flos": 22095861740640.0, + "grad_norm": 1.4711419513506832, + "language_loss": 0.81413245, + "learning_rate": 9.189715506138993e-07, + "loss": 0.8379606, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12915039, + "step": 11498, + "time_per_iteration": 2.767021894454956 + }, + { + "auxiliary_loss_clip": 0.01332171, + "auxiliary_loss_mlp": 0.01032951, + "balance_loss_clip": 1.22760677, + "balance_loss_mlp": 1.01982021, + "epoch": 0.6913572824289794, + "flos": 29977565754600.0, + "grad_norm": 5.301278286195011, + "language_loss": 0.86083937, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88449061, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.13134766, + "step": 11499, + "time_per_iteration": 2.832481622695923 + }, + { + "auxiliary_loss_clip": 0.01329385, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.22507787, + "balance_loss_mlp": 1.01696539, + "epoch": 0.6914174056816473, + "flos": 20453814690840.0, + "grad_norm": 1.5829009833733259, + "language_loss": 0.75592601, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77951407, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12457275, + "step": 11500, + "time_per_iteration": 2.7379090785980225 + }, + { + "auxiliary_loss_clip": 0.01337459, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.22898602, + "balance_loss_mlp": 1.01642179, + "epoch": 0.6914775289343154, + "flos": 21286788887160.0, + "grad_norm": 2.625171201191125, + "language_loss": 0.77932286, + "learning_rate": 9.179887320509921e-07, + "loss": 0.80299348, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1317749, + "step": 11501, + "time_per_iteration": 2.9880526065826416 + }, + { + "auxiliary_loss_clip": 0.01347387, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.23671317, + "balance_loss_mlp": 1.01928532, + "epoch": 0.6915376521869833, + "flos": 23883381227880.0, + "grad_norm": 1.895935417512027, + "language_loss": 0.73482001, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75861555, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12890625, + "step": 11502, + "time_per_iteration": 2.771143674850464 + }, + { + "auxiliary_loss_clip": 0.01342961, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.23230577, + "balance_loss_mlp": 1.01746941, + "epoch": 0.6915977754396513, + "flos": 11514024403200.0, + "grad_norm": 2.1742916456877284, + "language_loss": 0.73543477, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75918031, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14111328, + "step": 11503, + "time_per_iteration": 2.8050947189331055 + }, + { + "auxiliary_loss_clip": 0.01337803, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.23029459, + "balance_loss_mlp": 1.02077866, + "epoch": 0.6916578986923193, + "flos": 22606395955920.0, + "grad_norm": 1.6675024157914327, + "language_loss": 0.77325332, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79696441, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12524414, + "step": 11504, + "time_per_iteration": 2.759187698364258 + }, + { + "auxiliary_loss_clip": 0.01338527, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.2299087, + "balance_loss_mlp": 1.01498938, + "epoch": 0.6917180219449872, + "flos": 23482641682800.0, + "grad_norm": 1.7938323216444096, + "language_loss": 0.73978007, + "learning_rate": 9.166788817780499e-07, + "loss": 0.76344281, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12756348, + "step": 11505, + "time_per_iteration": 2.7975096702575684 + }, + { + "auxiliary_loss_clip": 0.01332458, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.22638726, + "balance_loss_mlp": 1.02053797, + "epoch": 0.6917781451976552, + "flos": 23737827573720.0, + "grad_norm": 1.7507548074464143, + "language_loss": 0.87831008, + "learning_rate": 9.163515218778886e-07, + "loss": 0.9019717, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13165283, + "step": 11506, + "time_per_iteration": 2.7725777626037598 + }, + { + "auxiliary_loss_clip": 0.01336408, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.23054361, + "balance_loss_mlp": 1.01197827, + "epoch": 0.6918382684503231, + "flos": 31472962724520.0, + "grad_norm": 1.9675817304876264, + "language_loss": 0.70596761, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72958046, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12902832, + "step": 11507, + "time_per_iteration": 2.8936195373535156 + }, + { + "auxiliary_loss_clip": 0.01339558, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.2295475, + "balance_loss_mlp": 1.01874518, + "epoch": 0.6918983917029912, + "flos": 21655180550520.0, + "grad_norm": 1.9361570805660915, + "language_loss": 0.77309823, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79680908, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12774658, + "step": 11508, + "time_per_iteration": 4.319936513900757 + }, + { + "auxiliary_loss_clip": 0.01329034, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.22541368, + "balance_loss_mlp": 1.01862693, + "epoch": 0.6919585149556591, + "flos": 25554730532400.0, + "grad_norm": 1.5171783810046664, + "language_loss": 0.7473098, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77090406, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.11779785, + "step": 11509, + "time_per_iteration": 4.349585294723511 + }, + { + "auxiliary_loss_clip": 0.01335955, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.22963893, + "balance_loss_mlp": 1.01647878, + "epoch": 0.6920186382083271, + "flos": 23665091355000.0, + "grad_norm": 1.5545857509032188, + "language_loss": 0.64227283, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66592383, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12670898, + "step": 11510, + "time_per_iteration": 2.744497060775757 + }, + { + "auxiliary_loss_clip": 0.01347566, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.23613834, + "balance_loss_mlp": 1.02315545, + "epoch": 0.692078761460995, + "flos": 19066588056720.0, + "grad_norm": 1.530047246604342, + "language_loss": 0.75547707, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77932191, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13751221, + "step": 11511, + "time_per_iteration": 4.215382814407349 + }, + { + "auxiliary_loss_clip": 0.01338231, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.23253679, + "balance_loss_mlp": 1.01850379, + "epoch": 0.692138884713663, + "flos": 29029314759480.0, + "grad_norm": 2.3043679603557683, + "language_loss": 0.62388039, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64756906, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12139893, + "step": 11512, + "time_per_iteration": 2.9650957584381104 + }, + { + "auxiliary_loss_clip": 0.01339147, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.23136055, + "balance_loss_mlp": 1.02066934, + "epoch": 0.6921990079663309, + "flos": 14762075260320.0, + "grad_norm": 1.6841790051796837, + "language_loss": 0.83376396, + "learning_rate": 9.140611538493666e-07, + "loss": 0.8574959, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1338501, + "step": 11513, + "time_per_iteration": 2.7137792110443115 + }, + { + "auxiliary_loss_clip": 0.01332803, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.22751009, + "balance_loss_mlp": 1.01972389, + "epoch": 0.692259131218999, + "flos": 23847134943600.0, + "grad_norm": 1.4271528272023044, + "language_loss": 0.78491575, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80855465, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11364746, + "step": 11514, + "time_per_iteration": 2.780233144760132 + }, + { + "auxiliary_loss_clip": 0.01346722, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.23569822, + "balance_loss_mlp": 1.02257133, + "epoch": 0.6923192544716669, + "flos": 19139730359040.0, + "grad_norm": 1.8461967994317814, + "language_loss": 0.7554608, + "learning_rate": 9.134071334081907e-07, + "loss": 0.77927971, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1260376, + "step": 11515, + "time_per_iteration": 2.7232487201690674 + }, + { + "auxiliary_loss_clip": 0.01334451, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.23117471, + "balance_loss_mlp": 1.0192883, + "epoch": 0.6923793777243349, + "flos": 28080535855680.0, + "grad_norm": 1.9170904153394985, + "language_loss": 0.53817225, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56182587, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.11627197, + "step": 11516, + "time_per_iteration": 2.813985586166382 + }, + { + "auxiliary_loss_clip": 0.01325533, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.2243495, + "balance_loss_mlp": 1.01813269, + "epoch": 0.6924395009770029, + "flos": 16585597381680.0, + "grad_norm": 1.5927772199465045, + "language_loss": 0.73590922, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75947452, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.12854004, + "step": 11517, + "time_per_iteration": 2.722513437271118 + }, + { + "auxiliary_loss_clip": 0.01338365, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.22961581, + "balance_loss_mlp": 1.02418375, + "epoch": 0.6924996242296708, + "flos": 16659795501360.0, + "grad_norm": 1.8293881566087007, + "language_loss": 0.76523167, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78898805, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13092041, + "step": 11518, + "time_per_iteration": 4.223195314407349 + }, + { + "auxiliary_loss_clip": 0.01344973, + "auxiliary_loss_mlp": 0.01040307, + "balance_loss_clip": 1.23262405, + "balance_loss_mlp": 1.02600765, + "epoch": 0.6925597474823388, + "flos": 34761280093560.0, + "grad_norm": 1.3608947621028704, + "language_loss": 0.64772964, + "learning_rate": 9.120995870695376e-07, + "loss": 0.67158246, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14318848, + "step": 11519, + "time_per_iteration": 2.9682602882385254 + }, + { + "auxiliary_loss_clip": 0.01337719, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.22949815, + "balance_loss_mlp": 1.02345407, + "epoch": 0.6926198707350067, + "flos": 21876759700560.0, + "grad_norm": 2.173607370458649, + "language_loss": 0.63147813, + "learning_rate": 9.117728035871212e-07, + "loss": 0.65521646, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12664795, + "step": 11520, + "time_per_iteration": 2.8314058780670166 + }, + { + "auxiliary_loss_clip": 0.01351878, + "auxiliary_loss_mlp": 0.01041879, + "balance_loss_clip": 1.23757982, + "balance_loss_mlp": 1.027812, + "epoch": 0.6926799939876748, + "flos": 13010233540320.0, + "grad_norm": 2.780993713071533, + "language_loss": 0.78242147, + "learning_rate": 9.114460613703887e-07, + "loss": 0.80635905, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.140625, + "step": 11521, + "time_per_iteration": 2.8357346057891846 + }, + { + "auxiliary_loss_clip": 0.01342903, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.23286712, + "balance_loss_mlp": 1.0188483, + "epoch": 0.6927401172403427, + "flos": 16765285685400.0, + "grad_norm": 1.7427697363408416, + "language_loss": 0.82160497, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84535897, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.13641357, + "step": 11522, + "time_per_iteration": 2.7616324424743652 + }, + { + "auxiliary_loss_clip": 0.01334471, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_clip": 1.22841001, + "balance_loss_mlp": 1.02008009, + "epoch": 0.6928002404930107, + "flos": 25711573310640.0, + "grad_norm": 1.5253217510692756, + "language_loss": 0.76963437, + "learning_rate": 9.107927007835361e-07, + "loss": 0.7933026, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.1227417, + "step": 11523, + "time_per_iteration": 2.795623302459717 + }, + { + "auxiliary_loss_clip": 0.01336747, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.23106968, + "balance_loss_mlp": 1.01986945, + "epoch": 0.6928603637456786, + "flos": 18592909251120.0, + "grad_norm": 2.0391774826731144, + "language_loss": 0.69158047, + "learning_rate": 9.104660824381915e-07, + "loss": 0.71526736, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12060547, + "step": 11524, + "time_per_iteration": 2.9087395668029785 + }, + { + "auxiliary_loss_clip": 0.0134673, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.23666668, + "balance_loss_mlp": 1.02112925, + "epoch": 0.6929204869983466, + "flos": 22206062494440.0, + "grad_norm": 1.6601659195828276, + "language_loss": 0.65058005, + "learning_rate": 9.101395054080815e-07, + "loss": 0.67439556, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13690186, + "step": 11525, + "time_per_iteration": 2.7658960819244385 + }, + { + "auxiliary_loss_clip": 0.01337982, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_clip": 1.23082256, + "balance_loss_mlp": 1.02929485, + "epoch": 0.6929806102510145, + "flos": 17899275629880.0, + "grad_norm": 2.1642306381679335, + "language_loss": 0.70701689, + "learning_rate": 9.098129697055907e-07, + "loss": 0.73081589, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.1262207, + "step": 11526, + "time_per_iteration": 2.780810832977295 + }, + { + "auxiliary_loss_clip": 0.01332957, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.22711074, + "balance_loss_mlp": 1.02005935, + "epoch": 0.6930407335036826, + "flos": 19759896811080.0, + "grad_norm": 2.116697725094875, + "language_loss": 0.76471603, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78836697, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12078857, + "step": 11527, + "time_per_iteration": 2.7876603603363037 + }, + { + "auxiliary_loss_clip": 0.01337847, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.23076105, + "balance_loss_mlp": 1.01901686, + "epoch": 0.6931008567563505, + "flos": 21549365499600.0, + "grad_norm": 1.5997002944651342, + "language_loss": 0.7957288, + "learning_rate": 9.091600223329952e-07, + "loss": 0.8194145, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11706543, + "step": 11528, + "time_per_iteration": 2.8429417610168457 + }, + { + "auxiliary_loss_clip": 0.01323387, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.22055459, + "balance_loss_mlp": 1.01903439, + "epoch": 0.6931609800090185, + "flos": 26255633050080.0, + "grad_norm": 2.5964113134160014, + "language_loss": 0.76092756, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78447169, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11993408, + "step": 11529, + "time_per_iteration": 2.8304364681243896 + }, + { + "auxiliary_loss_clip": 0.01329472, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.22617054, + "balance_loss_mlp": 1.02079332, + "epoch": 0.6932211032616865, + "flos": 32349817576800.0, + "grad_norm": 1.5962894742861018, + "language_loss": 0.72444332, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74806601, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12005615, + "step": 11530, + "time_per_iteration": 2.792081356048584 + }, + { + "auxiliary_loss_clip": 0.01351039, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.23757696, + "balance_loss_mlp": 1.02082276, + "epoch": 0.6932812265143544, + "flos": 22053483594000.0, + "grad_norm": 2.0001191687224518, + "language_loss": 0.78270388, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80657113, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.14855957, + "step": 11531, + "time_per_iteration": 2.7939369678497314 + }, + { + "auxiliary_loss_clip": 0.01330376, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.22564876, + "balance_loss_mlp": 1.02252221, + "epoch": 0.6933413497670224, + "flos": 26263104988320.0, + "grad_norm": 1.4869023834042252, + "language_loss": 0.69649267, + "learning_rate": 9.078546240639484e-07, + "loss": 0.72013247, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.11065674, + "step": 11532, + "time_per_iteration": 2.911153793334961 + }, + { + "auxiliary_loss_clip": 0.01339787, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.2321645, + "balance_loss_mlp": 1.02174687, + "epoch": 0.6934014730196904, + "flos": 19577731397400.0, + "grad_norm": 1.3120931483206153, + "language_loss": 0.66894013, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69268918, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13360596, + "step": 11533, + "time_per_iteration": 2.796018600463867 + }, + { + "auxiliary_loss_clip": 0.0134023, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.23089457, + "balance_loss_mlp": 1.02489042, + "epoch": 0.6934615962723584, + "flos": 22123296010800.0, + "grad_norm": 2.3154934392988533, + "language_loss": 0.58917719, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61295521, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.12658691, + "step": 11534, + "time_per_iteration": 2.721463680267334 + }, + { + "auxiliary_loss_clip": 0.01336509, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.22921336, + "balance_loss_mlp": 1.01536465, + "epoch": 0.6935217195250263, + "flos": 21365778793320.0, + "grad_norm": 2.07329109000411, + "language_loss": 0.71101224, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73465943, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1282959, + "step": 11535, + "time_per_iteration": 2.767731189727783 + }, + { + "auxiliary_loss_clip": 0.01158757, + "auxiliary_loss_mlp": 0.01006425, + "balance_loss_clip": 1.11428809, + "balance_loss_mlp": 1.00380254, + "epoch": 0.6935818427776943, + "flos": 64081028800440.0, + "grad_norm": 0.717651533118849, + "language_loss": 0.59088266, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61253452, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02624512, + "step": 11536, + "time_per_iteration": 3.3823938369750977 + }, + { + "auxiliary_loss_clip": 0.01342399, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.23107934, + "balance_loss_mlp": 1.02297831, + "epoch": 0.6936419660303622, + "flos": 20307367652760.0, + "grad_norm": 1.579990754809715, + "language_loss": 0.7303918, + "learning_rate": 9.062238081412692e-07, + "loss": 0.75418329, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13775635, + "step": 11537, + "time_per_iteration": 2.7224693298339844 + }, + { + "auxiliary_loss_clip": 0.01156915, + "auxiliary_loss_mlp": 0.01010844, + "balance_loss_clip": 1.11286771, + "balance_loss_mlp": 1.00818527, + "epoch": 0.6937020892830302, + "flos": 67197721948200.0, + "grad_norm": 0.7463772127215501, + "language_loss": 0.55645835, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57813591, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02661133, + "step": 11538, + "time_per_iteration": 3.2200706005096436 + }, + { + "auxiliary_loss_clip": 0.01322495, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.22165632, + "balance_loss_mlp": 1.01792359, + "epoch": 0.6937622125356981, + "flos": 23883462444600.0, + "grad_norm": 1.425571791755986, + "language_loss": 0.77844876, + "learning_rate": 9.055717720183505e-07, + "loss": 0.80196762, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.11462402, + "step": 11539, + "time_per_iteration": 2.7441248893737793 + }, + { + "auxiliary_loss_clip": 0.01332258, + "auxiliary_loss_mlp": 0.01027338, + "balance_loss_clip": 1.22764063, + "balance_loss_mlp": 1.01547027, + "epoch": 0.6938223357883662, + "flos": 28736501900040.0, + "grad_norm": 1.692599861311078, + "language_loss": 0.64321864, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66681457, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11871338, + "step": 11540, + "time_per_iteration": 2.8489482402801514 + }, + { + "auxiliary_loss_clip": 0.01334358, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.22865558, + "balance_loss_mlp": 1.01781809, + "epoch": 0.6938824590410341, + "flos": 28660557620880.0, + "grad_norm": 2.0484203379379977, + "language_loss": 0.86889511, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89254099, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12408447, + "step": 11541, + "time_per_iteration": 2.8422224521636963 + }, + { + "auxiliary_loss_clip": 0.01338949, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.23048878, + "balance_loss_mlp": 1.01864982, + "epoch": 0.6939425822937021, + "flos": 18986907808440.0, + "grad_norm": 1.6873546411805032, + "language_loss": 0.84373111, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86743653, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.1295166, + "step": 11542, + "time_per_iteration": 2.7591590881347656 + }, + { + "auxiliary_loss_clip": 0.01341591, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.23297, + "balance_loss_mlp": 1.01667416, + "epoch": 0.6940027055463701, + "flos": 23081049362160.0, + "grad_norm": 1.847810831319629, + "language_loss": 0.7563867, + "learning_rate": 9.04268197881323e-07, + "loss": 0.78010392, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13458252, + "step": 11543, + "time_per_iteration": 2.8879027366638184 + }, + { + "auxiliary_loss_clip": 0.01339448, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.23272538, + "balance_loss_mlp": 1.02092123, + "epoch": 0.694062828799038, + "flos": 18191113888680.0, + "grad_norm": 1.8653582289242572, + "language_loss": 0.76337653, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78710389, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12390137, + "step": 11544, + "time_per_iteration": 2.7415077686309814 + }, + { + "auxiliary_loss_clip": 0.01339918, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.23139954, + "balance_loss_mlp": 1.01705074, + "epoch": 0.694122952051706, + "flos": 17826011502480.0, + "grad_norm": 2.2891688010918467, + "language_loss": 0.71594977, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73964733, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12780762, + "step": 11545, + "time_per_iteration": 2.8394227027893066 + }, + { + "auxiliary_loss_clip": 0.01325747, + "auxiliary_loss_mlp": 0.0102771, + "balance_loss_clip": 1.2232945, + "balance_loss_mlp": 1.01653409, + "epoch": 0.694183075304374, + "flos": 21220306355880.0, + "grad_norm": 1.8120906441082736, + "language_loss": 0.79650575, + "learning_rate": 9.0329095351302e-07, + "loss": 0.82004035, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11181641, + "step": 11546, + "time_per_iteration": 4.19689154624939 + }, + { + "auxiliary_loss_clip": 0.01336423, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.23060107, + "balance_loss_mlp": 1.0179019, + "epoch": 0.694243198557042, + "flos": 24065790291720.0, + "grad_norm": 1.2971827421518916, + "language_loss": 0.78728521, + "learning_rate": 9.029652885484194e-07, + "loss": 0.81095374, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12512207, + "step": 11547, + "time_per_iteration": 2.7981107234954834 + }, + { + "auxiliary_loss_clip": 0.01330891, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.22548068, + "balance_loss_mlp": 1.02211666, + "epoch": 0.6943033218097099, + "flos": 21146798578320.0, + "grad_norm": 2.068119699433738, + "language_loss": 0.80531675, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82897395, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12713623, + "step": 11548, + "time_per_iteration": 4.391593933105469 + }, + { + "auxiliary_loss_clip": 0.01157665, + "auxiliary_loss_mlp": 0.01007206, + "balance_loss_clip": 1.11380184, + "balance_loss_mlp": 1.00477374, + "epoch": 0.6943634450623779, + "flos": 57826752826680.0, + "grad_norm": 0.6906787746385485, + "language_loss": 0.53844678, + "learning_rate": 9.023140834305613e-07, + "loss": 0.56009555, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02429199, + "step": 11549, + "time_per_iteration": 4.635455131530762 + }, + { + "auxiliary_loss_clip": 0.01333747, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.22724605, + "balance_loss_mlp": 1.01643157, + "epoch": 0.6944235683150458, + "flos": 30596716997640.0, + "grad_norm": 1.3991644968813985, + "language_loss": 0.73867738, + "learning_rate": 9.01988543302e-07, + "loss": 0.76230997, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13067627, + "step": 11550, + "time_per_iteration": 2.936220169067383 + }, + { + "auxiliary_loss_clip": 0.01342648, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.23378992, + "balance_loss_mlp": 1.02404249, + "epoch": 0.6944836915677138, + "flos": 19724259652200.0, + "grad_norm": 1.7859339900850908, + "language_loss": 0.74485391, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76865882, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13787842, + "step": 11551, + "time_per_iteration": 2.8918893337249756 + }, + { + "auxiliary_loss_clip": 0.01338556, + "auxiliary_loss_mlp": 0.01036771, + "balance_loss_clip": 1.23070943, + "balance_loss_mlp": 1.02349734, + "epoch": 0.6945438148203817, + "flos": 24869015541360.0, + "grad_norm": 1.4490602994126234, + "language_loss": 0.84528124, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86903453, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1328125, + "step": 11552, + "time_per_iteration": 2.882082939147949 + }, + { + "auxiliary_loss_clip": 0.01342113, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.23523712, + "balance_loss_mlp": 1.01996005, + "epoch": 0.6946039380730498, + "flos": 33332974780320.0, + "grad_norm": 1.59474403769859, + "language_loss": 0.67562759, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69937027, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12194824, + "step": 11553, + "time_per_iteration": 2.8616185188293457 + }, + { + "auxiliary_loss_clip": 0.01352896, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.24075162, + "balance_loss_mlp": 1.02138972, + "epoch": 0.6946640613257177, + "flos": 20855975528520.0, + "grad_norm": 1.4887719710857334, + "language_loss": 0.79681188, + "learning_rate": 9.006867992782195e-07, + "loss": 0.82069468, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14013672, + "step": 11554, + "time_per_iteration": 2.7448253631591797 + }, + { + "auxiliary_loss_clip": 0.01342443, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.23390627, + "balance_loss_mlp": 1.01781952, + "epoch": 0.6947241845783857, + "flos": 19359644566320.0, + "grad_norm": 1.671224249029957, + "language_loss": 0.72694063, + "learning_rate": 9.003614674565934e-07, + "loss": 0.75067288, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12969971, + "step": 11555, + "time_per_iteration": 2.7201175689697266 + }, + { + "auxiliary_loss_clip": 0.01338063, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.23057723, + "balance_loss_mlp": 1.01965261, + "epoch": 0.6947843078310536, + "flos": 27125259614280.0, + "grad_norm": 1.6799544988636996, + "language_loss": 0.78022337, + "learning_rate": 9.000361773333705e-07, + "loss": 0.80392486, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12451172, + "step": 11556, + "time_per_iteration": 2.8103156089782715 + }, + { + "auxiliary_loss_clip": 0.01336381, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.22896242, + "balance_loss_mlp": 1.01981664, + "epoch": 0.6948444310837216, + "flos": 28591029462600.0, + "grad_norm": 2.916695713891784, + "language_loss": 0.60131907, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62500471, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12353516, + "step": 11557, + "time_per_iteration": 4.2642927169799805 + }, + { + "auxiliary_loss_clip": 0.01331078, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.22811031, + "balance_loss_mlp": 1.01872671, + "epoch": 0.6949045543363896, + "flos": 15673511454120.0, + "grad_norm": 1.8351396129099387, + "language_loss": 0.85785788, + "learning_rate": 8.993857222314752e-07, + "loss": 0.88147897, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.12298584, + "step": 11558, + "time_per_iteration": 2.6812264919281006 + }, + { + "auxiliary_loss_clip": 0.01349092, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.23932791, + "balance_loss_mlp": 1.02566385, + "epoch": 0.6949646775890576, + "flos": 23264798501880.0, + "grad_norm": 1.700628086270935, + "language_loss": 0.70306647, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72694874, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13470459, + "step": 11559, + "time_per_iteration": 2.9619686603546143 + }, + { + "auxiliary_loss_clip": 0.01338772, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.23338068, + "balance_loss_mlp": 1.01835966, + "epoch": 0.6950248008417256, + "flos": 22387659391080.0, + "grad_norm": 1.5807249917403656, + "language_loss": 0.79243386, + "learning_rate": 8.987354340711921e-07, + "loss": 0.81612802, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12286377, + "step": 11560, + "time_per_iteration": 2.7973217964172363 + }, + { + "auxiliary_loss_clip": 0.01335, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.23014116, + "balance_loss_mlp": 1.0202843, + "epoch": 0.6950849240943935, + "flos": 23482641682800.0, + "grad_norm": 1.555390261289192, + "language_loss": 0.76952761, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79320085, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12030029, + "step": 11561, + "time_per_iteration": 2.8179454803466797 + }, + { + "auxiliary_loss_clip": 0.01337176, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.23144388, + "balance_loss_mlp": 1.02112103, + "epoch": 0.6951450473470615, + "flos": 17425028307240.0, + "grad_norm": 1.787140651905902, + "language_loss": 0.78550667, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80922341, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1338501, + "step": 11562, + "time_per_iteration": 2.8656532764434814 + }, + { + "auxiliary_loss_clip": 0.01342495, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.23378682, + "balance_loss_mlp": 1.01897776, + "epoch": 0.6952051705997294, + "flos": 20490670100520.0, + "grad_norm": 1.9261039058058096, + "language_loss": 0.69883275, + "learning_rate": 8.977603150620515e-07, + "loss": 0.72257435, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12689209, + "step": 11563, + "time_per_iteration": 2.773232936859131 + }, + { + "auxiliary_loss_clip": 0.01329943, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.22699225, + "balance_loss_mlp": 1.01998353, + "epoch": 0.6952652938523974, + "flos": 13993390743840.0, + "grad_norm": 2.06390496450356, + "language_loss": 0.74218136, + "learning_rate": 8.974353589699846e-07, + "loss": 0.76580298, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.12243652, + "step": 11564, + "time_per_iteration": 2.827765941619873 + }, + { + "auxiliary_loss_clip": 0.01367735, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.24858284, + "balance_loss_mlp": 1.0210253, + "epoch": 0.6953254171050653, + "flos": 30959829574200.0, + "grad_norm": 1.8876612878333756, + "language_loss": 0.72287583, + "learning_rate": 8.971104446872785e-07, + "loss": 0.74691486, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.15124512, + "step": 11565, + "time_per_iteration": 2.8232505321502686 + }, + { + "auxiliary_loss_clip": 0.01159487, + "auxiliary_loss_mlp": 0.01006825, + "balance_loss_clip": 1.11687768, + "balance_loss_mlp": 1.00466752, + "epoch": 0.6953855403577334, + "flos": 61684290137160.0, + "grad_norm": 0.9055260858969226, + "language_loss": 0.58436805, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60603118, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02160645, + "step": 11566, + "time_per_iteration": 3.1167984008789062 + }, + { + "auxiliary_loss_clip": 0.01341528, + "auxiliary_loss_mlp": 0.01028108, + "balance_loss_clip": 1.23127723, + "balance_loss_mlp": 1.01455355, + "epoch": 0.6954456636104013, + "flos": 23044193952480.0, + "grad_norm": 1.8894364455045891, + "language_loss": 0.74225426, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76595062, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13568115, + "step": 11567, + "time_per_iteration": 2.83156156539917 + }, + { + "auxiliary_loss_clip": 0.01331833, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.22732544, + "balance_loss_mlp": 1.01907063, + "epoch": 0.6955057868630693, + "flos": 23924297473560.0, + "grad_norm": 1.2702119870572715, + "language_loss": 0.76705056, + "learning_rate": 8.961359528185313e-07, + "loss": 0.7906903, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.13061523, + "step": 11568, + "time_per_iteration": 2.967029333114624 + }, + { + "auxiliary_loss_clip": 0.01337907, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.23277998, + "balance_loss_mlp": 1.02290082, + "epoch": 0.6955659101157372, + "flos": 22598883409320.0, + "grad_norm": 1.7669456129331564, + "language_loss": 0.73009074, + "learning_rate": 8.958112058964649e-07, + "loss": 0.75382501, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.12628174, + "step": 11569, + "time_per_iteration": 2.9446492195129395 + }, + { + "auxiliary_loss_clip": 0.01339803, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.23147583, + "balance_loss_mlp": 1.01741099, + "epoch": 0.6956260333684052, + "flos": 24578030058120.0, + "grad_norm": 1.4338847297465633, + "language_loss": 0.77075028, + "learning_rate": 8.954865008453471e-07, + "loss": 0.7944507, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12841797, + "step": 11570, + "time_per_iteration": 2.8364815711975098 + }, + { + "auxiliary_loss_clip": 0.01344237, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.23466921, + "balance_loss_mlp": 1.0186969, + "epoch": 0.6956861566210732, + "flos": 25851279360960.0, + "grad_norm": 1.9816327327347887, + "language_loss": 0.75275648, + "learning_rate": 8.95161837677493e-07, + "loss": 0.77651441, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.128479, + "step": 11571, + "time_per_iteration": 2.8791754245758057 + }, + { + "auxiliary_loss_clip": 0.01333806, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.23207712, + "balance_loss_mlp": 1.02362227, + "epoch": 0.6957462798737412, + "flos": 15305201007480.0, + "grad_norm": 1.7179696096061814, + "language_loss": 0.74541938, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76911664, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.1229248, + "step": 11572, + "time_per_iteration": 2.742342233657837 + }, + { + "auxiliary_loss_clip": 0.01337233, + "auxiliary_loss_mlp": 0.01026606, + "balance_loss_clip": 1.22908354, + "balance_loss_mlp": 1.0137434, + "epoch": 0.6958064031264092, + "flos": 36253712653200.0, + "grad_norm": 1.870110139130569, + "language_loss": 0.69928807, + "learning_rate": 8.94512637040814e-07, + "loss": 0.7229265, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12860107, + "step": 11573, + "time_per_iteration": 2.912428617477417 + }, + { + "auxiliary_loss_clip": 0.0135617, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.24558723, + "balance_loss_mlp": 1.02272582, + "epoch": 0.6958665263790771, + "flos": 19213481786760.0, + "grad_norm": 1.671295941173155, + "language_loss": 0.74824405, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77216774, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13470459, + "step": 11574, + "time_per_iteration": 2.7796614170074463 + }, + { + "auxiliary_loss_clip": 0.0134543, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.23474586, + "balance_loss_mlp": 1.02161324, + "epoch": 0.6959266496317451, + "flos": 21800490554520.0, + "grad_norm": 1.6694832919258586, + "language_loss": 0.75062585, + "learning_rate": 8.938636040849014e-07, + "loss": 0.77442503, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12866211, + "step": 11575, + "time_per_iteration": 2.771998167037964 + }, + { + "auxiliary_loss_clip": 0.01337264, + "auxiliary_loss_mlp": 0.01027143, + "balance_loss_clip": 1.22946966, + "balance_loss_mlp": 1.01392865, + "epoch": 0.695986772884413, + "flos": 20563122060720.0, + "grad_norm": 1.8515916368395173, + "language_loss": 0.78977978, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81342381, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13201904, + "step": 11576, + "time_per_iteration": 2.765413999557495 + }, + { + "auxiliary_loss_clip": 0.01343266, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.23195159, + "balance_loss_mlp": 1.02274776, + "epoch": 0.696046896137081, + "flos": 14939936187840.0, + "grad_norm": 3.9188906454089807, + "language_loss": 0.57565761, + "learning_rate": 8.932147389081985e-07, + "loss": 0.59944278, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12493896, + "step": 11577, + "time_per_iteration": 2.722519636154175 + }, + { + "auxiliary_loss_clip": 0.01330648, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.22833753, + "balance_loss_mlp": 1.01648116, + "epoch": 0.696107019389749, + "flos": 30746737571400.0, + "grad_norm": 1.7388732466765706, + "language_loss": 0.76980311, + "learning_rate": 8.928903692678081e-07, + "loss": 0.79338688, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.11242676, + "step": 11578, + "time_per_iteration": 2.906541347503662 + }, + { + "auxiliary_loss_clip": 0.01342437, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.23489261, + "balance_loss_mlp": 1.02399468, + "epoch": 0.696167142642417, + "flos": 20781574367040.0, + "grad_norm": 1.722265594777811, + "language_loss": 0.80135864, + "learning_rate": 8.925660416091254e-07, + "loss": 0.82514781, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12493896, + "step": 11579, + "time_per_iteration": 2.7493979930877686 + }, + { + "auxiliary_loss_clip": 0.01333173, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.22782707, + "balance_loss_mlp": 1.01394725, + "epoch": 0.6962272658950849, + "flos": 22570312105080.0, + "grad_norm": 2.0829061952564554, + "language_loss": 0.72605056, + "learning_rate": 8.922417559444502e-07, + "loss": 0.7496475, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12579346, + "step": 11580, + "time_per_iteration": 2.914318799972534 + }, + { + "auxiliary_loss_clip": 0.01346897, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.23665166, + "balance_loss_mlp": 1.02175069, + "epoch": 0.6962873891477529, + "flos": 22205250327240.0, + "grad_norm": 1.8481980917107086, + "language_loss": 0.65988743, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68371063, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 1.10302734, + "router_z_loss_mlp": 0.13671875, + "step": 11581, + "time_per_iteration": 2.882551431655884 + }, + { + "auxiliary_loss_clip": 0.01339709, + "auxiliary_loss_mlp": 0.01034664, + "balance_loss_clip": 1.2320776, + "balance_loss_mlp": 1.02250516, + "epoch": 0.6963475124004208, + "flos": 12492836512200.0, + "grad_norm": 1.8980759266809069, + "language_loss": 0.76677394, + "learning_rate": 8.915933106463056e-07, + "loss": 0.79051769, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12164307, + "step": 11582, + "time_per_iteration": 2.8381996154785156 + }, + { + "auxiliary_loss_clip": 0.01337209, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.23023415, + "balance_loss_mlp": 1.0206269, + "epoch": 0.6964076356530888, + "flos": 17169517549440.0, + "grad_norm": 1.9604610346443048, + "language_loss": 0.69948721, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72318399, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.11828613, + "step": 11583, + "time_per_iteration": 2.932244300842285 + }, + { + "auxiliary_loss_clip": 0.01339985, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.23332024, + "balance_loss_mlp": 1.022753, + "epoch": 0.6964677589057569, + "flos": 19942021616400.0, + "grad_norm": 1.5908036348483938, + "language_loss": 0.82832015, + "learning_rate": 8.909450334717301e-07, + "loss": 0.85207725, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12982178, + "step": 11584, + "time_per_iteration": 2.85951566696167 + }, + { + "auxiliary_loss_clip": 0.01345718, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.23631132, + "balance_loss_mlp": 1.02243674, + "epoch": 0.6965278821584248, + "flos": 22789251711720.0, + "grad_norm": 6.511908776850645, + "language_loss": 0.80554843, + "learning_rate": 8.906209579615107e-07, + "loss": 0.82936335, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13348389, + "step": 11585, + "time_per_iteration": 4.170030355453491 + }, + { + "auxiliary_loss_clip": 0.01334488, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.23077774, + "balance_loss_mlp": 1.01946366, + "epoch": 0.6965880054110928, + "flos": 20052384803640.0, + "grad_norm": 1.6527293003543277, + "language_loss": 0.77946401, + "learning_rate": 8.90296924519055e-07, + "loss": 0.80312347, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.12005615, + "step": 11586, + "time_per_iteration": 4.276267766952515 + }, + { + "auxiliary_loss_clip": 0.01328707, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.2273463, + "balance_loss_mlp": 1.01878476, + "epoch": 0.6966481286637607, + "flos": 21913290243360.0, + "grad_norm": 1.69042786620123, + "language_loss": 0.78418744, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80777752, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.11517334, + "step": 11587, + "time_per_iteration": 4.239785194396973 + }, + { + "auxiliary_loss_clip": 0.01331616, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.2288326, + "balance_loss_mlp": 1.0170604, + "epoch": 0.6967082519164287, + "flos": 15637468211640.0, + "grad_norm": 1.828465818092928, + "language_loss": 0.73829758, + "learning_rate": 8.896489838865857e-07, + "loss": 0.7619077, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.12335205, + "step": 11588, + "time_per_iteration": 2.765822649002075 + }, + { + "auxiliary_loss_clip": 0.01334608, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.22760272, + "balance_loss_mlp": 1.01875949, + "epoch": 0.6967683751690966, + "flos": 24029950091040.0, + "grad_norm": 1.963387531140501, + "language_loss": 0.75690913, + "learning_rate": 8.893250767211413e-07, + "loss": 0.78055489, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.11199951, + "step": 11589, + "time_per_iteration": 2.8315014839172363 + }, + { + "auxiliary_loss_clip": 0.01340055, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.23170638, + "balance_loss_mlp": 1.02124047, + "epoch": 0.6968284984217646, + "flos": 31030291724760.0, + "grad_norm": 2.1814937979556355, + "language_loss": 0.63999152, + "learning_rate": 8.890012116726012e-07, + "loss": 0.66372794, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12353516, + "step": 11590, + "time_per_iteration": 2.8498072624206543 + }, + { + "auxiliary_loss_clip": 0.01156701, + "auxiliary_loss_mlp": 0.00998814, + "balance_loss_clip": 1.11383438, + "balance_loss_mlp": 0.99658489, + "epoch": 0.6968886216744326, + "flos": 67638524963400.0, + "grad_norm": 0.7612030290146802, + "language_loss": 0.61289966, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63445485, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02233887, + "step": 11591, + "time_per_iteration": 3.3800759315490723 + }, + { + "auxiliary_loss_clip": 0.01344872, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.23692775, + "balance_loss_mlp": 1.02365661, + "epoch": 0.6969487449271006, + "flos": 24869137366440.0, + "grad_norm": 1.5270716493634056, + "language_loss": 0.69319814, + "learning_rate": 8.883536079753582e-07, + "loss": 0.71702552, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.14215088, + "step": 11592, + "time_per_iteration": 2.8697657585144043 + }, + { + "auxiliary_loss_clip": 0.01339795, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.23442256, + "balance_loss_mlp": 1.01615524, + "epoch": 0.6970088681797685, + "flos": 28773844610040.0, + "grad_norm": 1.4545903876605253, + "language_loss": 0.63026023, + "learning_rate": 8.880298693512109e-07, + "loss": 0.65393484, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11517334, + "step": 11593, + "time_per_iteration": 2.9194624423980713 + }, + { + "auxiliary_loss_clip": 0.01332076, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.22821021, + "balance_loss_mlp": 1.01867175, + "epoch": 0.6970689914324365, + "flos": 27314369057520.0, + "grad_norm": 1.4696901861174652, + "language_loss": 0.54266959, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56629568, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.11859131, + "step": 11594, + "time_per_iteration": 2.9789440631866455 + }, + { + "auxiliary_loss_clip": 0.01341717, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.23396516, + "balance_loss_mlp": 1.01819181, + "epoch": 0.6971291146851044, + "flos": 19141070434920.0, + "grad_norm": 2.5416693165242616, + "language_loss": 0.77560061, + "learning_rate": 8.87382518613248e-07, + "loss": 0.7993201, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12030029, + "step": 11595, + "time_per_iteration": 4.332391023635864 + }, + { + "auxiliary_loss_clip": 0.01340807, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.23176765, + "balance_loss_mlp": 1.02218616, + "epoch": 0.6971892379377724, + "flos": 14614531796520.0, + "grad_norm": 2.104465137854661, + "language_loss": 0.71689808, + "learning_rate": 8.870589065239793e-07, + "loss": 0.74066454, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13653564, + "step": 11596, + "time_per_iteration": 2.746577739715576 + }, + { + "auxiliary_loss_clip": 0.01338515, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.23124385, + "balance_loss_mlp": 1.02395844, + "epoch": 0.6972493611904405, + "flos": 22312283628960.0, + "grad_norm": 1.6237783971150874, + "language_loss": 0.761545, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78529918, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.12945557, + "step": 11597, + "time_per_iteration": 2.769177198410034 + }, + { + "auxiliary_loss_clip": 0.01340677, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.23468351, + "balance_loss_mlp": 1.01950192, + "epoch": 0.6973094844431084, + "flos": 17424825265440.0, + "grad_norm": 1.9141053607093772, + "language_loss": 0.75115436, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77488083, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12463379, + "step": 11598, + "time_per_iteration": 2.7415452003479004 + }, + { + "auxiliary_loss_clip": 0.01345794, + "auxiliary_loss_mlp": 0.01037335, + "balance_loss_clip": 1.23534107, + "balance_loss_mlp": 1.02400112, + "epoch": 0.6973696076957764, + "flos": 27241145538480.0, + "grad_norm": 1.7085186983218126, + "language_loss": 0.89938074, + "learning_rate": 8.860883235222791e-07, + "loss": 0.92321193, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13354492, + "step": 11599, + "time_per_iteration": 2.8303418159484863 + }, + { + "auxiliary_loss_clip": 0.01354767, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.24115014, + "balance_loss_mlp": 1.02477169, + "epoch": 0.6974297309484443, + "flos": 22023206738640.0, + "grad_norm": 1.9598614646761985, + "language_loss": 0.69382197, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71775788, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.14056396, + "step": 11600, + "time_per_iteration": 2.819546937942505 + }, + { + "auxiliary_loss_clip": 0.01340908, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.23263454, + "balance_loss_mlp": 1.01973438, + "epoch": 0.6974898542011123, + "flos": 28372374114480.0, + "grad_norm": 1.705337818620239, + "language_loss": 0.76511943, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78885531, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1295166, + "step": 11601, + "time_per_iteration": 2.8436574935913086 + }, + { + "auxiliary_loss_clip": 0.01332947, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.22909808, + "balance_loss_mlp": 1.02336669, + "epoch": 0.6975499774537802, + "flos": 15236322582960.0, + "grad_norm": 1.818653839881728, + "language_loss": 0.72054875, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74423015, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.11834717, + "step": 11602, + "time_per_iteration": 2.799422264099121 + }, + { + "auxiliary_loss_clip": 0.01336324, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.22907007, + "balance_loss_mlp": 1.0223825, + "epoch": 0.6976101007064482, + "flos": 22161694538160.0, + "grad_norm": 2.0950648495500555, + "language_loss": 0.76552469, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78923088, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11920166, + "step": 11603, + "time_per_iteration": 2.879223346710205 + }, + { + "auxiliary_loss_clip": 0.01340734, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.23421049, + "balance_loss_mlp": 1.02219057, + "epoch": 0.6976702239591162, + "flos": 22278676888080.0, + "grad_norm": 1.7277244119953845, + "language_loss": 0.6253407, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64909244, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12249756, + "step": 11604, + "time_per_iteration": 2.9153661727905273 + }, + { + "auxiliary_loss_clip": 0.0134386, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.23465073, + "balance_loss_mlp": 1.02121842, + "epoch": 0.6977303472117842, + "flos": 25854162554520.0, + "grad_norm": 2.3023482130750437, + "language_loss": 0.81796288, + "learning_rate": 8.841482983203057e-07, + "loss": 0.84174997, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.1362915, + "step": 11605, + "time_per_iteration": 2.998992919921875 + }, + { + "auxiliary_loss_clip": 0.01343362, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.23652482, + "balance_loss_mlp": 1.02083898, + "epoch": 0.6977904704644521, + "flos": 20964511339560.0, + "grad_norm": 1.7799592469820573, + "language_loss": 0.70669901, + "learning_rate": 8.838251088113638e-07, + "loss": 0.73047209, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13104248, + "step": 11606, + "time_per_iteration": 2.8384780883789062 + }, + { + "auxiliary_loss_clip": 0.01344736, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.23474741, + "balance_loss_mlp": 1.01953697, + "epoch": 0.6978505937171201, + "flos": 22060224581760.0, + "grad_norm": 1.7484956162629615, + "language_loss": 0.82292998, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84670222, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12957764, + "step": 11607, + "time_per_iteration": 2.787728786468506 + }, + { + "auxiliary_loss_clip": 0.01347685, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.23539543, + "balance_loss_mlp": 1.01767421, + "epoch": 0.697910716969788, + "flos": 20048039709120.0, + "grad_norm": 2.0524597973363834, + "language_loss": 0.79213721, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81593126, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14044189, + "step": 11608, + "time_per_iteration": 2.7941107749938965 + }, + { + "auxiliary_loss_clip": 0.01342149, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.23372579, + "balance_loss_mlp": 1.01974094, + "epoch": 0.697970840222456, + "flos": 15892532277480.0, + "grad_norm": 1.7939210559637502, + "language_loss": 0.89987695, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92361909, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12322998, + "step": 11609, + "time_per_iteration": 2.7593395709991455 + }, + { + "auxiliary_loss_clip": 0.0134614, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.23575759, + "balance_loss_mlp": 1.01796544, + "epoch": 0.698030963475124, + "flos": 21220671831120.0, + "grad_norm": 1.57903205295623, + "language_loss": 0.65054095, + "learning_rate": 8.82532774152765e-07, + "loss": 0.67431223, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13031006, + "step": 11610, + "time_per_iteration": 2.7459628582000732 + }, + { + "auxiliary_loss_clip": 0.01330156, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.2252599, + "balance_loss_mlp": 1.01744175, + "epoch": 0.698091086727792, + "flos": 33765493690080.0, + "grad_norm": 1.6178642954527167, + "language_loss": 0.84593242, + "learning_rate": 8.822097963936643e-07, + "loss": 0.8695296, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12115479, + "step": 11611, + "time_per_iteration": 2.8763134479522705 + }, + { + "auxiliary_loss_clip": 0.0134146, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.2323873, + "balance_loss_mlp": 1.01773393, + "epoch": 0.69815120998046, + "flos": 15892085585520.0, + "grad_norm": 1.9163576286927948, + "language_loss": 0.71562171, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73934197, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1282959, + "step": 11612, + "time_per_iteration": 2.712397336959839 + }, + { + "auxiliary_loss_clip": 0.01336735, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.23214459, + "balance_loss_mlp": 1.02143514, + "epoch": 0.6982113332331279, + "flos": 18950945782680.0, + "grad_norm": 1.6812993351469059, + "language_loss": 0.80854213, + "learning_rate": 8.815639680478573e-07, + "loss": 0.8322475, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12365723, + "step": 11613, + "time_per_iteration": 2.7817130088806152 + }, + { + "auxiliary_loss_clip": 0.01332055, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.22609961, + "balance_loss_mlp": 1.01682532, + "epoch": 0.6982714564857959, + "flos": 24395011868880.0, + "grad_norm": 1.8045852288438027, + "language_loss": 0.75588661, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77949113, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11566162, + "step": 11614, + "time_per_iteration": 2.865814208984375 + }, + { + "auxiliary_loss_clip": 0.01339035, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.23234248, + "balance_loss_mlp": 1.01965523, + "epoch": 0.6983315797384638, + "flos": 20088509262840.0, + "grad_norm": 3.16760850081127, + "language_loss": 0.77731752, + "learning_rate": 8.809183093468746e-07, + "loss": 0.80103076, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12634277, + "step": 11615, + "time_per_iteration": 2.777273654937744 + }, + { + "auxiliary_loss_clip": 0.01328614, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.22476399, + "balance_loss_mlp": 1.0200181, + "epoch": 0.6983917029911318, + "flos": 13515854144040.0, + "grad_norm": 1.8060658311520859, + "language_loss": 0.72946322, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75306952, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.12005615, + "step": 11616, + "time_per_iteration": 2.7665183544158936 + }, + { + "auxiliary_loss_clip": 0.01332051, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.22744596, + "balance_loss_mlp": 1.02209044, + "epoch": 0.6984518262437998, + "flos": 22023572213880.0, + "grad_norm": 1.5665871706073746, + "language_loss": 0.84503663, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86870325, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12536621, + "step": 11617, + "time_per_iteration": 2.8352484703063965 + }, + { + "auxiliary_loss_clip": 0.01342515, + "auxiliary_loss_mlp": 0.0103562, + "balance_loss_clip": 1.23243153, + "balance_loss_mlp": 1.0218811, + "epoch": 0.6985119494964678, + "flos": 18775277706600.0, + "grad_norm": 2.190374275681933, + "language_loss": 0.59685367, + "learning_rate": 8.799501395936682e-07, + "loss": 0.62063503, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13745117, + "step": 11618, + "time_per_iteration": 2.9283533096313477 + }, + { + "auxiliary_loss_clip": 0.01335161, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.22891295, + "balance_loss_mlp": 1.02002978, + "epoch": 0.6985720727491357, + "flos": 22387984257960.0, + "grad_norm": 1.9497752465574105, + "language_loss": 0.82972616, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85340393, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12591553, + "step": 11619, + "time_per_iteration": 2.782040596008301 + }, + { + "auxiliary_loss_clip": 0.0132861, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.22539425, + "balance_loss_mlp": 1.01861036, + "epoch": 0.6986321960018037, + "flos": 39575311896240.0, + "grad_norm": 1.7596370803684331, + "language_loss": 0.67826116, + "learning_rate": 8.793049054331494e-07, + "loss": 0.70184886, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.11547852, + "step": 11620, + "time_per_iteration": 3.0067427158355713 + }, + { + "auxiliary_loss_clip": 0.01335692, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.22676563, + "balance_loss_mlp": 1.01553142, + "epoch": 0.6986923192544716, + "flos": 17972255498760.0, + "grad_norm": 1.9190765349513232, + "language_loss": 0.7291075, + "learning_rate": 8.789823520920794e-07, + "loss": 0.75275052, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13092041, + "step": 11621, + "time_per_iteration": 2.8132684230804443 + }, + { + "auxiliary_loss_clip": 0.01344313, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.23417449, + "balance_loss_mlp": 1.01829696, + "epoch": 0.6987524425071396, + "flos": 25599991872600.0, + "grad_norm": 1.7097744755583584, + "language_loss": 0.68287385, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70663387, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13397217, + "step": 11622, + "time_per_iteration": 2.9033591747283936 + }, + { + "auxiliary_loss_clip": 0.01323436, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.21887672, + "balance_loss_mlp": 1.01379371, + "epoch": 0.6988125657598077, + "flos": 17534579327280.0, + "grad_norm": 1.624840648352957, + "language_loss": 0.62917322, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65266395, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1184082, + "step": 11623, + "time_per_iteration": 2.870243787765503 + }, + { + "auxiliary_loss_clip": 0.01346094, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.23304081, + "balance_loss_mlp": 1.01369119, + "epoch": 0.6988726890124756, + "flos": 39172988625120.0, + "grad_norm": 2.7769529885358657, + "language_loss": 0.61429322, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63802743, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13647461, + "step": 11624, + "time_per_iteration": 4.595324516296387 + }, + { + "auxiliary_loss_clip": 0.01342342, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.23241198, + "balance_loss_mlp": 1.0187372, + "epoch": 0.6989328122651436, + "flos": 20198303933040.0, + "grad_norm": 1.7253139438932672, + "language_loss": 0.78555536, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80929792, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1317749, + "step": 11625, + "time_per_iteration": 2.841726541519165 + }, + { + "auxiliary_loss_clip": 0.0132621, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.22263384, + "balance_loss_mlp": 1.01894021, + "epoch": 0.6989929355178115, + "flos": 21839863682520.0, + "grad_norm": 1.7007357291254128, + "language_loss": 0.66353679, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68710798, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11968994, + "step": 11626, + "time_per_iteration": 4.292184114456177 + }, + { + "auxiliary_loss_clip": 0.0133575, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.22788775, + "balance_loss_mlp": 1.01592207, + "epoch": 0.6990530587704795, + "flos": 26328491093880.0, + "grad_norm": 1.766606760115997, + "language_loss": 0.70677358, + "learning_rate": 8.770479251647697e-07, + "loss": 0.73042297, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13269043, + "step": 11627, + "time_per_iteration": 2.862213611602783 + }, + { + "auxiliary_loss_clip": 0.01327528, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.22435558, + "balance_loss_mlp": 1.01326931, + "epoch": 0.6991131820231474, + "flos": 19833810672240.0, + "grad_norm": 1.8514845585116397, + "language_loss": 0.6232729, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64680403, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12322998, + "step": 11628, + "time_per_iteration": 2.8126070499420166 + }, + { + "auxiliary_loss_clip": 0.01341319, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.23199081, + "balance_loss_mlp": 1.02239561, + "epoch": 0.6991733052758154, + "flos": 33991417934640.0, + "grad_norm": 1.7149671674428277, + "language_loss": 0.67734843, + "learning_rate": 8.764034567182581e-07, + "loss": 0.7011165, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13098145, + "step": 11629, + "time_per_iteration": 2.9935483932495117 + }, + { + "auxiliary_loss_clip": 0.01328249, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.2227416, + "balance_loss_mlp": 1.02189887, + "epoch": 0.6992334285284834, + "flos": 15637427603280.0, + "grad_norm": 1.6284201680354113, + "language_loss": 0.72575974, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74939442, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.13323975, + "step": 11630, + "time_per_iteration": 2.71284556388855 + }, + { + "auxiliary_loss_clip": 0.01334874, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.22960532, + "balance_loss_mlp": 1.01649857, + "epoch": 0.6992935517811514, + "flos": 21731368479840.0, + "grad_norm": 1.7346211264638518, + "language_loss": 0.74469757, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76834464, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.13348389, + "step": 11631, + "time_per_iteration": 2.771329641342163 + }, + { + "auxiliary_loss_clip": 0.01341376, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.23130918, + "balance_loss_mlp": 1.01979637, + "epoch": 0.6993536750338193, + "flos": 20119395243600.0, + "grad_norm": 2.1017561138189698, + "language_loss": 0.89390898, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91766137, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.140625, + "step": 11632, + "time_per_iteration": 2.726487159729004 + }, + { + "auxiliary_loss_clip": 0.01339997, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.23094273, + "balance_loss_mlp": 1.02048635, + "epoch": 0.6994137982864873, + "flos": 22014678983040.0, + "grad_norm": 1.462507525101302, + "language_loss": 0.79885995, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82259071, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12585449, + "step": 11633, + "time_per_iteration": 2.7697854042053223 + }, + { + "auxiliary_loss_clip": 0.01345031, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.23426592, + "balance_loss_mlp": 1.01997161, + "epoch": 0.6994739215391552, + "flos": 25524372460320.0, + "grad_norm": 1.7797137202467177, + "language_loss": 0.67433774, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69813001, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14233398, + "step": 11634, + "time_per_iteration": 4.311300754547119 + }, + { + "auxiliary_loss_clip": 0.01155536, + "auxiliary_loss_mlp": 0.01007648, + "balance_loss_clip": 1.11131525, + "balance_loss_mlp": 1.00461984, + "epoch": 0.6995340447918232, + "flos": 59143639135320.0, + "grad_norm": 0.6858097089965534, + "language_loss": 0.53208506, + "learning_rate": 8.744710743350412e-07, + "loss": 0.5537169, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.03027344, + "step": 11635, + "time_per_iteration": 3.3407247066497803 + }, + { + "auxiliary_loss_clip": 0.01335373, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.22794962, + "balance_loss_mlp": 1.01843107, + "epoch": 0.6995941680444913, + "flos": 17972377323840.0, + "grad_norm": 1.7905822909141234, + "language_loss": 0.82357395, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84723985, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12792969, + "step": 11636, + "time_per_iteration": 2.752730131149292 + }, + { + "auxiliary_loss_clip": 0.01341653, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.23215902, + "balance_loss_mlp": 1.0163573, + "epoch": 0.6996542912971592, + "flos": 21985011253080.0, + "grad_norm": 1.92320909784796, + "language_loss": 0.83312148, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85683542, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13372803, + "step": 11637, + "time_per_iteration": 2.756614923477173 + }, + { + "auxiliary_loss_clip": 0.01332022, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.2251035, + "balance_loss_mlp": 1.02519763, + "epoch": 0.6997144145498272, + "flos": 11689083353880.0, + "grad_norm": 1.8713198682448098, + "language_loss": 0.68069053, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70439637, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13360596, + "step": 11638, + "time_per_iteration": 2.7863056659698486 + }, + { + "auxiliary_loss_clip": 0.0134214, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.22982955, + "balance_loss_mlp": 1.01776981, + "epoch": 0.6997745378024951, + "flos": 29613275535600.0, + "grad_norm": 1.7990563722215862, + "language_loss": 0.78176248, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80550468, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 1.12255859, + "router_z_loss_mlp": 0.14331055, + "step": 11639, + "time_per_iteration": 2.885941982269287 + }, + { + "auxiliary_loss_clip": 0.01341743, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.23335445, + "balance_loss_mlp": 1.02004242, + "epoch": 0.6998346610551631, + "flos": 20891531470680.0, + "grad_norm": 2.6498619938141728, + "language_loss": 0.82562143, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84936988, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1305542, + "step": 11640, + "time_per_iteration": 2.991415023803711 + }, + { + "auxiliary_loss_clip": 0.01333327, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.22652805, + "balance_loss_mlp": 1.01453257, + "epoch": 0.699894784307831, + "flos": 27168896620080.0, + "grad_norm": 1.6702307698114274, + "language_loss": 0.75711244, + "learning_rate": 8.725402284377619e-07, + "loss": 0.78071314, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12200928, + "step": 11641, + "time_per_iteration": 3.010317325592041 + }, + { + "auxiliary_loss_clip": 0.01335127, + "auxiliary_loss_mlp": 0.01023195, + "balance_loss_clip": 1.2277056, + "balance_loss_mlp": 1.01044536, + "epoch": 0.699954907560499, + "flos": 20928630530520.0, + "grad_norm": 1.942854725250985, + "language_loss": 0.77788818, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80147141, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12738037, + "step": 11642, + "time_per_iteration": 2.780850887298584 + }, + { + "auxiliary_loss_clip": 0.01343192, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.23137999, + "balance_loss_mlp": 1.02024269, + "epoch": 0.700015030813167, + "flos": 28663684464600.0, + "grad_norm": 2.2889859690098495, + "language_loss": 0.75227129, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77605093, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.1451416, + "step": 11643, + "time_per_iteration": 2.8358540534973145 + }, + { + "auxiliary_loss_clip": 0.01338749, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.22922575, + "balance_loss_mlp": 1.01601362, + "epoch": 0.700075154065835, + "flos": 29211277131360.0, + "grad_norm": 2.4419482200987033, + "language_loss": 0.60138786, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62507087, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13543701, + "step": 11644, + "time_per_iteration": 2.8734939098358154 + }, + { + "auxiliary_loss_clip": 0.01328718, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.22375524, + "balance_loss_mlp": 1.01516485, + "epoch": 0.7001352773185029, + "flos": 23117661121680.0, + "grad_norm": 1.5916522229605314, + "language_loss": 0.81783617, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84139633, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12139893, + "step": 11645, + "time_per_iteration": 2.8718433380126953 + }, + { + "auxiliary_loss_clip": 0.01330685, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.22447824, + "balance_loss_mlp": 1.01801229, + "epoch": 0.7001954005711709, + "flos": 21327623916120.0, + "grad_norm": 2.3093546905311073, + "language_loss": 0.67990005, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70351559, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12866211, + "step": 11646, + "time_per_iteration": 2.86152982711792 + }, + { + "auxiliary_loss_clip": 0.01336239, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.22976637, + "balance_loss_mlp": 1.01962996, + "epoch": 0.7002555238238388, + "flos": 24541296473520.0, + "grad_norm": 1.719604066490491, + "language_loss": 0.71236771, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73605084, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12432861, + "step": 11647, + "time_per_iteration": 2.815422534942627 + }, + { + "auxiliary_loss_clip": 0.0134144, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.23315334, + "balance_loss_mlp": 1.01922607, + "epoch": 0.7003156470765068, + "flos": 39063843688680.0, + "grad_norm": 1.6402654842137947, + "language_loss": 0.7198723, + "learning_rate": 8.702895203548155e-07, + "loss": 0.74360979, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13085938, + "step": 11648, + "time_per_iteration": 2.981761932373047 + }, + { + "auxiliary_loss_clip": 0.01330992, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.2255733, + "balance_loss_mlp": 1.01532638, + "epoch": 0.7003757703291749, + "flos": 28810050285960.0, + "grad_norm": 1.6508487521364874, + "language_loss": 0.7730031, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79658812, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12194824, + "step": 11649, + "time_per_iteration": 2.8302001953125 + }, + { + "auxiliary_loss_clip": 0.01330708, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.22516561, + "balance_loss_mlp": 1.02395415, + "epoch": 0.7004358935818428, + "flos": 15957187432560.0, + "grad_norm": 1.7367693298785989, + "language_loss": 0.79004896, + "learning_rate": 8.69646846268308e-07, + "loss": 0.81372046, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12487793, + "step": 11650, + "time_per_iteration": 2.746833562850952 + }, + { + "auxiliary_loss_clip": 0.01333557, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.22622454, + "balance_loss_mlp": 1.01941359, + "epoch": 0.7004960168345108, + "flos": 20416796847720.0, + "grad_norm": 2.0060292423107136, + "language_loss": 0.78620046, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80984855, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1184082, + "step": 11651, + "time_per_iteration": 2.746833324432373 + }, + { + "auxiliary_loss_clip": 0.01342735, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.23264313, + "balance_loss_mlp": 1.02027595, + "epoch": 0.7005561400871787, + "flos": 17352007830000.0, + "grad_norm": 1.664186503085521, + "language_loss": 0.69954944, + "learning_rate": 8.690043436342198e-07, + "loss": 0.72330552, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12591553, + "step": 11652, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.01334706, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.22861505, + "balance_loss_mlp": 1.02206135, + "epoch": 0.7006162633398467, + "flos": 25307706921840.0, + "grad_norm": 2.6748364865439074, + "language_loss": 0.7456516, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76934415, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12487793, + "step": 11653, + "time_per_iteration": 2.911827802658081 + }, + { + "auxiliary_loss_clip": 0.01339921, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.23083913, + "balance_loss_mlp": 1.01447797, + "epoch": 0.7006763865925146, + "flos": 20674053765000.0, + "grad_norm": 1.8578418369543435, + "language_loss": 0.70845759, + "learning_rate": 8.68362012550003e-07, + "loss": 0.73213923, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13769531, + "step": 11654, + "time_per_iteration": 2.7886650562286377 + }, + { + "auxiliary_loss_clip": 0.0133729, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.22865856, + "balance_loss_mlp": 1.01179624, + "epoch": 0.7007365098451827, + "flos": 20051125944480.0, + "grad_norm": 3.307097559689783, + "language_loss": 0.7380836, + "learning_rate": 8.680409113695453e-07, + "loss": 0.76171082, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13641357, + "step": 11655, + "time_per_iteration": 2.818047046661377 + }, + { + "auxiliary_loss_clip": 0.01355745, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.24193227, + "balance_loss_mlp": 1.0206449, + "epoch": 0.7007966330978506, + "flos": 20782305317520.0, + "grad_norm": 2.98026871064593, + "language_loss": 0.69812083, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72202229, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13751221, + "step": 11656, + "time_per_iteration": 2.7450125217437744 + }, + { + "auxiliary_loss_clip": 0.01330154, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.22489548, + "balance_loss_mlp": 1.02090275, + "epoch": 0.7008567563505186, + "flos": 29643592999320.0, + "grad_norm": 1.4479003107332546, + "language_loss": 0.78104007, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80466491, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11413574, + "step": 11657, + "time_per_iteration": 2.8155922889709473 + }, + { + "auxiliary_loss_clip": 0.01345857, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.23419631, + "balance_loss_mlp": 1.02002394, + "epoch": 0.7009168796031865, + "flos": 17096415855480.0, + "grad_norm": 1.9531339541570318, + "language_loss": 0.78996241, + "learning_rate": 8.670778654208797e-07, + "loss": 0.8137604, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13922119, + "step": 11658, + "time_per_iteration": 2.7064549922943115 + }, + { + "auxiliary_loss_clip": 0.01329676, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.2252624, + "balance_loss_mlp": 1.02008224, + "epoch": 0.7009770028558545, + "flos": 20453855299200.0, + "grad_norm": 1.8298252861805189, + "language_loss": 0.82654101, + "learning_rate": 8.667569360094713e-07, + "loss": 0.85016441, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12579346, + "step": 11659, + "time_per_iteration": 2.740476369857788 + }, + { + "auxiliary_loss_clip": 0.01331109, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.22619247, + "balance_loss_mlp": 1.01460493, + "epoch": 0.7010371261085224, + "flos": 19249890504480.0, + "grad_norm": 2.0393019039057356, + "language_loss": 0.69900852, + "learning_rate": 8.664360495707526e-07, + "loss": 0.72258729, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.121521, + "step": 11660, + "time_per_iteration": 2.7363696098327637 + }, + { + "auxiliary_loss_clip": 0.01343564, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.23363233, + "balance_loss_mlp": 1.01542532, + "epoch": 0.7010972493611904, + "flos": 22132717150320.0, + "grad_norm": 1.7821105142123321, + "language_loss": 0.81078315, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83450305, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13006592, + "step": 11661, + "time_per_iteration": 2.7645161151885986 + }, + { + "auxiliary_loss_clip": 0.01335585, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.22863841, + "balance_loss_mlp": 1.01759899, + "epoch": 0.7011573726138585, + "flos": 31396977837000.0, + "grad_norm": 1.5587272092785998, + "language_loss": 0.79322195, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81688058, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12677002, + "step": 11662, + "time_per_iteration": 2.8454344272613525 + }, + { + "auxiliary_loss_clip": 0.01339424, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.2309351, + "balance_loss_mlp": 1.01625752, + "epoch": 0.7012174958665264, + "flos": 18155192471280.0, + "grad_norm": 1.7897180825368135, + "language_loss": 0.8379209, + "learning_rate": 8.654736482124134e-07, + "loss": 0.86160862, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13079834, + "step": 11663, + "time_per_iteration": 5.69816517829895 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01012989, + "balance_loss_clip": 1.11198235, + "balance_loss_mlp": 1.01067674, + "epoch": 0.7012776191191944, + "flos": 60665495774760.0, + "grad_norm": 0.816460460535945, + "language_loss": 0.53727734, + "learning_rate": 8.651529337861209e-07, + "loss": 0.5589667, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02307129, + "step": 11664, + "time_per_iteration": 3.3482844829559326 + }, + { + "auxiliary_loss_clip": 0.01342764, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.23365414, + "balance_loss_mlp": 1.01830268, + "epoch": 0.7013377423718623, + "flos": 27204330737160.0, + "grad_norm": 2.1813918693689174, + "language_loss": 0.79350066, + "learning_rate": 8.64832262393344e-07, + "loss": 0.8172363, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12493896, + "step": 11665, + "time_per_iteration": 4.326288223266602 + }, + { + "auxiliary_loss_clip": 0.01328438, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.2235043, + "balance_loss_mlp": 1.01534641, + "epoch": 0.7013978656245303, + "flos": 16547726763000.0, + "grad_norm": 2.340537486033588, + "language_loss": 0.76852834, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79208982, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12365723, + "step": 11666, + "time_per_iteration": 2.7890055179595947 + }, + { + "auxiliary_loss_clip": 0.01334843, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.2281574, + "balance_loss_mlp": 1.01791537, + "epoch": 0.7014579888771982, + "flos": 23148344060640.0, + "grad_norm": 1.9356178945529872, + "language_loss": 0.81441158, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83806455, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12530518, + "step": 11667, + "time_per_iteration": 2.7912847995758057 + }, + { + "auxiliary_loss_clip": 0.01334668, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.22825718, + "balance_loss_mlp": 1.02156091, + "epoch": 0.7015181121298663, + "flos": 25087508456040.0, + "grad_norm": 2.156214852477139, + "language_loss": 0.65674984, + "learning_rate": 8.638705065376879e-07, + "loss": 0.68044502, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13287354, + "step": 11668, + "time_per_iteration": 2.8199963569641113 + }, + { + "auxiliary_loss_clip": 0.01340221, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.22960985, + "balance_loss_mlp": 1.01284027, + "epoch": 0.7015782353825342, + "flos": 23332702325760.0, + "grad_norm": 2.5705096484765533, + "language_loss": 0.76676917, + "learning_rate": 8.635500074005519e-07, + "loss": 0.7904228, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12316895, + "step": 11669, + "time_per_iteration": 2.798610210418701 + }, + { + "auxiliary_loss_clip": 0.01156136, + "auxiliary_loss_mlp": 0.01010719, + "balance_loss_clip": 1.11266625, + "balance_loss_mlp": 1.00828731, + "epoch": 0.7016383586352022, + "flos": 70413221881800.0, + "grad_norm": 0.706596410848693, + "language_loss": 0.54595268, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56762123, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02429199, + "step": 11670, + "time_per_iteration": 3.359309434890747 + }, + { + "auxiliary_loss_clip": 0.01328634, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.22308016, + "balance_loss_mlp": 1.0211041, + "epoch": 0.7016984818878701, + "flos": 19797077087640.0, + "grad_norm": 1.5576062025752038, + "language_loss": 0.81892705, + "learning_rate": 8.629091384213218e-07, + "loss": 0.84255159, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12701416, + "step": 11671, + "time_per_iteration": 2.756395101547241 + }, + { + "auxiliary_loss_clip": 0.01339199, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.23032546, + "balance_loss_mlp": 1.0171324, + "epoch": 0.7017586051405381, + "flos": 12900885562080.0, + "grad_norm": 1.952765503099391, + "language_loss": 0.75307345, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77676296, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12615967, + "step": 11672, + "time_per_iteration": 2.721580743789673 + }, + { + "auxiliary_loss_clip": 0.01335624, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.22712469, + "balance_loss_mlp": 1.01539755, + "epoch": 0.701818728393206, + "flos": 18337439101680.0, + "grad_norm": 1.513680712215339, + "language_loss": 0.87152296, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89516854, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13525391, + "step": 11673, + "time_per_iteration": 4.24501895904541 + }, + { + "auxiliary_loss_clip": 0.01325156, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.22109199, + "balance_loss_mlp": 1.01415062, + "epoch": 0.701878851645874, + "flos": 17388700806240.0, + "grad_norm": 4.77675556448663, + "language_loss": 0.73167425, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75519896, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.13171387, + "step": 11674, + "time_per_iteration": 2.754615306854248 + }, + { + "auxiliary_loss_clip": 0.01328715, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.22501969, + "balance_loss_mlp": 1.02412558, + "epoch": 0.701938974898542, + "flos": 23920886371320.0, + "grad_norm": 1.490790434096679, + "language_loss": 0.72451079, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74815798, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.11859131, + "step": 11675, + "time_per_iteration": 2.9372081756591797 + }, + { + "auxiliary_loss_clip": 0.01337796, + "auxiliary_loss_mlp": 0.01025691, + "balance_loss_clip": 1.22909737, + "balance_loss_mlp": 1.01309681, + "epoch": 0.70199909815121, + "flos": 21800206296000.0, + "grad_norm": 2.1652118772794173, + "language_loss": 0.52047181, + "learning_rate": 8.613077207613078e-07, + "loss": 0.54410672, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1260376, + "step": 11676, + "time_per_iteration": 2.822638988494873 + }, + { + "auxiliary_loss_clip": 0.01157077, + "auxiliary_loss_mlp": 0.01007688, + "balance_loss_clip": 1.11321187, + "balance_loss_mlp": 1.0050658, + "epoch": 0.702059221403878, + "flos": 71731854349920.0, + "grad_norm": 0.741881738327343, + "language_loss": 0.59259319, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61424083, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02624512, + "step": 11677, + "time_per_iteration": 3.32840633392334 + }, + { + "auxiliary_loss_clip": 0.01331401, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.22257996, + "balance_loss_mlp": 1.01434469, + "epoch": 0.7021193446565459, + "flos": 28116944573400.0, + "grad_norm": 1.9935860821015685, + "language_loss": 0.61887175, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64245874, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12969971, + "step": 11678, + "time_per_iteration": 2.822678804397583 + }, + { + "auxiliary_loss_clip": 0.01327801, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.22205544, + "balance_loss_mlp": 1.01664495, + "epoch": 0.7021794679092139, + "flos": 22929445062360.0, + "grad_norm": 1.7798080988540286, + "language_loss": 0.79544806, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81901658, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12408447, + "step": 11679, + "time_per_iteration": 2.7972569465637207 + }, + { + "auxiliary_loss_clip": 0.01326233, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.22301435, + "balance_loss_mlp": 1.0216676, + "epoch": 0.7022395911618818, + "flos": 18082456252560.0, + "grad_norm": 2.188696931859436, + "language_loss": 0.70708585, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73067939, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.11468506, + "step": 11680, + "time_per_iteration": 2.744346857070923 + }, + { + "auxiliary_loss_clip": 0.0134122, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.23049045, + "balance_loss_mlp": 1.01767635, + "epoch": 0.7022997144145499, + "flos": 16038613840320.0, + "grad_norm": 1.9510798929567672, + "language_loss": 0.75256717, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77628744, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13140869, + "step": 11681, + "time_per_iteration": 2.7908670902252197 + }, + { + "auxiliary_loss_clip": 0.01326572, + "auxiliary_loss_mlp": 0.01027582, + "balance_loss_clip": 1.22107911, + "balance_loss_mlp": 1.01596546, + "epoch": 0.7023598376672178, + "flos": 26474288398200.0, + "grad_norm": 1.4551519856444877, + "language_loss": 0.77133703, + "learning_rate": 8.593874446204434e-07, + "loss": 0.7948786, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11627197, + "step": 11682, + "time_per_iteration": 2.9280810356140137 + }, + { + "auxiliary_loss_clip": 0.01338194, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.22805655, + "balance_loss_mlp": 1.0187006, + "epoch": 0.7024199609198858, + "flos": 17060372613000.0, + "grad_norm": 2.645039891868128, + "language_loss": 0.73551452, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75921094, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12756348, + "step": 11683, + "time_per_iteration": 2.817183017730713 + }, + { + "auxiliary_loss_clip": 0.0133255, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.22648311, + "balance_loss_mlp": 1.01984358, + "epoch": 0.7024800841725537, + "flos": 25854649854840.0, + "grad_norm": 1.7778137136632697, + "language_loss": 0.7179898, + "learning_rate": 8.587476984611976e-07, + "loss": 0.74164551, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13165283, + "step": 11684, + "time_per_iteration": 2.8528597354888916 + }, + { + "auxiliary_loss_clip": 0.01331253, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.22466946, + "balance_loss_mlp": 1.02164531, + "epoch": 0.7025402074252217, + "flos": 23518197624960.0, + "grad_norm": 1.789296655564506, + "language_loss": 0.72420597, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74786079, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12567139, + "step": 11685, + "time_per_iteration": 2.8889758586883545 + }, + { + "auxiliary_loss_clip": 0.01332673, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.22515786, + "balance_loss_mlp": 1.0138098, + "epoch": 0.7026003306778896, + "flos": 20155032402480.0, + "grad_norm": 1.5149785400957878, + "language_loss": 0.84691775, + "learning_rate": 8.581081254075582e-07, + "loss": 0.87050545, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12286377, + "step": 11686, + "time_per_iteration": 2.844712495803833 + }, + { + "auxiliary_loss_clip": 0.01157894, + "auxiliary_loss_mlp": 0.01000774, + "balance_loss_clip": 1.11431956, + "balance_loss_mlp": 0.9983657, + "epoch": 0.7026604539305576, + "flos": 64786949773560.0, + "grad_norm": 0.9737490764335778, + "language_loss": 0.7000258, + "learning_rate": 8.577884038256566e-07, + "loss": 0.72161245, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02404785, + "step": 11687, + "time_per_iteration": 3.5285122394561768 + }, + { + "auxiliary_loss_clip": 0.01333094, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.22630119, + "balance_loss_mlp": 1.01432395, + "epoch": 0.7027205771832256, + "flos": 21876475442040.0, + "grad_norm": 1.8775033039213076, + "language_loss": 0.77449656, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79809189, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12121582, + "step": 11688, + "time_per_iteration": 2.8736326694488525 + }, + { + "auxiliary_loss_clip": 0.01331232, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.22435546, + "balance_loss_mlp": 1.01887429, + "epoch": 0.7027807004358936, + "flos": 23373050054400.0, + "grad_norm": 2.948257204881339, + "language_loss": 0.69241059, + "learning_rate": 8.571490906123107e-07, + "loss": 0.71603596, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12414551, + "step": 11689, + "time_per_iteration": 2.9208335876464844 + }, + { + "auxiliary_loss_clip": 0.01337912, + "auxiliary_loss_mlp": 0.01036424, + "balance_loss_clip": 1.22815204, + "balance_loss_mlp": 1.02278662, + "epoch": 0.7028408236885616, + "flos": 15308449676280.0, + "grad_norm": 2.0360663841428686, + "language_loss": 0.80491459, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82865798, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13641357, + "step": 11690, + "time_per_iteration": 2.732186794281006 + }, + { + "auxiliary_loss_clip": 0.01330475, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.22401881, + "balance_loss_mlp": 1.01891828, + "epoch": 0.7029009469412295, + "flos": 22023328563720.0, + "grad_norm": 1.815237677271418, + "language_loss": 0.759969, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78358507, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12207031, + "step": 11691, + "time_per_iteration": 2.7845520973205566 + }, + { + "auxiliary_loss_clip": 0.0133044, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.2250576, + "balance_loss_mlp": 1.01212204, + "epoch": 0.7029610701938975, + "flos": 21840594633000.0, + "grad_norm": 2.956783380740678, + "language_loss": 0.82015896, + "learning_rate": 8.561904458502429e-07, + "loss": 0.84371001, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12536621, + "step": 11692, + "time_per_iteration": 2.718113660812378 + }, + { + "auxiliary_loss_clip": 0.0133334, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.22687531, + "balance_loss_mlp": 1.01532888, + "epoch": 0.7030211934465654, + "flos": 19140380092800.0, + "grad_norm": 1.4769866596172052, + "language_loss": 0.7650032, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78862214, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13214111, + "step": 11693, + "time_per_iteration": 2.774402618408203 + }, + { + "auxiliary_loss_clip": 0.01332247, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.22659278, + "balance_loss_mlp": 1.01787925, + "epoch": 0.7030813166992335, + "flos": 38553268865040.0, + "grad_norm": 2.173373704152998, + "language_loss": 0.68600798, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70963311, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12384033, + "step": 11694, + "time_per_iteration": 2.8772971630096436 + }, + { + "auxiliary_loss_clip": 0.01335605, + "auxiliary_loss_mlp": 0.01027985, + "balance_loss_clip": 1.22768366, + "balance_loss_mlp": 1.01577163, + "epoch": 0.7031414399519014, + "flos": 14725057417200.0, + "grad_norm": 2.06519240884783, + "language_loss": 0.76337147, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78700733, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12213135, + "step": 11695, + "time_per_iteration": 2.735255241394043 + }, + { + "auxiliary_loss_clip": 0.01340427, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.23135364, + "balance_loss_mlp": 1.02144563, + "epoch": 0.7032015632045694, + "flos": 14031261362520.0, + "grad_norm": 1.8943977560116287, + "language_loss": 0.7381407, + "learning_rate": 8.549128601178852e-07, + "loss": 0.76188731, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12792969, + "step": 11696, + "time_per_iteration": 2.7749063968658447 + }, + { + "auxiliary_loss_clip": 0.01336272, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.22771585, + "balance_loss_mlp": 1.01770473, + "epoch": 0.7032616864572373, + "flos": 27643062726000.0, + "grad_norm": 1.6614958389962486, + "language_loss": 0.75486916, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77854145, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13262939, + "step": 11697, + "time_per_iteration": 2.8167731761932373 + }, + { + "auxiliary_loss_clip": 0.01340066, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.2315098, + "balance_loss_mlp": 1.02358007, + "epoch": 0.7033218097099053, + "flos": 17971768198440.0, + "grad_norm": 1.7605775766205056, + "language_loss": 0.80347979, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82724851, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13232422, + "step": 11698, + "time_per_iteration": 2.70443058013916 + }, + { + "auxiliary_loss_clip": 0.01333917, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.22551203, + "balance_loss_mlp": 1.01662552, + "epoch": 0.7033819329625732, + "flos": 19506660121440.0, + "grad_norm": 1.4241958637738013, + "language_loss": 0.85078812, + "learning_rate": 8.539551267053222e-07, + "loss": 0.87442136, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12774658, + "step": 11699, + "time_per_iteration": 2.8771393299102783 + }, + { + "auxiliary_loss_clip": 0.01327818, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.2233665, + "balance_loss_mlp": 1.01510644, + "epoch": 0.7034420562152413, + "flos": 23992932247920.0, + "grad_norm": 2.0389958813803646, + "language_loss": 0.79428864, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81784749, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12969971, + "step": 11700, + "time_per_iteration": 2.8034090995788574 + }, + { + "auxiliary_loss_clip": 0.01336574, + "auxiliary_loss_mlp": 0.0102351, + "balance_loss_clip": 1.2271142, + "balance_loss_mlp": 1.01005721, + "epoch": 0.7035021794679092, + "flos": 35050235158800.0, + "grad_norm": 1.9316915824709469, + "language_loss": 0.74971575, + "learning_rate": 8.533168550341186e-07, + "loss": 0.77331662, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13458252, + "step": 11701, + "time_per_iteration": 2.9754157066345215 + }, + { + "auxiliary_loss_clip": 0.01339464, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.22989511, + "balance_loss_mlp": 1.01718402, + "epoch": 0.7035623027205772, + "flos": 11000850644520.0, + "grad_norm": 2.1234577435698934, + "language_loss": 0.84414291, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86784661, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.137146, + "step": 11702, + "time_per_iteration": 5.782102584838867 + }, + { + "auxiliary_loss_clip": 0.01338469, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.22933984, + "balance_loss_mlp": 1.02474594, + "epoch": 0.7036224259732452, + "flos": 23629088720880.0, + "grad_norm": 9.073035323294894, + "language_loss": 0.61273968, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63649803, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.1262207, + "step": 11703, + "time_per_iteration": 4.269595146179199 + }, + { + "auxiliary_loss_clip": 0.01337484, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.22908783, + "balance_loss_mlp": 1.01919508, + "epoch": 0.7036825492259131, + "flos": 31692470848200.0, + "grad_norm": 1.7830302704820773, + "language_loss": 0.61845309, + "learning_rate": 8.523597736751067e-07, + "loss": 0.64214814, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.1282959, + "step": 11704, + "time_per_iteration": 2.897728681564331 + }, + { + "auxiliary_loss_clip": 0.0132438, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.22007322, + "balance_loss_mlp": 1.02000022, + "epoch": 0.7037426724785811, + "flos": 30199307338080.0, + "grad_norm": 1.5705800992425392, + "language_loss": 0.71026075, + "learning_rate": 8.520408335765719e-07, + "loss": 0.73381984, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.11529541, + "step": 11705, + "time_per_iteration": 2.840249538421631 + }, + { + "auxiliary_loss_clip": 0.01330312, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.224419, + "balance_loss_mlp": 1.01814151, + "epoch": 0.703802795731249, + "flos": 24316265612880.0, + "grad_norm": 1.6654424950993723, + "language_loss": 0.62644446, + "learning_rate": 8.517219370087645e-07, + "loss": 0.65005249, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12347412, + "step": 11706, + "time_per_iteration": 2.8685357570648193 + }, + { + "auxiliary_loss_clip": 0.01338995, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.23117614, + "balance_loss_mlp": 1.01562214, + "epoch": 0.7038629189839171, + "flos": 22534147037520.0, + "grad_norm": 2.4509266812010657, + "language_loss": 0.68279421, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70646489, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12426758, + "step": 11707, + "time_per_iteration": 2.817866086959839 + }, + { + "auxiliary_loss_clip": 0.01327683, + "auxiliary_loss_mlp": 0.01025384, + "balance_loss_clip": 1.22319818, + "balance_loss_mlp": 1.01376092, + "epoch": 0.703923042236585, + "flos": 26256160958760.0, + "grad_norm": 1.665878508446638, + "language_loss": 0.76663089, + "learning_rate": 8.510842745136974e-07, + "loss": 0.79016155, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11627197, + "step": 11708, + "time_per_iteration": 2.82431960105896 + }, + { + "auxiliary_loss_clip": 0.01329857, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.22500098, + "balance_loss_mlp": 1.01593482, + "epoch": 0.703983165489253, + "flos": 19394916249960.0, + "grad_norm": 1.726588136898503, + "language_loss": 0.72259885, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74617016, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11340332, + "step": 11709, + "time_per_iteration": 2.7782278060913086 + }, + { + "auxiliary_loss_clip": 0.01328056, + "auxiliary_loss_mlp": 0.01028835, + "balance_loss_clip": 1.22204399, + "balance_loss_mlp": 1.01678312, + "epoch": 0.7040432887419209, + "flos": 16687067338080.0, + "grad_norm": 2.967977030473426, + "language_loss": 0.79116881, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81473768, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12054443, + "step": 11710, + "time_per_iteration": 2.8177335262298584 + }, + { + "auxiliary_loss_clip": 0.01334988, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.22633231, + "balance_loss_mlp": 1.01913691, + "epoch": 0.7041034119945889, + "flos": 21146270669640.0, + "grad_norm": 2.620178758489867, + "language_loss": 0.7771455, + "learning_rate": 8.501281075538076e-07, + "loss": 0.80081373, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12701416, + "step": 11711, + "time_per_iteration": 4.266989707946777 + }, + { + "auxiliary_loss_clip": 0.01325134, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.22024584, + "balance_loss_mlp": 1.0172894, + "epoch": 0.7041635352472568, + "flos": 16914778350480.0, + "grad_norm": 3.0231322234045708, + "language_loss": 0.74122858, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76476967, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.11676025, + "step": 11712, + "time_per_iteration": 2.763524293899536 + }, + { + "auxiliary_loss_clip": 0.01151732, + "auxiliary_loss_mlp": 0.0100245, + "balance_loss_clip": 1.10897279, + "balance_loss_mlp": 0.99983895, + "epoch": 0.7042236584999249, + "flos": 71698369434120.0, + "grad_norm": 0.8892366535589807, + "language_loss": 0.64688021, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66842204, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02612305, + "step": 11713, + "time_per_iteration": 3.2940831184387207 + }, + { + "auxiliary_loss_clip": 0.01327493, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.22026324, + "balance_loss_mlp": 1.01723981, + "epoch": 0.7042837817525928, + "flos": 28664131156560.0, + "grad_norm": 1.7818174181353605, + "language_loss": 0.72824246, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75180489, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1151123, + "step": 11714, + "time_per_iteration": 2.836744785308838 + }, + { + "auxiliary_loss_clip": 0.01330424, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.22419631, + "balance_loss_mlp": 1.01963377, + "epoch": 0.7043439050052608, + "flos": 19758069434880.0, + "grad_norm": 1.6881851351697, + "language_loss": 0.80047929, + "learning_rate": 8.488538287759248e-07, + "loss": 0.82411432, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13452148, + "step": 11715, + "time_per_iteration": 2.7232964038848877 + }, + { + "auxiliary_loss_clip": 0.01333274, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.22498846, + "balance_loss_mlp": 1.01913333, + "epoch": 0.7044040282579288, + "flos": 11541215023200.0, + "grad_norm": 2.539530458673363, + "language_loss": 0.7164613, + "learning_rate": 8.485353681802037e-07, + "loss": 0.74010539, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.11999512, + "step": 11716, + "time_per_iteration": 2.811898946762085 + }, + { + "auxiliary_loss_clip": 0.01338169, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.22758031, + "balance_loss_mlp": 1.01576746, + "epoch": 0.7044641515105967, + "flos": 33662643049440.0, + "grad_norm": 6.252696417571959, + "language_loss": 0.66832221, + "learning_rate": 8.482169512481358e-07, + "loss": 0.69199109, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12957764, + "step": 11717, + "time_per_iteration": 2.819329023361206 + }, + { + "auxiliary_loss_clip": 0.01332462, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.2244091, + "balance_loss_mlp": 1.01549423, + "epoch": 0.7045242747632647, + "flos": 26729474289120.0, + "grad_norm": 1.3932553012199593, + "language_loss": 0.74273413, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76633388, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12023926, + "step": 11718, + "time_per_iteration": 2.7560832500457764 + }, + { + "auxiliary_loss_clip": 0.01328855, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.22403347, + "balance_loss_mlp": 1.01391697, + "epoch": 0.7045843980159326, + "flos": 26803388150280.0, + "grad_norm": 1.6581576886202345, + "language_loss": 0.80144531, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82498664, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1137085, + "step": 11719, + "time_per_iteration": 2.825071096420288 + }, + { + "auxiliary_loss_clip": 0.01329107, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.22290134, + "balance_loss_mlp": 1.01997507, + "epoch": 0.7046445212686007, + "flos": 41582542548960.0, + "grad_norm": 1.6185317914002846, + "language_loss": 0.66073036, + "learning_rate": 8.472619625545951e-07, + "loss": 0.68435127, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.13012695, + "step": 11720, + "time_per_iteration": 2.9131696224212646 + }, + { + "auxiliary_loss_clip": 0.01345372, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.23450756, + "balance_loss_mlp": 1.01811409, + "epoch": 0.7047046445212686, + "flos": 15564447734400.0, + "grad_norm": 2.0387600783726345, + "language_loss": 0.80150867, + "learning_rate": 8.46943720397872e-07, + "loss": 0.8252697, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.1262207, + "step": 11721, + "time_per_iteration": 2.758997917175293 + }, + { + "auxiliary_loss_clip": 0.01153027, + "auxiliary_loss_mlp": 0.01005384, + "balance_loss_clip": 1.1101923, + "balance_loss_mlp": 1.00285709, + "epoch": 0.7047647677739366, + "flos": 70428043933200.0, + "grad_norm": 0.7645771953986207, + "language_loss": 0.64817095, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66975498, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02526855, + "step": 11722, + "time_per_iteration": 3.394792079925537 + }, + { + "auxiliary_loss_clip": 0.01331672, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.22426414, + "balance_loss_mlp": 1.01974213, + "epoch": 0.7048248910266045, + "flos": 23665375613520.0, + "grad_norm": 1.5284041862634656, + "language_loss": 0.66222143, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68585777, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12213135, + "step": 11723, + "time_per_iteration": 2.8562493324279785 + }, + { + "auxiliary_loss_clip": 0.01338475, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.22855282, + "balance_loss_mlp": 1.01612926, + "epoch": 0.7048850142792725, + "flos": 21402106294320.0, + "grad_norm": 1.8469379725842046, + "language_loss": 0.81021142, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83388162, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12414551, + "step": 11724, + "time_per_iteration": 2.880028486251831 + }, + { + "auxiliary_loss_clip": 0.01339328, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.23006165, + "balance_loss_mlp": 1.02330685, + "epoch": 0.7049451375319404, + "flos": 21652500398760.0, + "grad_norm": 1.6882910843746077, + "language_loss": 0.73357916, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75733554, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13000488, + "step": 11725, + "time_per_iteration": 2.780716896057129 + }, + { + "auxiliary_loss_clip": 0.01340647, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.23012805, + "balance_loss_mlp": 1.01726973, + "epoch": 0.7050052607846085, + "flos": 14870976546600.0, + "grad_norm": 2.689552604364533, + "language_loss": 0.78350139, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80721521, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13476562, + "step": 11726, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.01331765, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.22452259, + "balance_loss_mlp": 1.01879394, + "epoch": 0.7050653840372764, + "flos": 19246073318640.0, + "grad_norm": 1.6708448192309726, + "language_loss": 0.70602155, + "learning_rate": 8.450351860839931e-07, + "loss": 0.7296499, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12286377, + "step": 11727, + "time_per_iteration": 2.778312921524048 + }, + { + "auxiliary_loss_clip": 0.01316087, + "auxiliary_loss_mlp": 0.01019937, + "balance_loss_clip": 1.21462262, + "balance_loss_mlp": 1.0088799, + "epoch": 0.7051255072899444, + "flos": 27785976836760.0, + "grad_norm": 1.699886695249063, + "language_loss": 0.69175136, + "learning_rate": 8.44717250248668e-07, + "loss": 0.71511161, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.11053467, + "step": 11728, + "time_per_iteration": 2.8718807697296143 + }, + { + "auxiliary_loss_clip": 0.01328505, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.22184515, + "balance_loss_mlp": 1.01853979, + "epoch": 0.7051856305426124, + "flos": 27897883141680.0, + "grad_norm": 1.8990630064513698, + "language_loss": 0.73065096, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75423932, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.11791992, + "step": 11729, + "time_per_iteration": 2.936516523361206 + }, + { + "auxiliary_loss_clip": 0.01340598, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.22766471, + "balance_loss_mlp": 1.02168643, + "epoch": 0.7052457537952803, + "flos": 25048703845080.0, + "grad_norm": 1.778186800084149, + "language_loss": 0.7832936, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80705112, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13458252, + "step": 11730, + "time_per_iteration": 2.799095869064331 + }, + { + "auxiliary_loss_clip": 0.01338608, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.22902703, + "balance_loss_mlp": 1.01804733, + "epoch": 0.7053058770479483, + "flos": 21876759700560.0, + "grad_norm": 2.065925841687632, + "language_loss": 0.63592017, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65961468, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12811279, + "step": 11731, + "time_per_iteration": 2.798353910446167 + }, + { + "auxiliary_loss_clip": 0.01339163, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.23033106, + "balance_loss_mlp": 1.01491809, + "epoch": 0.7053660003006162, + "flos": 16403025884400.0, + "grad_norm": 2.020650225277231, + "language_loss": 0.74794215, + "learning_rate": 8.434459451122815e-07, + "loss": 0.7716164, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13342285, + "step": 11732, + "time_per_iteration": 2.7104439735412598 + }, + { + "auxiliary_loss_clip": 0.01327846, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.22318411, + "balance_loss_mlp": 1.01323605, + "epoch": 0.7054261235532843, + "flos": 22716799751520.0, + "grad_norm": 1.4247281234533589, + "language_loss": 0.7086947, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73222858, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1229248, + "step": 11733, + "time_per_iteration": 2.7695817947387695 + }, + { + "auxiliary_loss_clip": 0.0132796, + "auxiliary_loss_mlp": 0.01030205, + "balance_loss_clip": 1.22144687, + "balance_loss_mlp": 1.0182426, + "epoch": 0.7054862468059522, + "flos": 13593707016120.0, + "grad_norm": 1.7746381189609426, + "language_loss": 0.73159909, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75518072, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.11975098, + "step": 11734, + "time_per_iteration": 2.829369306564331 + }, + { + "auxiliary_loss_clip": 0.01347833, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.23491728, + "balance_loss_mlp": 1.01772034, + "epoch": 0.7055463700586202, + "flos": 15884004521880.0, + "grad_norm": 2.1520146674228493, + "language_loss": 0.69948125, + "learning_rate": 8.424929267125829e-07, + "loss": 0.72327185, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13494873, + "step": 11735, + "time_per_iteration": 2.748528242111206 + }, + { + "auxiliary_loss_clip": 0.01332112, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.22237945, + "balance_loss_mlp": 1.01887619, + "epoch": 0.7056064933112881, + "flos": 23081414837400.0, + "grad_norm": 1.6322457800849928, + "language_loss": 0.72544235, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74909383, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.14154053, + "step": 11736, + "time_per_iteration": 2.7745625972747803 + }, + { + "auxiliary_loss_clip": 0.01323419, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.21835828, + "balance_loss_mlp": 1.01731491, + "epoch": 0.7056666165639561, + "flos": 24062013714240.0, + "grad_norm": 1.8742456456204502, + "language_loss": 0.69418436, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71771246, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12084961, + "step": 11737, + "time_per_iteration": 2.824432611465454 + }, + { + "auxiliary_loss_clip": 0.01332148, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.22238898, + "balance_loss_mlp": 1.02512455, + "epoch": 0.705726739816624, + "flos": 17497683309240.0, + "grad_norm": 2.0543461991626164, + "language_loss": 0.67140257, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69510591, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13067627, + "step": 11738, + "time_per_iteration": 2.72446346282959 + }, + { + "auxiliary_loss_clip": 0.01332542, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.2242372, + "balance_loss_mlp": 1.01624238, + "epoch": 0.7057868630692921, + "flos": 51359083610400.0, + "grad_norm": 1.6798413999385102, + "language_loss": 0.75224257, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77586246, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13208008, + "step": 11739, + "time_per_iteration": 3.003413200378418 + }, + { + "auxiliary_loss_clip": 0.01321894, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.21578038, + "balance_loss_mlp": 1.01793814, + "epoch": 0.70584698632196, + "flos": 26108414453160.0, + "grad_norm": 1.6929946034185164, + "language_loss": 0.71977866, + "learning_rate": 8.409054407293032e-07, + "loss": 0.7433027, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12573242, + "step": 11740, + "time_per_iteration": 4.254818916320801 + }, + { + "auxiliary_loss_clip": 0.01324575, + "auxiliary_loss_mlp": 0.01027798, + "balance_loss_clip": 1.21896219, + "balance_loss_mlp": 1.01615775, + "epoch": 0.705907109574628, + "flos": 21548106640440.0, + "grad_norm": 8.007731902172255, + "language_loss": 0.82433069, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84785444, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11639404, + "step": 11741, + "time_per_iteration": 4.328679800033569 + }, + { + "auxiliary_loss_clip": 0.0133267, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.22422791, + "balance_loss_mlp": 1.01461005, + "epoch": 0.705967232827296, + "flos": 22715987584320.0, + "grad_norm": 1.6761492020652875, + "language_loss": 0.78342378, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80702877, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13232422, + "step": 11742, + "time_per_iteration": 4.254484176635742 + }, + { + "auxiliary_loss_clip": 0.01338144, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.22578382, + "balance_loss_mlp": 1.01786721, + "epoch": 0.7060273560799639, + "flos": 28696722688440.0, + "grad_norm": 1.497028090388646, + "language_loss": 0.64730215, + "learning_rate": 8.39953476478805e-07, + "loss": 0.67099059, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.1282959, + "step": 11743, + "time_per_iteration": 2.8256943225860596 + }, + { + "auxiliary_loss_clip": 0.01336199, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.22496057, + "balance_loss_mlp": 1.01612091, + "epoch": 0.7060874793326319, + "flos": 15710488688880.0, + "grad_norm": 2.020012533591865, + "language_loss": 0.66023767, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68389559, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13482666, + "step": 11744, + "time_per_iteration": 2.830334424972534 + }, + { + "auxiliary_loss_clip": 0.01323401, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.21777821, + "balance_loss_mlp": 1.01717687, + "epoch": 0.7061476025852998, + "flos": 21511738531080.0, + "grad_norm": 1.8719739204747576, + "language_loss": 0.63451982, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65805149, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12591553, + "step": 11745, + "time_per_iteration": 2.8426408767700195 + }, + { + "auxiliary_loss_clip": 0.01332243, + "auxiliary_loss_mlp": 0.01027244, + "balance_loss_clip": 1.22477627, + "balance_loss_mlp": 1.01518023, + "epoch": 0.7062077258379679, + "flos": 28187447332320.0, + "grad_norm": 1.4799023675450977, + "language_loss": 0.7183423, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74193716, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12060547, + "step": 11746, + "time_per_iteration": 2.854351282119751 + }, + { + "auxiliary_loss_clip": 0.01329768, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.22088575, + "balance_loss_mlp": 1.01864684, + "epoch": 0.7062678490906358, + "flos": 27859281572520.0, + "grad_norm": 1.412834135992517, + "language_loss": 0.79270649, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81631553, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12487793, + "step": 11747, + "time_per_iteration": 2.8787221908569336 + }, + { + "auxiliary_loss_clip": 0.013186, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.21298218, + "balance_loss_mlp": 1.0162878, + "epoch": 0.7063279723433038, + "flos": 23190031865160.0, + "grad_norm": 1.7927266966220772, + "language_loss": 0.65634716, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67981094, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.1149292, + "step": 11748, + "time_per_iteration": 2.831655263900757 + }, + { + "auxiliary_loss_clip": 0.01322829, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.21493959, + "balance_loss_mlp": 1.01690245, + "epoch": 0.7063880955959717, + "flos": 20192862412800.0, + "grad_norm": 3.2161332970171377, + "language_loss": 0.79850781, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8220374, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13244629, + "step": 11749, + "time_per_iteration": 4.2991554737091064 + }, + { + "auxiliary_loss_clip": 0.01155022, + "auxiliary_loss_mlp": 0.01000505, + "balance_loss_clip": 1.11127889, + "balance_loss_mlp": 0.99742937, + "epoch": 0.7064482188486397, + "flos": 63681328091520.0, + "grad_norm": 0.7886771176314902, + "language_loss": 0.54091299, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56246823, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.03076172, + "step": 11750, + "time_per_iteration": 3.217243194580078 + }, + { + "auxiliary_loss_clip": 0.01326911, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.21930683, + "balance_loss_mlp": 1.01743484, + "epoch": 0.7065083421013076, + "flos": 25196612784120.0, + "grad_norm": 1.7666583794821886, + "language_loss": 0.78785968, + "learning_rate": 8.37416841545612e-07, + "loss": 0.81143898, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13592529, + "step": 11751, + "time_per_iteration": 2.773892879486084 + }, + { + "auxiliary_loss_clip": 0.01321309, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.21546364, + "balance_loss_mlp": 1.01552141, + "epoch": 0.7065684653539757, + "flos": 22898924556840.0, + "grad_norm": 2.1374472886160696, + "language_loss": 0.68311167, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70660108, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12109375, + "step": 11752, + "time_per_iteration": 2.7510249614715576 + }, + { + "auxiliary_loss_clip": 0.01320977, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.21509576, + "balance_loss_mlp": 1.01798296, + "epoch": 0.7066285886066436, + "flos": 23555540334960.0, + "grad_norm": 2.557930052345346, + "language_loss": 0.77062893, + "learning_rate": 8.367831234246025e-07, + "loss": 0.79414487, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12640381, + "step": 11753, + "time_per_iteration": 2.7459003925323486 + }, + { + "auxiliary_loss_clip": 0.01318659, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.21457744, + "balance_loss_mlp": 1.01581001, + "epoch": 0.7066887118593116, + "flos": 21074265401400.0, + "grad_norm": 1.5795937789253798, + "language_loss": 0.70697522, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73044366, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.12384033, + "step": 11754, + "time_per_iteration": 2.776703119277954 + }, + { + "auxiliary_loss_clip": 0.01326102, + "auxiliary_loss_mlp": 0.0102588, + "balance_loss_clip": 1.21795118, + "balance_loss_mlp": 1.01345825, + "epoch": 0.7067488351119796, + "flos": 21180973836240.0, + "grad_norm": 1.6605521004153825, + "language_loss": 0.89202428, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91554409, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12426758, + "step": 11755, + "time_per_iteration": 2.727304697036743 + }, + { + "auxiliary_loss_clip": 0.01320626, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.21447003, + "balance_loss_mlp": 1.0173018, + "epoch": 0.7068089583646475, + "flos": 20454139557720.0, + "grad_norm": 1.8351817896898348, + "language_loss": 0.7968002, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82030833, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12890625, + "step": 11756, + "time_per_iteration": 2.889970064163208 + }, + { + "auxiliary_loss_clip": 0.01154189, + "auxiliary_loss_mlp": 0.01016316, + "balance_loss_clip": 1.10993528, + "balance_loss_mlp": 1.01259637, + "epoch": 0.7068690816173155, + "flos": 59121954271080.0, + "grad_norm": 0.8366290542178836, + "language_loss": 0.60381061, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62551564, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.03710938, + "step": 11757, + "time_per_iteration": 3.085045337677002 + }, + { + "auxiliary_loss_clip": 0.01324181, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.21815705, + "balance_loss_mlp": 1.0152123, + "epoch": 0.7069292048699835, + "flos": 16255685462400.0, + "grad_norm": 1.645194383265359, + "language_loss": 0.80564582, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82917178, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.13201904, + "step": 11758, + "time_per_iteration": 2.7151026725769043 + }, + { + "auxiliary_loss_clip": 0.0132209, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.2159605, + "balance_loss_mlp": 1.01645517, + "epoch": 0.7069893281226515, + "flos": 41180341102920.0, + "grad_norm": 1.6013020153933597, + "language_loss": 0.77765, + "learning_rate": 8.348830280691304e-07, + "loss": 0.80116153, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12609863, + "step": 11759, + "time_per_iteration": 2.964545488357544 + }, + { + "auxiliary_loss_clip": 0.01321244, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.21479821, + "balance_loss_mlp": 1.01506996, + "epoch": 0.7070494513753194, + "flos": 24212684021760.0, + "grad_norm": 1.60145332355429, + "language_loss": 0.68212968, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70562434, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13165283, + "step": 11760, + "time_per_iteration": 2.836582899093628 + }, + { + "auxiliary_loss_clip": 0.01329943, + "auxiliary_loss_mlp": 0.01025672, + "balance_loss_clip": 1.22007608, + "balance_loss_mlp": 1.01350093, + "epoch": 0.7071095746279874, + "flos": 20189491918920.0, + "grad_norm": 2.148716251331723, + "language_loss": 0.80564356, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82919967, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1217041, + "step": 11761, + "time_per_iteration": 2.7770135402679443 + }, + { + "auxiliary_loss_clip": 0.01328705, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.22085381, + "balance_loss_mlp": 1.01505494, + "epoch": 0.7071696978806553, + "flos": 18187134269400.0, + "grad_norm": 2.564014390732875, + "language_loss": 0.75412041, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77769589, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13793945, + "step": 11762, + "time_per_iteration": 2.7246899604797363 + }, + { + "auxiliary_loss_clip": 0.01323397, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.21754146, + "balance_loss_mlp": 1.01612604, + "epoch": 0.7072298211333233, + "flos": 24136780350960.0, + "grad_norm": 1.6370055211598797, + "language_loss": 0.77127874, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79479897, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12512207, + "step": 11763, + "time_per_iteration": 2.768484592437744 + }, + { + "auxiliary_loss_clip": 0.01327062, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.21972871, + "balance_loss_mlp": 1.01948965, + "epoch": 0.7072899443859912, + "flos": 27204127695360.0, + "grad_norm": 2.154708779696085, + "language_loss": 0.78997791, + "learning_rate": 8.333008301499453e-07, + "loss": 0.81358445, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.14093018, + "step": 11764, + "time_per_iteration": 2.7766737937927246 + }, + { + "auxiliary_loss_clip": 0.01332446, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.22223377, + "balance_loss_mlp": 1.02240741, + "epoch": 0.7073500676386593, + "flos": 16439962510800.0, + "grad_norm": 1.6026080169564303, + "language_loss": 0.79774547, + "learning_rate": 8.32984523242167e-07, + "loss": 0.82143235, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13830566, + "step": 11765, + "time_per_iteration": 2.7746076583862305 + }, + { + "auxiliary_loss_clip": 0.01321405, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.21628916, + "balance_loss_mlp": 1.01579428, + "epoch": 0.7074101908913272, + "flos": 27679633877160.0, + "grad_norm": 1.656300398105101, + "language_loss": 0.68541348, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70889616, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1105957, + "step": 11766, + "time_per_iteration": 2.8179237842559814 + }, + { + "auxiliary_loss_clip": 0.013266, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.21915817, + "balance_loss_mlp": 1.02228236, + "epoch": 0.7074703141439952, + "flos": 22243364596080.0, + "grad_norm": 1.9029668470901764, + "language_loss": 0.64398932, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66760063, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12255859, + "step": 11767, + "time_per_iteration": 2.7800416946411133 + }, + { + "auxiliary_loss_clip": 0.0132545, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.21758664, + "balance_loss_mlp": 1.016505, + "epoch": 0.7075304373966632, + "flos": 29649278169720.0, + "grad_norm": 1.6577120668326517, + "language_loss": 0.53095031, + "learning_rate": 8.320358680868646e-07, + "loss": 0.55449843, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12860107, + "step": 11768, + "time_per_iteration": 2.886704206466675 + }, + { + "auxiliary_loss_clip": 0.01320185, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.21495306, + "balance_loss_mlp": 1.01833081, + "epoch": 0.7075905606493311, + "flos": 19759937419440.0, + "grad_norm": 1.8520839306972883, + "language_loss": 0.7576651, + "learning_rate": 8.317197382644119e-07, + "loss": 0.7811752, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12493896, + "step": 11769, + "time_per_iteration": 2.8614537715911865 + }, + { + "auxiliary_loss_clip": 0.01154329, + "auxiliary_loss_mlp": 0.01006027, + "balance_loss_clip": 1.10992432, + "balance_loss_mlp": 1.00288033, + "epoch": 0.7076506839019991, + "flos": 65730165332040.0, + "grad_norm": 0.8521078483127945, + "language_loss": 0.62047118, + "learning_rate": 8.314036527432637e-07, + "loss": 0.6420747, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.03149414, + "step": 11770, + "time_per_iteration": 3.2555336952209473 + }, + { + "auxiliary_loss_clip": 0.01329385, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.22037041, + "balance_loss_mlp": 1.01880908, + "epoch": 0.707710807154667, + "flos": 23770094238720.0, + "grad_norm": 1.6069209311642105, + "language_loss": 0.76436567, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78797138, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.1237793, + "step": 11771, + "time_per_iteration": 2.7484140396118164 + }, + { + "auxiliary_loss_clip": 0.01312394, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.2090323, + "balance_loss_mlp": 1.01448488, + "epoch": 0.7077709304073351, + "flos": 21256308990000.0, + "grad_norm": 1.4611485663563213, + "language_loss": 0.71881497, + "learning_rate": 8.307716146528221e-07, + "loss": 0.74220479, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.12097168, + "step": 11772, + "time_per_iteration": 2.789261817932129 + }, + { + "auxiliary_loss_clip": 0.01333283, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.22340155, + "balance_loss_mlp": 1.01652551, + "epoch": 0.707831053660003, + "flos": 20745652949640.0, + "grad_norm": 1.771915805118637, + "language_loss": 0.70163995, + "learning_rate": 8.30455662107496e-07, + "loss": 0.72527546, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13745117, + "step": 11773, + "time_per_iteration": 2.844423294067383 + }, + { + "auxiliary_loss_clip": 0.01329942, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.22211802, + "balance_loss_mlp": 1.01683712, + "epoch": 0.707891176912671, + "flos": 21986026462080.0, + "grad_norm": 1.4049561554827195, + "language_loss": 0.70261723, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72620344, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.11828613, + "step": 11774, + "time_per_iteration": 2.790625810623169 + }, + { + "auxiliary_loss_clip": 0.01316625, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.21361995, + "balance_loss_mlp": 1.01615691, + "epoch": 0.7079513001653389, + "flos": 21073899926160.0, + "grad_norm": 1.523668348809785, + "language_loss": 0.74581563, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76925731, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.1138916, + "step": 11775, + "time_per_iteration": 2.738095283508301 + }, + { + "auxiliary_loss_clip": 0.01323901, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.21643651, + "balance_loss_mlp": 1.01567721, + "epoch": 0.7080114234180069, + "flos": 18045194759280.0, + "grad_norm": 1.795864430068966, + "language_loss": 0.87354559, + "learning_rate": 8.295080706148665e-07, + "loss": 0.8970679, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12664795, + "step": 11776, + "time_per_iteration": 2.716228485107422 + }, + { + "auxiliary_loss_clip": 0.01318521, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.21318114, + "balance_loss_mlp": 1.01598144, + "epoch": 0.7080715466706748, + "flos": 15126487304400.0, + "grad_norm": 1.542490082306668, + "language_loss": 0.74790877, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77137184, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11804199, + "step": 11777, + "time_per_iteration": 2.7204277515411377 + }, + { + "auxiliary_loss_clip": 0.0133471, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.22360682, + "balance_loss_mlp": 1.01841378, + "epoch": 0.7081316699233429, + "flos": 14426112695400.0, + "grad_norm": 1.9974359039826652, + "language_loss": 0.82644266, + "learning_rate": 8.288765648590066e-07, + "loss": 0.85010517, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13122559, + "step": 11778, + "time_per_iteration": 2.674349308013916 + }, + { + "auxiliary_loss_clip": 0.01311043, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.20884645, + "balance_loss_mlp": 1.01695597, + "epoch": 0.7081917931760108, + "flos": 23227699442040.0, + "grad_norm": 1.5090869026328866, + "language_loss": 0.84981894, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87320828, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.10931396, + "step": 11779, + "time_per_iteration": 4.158705711364746 + }, + { + "auxiliary_loss_clip": 0.01328591, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.22019184, + "balance_loss_mlp": 1.02021456, + "epoch": 0.7082519164286788, + "flos": 39315293610480.0, + "grad_norm": 2.0174045766933406, + "language_loss": 0.71646917, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74007881, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12158203, + "step": 11780, + "time_per_iteration": 4.483896255493164 + }, + { + "auxiliary_loss_clip": 0.01324079, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.21832085, + "balance_loss_mlp": 1.01785231, + "epoch": 0.7083120396813467, + "flos": 21655952109360.0, + "grad_norm": 1.5580438878636287, + "language_loss": 0.73372817, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75727057, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12310791, + "step": 11781, + "time_per_iteration": 4.179202318191528 + }, + { + "auxiliary_loss_clip": 0.013227, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.2173667, + "balance_loss_mlp": 1.0175612, + "epoch": 0.7083721629340147, + "flos": 17571597170400.0, + "grad_norm": 1.4998729226678797, + "language_loss": 0.77415806, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79768002, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.11932373, + "step": 11782, + "time_per_iteration": 2.712552547454834 + }, + { + "auxiliary_loss_clip": 0.01317666, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.21320879, + "balance_loss_mlp": 1.01627553, + "epoch": 0.7084322861866827, + "flos": 29356424701920.0, + "grad_norm": 1.434928727945007, + "language_loss": 0.70084202, + "learning_rate": 8.272985778383828e-07, + "loss": 0.72429097, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.10949707, + "step": 11783, + "time_per_iteration": 2.8931615352630615 + }, + { + "auxiliary_loss_clip": 0.01329988, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.22111368, + "balance_loss_mlp": 1.01587009, + "epoch": 0.7084924094393507, + "flos": 20199237925320.0, + "grad_norm": 1.558510633361424, + "language_loss": 0.79229337, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81587708, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12506104, + "step": 11784, + "time_per_iteration": 2.8353562355041504 + }, + { + "auxiliary_loss_clip": 0.01327098, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.2201736, + "balance_loss_mlp": 1.02041686, + "epoch": 0.7085525326920187, + "flos": 23482479249360.0, + "grad_norm": 1.7120380290828434, + "language_loss": 0.77611619, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79971898, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12744141, + "step": 11785, + "time_per_iteration": 2.8143532276153564 + }, + { + "auxiliary_loss_clip": 0.01319681, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.21308303, + "balance_loss_mlp": 1.02096868, + "epoch": 0.7086126559446866, + "flos": 25964485133400.0, + "grad_norm": 1.4969351870772711, + "language_loss": 0.78135991, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80489045, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12420654, + "step": 11786, + "time_per_iteration": 2.837562322616577 + }, + { + "auxiliary_loss_clip": 0.01325565, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.21732414, + "balance_loss_mlp": 1.01216841, + "epoch": 0.7086727791973546, + "flos": 26730530106480.0, + "grad_norm": 1.7866038061997775, + "language_loss": 0.78885329, + "learning_rate": 8.260369885912526e-07, + "loss": 0.8123585, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12774658, + "step": 11787, + "time_per_iteration": 2.7800259590148926 + }, + { + "auxiliary_loss_clip": 0.01321767, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.21526349, + "balance_loss_mlp": 1.0147438, + "epoch": 0.7087329024500225, + "flos": 21687609648960.0, + "grad_norm": 1.749017648960796, + "language_loss": 0.76855612, + "learning_rate": 8.257217025415615e-07, + "loss": 0.79204798, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12683105, + "step": 11788, + "time_per_iteration": 4.287750959396362 + }, + { + "auxiliary_loss_clip": 0.01335118, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.22398925, + "balance_loss_mlp": 1.01265216, + "epoch": 0.7087930257026905, + "flos": 17935562522520.0, + "grad_norm": 1.9106315601061148, + "language_loss": 0.68423963, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70784938, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 1.11083984, + "router_z_loss_mlp": 0.13226318, + "step": 11789, + "time_per_iteration": 2.7378101348876953 + }, + { + "auxiliary_loss_clip": 0.01333273, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.22296643, + "balance_loss_mlp": 1.01418078, + "epoch": 0.7088531489553584, + "flos": 18915552273960.0, + "grad_norm": 1.5540963372765626, + "language_loss": 0.77670348, + "learning_rate": 8.250912640403858e-07, + "loss": 0.80031741, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13934326, + "step": 11790, + "time_per_iteration": 2.6898574829101562 + }, + { + "auxiliary_loss_clip": 0.01336572, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.22419763, + "balance_loss_mlp": 1.01676726, + "epoch": 0.7089132722080265, + "flos": 27386617975920.0, + "grad_norm": 1.962699010053534, + "language_loss": 0.71506381, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73872977, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13232422, + "step": 11791, + "time_per_iteration": 2.923938035964966 + }, + { + "auxiliary_loss_clip": 0.01324132, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.21750951, + "balance_loss_mlp": 1.01648796, + "epoch": 0.7089733954606944, + "flos": 22167907617240.0, + "grad_norm": 1.5061438391326436, + "language_loss": 0.82413733, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84767663, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13323975, + "step": 11792, + "time_per_iteration": 2.7297728061676025 + }, + { + "auxiliary_loss_clip": 0.01334713, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.2231586, + "balance_loss_mlp": 1.01847744, + "epoch": 0.7090335187133624, + "flos": 24431217544800.0, + "grad_norm": 1.9340973622309643, + "language_loss": 0.64885545, + "learning_rate": 8.241459404634232e-07, + "loss": 0.67252517, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13793945, + "step": 11793, + "time_per_iteration": 2.7718665599823 + }, + { + "auxiliary_loss_clip": 0.01321079, + "auxiliary_loss_mlp": 0.01024372, + "balance_loss_clip": 1.21566248, + "balance_loss_mlp": 1.0124867, + "epoch": 0.7090936419660303, + "flos": 21840269766120.0, + "grad_norm": 1.9429757720608722, + "language_loss": 0.70536673, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72882116, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11877441, + "step": 11794, + "time_per_iteration": 2.7511069774627686 + }, + { + "auxiliary_loss_clip": 0.01323559, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.21837187, + "balance_loss_mlp": 1.02057493, + "epoch": 0.7091537652186983, + "flos": 20086966145160.0, + "grad_norm": 1.8123050769168774, + "language_loss": 0.76394439, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78751087, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12512207, + "step": 11795, + "time_per_iteration": 2.7267022132873535 + }, + { + "auxiliary_loss_clip": 0.01321836, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.21456003, + "balance_loss_mlp": 1.0198791, + "epoch": 0.7092138884713663, + "flos": 13155462327600.0, + "grad_norm": 2.0712575717054498, + "language_loss": 0.75144196, + "learning_rate": 8.232010181829838e-07, + "loss": 0.7749877, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12866211, + "step": 11796, + "time_per_iteration": 2.72692608833313 + }, + { + "auxiliary_loss_clip": 0.01336292, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.22407091, + "balance_loss_mlp": 1.02218199, + "epoch": 0.7092740117240343, + "flos": 21650104505520.0, + "grad_norm": 3.234611288569461, + "language_loss": 0.74193656, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76566666, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14538574, + "step": 11797, + "time_per_iteration": 2.7203636169433594 + }, + { + "auxiliary_loss_clip": 0.01329463, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.22175145, + "balance_loss_mlp": 1.01604104, + "epoch": 0.7093341349767023, + "flos": 21037450600080.0, + "grad_norm": 1.3725650423886564, + "language_loss": 0.79325038, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81683302, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12768555, + "step": 11798, + "time_per_iteration": 2.7611260414123535 + }, + { + "auxiliary_loss_clip": 0.01324794, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.21860814, + "balance_loss_mlp": 1.01758552, + "epoch": 0.7093942582293702, + "flos": 22022719438320.0, + "grad_norm": 2.255026375146008, + "language_loss": 0.67177987, + "learning_rate": 8.222564975215529e-07, + "loss": 0.69533408, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13031006, + "step": 11799, + "time_per_iteration": 2.780195474624634 + }, + { + "auxiliary_loss_clip": 0.01327559, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.21913433, + "balance_loss_mlp": 1.01324856, + "epoch": 0.7094543814820382, + "flos": 27241389188640.0, + "grad_norm": 1.5641819524053537, + "language_loss": 0.81796694, + "learning_rate": 8.219417466054622e-07, + "loss": 0.84151548, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.14038086, + "step": 11800, + "time_per_iteration": 2.816986560821533 + }, + { + "auxiliary_loss_clip": 0.01323326, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.2168901, + "balance_loss_mlp": 1.01412439, + "epoch": 0.7095145047347061, + "flos": 12092340617280.0, + "grad_norm": 1.892906867781727, + "language_loss": 0.86885178, + "learning_rate": 8.21627040361459e-07, + "loss": 0.89235401, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12762451, + "step": 11801, + "time_per_iteration": 2.705665349960327 + }, + { + "auxiliary_loss_clip": 0.01328058, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.22022545, + "balance_loss_mlp": 1.01839137, + "epoch": 0.7095746279873741, + "flos": 19387606745160.0, + "grad_norm": 2.402003295848093, + "language_loss": 0.76378393, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78737485, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12652588, + "step": 11802, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.01326964, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.21801138, + "balance_loss_mlp": 1.02465522, + "epoch": 0.709634751240042, + "flos": 21365535143160.0, + "grad_norm": 1.830320322257707, + "language_loss": 0.81685603, + "learning_rate": 8.209977619374462e-07, + "loss": 0.84050381, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13153076, + "step": 11803, + "time_per_iteration": 2.7305655479431152 + }, + { + "auxiliary_loss_clip": 0.01329799, + "auxiliary_loss_mlp": 0.0102931, + "balance_loss_clip": 1.21968436, + "balance_loss_mlp": 1.01570272, + "epoch": 0.7096948744927101, + "flos": 13920938783640.0, + "grad_norm": 2.0724679838577025, + "language_loss": 0.67769253, + "learning_rate": 8.206831897812995e-07, + "loss": 0.70128357, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1362915, + "step": 11804, + "time_per_iteration": 2.8320553302764893 + }, + { + "auxiliary_loss_clip": 0.01315705, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.21347761, + "balance_loss_mlp": 1.01745343, + "epoch": 0.709754997745378, + "flos": 30304269613440.0, + "grad_norm": 1.8210453968333626, + "language_loss": 0.78660941, + "learning_rate": 8.203686623449637e-07, + "loss": 0.81005466, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.11364746, + "step": 11805, + "time_per_iteration": 2.8201305866241455 + }, + { + "auxiliary_loss_clip": 0.013327, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.22419322, + "balance_loss_mlp": 1.01752448, + "epoch": 0.709815120998046, + "flos": 18519766948800.0, + "grad_norm": 2.065065097015327, + "language_loss": 0.78951842, + "learning_rate": 8.200541796403667e-07, + "loss": 0.81315464, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13397217, + "step": 11806, + "time_per_iteration": 2.7676970958709717 + }, + { + "auxiliary_loss_clip": 0.013236, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.21633673, + "balance_loss_mlp": 1.01897025, + "epoch": 0.7098752442507139, + "flos": 22277621070720.0, + "grad_norm": 2.0736565789397408, + "language_loss": 0.56513721, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58868825, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12536621, + "step": 11807, + "time_per_iteration": 2.786287546157837 + }, + { + "auxiliary_loss_clip": 0.0133414, + "auxiliary_loss_mlp": 0.01032664, + "balance_loss_clip": 1.22083819, + "balance_loss_mlp": 1.01909781, + "epoch": 0.7099353675033819, + "flos": 19279395801000.0, + "grad_norm": 2.0241988880793906, + "language_loss": 0.68570769, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70937574, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13574219, + "step": 11808, + "time_per_iteration": 2.7617721557617188 + }, + { + "auxiliary_loss_clip": 0.01330052, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.21952605, + "balance_loss_mlp": 1.01624441, + "epoch": 0.70999549075605, + "flos": 21913615110240.0, + "grad_norm": 1.8418746296177575, + "language_loss": 0.71624768, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73983836, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12774658, + "step": 11809, + "time_per_iteration": 2.805612087249756 + }, + { + "auxiliary_loss_clip": 0.01154056, + "auxiliary_loss_mlp": 0.01003219, + "balance_loss_clip": 1.10979474, + "balance_loss_mlp": 0.99930871, + "epoch": 0.7100556140087179, + "flos": 70470283273560.0, + "grad_norm": 0.7540919254832366, + "language_loss": 0.59451449, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61608726, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.0390625, + "step": 11810, + "time_per_iteration": 3.376530170440674 + }, + { + "auxiliary_loss_clip": 0.01329574, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.22324347, + "balance_loss_mlp": 1.01926756, + "epoch": 0.7101157372613859, + "flos": 23044803077880.0, + "grad_norm": 1.7831731520814176, + "language_loss": 0.74183607, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76544833, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12384033, + "step": 11811, + "time_per_iteration": 2.762812376022339 + }, + { + "auxiliary_loss_clip": 0.01320082, + "auxiliary_loss_mlp": 0.01023989, + "balance_loss_clip": 1.21615112, + "balance_loss_mlp": 1.01160359, + "epoch": 0.7101758605140538, + "flos": 23190559773840.0, + "grad_norm": 1.791518331623077, + "language_loss": 0.83917427, + "learning_rate": 8.181682234469882e-07, + "loss": 0.86261493, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12384033, + "step": 11812, + "time_per_iteration": 2.751593828201294 + }, + { + "auxiliary_loss_clip": 0.01326397, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.21721101, + "balance_loss_mlp": 1.01442611, + "epoch": 0.7102359837667218, + "flos": 23701418856000.0, + "grad_norm": 2.26576684965624, + "language_loss": 0.70146453, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72500592, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13317871, + "step": 11813, + "time_per_iteration": 2.7622013092041016 + }, + { + "auxiliary_loss_clip": 0.0131932, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.2145412, + "balance_loss_mlp": 1.01365972, + "epoch": 0.7102961070193897, + "flos": 19395890850600.0, + "grad_norm": 1.7817874804407816, + "language_loss": 0.82209092, + "learning_rate": 8.175399297768495e-07, + "loss": 0.84554052, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.11975098, + "step": 11814, + "time_per_iteration": 2.7610065937042236 + }, + { + "auxiliary_loss_clip": 0.01326736, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.22010446, + "balance_loss_mlp": 1.01411915, + "epoch": 0.7103562302720577, + "flos": 21512591306640.0, + "grad_norm": 1.9557842675277413, + "language_loss": 0.76593614, + "learning_rate": 8.172258501943301e-07, + "loss": 0.78947341, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12860107, + "step": 11815, + "time_per_iteration": 2.765913963317871 + }, + { + "auxiliary_loss_clip": 0.01324202, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.2195816, + "balance_loss_mlp": 1.01480699, + "epoch": 0.7104163535247257, + "flos": 14538749950800.0, + "grad_norm": 1.6392778938955341, + "language_loss": 0.79355913, + "learning_rate": 8.16911815462725e-07, + "loss": 0.81707257, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.12335205, + "step": 11816, + "time_per_iteration": 2.8524978160858154 + }, + { + "auxiliary_loss_clip": 0.01326338, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.2186532, + "balance_loss_mlp": 1.02328587, + "epoch": 0.7104764767773937, + "flos": 11403986082840.0, + "grad_norm": 1.8099644582045984, + "language_loss": 0.86743569, + "learning_rate": 8.165978255939426e-07, + "loss": 0.8910585, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12646484, + "step": 11817, + "time_per_iteration": 2.699840545654297 + }, + { + "auxiliary_loss_clip": 0.0131967, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.21391106, + "balance_loss_mlp": 1.01295805, + "epoch": 0.7105366000300616, + "flos": 11693915748720.0, + "grad_norm": 2.2110370465163425, + "language_loss": 0.85014236, + "learning_rate": 8.162838805998897e-07, + "loss": 0.8735919, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12341309, + "step": 11818, + "time_per_iteration": 5.533309459686279 + }, + { + "auxiliary_loss_clip": 0.0132627, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.21743202, + "balance_loss_mlp": 1.01495576, + "epoch": 0.7105967232827296, + "flos": 19358710574040.0, + "grad_norm": 3.061404190965662, + "language_loss": 0.76173198, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78527272, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1282959, + "step": 11819, + "time_per_iteration": 4.260673761367798 + }, + { + "auxiliary_loss_clip": 0.01325455, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.2190057, + "balance_loss_mlp": 1.01550007, + "epoch": 0.7106568465353975, + "flos": 22935658141440.0, + "grad_norm": 1.4991723297333763, + "language_loss": 0.7106173, + "learning_rate": 8.156561252835883e-07, + "loss": 0.73416579, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13903809, + "step": 11820, + "time_per_iteration": 2.8578004837036133 + }, + { + "auxiliary_loss_clip": 0.01320837, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.2144649, + "balance_loss_mlp": 1.01579916, + "epoch": 0.7107169697880655, + "flos": 19104458675400.0, + "grad_norm": 1.712090378735194, + "language_loss": 0.75353426, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77702504, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12438965, + "step": 11821, + "time_per_iteration": 2.7280092239379883 + }, + { + "auxiliary_loss_clip": 0.01151639, + "auxiliary_loss_mlp": 0.01007016, + "balance_loss_clip": 1.10834253, + "balance_loss_mlp": 1.0031538, + "epoch": 0.7107770930407336, + "flos": 63652025836800.0, + "grad_norm": 0.7634731792145094, + "language_loss": 0.55192387, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57351047, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.03857422, + "step": 11822, + "time_per_iteration": 3.2724056243896484 + }, + { + "auxiliary_loss_clip": 0.01315832, + "auxiliary_loss_mlp": 0.01024049, + "balance_loss_clip": 1.21421313, + "balance_loss_mlp": 1.01216996, + "epoch": 0.7108372162934015, + "flos": 22059574848000.0, + "grad_norm": 2.02519398184246, + "language_loss": 0.60824871, + "learning_rate": 8.147148291671688e-07, + "loss": 0.63164747, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.11883545, + "step": 11823, + "time_per_iteration": 2.7986221313476562 + }, + { + "auxiliary_loss_clip": 0.01322618, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.21648574, + "balance_loss_mlp": 1.01353669, + "epoch": 0.7108973395460695, + "flos": 19139811575760.0, + "grad_norm": 1.882192050683841, + "language_loss": 0.71684456, + "learning_rate": 8.144011536714322e-07, + "loss": 0.74032557, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.1194458, + "step": 11824, + "time_per_iteration": 2.725027561187744 + }, + { + "auxiliary_loss_clip": 0.01318517, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.21599066, + "balance_loss_mlp": 1.01527476, + "epoch": 0.7109574627987374, + "flos": 17898910154640.0, + "grad_norm": 1.7445429537655341, + "language_loss": 0.72858989, + "learning_rate": 8.140875231337223e-07, + "loss": 0.75203693, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.10913086, + "step": 11825, + "time_per_iteration": 2.795346975326538 + }, + { + "auxiliary_loss_clip": 0.01330768, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.22209191, + "balance_loss_mlp": 1.02287555, + "epoch": 0.7110175860514054, + "flos": 28984703153040.0, + "grad_norm": 2.2508614435665555, + "language_loss": 0.8019591, + "learning_rate": 8.137739375659321e-07, + "loss": 0.82562006, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12457275, + "step": 11826, + "time_per_iteration": 2.8018431663513184 + }, + { + "auxiliary_loss_clip": 0.01323354, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.21869147, + "balance_loss_mlp": 1.01794851, + "epoch": 0.7110777093040733, + "flos": 26178429911760.0, + "grad_norm": 1.4858979642058765, + "language_loss": 0.83528113, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85881245, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11846924, + "step": 11827, + "time_per_iteration": 4.398839950561523 + }, + { + "auxiliary_loss_clip": 0.01330461, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.22195148, + "balance_loss_mlp": 1.01465893, + "epoch": 0.7111378325567413, + "flos": 26875596460320.0, + "grad_norm": 1.343442723062059, + "language_loss": 0.62513268, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64871109, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1272583, + "step": 11828, + "time_per_iteration": 2.850043773651123 + }, + { + "auxiliary_loss_clip": 0.01331543, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.22447085, + "balance_loss_mlp": 1.01535892, + "epoch": 0.7111979558094093, + "flos": 27277594864560.0, + "grad_norm": 1.4549810477658305, + "language_loss": 0.72197324, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74556679, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12469482, + "step": 11829, + "time_per_iteration": 2.899317979812622 + }, + { + "auxiliary_loss_clip": 0.01326776, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.22069621, + "balance_loss_mlp": 1.01992869, + "epoch": 0.7112580790620773, + "flos": 25052521030920.0, + "grad_norm": 2.023436804854726, + "language_loss": 0.80752993, + "learning_rate": 8.125200452317697e-07, + "loss": 0.83111727, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12017822, + "step": 11830, + "time_per_iteration": 2.8342483043670654 + }, + { + "auxiliary_loss_clip": 0.01331044, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.22314906, + "balance_loss_mlp": 1.02078152, + "epoch": 0.7113182023147452, + "flos": 21650835456000.0, + "grad_norm": 1.6116172512541047, + "language_loss": 0.8439703, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86760682, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.11834717, + "step": 11831, + "time_per_iteration": 2.750643014907837 + }, + { + "auxiliary_loss_clip": 0.01329914, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.2217052, + "balance_loss_mlp": 1.01980639, + "epoch": 0.7113783255674132, + "flos": 21001244924160.0, + "grad_norm": 1.8363408014430016, + "language_loss": 0.77635419, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79997432, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12280273, + "step": 11832, + "time_per_iteration": 2.7877185344696045 + }, + { + "auxiliary_loss_clip": 0.01151078, + "auxiliary_loss_mlp": 0.01007392, + "balance_loss_clip": 1.10782635, + "balance_loss_mlp": 1.00398302, + "epoch": 0.7114384488200811, + "flos": 66784840503480.0, + "grad_norm": 0.7495127297020986, + "language_loss": 0.56611788, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58770251, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.03417969, + "step": 11833, + "time_per_iteration": 3.1878063678741455 + }, + { + "auxiliary_loss_clip": 0.0132405, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.2178731, + "balance_loss_mlp": 1.01990461, + "epoch": 0.7114985720727491, + "flos": 25015584404520.0, + "grad_norm": 1.6071058979007504, + "language_loss": 0.71063775, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73419374, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.11651611, + "step": 11834, + "time_per_iteration": 2.7818071842193604 + }, + { + "auxiliary_loss_clip": 0.01331457, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.22280455, + "balance_loss_mlp": 1.01486528, + "epoch": 0.7115586953254172, + "flos": 21475086163200.0, + "grad_norm": 2.0349491481587254, + "language_loss": 0.79502809, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81861997, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12872314, + "step": 11835, + "time_per_iteration": 2.7643349170684814 + }, + { + "auxiliary_loss_clip": 0.0132222, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.21755707, + "balance_loss_mlp": 1.01366711, + "epoch": 0.7116188185780851, + "flos": 28628656431120.0, + "grad_norm": 2.3510855825701555, + "language_loss": 0.76267815, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78615546, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.11846924, + "step": 11836, + "time_per_iteration": 2.795050621032715 + }, + { + "auxiliary_loss_clip": 0.01322374, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.21693826, + "balance_loss_mlp": 1.01987755, + "epoch": 0.7116789418307531, + "flos": 25298082740520.0, + "grad_norm": 1.8201473701719413, + "language_loss": 0.70670927, + "learning_rate": 8.103274677346208e-07, + "loss": 0.73026055, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12896729, + "step": 11837, + "time_per_iteration": 2.825237512588501 + }, + { + "auxiliary_loss_clip": 0.01337736, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.22745359, + "balance_loss_mlp": 1.01873863, + "epoch": 0.711739065083421, + "flos": 25562892812760.0, + "grad_norm": 1.7871917189446895, + "language_loss": 0.61836982, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64207327, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13873291, + "step": 11838, + "time_per_iteration": 2.8539316654205322 + }, + { + "auxiliary_loss_clip": 0.01328219, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.2219137, + "balance_loss_mlp": 1.0206902, + "epoch": 0.711799188336089, + "flos": 26146650547080.0, + "grad_norm": 2.4652904871907757, + "language_loss": 0.67923605, + "learning_rate": 8.097014228555426e-07, + "loss": 0.7028473, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12200928, + "step": 11839, + "time_per_iteration": 2.888000249862671 + }, + { + "auxiliary_loss_clip": 0.01328132, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.2221173, + "balance_loss_mlp": 1.01986361, + "epoch": 0.7118593115887569, + "flos": 21145417894080.0, + "grad_norm": 2.050608174071099, + "language_loss": 0.84212101, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86572611, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12506104, + "step": 11840, + "time_per_iteration": 2.854722499847412 + }, + { + "auxiliary_loss_clip": 0.01331782, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.22269845, + "balance_loss_mlp": 1.0145278, + "epoch": 0.711919434841425, + "flos": 14979796616160.0, + "grad_norm": 1.8356025464145513, + "language_loss": 0.77085912, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79444611, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12371826, + "step": 11841, + "time_per_iteration": 2.7557477951049805 + }, + { + "auxiliary_loss_clip": 0.01332249, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.22450686, + "balance_loss_mlp": 1.0166142, + "epoch": 0.7119795580940929, + "flos": 16513429680000.0, + "grad_norm": 1.857847440716752, + "language_loss": 0.75569189, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77930874, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1282959, + "step": 11842, + "time_per_iteration": 2.7599904537200928 + }, + { + "auxiliary_loss_clip": 0.01149131, + "auxiliary_loss_mlp": 0.01005067, + "balance_loss_clip": 1.10573268, + "balance_loss_mlp": 1.00184822, + "epoch": 0.7120396813467609, + "flos": 66586895418960.0, + "grad_norm": 0.7865559913420525, + "language_loss": 0.61722231, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63876426, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.03222656, + "step": 11843, + "time_per_iteration": 3.238779067993164 + }, + { + "auxiliary_loss_clip": 0.01324698, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.21857631, + "balance_loss_mlp": 1.01696229, + "epoch": 0.7120998045994288, + "flos": 26438407589160.0, + "grad_norm": 1.756872250491515, + "language_loss": 0.80171382, + "learning_rate": 8.081371007497171e-07, + "loss": 0.825248, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11767578, + "step": 11844, + "time_per_iteration": 2.8522634506225586 + }, + { + "auxiliary_loss_clip": 0.01328013, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.22053695, + "balance_loss_mlp": 1.01538706, + "epoch": 0.7121599278520968, + "flos": 16431353538480.0, + "grad_norm": 2.0437552591698473, + "language_loss": 0.79232013, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81587875, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12457275, + "step": 11845, + "time_per_iteration": 2.7313342094421387 + }, + { + "auxiliary_loss_clip": 0.01322169, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.21882796, + "balance_loss_mlp": 1.01883066, + "epoch": 0.7122200511047647, + "flos": 28955847590280.0, + "grad_norm": 1.721075653651138, + "language_loss": 0.7762022, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79973584, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.12359619, + "step": 11846, + "time_per_iteration": 2.814741849899292 + }, + { + "auxiliary_loss_clip": 0.01333322, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.225245, + "balance_loss_mlp": 1.01521611, + "epoch": 0.7122801743574327, + "flos": 16476208795080.0, + "grad_norm": 2.9773381411293864, + "language_loss": 0.58289796, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60650992, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12670898, + "step": 11847, + "time_per_iteration": 2.74680757522583 + }, + { + "auxiliary_loss_clip": 0.0131931, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.21769857, + "balance_loss_mlp": 1.02061677, + "epoch": 0.7123402976101008, + "flos": 20635898887800.0, + "grad_norm": 1.9073743481280048, + "language_loss": 0.71586597, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73939663, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.13128662, + "step": 11848, + "time_per_iteration": 2.770341157913208 + }, + { + "auxiliary_loss_clip": 0.01151237, + "auxiliary_loss_mlp": 0.01009601, + "balance_loss_clip": 1.10759163, + "balance_loss_mlp": 1.00640571, + "epoch": 0.7124004208627687, + "flos": 62339687664480.0, + "grad_norm": 0.8267268142529952, + "language_loss": 0.63116848, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65277684, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.03198242, + "step": 11849, + "time_per_iteration": 3.251316785812378 + }, + { + "auxiliary_loss_clip": 0.01334807, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.22609866, + "balance_loss_mlp": 1.01608396, + "epoch": 0.7124605441154367, + "flos": 39683238581880.0, + "grad_norm": 1.4412055525374912, + "language_loss": 0.64441204, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66805285, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13201904, + "step": 11850, + "time_per_iteration": 2.902050733566284 + }, + { + "auxiliary_loss_clip": 0.01331492, + "auxiliary_loss_mlp": 0.01026108, + "balance_loss_clip": 1.22407675, + "balance_loss_mlp": 1.01322103, + "epoch": 0.7125206673681046, + "flos": 28187487940680.0, + "grad_norm": 1.47225343169602, + "language_loss": 0.70269793, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72627389, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12890625, + "step": 11851, + "time_per_iteration": 2.933241128921509 + }, + { + "auxiliary_loss_clip": 0.01325215, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.21926975, + "balance_loss_mlp": 1.01615262, + "epoch": 0.7125807906207726, + "flos": 26182531356120.0, + "grad_norm": 1.5056023011637878, + "language_loss": 0.83215505, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85568583, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.1171875, + "step": 11852, + "time_per_iteration": 2.7843520641326904 + }, + { + "auxiliary_loss_clip": 0.01335936, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.22464728, + "balance_loss_mlp": 1.01499081, + "epoch": 0.7126409138734405, + "flos": 17160746143680.0, + "grad_norm": 5.287456738259691, + "language_loss": 0.73483461, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75848317, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13928223, + "step": 11853, + "time_per_iteration": 2.716858386993408 + }, + { + "auxiliary_loss_clip": 0.01313607, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.2118839, + "balance_loss_mlp": 1.01601362, + "epoch": 0.7127010371261085, + "flos": 18774506147760.0, + "grad_norm": 1.8134777554744381, + "language_loss": 0.92609298, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94950336, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.11419678, + "step": 11854, + "time_per_iteration": 2.7600908279418945 + }, + { + "auxiliary_loss_clip": 0.01324639, + "auxiliary_loss_mlp": 0.01023713, + "balance_loss_clip": 1.22084129, + "balance_loss_mlp": 1.01157165, + "epoch": 0.7127611603787765, + "flos": 20381362730640.0, + "grad_norm": 1.9133757736942822, + "language_loss": 0.79915243, + "learning_rate": 8.046995714123856e-07, + "loss": 0.82263589, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.12133789, + "step": 11855, + "time_per_iteration": 2.7287189960479736 + }, + { + "auxiliary_loss_clip": 0.01324497, + "auxiliary_loss_mlp": 0.01029326, + "balance_loss_clip": 1.21869087, + "balance_loss_mlp": 1.01561737, + "epoch": 0.7128212836314445, + "flos": 20453895907560.0, + "grad_norm": 1.7497382701765452, + "language_loss": 0.73392719, + "learning_rate": 8.043873404639192e-07, + "loss": 0.75746542, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13726807, + "step": 11856, + "time_per_iteration": 4.165524244308472 + }, + { + "auxiliary_loss_clip": 0.01331783, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.22300041, + "balance_loss_mlp": 1.01848364, + "epoch": 0.7128814068841124, + "flos": 23446111140000.0, + "grad_norm": 1.6656192405351375, + "language_loss": 0.70064521, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72427744, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12945557, + "step": 11857, + "time_per_iteration": 5.760505437850952 + }, + { + "auxiliary_loss_clip": 0.01320715, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.21599483, + "balance_loss_mlp": 1.0153439, + "epoch": 0.7129415301367804, + "flos": 18227481998040.0, + "grad_norm": 1.926966264460243, + "language_loss": 0.85328805, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87677693, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.128479, + "step": 11858, + "time_per_iteration": 2.8352370262145996 + }, + { + "auxiliary_loss_clip": 0.01335274, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.22549784, + "balance_loss_mlp": 1.01693654, + "epoch": 0.7130016533894483, + "flos": 15528201450120.0, + "grad_norm": 1.6988659606095344, + "language_loss": 0.80455256, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82820606, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13140869, + "step": 11859, + "time_per_iteration": 2.7014050483703613 + }, + { + "auxiliary_loss_clip": 0.0132699, + "auxiliary_loss_mlp": 0.01029038, + "balance_loss_clip": 1.22155595, + "balance_loss_mlp": 1.01691413, + "epoch": 0.7130617766421163, + "flos": 57125142985680.0, + "grad_norm": 1.4440497291254923, + "language_loss": 0.68582523, + "learning_rate": 8.031388701659456e-07, + "loss": 0.70938551, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12127686, + "step": 11860, + "time_per_iteration": 3.173159122467041 + }, + { + "auxiliary_loss_clip": 0.01330234, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.22293139, + "balance_loss_mlp": 1.01703286, + "epoch": 0.7131218998947844, + "flos": 19792528951320.0, + "grad_norm": 1.7654754417322767, + "language_loss": 0.64581144, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66942292, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13873291, + "step": 11861, + "time_per_iteration": 2.7424728870391846 + }, + { + "auxiliary_loss_clip": 0.01338727, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.22924805, + "balance_loss_mlp": 1.01896334, + "epoch": 0.7131820231474523, + "flos": 26657631454320.0, + "grad_norm": 1.7436712955180218, + "language_loss": 0.67239976, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69610751, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13098145, + "step": 11862, + "time_per_iteration": 2.875602960586548 + }, + { + "auxiliary_loss_clip": 0.01324127, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.21918178, + "balance_loss_mlp": 1.02150071, + "epoch": 0.7132421464001203, + "flos": 29211642606600.0, + "grad_norm": 2.1453449918371037, + "language_loss": 0.67246187, + "learning_rate": 8.022029939445214e-07, + "loss": 0.69603276, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11468506, + "step": 11863, + "time_per_iteration": 2.8628649711608887 + }, + { + "auxiliary_loss_clip": 0.013403, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.22826493, + "balance_loss_mlp": 1.02439129, + "epoch": 0.7133022696527882, + "flos": 23078531643840.0, + "grad_norm": 1.6590277435212002, + "language_loss": 0.66109133, + "learning_rate": 8.018911260294414e-07, + "loss": 0.68487114, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13293457, + "step": 11864, + "time_per_iteration": 2.7769720554351807 + }, + { + "auxiliary_loss_clip": 0.0133451, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.22552204, + "balance_loss_mlp": 1.02263498, + "epoch": 0.7133623929054562, + "flos": 17461599458400.0, + "grad_norm": 1.657862452942983, + "language_loss": 0.85814118, + "learning_rate": 8.015793035467697e-07, + "loss": 0.88184226, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12969971, + "step": 11865, + "time_per_iteration": 4.239623785018921 + }, + { + "auxiliary_loss_clip": 0.01329958, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.22157454, + "balance_loss_mlp": 1.0147773, + "epoch": 0.7134225161581241, + "flos": 19541282071320.0, + "grad_norm": 2.567829551309864, + "language_loss": 0.75354552, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77712846, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13549805, + "step": 11866, + "time_per_iteration": 2.773073196411133 + }, + { + "auxiliary_loss_clip": 0.01339726, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.22997439, + "balance_loss_mlp": 1.01871252, + "epoch": 0.7134826394107922, + "flos": 26255714266800.0, + "grad_norm": 2.02377532767581, + "language_loss": 0.70823324, + "learning_rate": 8.009557949259464e-07, + "loss": 0.73195207, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.13439941, + "step": 11867, + "time_per_iteration": 2.862332344055176 + }, + { + "auxiliary_loss_clip": 0.01323023, + "auxiliary_loss_mlp": 0.01023731, + "balance_loss_clip": 1.21907997, + "balance_loss_mlp": 1.0117265, + "epoch": 0.7135427626634601, + "flos": 15819593016960.0, + "grad_norm": 1.8198667313989478, + "language_loss": 0.72168565, + "learning_rate": 8.006441088114397e-07, + "loss": 0.74515319, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.11999512, + "step": 11868, + "time_per_iteration": 2.942927360534668 + }, + { + "auxiliary_loss_clip": 0.0133788, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.22723556, + "balance_loss_mlp": 1.0157795, + "epoch": 0.7136028859161281, + "flos": 18228009906720.0, + "grad_norm": 2.0831833504417716, + "language_loss": 0.66036242, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68403643, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13739014, + "step": 11869, + "time_per_iteration": 2.763077974319458 + }, + { + "auxiliary_loss_clip": 0.01335413, + "auxiliary_loss_mlp": 0.01023672, + "balance_loss_clip": 1.22620177, + "balance_loss_mlp": 1.01066017, + "epoch": 0.713663009168796, + "flos": 24320001582000.0, + "grad_norm": 1.5607878030348388, + "language_loss": 0.78256392, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80615473, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13024902, + "step": 11870, + "time_per_iteration": 2.790745973587036 + }, + { + "auxiliary_loss_clip": 0.0132791, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.22229147, + "balance_loss_mlp": 1.01658511, + "epoch": 0.713723132421464, + "flos": 26543694731400.0, + "grad_norm": 1.6385272319023505, + "language_loss": 0.8119061, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83548981, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.13891602, + "step": 11871, + "time_per_iteration": 2.8052995204925537 + }, + { + "auxiliary_loss_clip": 0.01336912, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.22741544, + "balance_loss_mlp": 1.01842725, + "epoch": 0.7137832556741319, + "flos": 19870544256840.0, + "grad_norm": 1.607662189482746, + "language_loss": 0.79015994, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81385201, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13879395, + "step": 11872, + "time_per_iteration": 2.7396068572998047 + }, + { + "auxiliary_loss_clip": 0.01340621, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.23038089, + "balance_loss_mlp": 1.01615405, + "epoch": 0.7138433789267999, + "flos": 21693903944760.0, + "grad_norm": 5.201429654990061, + "language_loss": 0.84223336, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86592567, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12457275, + "step": 11873, + "time_per_iteration": 2.7865183353424072 + }, + { + "auxiliary_loss_clip": 0.01325191, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.22039771, + "balance_loss_mlp": 1.01783907, + "epoch": 0.713903502179468, + "flos": 17607193720920.0, + "grad_norm": 2.6692108723917327, + "language_loss": 0.86218488, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88573217, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11688232, + "step": 11874, + "time_per_iteration": 2.7501087188720703 + }, + { + "auxiliary_loss_clip": 0.01334026, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.22605753, + "balance_loss_mlp": 1.0140264, + "epoch": 0.7139636254321359, + "flos": 18044829284040.0, + "grad_norm": 1.760405547367403, + "language_loss": 0.82703197, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85064423, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13189697, + "step": 11875, + "time_per_iteration": 2.7494094371795654 + }, + { + "auxiliary_loss_clip": 0.0134241, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.22847283, + "balance_loss_mlp": 1.02121091, + "epoch": 0.7140237486848039, + "flos": 23336357078160.0, + "grad_norm": 1.753225970537927, + "language_loss": 0.69743502, + "learning_rate": 7.981522581568721e-07, + "loss": 0.72121888, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.14770508, + "step": 11876, + "time_per_iteration": 2.7720494270324707 + }, + { + "auxiliary_loss_clip": 0.01336381, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.22647905, + "balance_loss_mlp": 1.01858294, + "epoch": 0.7140838719374718, + "flos": 16841514223080.0, + "grad_norm": 1.7273049842771098, + "language_loss": 0.78479725, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80848271, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13580322, + "step": 11877, + "time_per_iteration": 2.7438414096832275 + }, + { + "auxiliary_loss_clip": 0.01325933, + "auxiliary_loss_mlp": 0.01039016, + "balance_loss_clip": 1.21980166, + "balance_loss_mlp": 1.02645171, + "epoch": 0.7141439951901398, + "flos": 21146961011760.0, + "grad_norm": 1.7941041375088973, + "language_loss": 0.69860989, + "learning_rate": 7.97529750998934e-07, + "loss": 0.7222594, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12573242, + "step": 11878, + "time_per_iteration": 2.720508337020874 + }, + { + "auxiliary_loss_clip": 0.01324565, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.2189393, + "balance_loss_mlp": 1.01704192, + "epoch": 0.7142041184428077, + "flos": 24723137020320.0, + "grad_norm": 1.8629321932618774, + "language_loss": 0.6749506, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69848853, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12194824, + "step": 11879, + "time_per_iteration": 2.8019096851348877 + }, + { + "auxiliary_loss_clip": 0.01335629, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.22660148, + "balance_loss_mlp": 1.01574409, + "epoch": 0.7142642416954758, + "flos": 21913249635000.0, + "grad_norm": 1.4340282567255687, + "language_loss": 0.6933372, + "learning_rate": 7.969074262321646e-07, + "loss": 0.7169885, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13775635, + "step": 11880, + "time_per_iteration": 2.7955150604248047 + }, + { + "auxiliary_loss_clip": 0.01340541, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.22922385, + "balance_loss_mlp": 1.01812744, + "epoch": 0.7143243649481437, + "flos": 20809495937520.0, + "grad_norm": 2.0841413734717706, + "language_loss": 0.80305958, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82678199, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13555908, + "step": 11881, + "time_per_iteration": 2.7939140796661377 + }, + { + "auxiliary_loss_clip": 0.01332516, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.22561646, + "balance_loss_mlp": 1.02095556, + "epoch": 0.7143844882008117, + "flos": 27240739454880.0, + "grad_norm": 13.85173047378124, + "language_loss": 0.64248127, + "learning_rate": 7.962852839509579e-07, + "loss": 0.66613543, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.11950684, + "step": 11882, + "time_per_iteration": 2.9047739505767822 + }, + { + "auxiliary_loss_clip": 0.01332013, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.22272968, + "balance_loss_mlp": 1.01593494, + "epoch": 0.7144446114534796, + "flos": 17933938188120.0, + "grad_norm": 1.5529547164581263, + "language_loss": 0.69286418, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71647179, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12817383, + "step": 11883, + "time_per_iteration": 2.8838984966278076 + }, + { + "auxiliary_loss_clip": 0.01324644, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.21968412, + "balance_loss_mlp": 1.01379967, + "epoch": 0.7145047347061476, + "flos": 20745896599800.0, + "grad_norm": 1.7966724373050522, + "language_loss": 0.78501391, + "learning_rate": 7.956633242496788e-07, + "loss": 0.8085345, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.13616943, + "step": 11884, + "time_per_iteration": 2.7760753631591797 + }, + { + "auxiliary_loss_clip": 0.01341638, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.22690082, + "balance_loss_mlp": 1.01672101, + "epoch": 0.7145648579588155, + "flos": 21183572771280.0, + "grad_norm": 2.1017230081475065, + "language_loss": 0.74196112, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76568729, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14257812, + "step": 11885, + "time_per_iteration": 2.750755786895752 + }, + { + "auxiliary_loss_clip": 0.01148898, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.1054709, + "balance_loss_mlp": 0.9995864, + "epoch": 0.7146249812114835, + "flos": 64800675399240.0, + "grad_norm": 0.8924596149787328, + "language_loss": 0.66401404, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68552929, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.03039551, + "step": 11886, + "time_per_iteration": 3.3039143085479736 + }, + { + "auxiliary_loss_clip": 0.01331414, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.22431302, + "balance_loss_mlp": 1.01461291, + "epoch": 0.7146851044641516, + "flos": 18118458886680.0, + "grad_norm": 1.6061803858774493, + "language_loss": 0.75425619, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77784711, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13085938, + "step": 11887, + "time_per_iteration": 2.714444875717163 + }, + { + "auxiliary_loss_clip": 0.01336647, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.22794342, + "balance_loss_mlp": 1.01272321, + "epoch": 0.7147452277168195, + "flos": 19248428603520.0, + "grad_norm": 1.436449123055316, + "language_loss": 0.72026241, + "learning_rate": 7.944199529642372e-07, + "loss": 0.74387908, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1229248, + "step": 11888, + "time_per_iteration": 2.7302520275115967 + }, + { + "auxiliary_loss_clip": 0.01340673, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.22931099, + "balance_loss_mlp": 1.01930857, + "epoch": 0.7148053509694875, + "flos": 23769444504960.0, + "grad_norm": 1.9265496521359697, + "language_loss": 0.84951663, + "learning_rate": 7.941092244027041e-07, + "loss": 0.87324947, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13317871, + "step": 11889, + "time_per_iteration": 2.796032667160034 + }, + { + "auxiliary_loss_clip": 0.01332444, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.2240566, + "balance_loss_mlp": 1.01454949, + "epoch": 0.7148654742221554, + "flos": 22489576039440.0, + "grad_norm": 1.7664129145349075, + "language_loss": 0.76109993, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78469646, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12664795, + "step": 11890, + "time_per_iteration": 2.7581334114074707 + }, + { + "auxiliary_loss_clip": 0.01330716, + "auxiliary_loss_mlp": 0.01030373, + "balance_loss_clip": 1.22455621, + "balance_loss_mlp": 1.01789129, + "epoch": 0.7149255974748234, + "flos": 24684373017720.0, + "grad_norm": 1.4859388988779025, + "language_loss": 0.74266964, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76628053, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12481689, + "step": 11891, + "time_per_iteration": 2.8412113189697266 + }, + { + "auxiliary_loss_clip": 0.01336216, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.22663379, + "balance_loss_mlp": 1.02438617, + "epoch": 0.7149857207274913, + "flos": 18410175320400.0, + "grad_norm": 1.7926051528737605, + "language_loss": 0.68253422, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70627534, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1348877, + "step": 11892, + "time_per_iteration": 2.7211875915527344 + }, + { + "auxiliary_loss_clip": 0.01346716, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.23434854, + "balance_loss_mlp": 1.0198437, + "epoch": 0.7150458439801594, + "flos": 24974505725400.0, + "grad_norm": 1.9413667153558571, + "language_loss": 0.74241841, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76622808, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14398193, + "step": 11893, + "time_per_iteration": 2.867166519165039 + }, + { + "auxiliary_loss_clip": 0.01344477, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.23179138, + "balance_loss_mlp": 1.01585412, + "epoch": 0.7151059672328273, + "flos": 16695148401720.0, + "grad_norm": 2.083050329258354, + "language_loss": 0.66345334, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68719327, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13677979, + "step": 11894, + "time_per_iteration": 2.6911025047302246 + }, + { + "auxiliary_loss_clip": 0.01346368, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.2344507, + "balance_loss_mlp": 1.01649737, + "epoch": 0.7151660904854953, + "flos": 27277594864560.0, + "grad_norm": 1.642092269216486, + "language_loss": 0.77670175, + "learning_rate": 7.922458137232613e-07, + "loss": 0.80046201, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13171387, + "step": 11895, + "time_per_iteration": 2.826165199279785 + }, + { + "auxiliary_loss_clip": 0.01341282, + "auxiliary_loss_mlp": 0.01029898, + "balance_loss_clip": 1.23099291, + "balance_loss_mlp": 1.01584303, + "epoch": 0.7152262137381632, + "flos": 18336789367920.0, + "grad_norm": 1.8301636922280926, + "language_loss": 0.69644535, + "learning_rate": 7.919354055015643e-07, + "loss": 0.72015721, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.140625, + "step": 11896, + "time_per_iteration": 7.0779125690460205 + }, + { + "auxiliary_loss_clip": 0.01343446, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.23249316, + "balance_loss_mlp": 1.02610922, + "epoch": 0.7152863369908312, + "flos": 21804388957080.0, + "grad_norm": 1.6999635134435183, + "language_loss": 0.86774623, + "learning_rate": 7.91625043089798e-07, + "loss": 0.89158988, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14831543, + "step": 11897, + "time_per_iteration": 2.8652467727661133 + }, + { + "auxiliary_loss_clip": 0.01334466, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.22733998, + "balance_loss_mlp": 1.01872325, + "epoch": 0.7153464602434991, + "flos": 22162831572240.0, + "grad_norm": 1.7885729131157884, + "language_loss": 0.78357399, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80723846, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13244629, + "step": 11898, + "time_per_iteration": 2.7237446308135986 + }, + { + "auxiliary_loss_clip": 0.01343502, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.23026156, + "balance_loss_mlp": 1.0174377, + "epoch": 0.7154065834961671, + "flos": 24721269035760.0, + "grad_norm": 1.66350624864177, + "language_loss": 0.73237616, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75612688, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14135742, + "step": 11899, + "time_per_iteration": 2.7903084754943848 + }, + { + "auxiliary_loss_clip": 0.01333216, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.22481084, + "balance_loss_mlp": 1.02086496, + "epoch": 0.7154667067488351, + "flos": 22606639606080.0, + "grad_norm": 2.1857125421754673, + "language_loss": 0.76111722, + "learning_rate": 7.906942308317614e-07, + "loss": 0.7847957, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13781738, + "step": 11900, + "time_per_iteration": 2.7478840351104736 + }, + { + "auxiliary_loss_clip": 0.01338798, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.22948551, + "balance_loss_mlp": 1.01675534, + "epoch": 0.7155268300015031, + "flos": 18775886832000.0, + "grad_norm": 1.7285118528615273, + "language_loss": 0.80988944, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83357596, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.13098145, + "step": 11901, + "time_per_iteration": 2.7917778491973877 + }, + { + "auxiliary_loss_clip": 0.01342637, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.22926199, + "balance_loss_mlp": 1.02030802, + "epoch": 0.7155869532541711, + "flos": 18300949167240.0, + "grad_norm": 2.4958098746886535, + "language_loss": 0.81943846, + "learning_rate": 7.900739185917744e-07, + "loss": 0.84321249, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14459229, + "step": 11902, + "time_per_iteration": 2.9187393188476562 + }, + { + "auxiliary_loss_clip": 0.01338911, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.22885966, + "balance_loss_mlp": 1.01420188, + "epoch": 0.715647076506839, + "flos": 11984129673120.0, + "grad_norm": 1.6538374297859548, + "language_loss": 0.68101847, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70467693, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12738037, + "step": 11903, + "time_per_iteration": 2.76887583732605 + }, + { + "auxiliary_loss_clip": 0.01328338, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.22149587, + "balance_loss_mlp": 1.01699352, + "epoch": 0.715707199759507, + "flos": 18956143652760.0, + "grad_norm": 1.7118657564674093, + "language_loss": 0.76227319, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78584659, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12005615, + "step": 11904, + "time_per_iteration": 4.29207706451416 + }, + { + "auxiliary_loss_clip": 0.0133747, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.22798097, + "balance_loss_mlp": 1.02203679, + "epoch": 0.7157673230121749, + "flos": 15308612109720.0, + "grad_norm": 2.7251078015812453, + "language_loss": 0.72145313, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74518716, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13879395, + "step": 11905, + "time_per_iteration": 2.77518630027771 + }, + { + "auxiliary_loss_clip": 0.01331698, + "auxiliary_loss_mlp": 0.01032781, + "balance_loss_clip": 1.22304034, + "balance_loss_mlp": 1.02019298, + "epoch": 0.715827446264843, + "flos": 23227090316640.0, + "grad_norm": 1.4923932671538385, + "language_loss": 0.77991253, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80355734, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12597656, + "step": 11906, + "time_per_iteration": 2.752647638320923 + }, + { + "auxiliary_loss_clip": 0.01149431, + "auxiliary_loss_mlp": 0.01004435, + "balance_loss_clip": 1.104707, + "balance_loss_mlp": 1.00073969, + "epoch": 0.7158875695175109, + "flos": 60989154006600.0, + "grad_norm": 0.7361042266940409, + "language_loss": 0.5541811, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57571971, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.03686523, + "step": 11907, + "time_per_iteration": 3.1835896968841553 + }, + { + "auxiliary_loss_clip": 0.01331832, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.22290039, + "balance_loss_mlp": 1.01591122, + "epoch": 0.7159476927701789, + "flos": 17133230656800.0, + "grad_norm": 4.5542307996890665, + "language_loss": 0.69317245, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71678317, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13336182, + "step": 11908, + "time_per_iteration": 2.7968008518218994 + }, + { + "auxiliary_loss_clip": 0.01339852, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.22963142, + "balance_loss_mlp": 1.01760721, + "epoch": 0.7160078160228468, + "flos": 22495545468360.0, + "grad_norm": 1.642462774517489, + "language_loss": 0.71974599, + "learning_rate": 7.879042716053415e-07, + "loss": 0.74345613, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13531494, + "step": 11909, + "time_per_iteration": 2.884007215499878 + }, + { + "auxiliary_loss_clip": 0.01336681, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.22619128, + "balance_loss_mlp": 1.01912713, + "epoch": 0.7160679392755148, + "flos": 30597001256160.0, + "grad_norm": 1.4769042964372407, + "language_loss": 0.75197691, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77566814, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13330078, + "step": 11910, + "time_per_iteration": 2.8146004676818848 + }, + { + "auxiliary_loss_clip": 0.01335673, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.22800922, + "balance_loss_mlp": 1.01868832, + "epoch": 0.7161280625281827, + "flos": 21328517300040.0, + "grad_norm": 1.4107493834978826, + "language_loss": 0.76482785, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78849292, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12145996, + "step": 11911, + "time_per_iteration": 2.7582011222839355 + }, + { + "auxiliary_loss_clip": 0.0134151, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.23122501, + "balance_loss_mlp": 1.01774263, + "epoch": 0.7161881857808508, + "flos": 61870336972200.0, + "grad_norm": 2.4306145287540732, + "language_loss": 0.59100229, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61473715, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.14233398, + "step": 11912, + "time_per_iteration": 3.1698670387268066 + }, + { + "auxiliary_loss_clip": 0.01333752, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.2271142, + "balance_loss_mlp": 1.01754391, + "epoch": 0.7162483090335187, + "flos": 20816805442320.0, + "grad_norm": 1.7168467512438532, + "language_loss": 0.78708589, + "learning_rate": 7.866654842502376e-07, + "loss": 0.81073678, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.13787842, + "step": 11913, + "time_per_iteration": 2.7577602863311768 + }, + { + "auxiliary_loss_clip": 0.01329996, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.2237252, + "balance_loss_mlp": 1.01902342, + "epoch": 0.7163084322861867, + "flos": 24102889351560.0, + "grad_norm": 1.5737033402886516, + "language_loss": 0.74043846, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76405054, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12176514, + "step": 11914, + "time_per_iteration": 2.796192169189453 + }, + { + "auxiliary_loss_clip": 0.01331648, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.22652745, + "balance_loss_mlp": 1.0217855, + "epoch": 0.7163685555388547, + "flos": 20085301202400.0, + "grad_norm": 1.6593715916570497, + "language_loss": 0.74161649, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76528275, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.13183594, + "step": 11915, + "time_per_iteration": 2.719806432723999 + }, + { + "auxiliary_loss_clip": 0.01338778, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.2284143, + "balance_loss_mlp": 1.01736569, + "epoch": 0.7164286787915226, + "flos": 17461437024960.0, + "grad_norm": 1.628471642583332, + "language_loss": 0.80741382, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83110434, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12915039, + "step": 11916, + "time_per_iteration": 2.806051015853882 + }, + { + "auxiliary_loss_clip": 0.01338916, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.22961164, + "balance_loss_mlp": 1.01908016, + "epoch": 0.7164888020441906, + "flos": 19723650526800.0, + "grad_norm": 2.220718113387178, + "language_loss": 0.68611157, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70981455, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12316895, + "step": 11917, + "time_per_iteration": 2.757673978805542 + }, + { + "auxiliary_loss_clip": 0.01337709, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.22896767, + "balance_loss_mlp": 1.01636469, + "epoch": 0.7165489252968585, + "flos": 21475532855160.0, + "grad_norm": 1.5430242968054695, + "language_loss": 0.76548696, + "learning_rate": 7.851180353640896e-07, + "loss": 0.7891624, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13482666, + "step": 11918, + "time_per_iteration": 2.7261459827423096 + }, + { + "auxiliary_loss_clip": 0.01151134, + "auxiliary_loss_mlp": 0.00999801, + "balance_loss_clip": 1.10695958, + "balance_loss_mlp": 0.99692804, + "epoch": 0.7166090485495266, + "flos": 69944740946280.0, + "grad_norm": 0.6340336582306647, + "language_loss": 0.53943181, + "learning_rate": 7.848086837452639e-07, + "loss": 0.56094116, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02868652, + "step": 11919, + "time_per_iteration": 3.28525972366333 + }, + { + "auxiliary_loss_clip": 0.013439, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.23357129, + "balance_loss_mlp": 1.01682425, + "epoch": 0.7166691718021945, + "flos": 27349559524440.0, + "grad_norm": 1.8264538593360853, + "language_loss": 0.69144088, + "learning_rate": 7.844993782066132e-07, + "loss": 0.7151736, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12554932, + "step": 11920, + "time_per_iteration": 2.8229496479034424 + }, + { + "auxiliary_loss_clip": 0.01335049, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.22597146, + "balance_loss_mlp": 1.01823592, + "epoch": 0.7167292950548625, + "flos": 30414957667560.0, + "grad_norm": 1.8243902979952404, + "language_loss": 0.7519263, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77559245, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13330078, + "step": 11921, + "time_per_iteration": 2.939854860305786 + }, + { + "auxiliary_loss_clip": 0.01350142, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.23551154, + "balance_loss_mlp": 1.02095139, + "epoch": 0.7167894183075304, + "flos": 14574915018360.0, + "grad_norm": 2.2228026861509442, + "language_loss": 0.76507008, + "learning_rate": 7.83880905416755e-07, + "loss": 0.78894579, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.16491699, + "step": 11922, + "time_per_iteration": 2.7226428985595703 + }, + { + "auxiliary_loss_clip": 0.01150112, + "auxiliary_loss_mlp": 0.01006115, + "balance_loss_clip": 1.10629094, + "balance_loss_mlp": 1.00333714, + "epoch": 0.7168495415601984, + "flos": 64124486391600.0, + "grad_norm": 0.8086207805670685, + "language_loss": 0.55193317, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57349539, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02783203, + "step": 11923, + "time_per_iteration": 3.08730411529541 + }, + { + "auxiliary_loss_clip": 0.0134153, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.2317096, + "balance_loss_mlp": 1.02452326, + "epoch": 0.7169096648128663, + "flos": 24686890736040.0, + "grad_norm": 1.456716581024958, + "language_loss": 0.77102077, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79481298, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13165283, + "step": 11924, + "time_per_iteration": 2.7829482555389404 + }, + { + "auxiliary_loss_clip": 0.01332338, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.22558165, + "balance_loss_mlp": 1.02203536, + "epoch": 0.7169697880655344, + "flos": 20672388822240.0, + "grad_norm": 1.5706262254274719, + "language_loss": 0.68344867, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70711201, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11962891, + "step": 11925, + "time_per_iteration": 2.773482322692871 + }, + { + "auxiliary_loss_clip": 0.01324786, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.21971142, + "balance_loss_mlp": 1.0162921, + "epoch": 0.7170299113182023, + "flos": 21037653641880.0, + "grad_norm": 1.5398912442749169, + "language_loss": 0.77261329, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79614162, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11761475, + "step": 11926, + "time_per_iteration": 2.8590471744537354 + }, + { + "auxiliary_loss_clip": 0.01344736, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.23135102, + "balance_loss_mlp": 1.02613544, + "epoch": 0.7170900345708703, + "flos": 22898477864880.0, + "grad_norm": 1.8884714897002681, + "language_loss": 0.77219486, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79604423, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14056396, + "step": 11927, + "time_per_iteration": 2.8647525310516357 + }, + { + "auxiliary_loss_clip": 0.01335298, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.22825587, + "balance_loss_mlp": 1.01890755, + "epoch": 0.7171501578235383, + "flos": 15522313237920.0, + "grad_norm": 1.5992249119005757, + "language_loss": 0.69832629, + "learning_rate": 7.820265941908642e-07, + "loss": 0.72200894, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.14044189, + "step": 11928, + "time_per_iteration": 2.892146348953247 + }, + { + "auxiliary_loss_clip": 0.01329754, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.22476351, + "balance_loss_mlp": 1.01997375, + "epoch": 0.7172102810762062, + "flos": 26109916962480.0, + "grad_norm": 5.4452582553360225, + "language_loss": 0.65977591, + "learning_rate": 7.817177039013931e-07, + "loss": 0.68339914, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12597656, + "step": 11929, + "time_per_iteration": 2.895782947540283 + }, + { + "auxiliary_loss_clip": 0.01337885, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.22748959, + "balance_loss_mlp": 1.01768327, + "epoch": 0.7172704043288742, + "flos": 21511616706000.0, + "grad_norm": 3.1168648119668902, + "language_loss": 0.70136595, + "learning_rate": 7.81408859809308e-07, + "loss": 0.72504699, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12518311, + "step": 11930, + "time_per_iteration": 2.8145053386688232 + }, + { + "auxiliary_loss_clip": 0.01337773, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.22849512, + "balance_loss_mlp": 1.02285874, + "epoch": 0.7173305275815421, + "flos": 18775643181840.0, + "grad_norm": 1.6522334184847147, + "language_loss": 0.80375123, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82749021, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13256836, + "step": 11931, + "time_per_iteration": 2.7242603302001953 + }, + { + "auxiliary_loss_clip": 0.01331832, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.22453046, + "balance_loss_mlp": 1.018134, + "epoch": 0.7173906508342102, + "flos": 16184167494480.0, + "grad_norm": 1.9272652265595245, + "language_loss": 0.78760433, + "learning_rate": 7.80791310264143e-07, + "loss": 0.81122679, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12280273, + "step": 11932, + "time_per_iteration": 2.750415325164795 + }, + { + "auxiliary_loss_clip": 0.01332874, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.22594285, + "balance_loss_mlp": 1.01960146, + "epoch": 0.7174507740868781, + "flos": 26619395360400.0, + "grad_norm": 1.3958796162351677, + "language_loss": 0.75217414, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77582526, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12634277, + "step": 11933, + "time_per_iteration": 2.9046590328216553 + }, + { + "auxiliary_loss_clip": 0.01343664, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.23032069, + "balance_loss_mlp": 1.02150059, + "epoch": 0.7175108973395461, + "flos": 18436310123040.0, + "grad_norm": 2.6587629399381587, + "language_loss": 0.69886243, + "learning_rate": 7.801739456490388e-07, + "loss": 0.72266614, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 1.13232422, + "router_z_loss_mlp": 0.15203857, + "step": 11934, + "time_per_iteration": 5.63031005859375 + }, + { + "auxiliary_loss_clip": 0.01334324, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.22731137, + "balance_loss_mlp": 1.02373385, + "epoch": 0.717571020592214, + "flos": 23920277245920.0, + "grad_norm": 2.14023160071096, + "language_loss": 0.86270845, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88641661, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12768555, + "step": 11935, + "time_per_iteration": 4.24286413192749 + }, + { + "auxiliary_loss_clip": 0.01335737, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.22629642, + "balance_loss_mlp": 1.01682234, + "epoch": 0.717631143844882, + "flos": 38264516841600.0, + "grad_norm": 1.497757268903236, + "language_loss": 0.74117947, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76483655, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13153076, + "step": 11936, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.01151341, + "auxiliary_loss_mlp": 0.01006129, + "balance_loss_clip": 1.10732818, + "balance_loss_mlp": 1.00341129, + "epoch": 0.7176912670975499, + "flos": 65532342072600.0, + "grad_norm": 0.7772054364335896, + "language_loss": 0.55959165, + "learning_rate": 7.79248245675082e-07, + "loss": 0.58116639, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02722168, + "step": 11937, + "time_per_iteration": 3.2496249675750732 + }, + { + "auxiliary_loss_clip": 0.01339223, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.22811818, + "balance_loss_mlp": 1.02389836, + "epoch": 0.717751390350218, + "flos": 31286817691560.0, + "grad_norm": 2.303472909869916, + "language_loss": 0.55307543, + "learning_rate": 7.789397715835542e-07, + "loss": 0.57684326, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13647461, + "step": 11938, + "time_per_iteration": 2.801036834716797 + }, + { + "auxiliary_loss_clip": 0.01325629, + "auxiliary_loss_mlp": 0.01031159, + "balance_loss_clip": 1.22060096, + "balance_loss_mlp": 1.01966715, + "epoch": 0.7178115136028859, + "flos": 19863031710240.0, + "grad_norm": 1.6730334661878379, + "language_loss": 0.77232915, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79589701, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1149292, + "step": 11939, + "time_per_iteration": 2.8357410430908203 + }, + { + "auxiliary_loss_clip": 0.01149773, + "auxiliary_loss_mlp": 0.01001559, + "balance_loss_clip": 1.10601115, + "balance_loss_mlp": 0.99910301, + "epoch": 0.7178716368555539, + "flos": 64364989037400.0, + "grad_norm": 0.7663267245484879, + "language_loss": 0.61467695, + "learning_rate": 7.783229623203738e-07, + "loss": 0.6361903, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02453613, + "step": 11940, + "time_per_iteration": 3.212916851043701 + }, + { + "auxiliary_loss_clip": 0.01328819, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.22289634, + "balance_loss_mlp": 1.0218606, + "epoch": 0.7179317601082219, + "flos": 26768522550240.0, + "grad_norm": 1.5727915859085706, + "language_loss": 0.59093106, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61456168, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12384033, + "step": 11941, + "time_per_iteration": 2.836566925048828 + }, + { + "auxiliary_loss_clip": 0.01331761, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.22600424, + "balance_loss_mlp": 1.01787472, + "epoch": 0.7179918833608898, + "flos": 23519091008880.0, + "grad_norm": 1.800783632475805, + "language_loss": 0.78999567, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81362689, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1350708, + "step": 11942, + "time_per_iteration": 4.313767910003662 + }, + { + "auxiliary_loss_clip": 0.01333059, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.22552502, + "balance_loss_mlp": 1.02337193, + "epoch": 0.7180520066135578, + "flos": 17170086066480.0, + "grad_norm": 1.8929355920307325, + "language_loss": 0.66405141, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68774778, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13201904, + "step": 11943, + "time_per_iteration": 2.89489483833313 + }, + { + "auxiliary_loss_clip": 0.01328644, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.22180438, + "balance_loss_mlp": 1.01878262, + "epoch": 0.7181121298662257, + "flos": 17571150478440.0, + "grad_norm": 1.6867579523873653, + "language_loss": 0.7905345, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81413937, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.1305542, + "step": 11944, + "time_per_iteration": 2.7399492263793945 + }, + { + "auxiliary_loss_clip": 0.01340439, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.22941589, + "balance_loss_mlp": 1.0204618, + "epoch": 0.7181722531188938, + "flos": 11951416316160.0, + "grad_norm": 4.425788179626332, + "language_loss": 0.63331211, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65706241, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14141846, + "step": 11945, + "time_per_iteration": 2.8105404376983643 + }, + { + "auxiliary_loss_clip": 0.01152071, + "auxiliary_loss_mlp": 0.01003377, + "balance_loss_clip": 1.10771966, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7182323763715617, + "flos": 65518494621840.0, + "grad_norm": 0.6996604530027586, + "language_loss": 0.51071548, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53226995, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.02905273, + "step": 11946, + "time_per_iteration": 3.1822688579559326 + }, + { + "auxiliary_loss_clip": 0.01338369, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.2264123, + "balance_loss_mlp": 1.01987624, + "epoch": 0.7182924996242297, + "flos": 20635777062720.0, + "grad_norm": 1.748855720490204, + "language_loss": 0.74869591, + "learning_rate": 7.761655897855925e-07, + "loss": 0.77242547, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14733887, + "step": 11947, + "time_per_iteration": 2.7636702060699463 + }, + { + "auxiliary_loss_clip": 0.01331139, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.22477245, + "balance_loss_mlp": 1.01403451, + "epoch": 0.7183526228768976, + "flos": 16220738645640.0, + "grad_norm": 1.5089497043205287, + "language_loss": 0.73012936, + "learning_rate": 7.758575792474187e-07, + "loss": 0.753708, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12677002, + "step": 11948, + "time_per_iteration": 2.742189407348633 + }, + { + "auxiliary_loss_clip": 0.01335827, + "auxiliary_loss_mlp": 0.01035661, + "balance_loss_clip": 1.22701323, + "balance_loss_mlp": 1.02229762, + "epoch": 0.7184127461295656, + "flos": 22237232733720.0, + "grad_norm": 1.5150413565534804, + "language_loss": 0.71621311, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73992807, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13366699, + "step": 11949, + "time_per_iteration": 2.8450770378112793 + }, + { + "auxiliary_loss_clip": 0.01334248, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.22644722, + "balance_loss_mlp": 1.01835513, + "epoch": 0.7184728693822335, + "flos": 27350249866560.0, + "grad_norm": 1.7175275794545377, + "language_loss": 0.76577914, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78943712, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13201904, + "step": 11950, + "time_per_iteration": 2.8162312507629395 + }, + { + "auxiliary_loss_clip": 0.01338436, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.22898424, + "balance_loss_mlp": 1.01677454, + "epoch": 0.7185329926349016, + "flos": 16512901771320.0, + "grad_norm": 2.263597815654234, + "language_loss": 0.68410629, + "learning_rate": 7.749338261972282e-07, + "loss": 0.70779538, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13708496, + "step": 11951, + "time_per_iteration": 2.730679750442505 + }, + { + "auxiliary_loss_clip": 0.01339126, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.22700608, + "balance_loss_mlp": 1.02107573, + "epoch": 0.7185931158875695, + "flos": 23956320488400.0, + "grad_norm": 1.6460849586945456, + "language_loss": 0.7820397, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80579364, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1517334, + "step": 11952, + "time_per_iteration": 2.81113600730896 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.22912812, + "balance_loss_mlp": 1.01727855, + "epoch": 0.7186532391402375, + "flos": 26547674350680.0, + "grad_norm": 1.893555167822872, + "language_loss": 0.7543484, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77805948, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13726807, + "step": 11953, + "time_per_iteration": 2.820072889328003 + }, + { + "auxiliary_loss_clip": 0.01336374, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.226372, + "balance_loss_mlp": 1.01519322, + "epoch": 0.7187133623929055, + "flos": 22388227908120.0, + "grad_norm": 1.8887197521822896, + "language_loss": 0.73818803, + "learning_rate": 7.740104912387164e-07, + "loss": 0.76183987, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13623047, + "step": 11954, + "time_per_iteration": 2.814194917678833 + }, + { + "auxiliary_loss_clip": 0.01337724, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.22844887, + "balance_loss_mlp": 1.02032447, + "epoch": 0.7187734856455734, + "flos": 15783955858080.0, + "grad_norm": 1.8224510744285014, + "language_loss": 0.74287593, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76659566, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13934326, + "step": 11955, + "time_per_iteration": 2.801363229751587 + }, + { + "auxiliary_loss_clip": 0.01335552, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.22624052, + "balance_loss_mlp": 1.01835656, + "epoch": 0.7188336088982414, + "flos": 31765978625760.0, + "grad_norm": 1.8927209760935293, + "language_loss": 0.73781741, + "learning_rate": 7.733951670284817e-07, + "loss": 0.76147991, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12347412, + "step": 11956, + "time_per_iteration": 2.8359339237213135 + }, + { + "auxiliary_loss_clip": 0.01339117, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.22787416, + "balance_loss_mlp": 1.01912832, + "epoch": 0.7188937321509093, + "flos": 21469522817880.0, + "grad_norm": 1.7330822179497254, + "language_loss": 0.71305138, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73677182, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13793945, + "step": 11957, + "time_per_iteration": 2.7384696006774902 + }, + { + "auxiliary_loss_clip": 0.01337884, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.22630692, + "balance_loss_mlp": 1.02133977, + "epoch": 0.7189538554035774, + "flos": 27277229389320.0, + "grad_norm": 1.8206255483339884, + "language_loss": 0.73642588, + "learning_rate": 7.727800288701582e-07, + "loss": 0.76016486, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14672852, + "step": 11958, + "time_per_iteration": 2.7726922035217285 + }, + { + "auxiliary_loss_clip": 0.01327126, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.22030091, + "balance_loss_mlp": 1.01940703, + "epoch": 0.7190139786562453, + "flos": 21585733608960.0, + "grad_norm": 1.9705868227696415, + "language_loss": 0.84283543, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86643612, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13525391, + "step": 11959, + "time_per_iteration": 2.7880258560180664 + }, + { + "auxiliary_loss_clip": 0.01341307, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.22976923, + "balance_loss_mlp": 1.01851952, + "epoch": 0.7190741019089133, + "flos": 26726509878840.0, + "grad_norm": 1.5046378315663655, + "language_loss": 0.82145524, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84519148, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13812256, + "step": 11960, + "time_per_iteration": 2.7869107723236084 + }, + { + "auxiliary_loss_clip": 0.01331983, + "auxiliary_loss_mlp": 0.01041566, + "balance_loss_clip": 1.22421265, + "balance_loss_mlp": 1.0270468, + "epoch": 0.7191342251615812, + "flos": 26110444871160.0, + "grad_norm": 1.8485879233731108, + "language_loss": 0.78194356, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80567908, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.14520264, + "step": 11961, + "time_per_iteration": 2.799975872039795 + }, + { + "auxiliary_loss_clip": 0.01326733, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.22236991, + "balance_loss_mlp": 1.021384, + "epoch": 0.7191943484142492, + "flos": 22972310509320.0, + "grad_norm": 1.3097516682318455, + "language_loss": 0.75099957, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77460277, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12213135, + "step": 11962, + "time_per_iteration": 2.7561001777648926 + }, + { + "auxiliary_loss_clip": 0.01336501, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.22691536, + "balance_loss_mlp": 1.01454413, + "epoch": 0.7192544716669171, + "flos": 22570515146880.0, + "grad_norm": 1.6175179744562511, + "language_loss": 0.75414264, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77779245, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13934326, + "step": 11963, + "time_per_iteration": 2.8567934036254883 + }, + { + "auxiliary_loss_clip": 0.01347728, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.23438776, + "balance_loss_mlp": 1.02074265, + "epoch": 0.7193145949195852, + "flos": 18985933207800.0, + "grad_norm": 2.538620014195928, + "language_loss": 0.81441635, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83824527, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14440918, + "step": 11964, + "time_per_iteration": 2.815595865249634 + }, + { + "auxiliary_loss_clip": 0.01332444, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.22532916, + "balance_loss_mlp": 1.01740253, + "epoch": 0.7193747181722531, + "flos": 18008786041560.0, + "grad_norm": 1.7606295831862202, + "language_loss": 0.75464308, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77827549, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1340332, + "step": 11965, + "time_per_iteration": 2.746194362640381 + }, + { + "auxiliary_loss_clip": 0.01341654, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.23055744, + "balance_loss_mlp": 1.01967442, + "epoch": 0.7194348414249211, + "flos": 24394808827080.0, + "grad_norm": 1.5211257556740803, + "language_loss": 0.77696884, + "learning_rate": 7.703213386216377e-07, + "loss": 0.80071771, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13555908, + "step": 11966, + "time_per_iteration": 2.7551097869873047 + }, + { + "auxiliary_loss_clip": 0.013369, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.22731435, + "balance_loss_mlp": 1.01827705, + "epoch": 0.7194949646775891, + "flos": 22168516742640.0, + "grad_norm": 1.79964657879289, + "language_loss": 0.73446536, + "learning_rate": 7.700142120511619e-07, + "loss": 0.7581498, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13250732, + "step": 11967, + "time_per_iteration": 2.792820930480957 + }, + { + "auxiliary_loss_clip": 0.01324505, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.22221112, + "balance_loss_mlp": 1.01784801, + "epoch": 0.719555087930257, + "flos": 20271202585200.0, + "grad_norm": 1.5512955433052744, + "language_loss": 0.82221782, + "learning_rate": 7.6970713212187e-07, + "loss": 0.84575689, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.11566162, + "step": 11968, + "time_per_iteration": 2.896728754043579 + }, + { + "auxiliary_loss_clip": 0.0133143, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.22393465, + "balance_loss_mlp": 1.01728618, + "epoch": 0.719615211182925, + "flos": 24721553294280.0, + "grad_norm": 1.7374778577595211, + "language_loss": 0.76928949, + "learning_rate": 7.69400098845407e-07, + "loss": 0.79290235, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12567139, + "step": 11969, + "time_per_iteration": 2.722651720046997 + }, + { + "auxiliary_loss_clip": 0.01333309, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.22501731, + "balance_loss_mlp": 1.01873887, + "epoch": 0.719675334435593, + "flos": 20014270534800.0, + "grad_norm": 1.593217366921142, + "language_loss": 0.70850134, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73215395, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.13220215, + "step": 11970, + "time_per_iteration": 2.759126663208008 + }, + { + "auxiliary_loss_clip": 0.01154197, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 1.10878026, + "balance_loss_mlp": 0.99862337, + "epoch": 0.719735457688261, + "flos": 44212433744880.0, + "grad_norm": 0.913898909331963, + "language_loss": 0.60905933, + "learning_rate": 7.68786172297538e-07, + "loss": 0.63061464, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02709961, + "step": 11971, + "time_per_iteration": 4.5683794021606445 + }, + { + "auxiliary_loss_clip": 0.01343491, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.22925496, + "balance_loss_mlp": 1.01934338, + "epoch": 0.7197955809409289, + "flos": 16807623223680.0, + "grad_norm": 1.8115033186378733, + "language_loss": 0.80203754, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82580811, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14221191, + "step": 11972, + "time_per_iteration": 2.853567123413086 + }, + { + "auxiliary_loss_clip": 0.01334305, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.22505999, + "balance_loss_mlp": 1.02084553, + "epoch": 0.7198557041935969, + "flos": 24540809173200.0, + "grad_norm": 1.9036107105509275, + "language_loss": 0.7617389, + "learning_rate": 7.681724325006733e-07, + "loss": 0.78541905, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12872314, + "step": 11973, + "time_per_iteration": 2.8296637535095215 + }, + { + "auxiliary_loss_clip": 0.01153716, + "auxiliary_loss_mlp": 0.01003141, + "balance_loss_clip": 1.10848606, + "balance_loss_mlp": 1.00031567, + "epoch": 0.7199158274462648, + "flos": 70725022472520.0, + "grad_norm": 0.8602984070486035, + "language_loss": 0.5731051, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59467363, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02819824, + "step": 11974, + "time_per_iteration": 4.553699016571045 + }, + { + "auxiliary_loss_clip": 0.01337868, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.2266916, + "balance_loss_mlp": 1.01759136, + "epoch": 0.7199759506989328, + "flos": 29353582116720.0, + "grad_norm": 2.3519769642301758, + "language_loss": 0.61538666, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63907599, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13452148, + "step": 11975, + "time_per_iteration": 2.790799617767334 + }, + { + "auxiliary_loss_clip": 0.01332746, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.22405183, + "balance_loss_mlp": 1.02092552, + "epoch": 0.7200360739516007, + "flos": 24645324756600.0, + "grad_norm": 1.9053884577391422, + "language_loss": 0.68002838, + "learning_rate": 7.672521731671425e-07, + "loss": 0.70369446, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.1293335, + "step": 11976, + "time_per_iteration": 2.827385663986206 + }, + { + "auxiliary_loss_clip": 0.01334633, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.22601628, + "balance_loss_mlp": 1.02182937, + "epoch": 0.7200961972042688, + "flos": 20817658217880.0, + "grad_norm": 1.7273533609566651, + "language_loss": 0.67693436, + "learning_rate": 7.669455135323004e-07, + "loss": 0.7006259, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12695312, + "step": 11977, + "time_per_iteration": 2.767484426498413 + }, + { + "auxiliary_loss_clip": 0.01339206, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.22871494, + "balance_loss_mlp": 1.01722217, + "epoch": 0.7201563204569367, + "flos": 31251830266440.0, + "grad_norm": 1.5355176920903981, + "language_loss": 0.75611806, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77981615, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1338501, + "step": 11978, + "time_per_iteration": 2.8754491806030273 + }, + { + "auxiliary_loss_clip": 0.01330767, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.2237289, + "balance_loss_mlp": 1.02209616, + "epoch": 0.7202164437096047, + "flos": 26657062937280.0, + "grad_norm": 2.083050475832572, + "language_loss": 0.78984195, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81350225, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1315918, + "step": 11979, + "time_per_iteration": 2.912299394607544 + }, + { + "auxiliary_loss_clip": 0.01338351, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.23039365, + "balance_loss_mlp": 1.01702523, + "epoch": 0.7202765669622727, + "flos": 25965378517320.0, + "grad_norm": 1.5537413441066164, + "language_loss": 0.65320861, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67689127, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12896729, + "step": 11980, + "time_per_iteration": 2.81697154045105 + }, + { + "auxiliary_loss_clip": 0.01338011, + "auxiliary_loss_mlp": 0.01038423, + "balance_loss_clip": 1.22770882, + "balance_loss_mlp": 1.02395749, + "epoch": 0.7203366902149406, + "flos": 28518293243880.0, + "grad_norm": 2.196339254085369, + "language_loss": 0.67454642, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69831073, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14477539, + "step": 11981, + "time_per_iteration": 2.8705198764801025 + }, + { + "auxiliary_loss_clip": 0.01337672, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.22847533, + "balance_loss_mlp": 1.01700187, + "epoch": 0.7203968134676086, + "flos": 21111039594360.0, + "grad_norm": 1.7128830856938873, + "language_loss": 0.74299908, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76667541, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12957764, + "step": 11982, + "time_per_iteration": 4.281341552734375 + }, + { + "auxiliary_loss_clip": 0.01335632, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.22579384, + "balance_loss_mlp": 1.02028179, + "epoch": 0.7204569367202766, + "flos": 18336667542840.0, + "grad_norm": 1.849946617010389, + "language_loss": 0.6586144, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68229699, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12341309, + "step": 11983, + "time_per_iteration": 2.753516912460327 + }, + { + "auxiliary_loss_clip": 0.0133847, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.22920346, + "balance_loss_mlp": 1.01966488, + "epoch": 0.7205170599729446, + "flos": 23259925498680.0, + "grad_norm": 1.5916582902443046, + "language_loss": 0.66586065, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68957877, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13696289, + "step": 11984, + "time_per_iteration": 2.874528408050537 + }, + { + "auxiliary_loss_clip": 0.01340633, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.22941303, + "balance_loss_mlp": 1.01754045, + "epoch": 0.7205771832256125, + "flos": 20125770756120.0, + "grad_norm": 2.0537473797104844, + "language_loss": 0.7450757, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76879072, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13311768, + "step": 11985, + "time_per_iteration": 2.743013620376587 + }, + { + "auxiliary_loss_clip": 0.01336609, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.229141, + "balance_loss_mlp": 1.01892495, + "epoch": 0.7206373064782805, + "flos": 27708245789760.0, + "grad_norm": 1.6162235031967997, + "language_loss": 0.63197154, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65565175, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12475586, + "step": 11986, + "time_per_iteration": 2.85601544380188 + }, + { + "auxiliary_loss_clip": 0.0134014, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.23018622, + "balance_loss_mlp": 1.02161348, + "epoch": 0.7206974297309484, + "flos": 17972742799080.0, + "grad_norm": 2.0066293188746838, + "language_loss": 0.73238724, + "learning_rate": 7.638814907669455e-07, + "loss": 0.75614482, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.14001465, + "step": 11987, + "time_per_iteration": 2.7276313304901123 + }, + { + "auxiliary_loss_clip": 0.01343229, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.23192751, + "balance_loss_mlp": 1.02483511, + "epoch": 0.7207575529836164, + "flos": 16987636394280.0, + "grad_norm": 1.753243331035837, + "language_loss": 0.78413022, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80794698, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 1.11474609, + "router_z_loss_mlp": 0.13616943, + "step": 11988, + "time_per_iteration": 2.782210111618042 + }, + { + "auxiliary_loss_clip": 0.01336958, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.22801542, + "balance_loss_mlp": 1.02274764, + "epoch": 0.7208176762362843, + "flos": 18731397050640.0, + "grad_norm": 1.77263252052442, + "language_loss": 0.7930339, + "learning_rate": 7.632692483270618e-07, + "loss": 0.8167628, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.13195801, + "step": 11989, + "time_per_iteration": 2.6976559162139893 + }, + { + "auxiliary_loss_clip": 0.01326846, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.22115362, + "balance_loss_mlp": 1.01593709, + "epoch": 0.7208777994889524, + "flos": 18739193855760.0, + "grad_norm": 1.6945497018921465, + "language_loss": 0.82830346, + "learning_rate": 7.629631974467481e-07, + "loss": 0.85186541, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13415527, + "step": 11990, + "time_per_iteration": 2.822138786315918 + }, + { + "auxiliary_loss_clip": 0.01330546, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.22328544, + "balance_loss_mlp": 1.02038825, + "epoch": 0.7209379227416203, + "flos": 14797549985760.0, + "grad_norm": 2.073714685829332, + "language_loss": 0.7611295, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78476655, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12780762, + "step": 11991, + "time_per_iteration": 2.881086826324463 + }, + { + "auxiliary_loss_clip": 0.01326794, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.22170377, + "balance_loss_mlp": 1.013502, + "epoch": 0.7209980459942883, + "flos": 29642090490000.0, + "grad_norm": 1.4856596828876885, + "language_loss": 0.7244041, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74792761, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12054443, + "step": 11992, + "time_per_iteration": 2.770935297012329 + }, + { + "auxiliary_loss_clip": 0.01340602, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.23020697, + "balance_loss_mlp": 1.02169776, + "epoch": 0.7210581692469563, + "flos": 23482438641000.0, + "grad_norm": 2.295129124908647, + "language_loss": 0.66625434, + "learning_rate": 7.620453263035755e-07, + "loss": 0.69000638, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12915039, + "step": 11993, + "time_per_iteration": 2.777801513671875 + }, + { + "auxiliary_loss_clip": 0.01336241, + "auxiliary_loss_mlp": 0.01025613, + "balance_loss_clip": 1.22842574, + "balance_loss_mlp": 1.01315594, + "epoch": 0.7211182924996242, + "flos": 26104881525840.0, + "grad_norm": 2.737938215864662, + "language_loss": 0.66000617, + "learning_rate": 7.61739463127115e-07, + "loss": 0.68362468, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12463379, + "step": 11994, + "time_per_iteration": 2.7556962966918945 + }, + { + "auxiliary_loss_clip": 0.0134392, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.23346114, + "balance_loss_mlp": 1.01642776, + "epoch": 0.7211784157522922, + "flos": 17716541699160.0, + "grad_norm": 1.5925744405846878, + "language_loss": 0.67297387, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69672441, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.14697266, + "step": 11995, + "time_per_iteration": 2.798102855682373 + }, + { + "auxiliary_loss_clip": 0.01327786, + "auxiliary_loss_mlp": 0.01026644, + "balance_loss_clip": 1.22306573, + "balance_loss_mlp": 1.01243412, + "epoch": 0.7212385390049602, + "flos": 24428740434840.0, + "grad_norm": 1.5810374556058047, + "language_loss": 0.79505479, + "learning_rate": 7.6112787765068e-07, + "loss": 0.8185991, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.14202881, + "step": 11996, + "time_per_iteration": 2.765956163406372 + }, + { + "auxiliary_loss_clip": 0.0134, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.23125768, + "balance_loss_mlp": 1.01677704, + "epoch": 0.7212986622576282, + "flos": 28153109640960.0, + "grad_norm": 2.479419242229134, + "language_loss": 0.81561202, + "learning_rate": 7.60822155373899e-07, + "loss": 0.8393048, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12512207, + "step": 11997, + "time_per_iteration": 2.7959182262420654 + }, + { + "auxiliary_loss_clip": 0.01343637, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.23237097, + "balance_loss_mlp": 1.02165949, + "epoch": 0.7213587855102961, + "flos": 21840878891520.0, + "grad_norm": 2.100330414545503, + "language_loss": 0.67294168, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69673949, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1449585, + "step": 11998, + "time_per_iteration": 2.7339377403259277 + }, + { + "auxiliary_loss_clip": 0.01336846, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.22885072, + "balance_loss_mlp": 1.0209384, + "epoch": 0.7214189087629641, + "flos": 14615709438960.0, + "grad_norm": 2.294143791732789, + "language_loss": 0.72503257, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74873036, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12005615, + "step": 11999, + "time_per_iteration": 2.7510836124420166 + }, + { + "auxiliary_loss_clip": 0.01335881, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.22643518, + "balance_loss_mlp": 1.01432657, + "epoch": 0.721479032015632, + "flos": 19395606592080.0, + "grad_norm": 3.5073276607373716, + "language_loss": 0.83194566, + "learning_rate": 7.599052705284039e-07, + "loss": 0.8555786, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13085938, + "step": 12000, + "time_per_iteration": 2.755080461502075 + }, + { + "auxiliary_loss_clip": 0.01343698, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.23403704, + "balance_loss_mlp": 1.02073884, + "epoch": 0.7215391552683, + "flos": 18517208622120.0, + "grad_norm": 1.9227036511619788, + "language_loss": 0.77211124, + "learning_rate": 7.59599736280154e-07, + "loss": 0.7958914, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.13562012, + "step": 12001, + "time_per_iteration": 2.754528760910034 + }, + { + "auxiliary_loss_clip": 0.01329709, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.22419369, + "balance_loss_mlp": 1.02112043, + "epoch": 0.721599278520968, + "flos": 23264026943040.0, + "grad_norm": 1.6375205322477526, + "language_loss": 0.82137573, + "learning_rate": 7.592942490680066e-07, + "loss": 0.84501058, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12670898, + "step": 12002, + "time_per_iteration": 2.757970094680786 + }, + { + "auxiliary_loss_clip": 0.01343885, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.23299444, + "balance_loss_mlp": 1.01880956, + "epoch": 0.721659401773636, + "flos": 39205905023880.0, + "grad_norm": 2.007743449016211, + "language_loss": 0.62747866, + "learning_rate": 7.589888089035462e-07, + "loss": 0.65124291, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13739014, + "step": 12003, + "time_per_iteration": 2.977001905441284 + }, + { + "auxiliary_loss_clip": 0.01347522, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.23690605, + "balance_loss_mlp": 1.01939154, + "epoch": 0.7217195250263039, + "flos": 14944118848920.0, + "grad_norm": 2.216664796717982, + "language_loss": 0.68815029, + "learning_rate": 7.586834157983544e-07, + "loss": 0.71195936, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13989258, + "step": 12004, + "time_per_iteration": 2.767348289489746 + }, + { + "auxiliary_loss_clip": 0.01154029, + "auxiliary_loss_mlp": 0.01005794, + "balance_loss_clip": 1.11138678, + "balance_loss_mlp": 1.00293255, + "epoch": 0.7217796482789719, + "flos": 70885032702840.0, + "grad_norm": 0.8507749087229026, + "language_loss": 0.54196584, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56356406, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02856445, + "step": 12005, + "time_per_iteration": 3.15238356590271 + }, + { + "auxiliary_loss_clip": 0.01330637, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.22346485, + "balance_loss_mlp": 1.02010727, + "epoch": 0.7218397715316398, + "flos": 37458652048560.0, + "grad_norm": 1.438331796012971, + "language_loss": 0.63513589, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65877759, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13415527, + "step": 12006, + "time_per_iteration": 2.9121527671813965 + }, + { + "auxiliary_loss_clip": 0.01339882, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.23051953, + "balance_loss_mlp": 1.0234164, + "epoch": 0.7218998947843078, + "flos": 22715784542520.0, + "grad_norm": 1.6156005097197208, + "language_loss": 0.92166626, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94542414, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12493896, + "step": 12007, + "time_per_iteration": 2.824230194091797 + }, + { + "auxiliary_loss_clip": 0.01339292, + "auxiliary_loss_mlp": 0.01037387, + "balance_loss_clip": 1.22868752, + "balance_loss_mlp": 1.02308154, + "epoch": 0.7219600180369758, + "flos": 12171046264920.0, + "grad_norm": 2.4756030182856845, + "language_loss": 0.64198756, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66575432, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14312744, + "step": 12008, + "time_per_iteration": 2.6984689235687256 + }, + { + "auxiliary_loss_clip": 0.01341478, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.23062122, + "balance_loss_mlp": 1.01541042, + "epoch": 0.7220201412896438, + "flos": 22601076260760.0, + "grad_norm": 1.9486388593326025, + "language_loss": 0.78404057, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80774403, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13470459, + "step": 12009, + "time_per_iteration": 2.772285223007202 + }, + { + "auxiliary_loss_clip": 0.01345783, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.23425055, + "balance_loss_mlp": 1.01556778, + "epoch": 0.7220802645423118, + "flos": 26723586076920.0, + "grad_norm": 1.7282166445514484, + "language_loss": 0.63947368, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66323131, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14404297, + "step": 12010, + "time_per_iteration": 4.213931560516357 + }, + { + "auxiliary_loss_clip": 0.01340121, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_clip": 1.23080182, + "balance_loss_mlp": 1.01566172, + "epoch": 0.7221403877949797, + "flos": 24425248115880.0, + "grad_norm": 1.6992484731066257, + "language_loss": 0.76896024, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79265511, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13690186, + "step": 12011, + "time_per_iteration": 2.7420871257781982 + }, + { + "auxiliary_loss_clip": 0.01337054, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.22931564, + "balance_loss_mlp": 1.01950121, + "epoch": 0.7222005110476477, + "flos": 23519740742640.0, + "grad_norm": 1.7133571246174526, + "language_loss": 0.79425454, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81794298, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12280273, + "step": 12012, + "time_per_iteration": 5.697652339935303 + }, + { + "auxiliary_loss_clip": 0.01336044, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.2272296, + "balance_loss_mlp": 1.01707482, + "epoch": 0.7222606343003156, + "flos": 23117782946760.0, + "grad_norm": 1.8625808869931273, + "language_loss": 0.76099974, + "learning_rate": 7.559369974289171e-07, + "loss": 0.78465831, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12738037, + "step": 12013, + "time_per_iteration": 2.7819981575012207 + }, + { + "auxiliary_loss_clip": 0.01330281, + "auxiliary_loss_mlp": 0.01024599, + "balance_loss_clip": 1.22486424, + "balance_loss_mlp": 1.01213551, + "epoch": 0.7223207575529836, + "flos": 24356329083000.0, + "grad_norm": 2.8345417631139984, + "language_loss": 0.76256704, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78611577, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12445068, + "step": 12014, + "time_per_iteration": 2.8293700218200684 + }, + { + "auxiliary_loss_clip": 0.01343332, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.23295605, + "balance_loss_mlp": 1.01749229, + "epoch": 0.7223808808056515, + "flos": 28336127830200.0, + "grad_norm": 1.5291380550130949, + "language_loss": 0.86988431, + "learning_rate": 7.553272008637346e-07, + "loss": 0.89362526, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.1328125, + "step": 12015, + "time_per_iteration": 2.847264528274536 + }, + { + "auxiliary_loss_clip": 0.01332613, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.22651148, + "balance_loss_mlp": 1.01996529, + "epoch": 0.7224410040583196, + "flos": 21074427834840.0, + "grad_norm": 1.9070456186693472, + "language_loss": 0.78215307, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80580097, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12207031, + "step": 12016, + "time_per_iteration": 2.8070011138916016 + }, + { + "auxiliary_loss_clip": 0.01338314, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.2314595, + "balance_loss_mlp": 1.02237856, + "epoch": 0.7225011273109875, + "flos": 26800870431960.0, + "grad_norm": 1.4665731776489337, + "language_loss": 0.78117955, + "learning_rate": 7.547175930910186e-07, + "loss": 0.8049174, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13098145, + "step": 12017, + "time_per_iteration": 2.8358263969421387 + }, + { + "auxiliary_loss_clip": 0.01329811, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.22515178, + "balance_loss_mlp": 1.01576626, + "epoch": 0.7225612505636555, + "flos": 23588456733720.0, + "grad_norm": 2.175777303064741, + "language_loss": 0.74553227, + "learning_rate": 7.54412860030732e-07, + "loss": 0.76911384, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12579346, + "step": 12018, + "time_per_iteration": 2.7656733989715576 + }, + { + "auxiliary_loss_clip": 0.01326282, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.22329414, + "balance_loss_mlp": 1.01640368, + "epoch": 0.7226213738163234, + "flos": 20782589576040.0, + "grad_norm": 1.5739466250047631, + "language_loss": 0.77965754, + "learning_rate": 7.541081742032347e-07, + "loss": 0.8031981, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.1137085, + "step": 12019, + "time_per_iteration": 4.239832401275635 + }, + { + "auxiliary_loss_clip": 0.01332046, + "auxiliary_loss_mlp": 0.01024736, + "balance_loss_clip": 1.22443008, + "balance_loss_mlp": 1.01213562, + "epoch": 0.7226814970689914, + "flos": 32642386786080.0, + "grad_norm": 1.7185912744097513, + "language_loss": 0.74058217, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76414996, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12585449, + "step": 12020, + "time_per_iteration": 2.828840494155884 + }, + { + "auxiliary_loss_clip": 0.01336707, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.22629762, + "balance_loss_mlp": 1.01749444, + "epoch": 0.7227416203216595, + "flos": 22459339792440.0, + "grad_norm": 1.5197673548763866, + "language_loss": 0.77386177, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79752421, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1204834, + "step": 12021, + "time_per_iteration": 2.7832207679748535 + }, + { + "auxiliary_loss_clip": 0.01335266, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.22823882, + "balance_loss_mlp": 1.01618767, + "epoch": 0.7228017435743274, + "flos": 21657251576880.0, + "grad_norm": 2.607148721154432, + "language_loss": 0.68886238, + "learning_rate": 7.531944002330073e-07, + "loss": 0.71250528, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.128479, + "step": 12022, + "time_per_iteration": 2.7324931621551514 + }, + { + "auxiliary_loss_clip": 0.01334357, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.22522712, + "balance_loss_mlp": 1.01774955, + "epoch": 0.7228618668269954, + "flos": 29539483499520.0, + "grad_norm": 1.645989480982953, + "language_loss": 0.69203752, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71569306, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13439941, + "step": 12023, + "time_per_iteration": 2.871466636657715 + }, + { + "auxiliary_loss_clip": 0.01330798, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.22364187, + "balance_loss_mlp": 1.01575279, + "epoch": 0.7229219900796633, + "flos": 27459557236440.0, + "grad_norm": 1.9737201783198561, + "language_loss": 0.71297151, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73656249, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12542725, + "step": 12024, + "time_per_iteration": 2.827411413192749 + }, + { + "auxiliary_loss_clip": 0.01334369, + "auxiliary_loss_mlp": 0.01028649, + "balance_loss_clip": 1.22722697, + "balance_loss_mlp": 1.01637638, + "epoch": 0.7229821133323313, + "flos": 16293677906160.0, + "grad_norm": 1.794377730987406, + "language_loss": 0.76123387, + "learning_rate": 7.522810517737089e-07, + "loss": 0.78486407, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12268066, + "step": 12025, + "time_per_iteration": 2.7639853954315186 + }, + { + "auxiliary_loss_clip": 0.01330381, + "auxiliary_loss_mlp": 0.01023502, + "balance_loss_clip": 1.22503614, + "balance_loss_mlp": 1.01080036, + "epoch": 0.7230422365849992, + "flos": 20417202931320.0, + "grad_norm": 1.9163550765320398, + "language_loss": 0.76829296, + "learning_rate": 7.519766968991395e-07, + "loss": 0.79183179, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12701416, + "step": 12026, + "time_per_iteration": 2.9893627166748047 + }, + { + "auxiliary_loss_clip": 0.01338213, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.22927737, + "balance_loss_mlp": 1.02235925, + "epoch": 0.7231023598376672, + "flos": 25599057880320.0, + "grad_norm": 8.8191034483124, + "language_loss": 0.68241572, + "learning_rate": 7.516723893497388e-07, + "loss": 0.70615166, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13018799, + "step": 12027, + "time_per_iteration": 2.8062961101531982 + }, + { + "auxiliary_loss_clip": 0.01343799, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.23378468, + "balance_loss_mlp": 1.01700473, + "epoch": 0.7231624830903352, + "flos": 25154315854200.0, + "grad_norm": 2.0206867419151306, + "language_loss": 0.79258275, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81632519, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13458252, + "step": 12028, + "time_per_iteration": 2.787252187728882 + }, + { + "auxiliary_loss_clip": 0.01333774, + "auxiliary_loss_mlp": 0.01026685, + "balance_loss_clip": 1.22648406, + "balance_loss_mlp": 1.01348257, + "epoch": 0.7232226063430032, + "flos": 21730759354440.0, + "grad_norm": 1.7905024714793658, + "language_loss": 0.82187223, + "learning_rate": 7.510639162726e-07, + "loss": 0.84547681, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13189697, + "step": 12029, + "time_per_iteration": 2.776987075805664 + }, + { + "auxiliary_loss_clip": 0.01157735, + "auxiliary_loss_mlp": 0.01002931, + "balance_loss_clip": 1.11405849, + "balance_loss_mlp": 1.00035572, + "epoch": 0.7232827295956711, + "flos": 68453892112680.0, + "grad_norm": 0.8096866764866879, + "language_loss": 0.61808562, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63969231, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02575684, + "step": 12030, + "time_per_iteration": 3.3214845657348633 + }, + { + "auxiliary_loss_clip": 0.01332571, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.22614193, + "balance_loss_mlp": 1.01546562, + "epoch": 0.7233428528483391, + "flos": 20197369940760.0, + "grad_norm": 1.6918813671646407, + "language_loss": 0.78293061, + "learning_rate": 7.504556326345859e-07, + "loss": 0.8065393, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12811279, + "step": 12031, + "time_per_iteration": 2.7646090984344482 + }, + { + "auxiliary_loss_clip": 0.01340437, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.2309041, + "balance_loss_mlp": 1.01986694, + "epoch": 0.723402976101007, + "flos": 23954533720560.0, + "grad_norm": 1.7289024459828237, + "language_loss": 0.81418824, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83792877, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13757324, + "step": 12032, + "time_per_iteration": 2.824847936630249 + }, + { + "auxiliary_loss_clip": 0.01345177, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.23192298, + "balance_loss_mlp": 1.02110183, + "epoch": 0.723463099353675, + "flos": 20818267343280.0, + "grad_norm": 1.8115341477227496, + "language_loss": 0.75624347, + "learning_rate": 7.498475385279592e-07, + "loss": 0.78003645, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 1.13232422, + "router_z_loss_mlp": 0.13031006, + "step": 12033, + "time_per_iteration": 2.738748550415039 + }, + { + "auxiliary_loss_clip": 0.01333011, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.22747815, + "balance_loss_mlp": 1.01704133, + "epoch": 0.723523222606343, + "flos": 19102590690840.0, + "grad_norm": 1.6482518181721912, + "language_loss": 0.7549156, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77854311, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12695312, + "step": 12034, + "time_per_iteration": 2.853135108947754 + }, + { + "auxiliary_loss_clip": 0.01335249, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.22676539, + "balance_loss_mlp": 1.01543975, + "epoch": 0.723583345859011, + "flos": 26512849359000.0, + "grad_norm": 1.6392371249298614, + "language_loss": 0.81150407, + "learning_rate": 7.492396340449578e-07, + "loss": 0.83513224, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12127686, + "step": 12035, + "time_per_iteration": 2.8439695835113525 + }, + { + "auxiliary_loss_clip": 0.0134303, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.23355782, + "balance_loss_mlp": 1.01998043, + "epoch": 0.723643469111679, + "flos": 16038126540000.0, + "grad_norm": 1.7152742024791636, + "language_loss": 0.61808395, + "learning_rate": 7.489357529411326e-07, + "loss": 0.64184225, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12811279, + "step": 12036, + "time_per_iteration": 2.794584274291992 + }, + { + "auxiliary_loss_clip": 0.0132454, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.22089577, + "balance_loss_mlp": 1.02187228, + "epoch": 0.7237035923643469, + "flos": 21950714170080.0, + "grad_norm": 1.5963103208871754, + "language_loss": 0.67721814, + "learning_rate": 7.486319192777883e-07, + "loss": 0.70080119, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.11901855, + "step": 12037, + "time_per_iteration": 2.9298582077026367 + }, + { + "auxiliary_loss_clip": 0.01332274, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.22574353, + "balance_loss_mlp": 1.01482427, + "epoch": 0.7237637156170149, + "flos": 23587928825040.0, + "grad_norm": 2.097681519370025, + "language_loss": 0.72529745, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74890137, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13299561, + "step": 12038, + "time_per_iteration": 2.930589199066162 + }, + { + "auxiliary_loss_clip": 0.01337746, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.22978449, + "balance_loss_mlp": 1.01909304, + "epoch": 0.7238238388696828, + "flos": 20599571386800.0, + "grad_norm": 1.6318657088320163, + "language_loss": 0.72779322, + "learning_rate": 7.480243943186293e-07, + "loss": 0.75149894, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13745117, + "step": 12039, + "time_per_iteration": 2.7697112560272217 + }, + { + "auxiliary_loss_clip": 0.01333013, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.22480774, + "balance_loss_mlp": 1.01797318, + "epoch": 0.7238839621223508, + "flos": 24212399763240.0, + "grad_norm": 1.7712949208984092, + "language_loss": 0.76546723, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78909868, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1217041, + "step": 12040, + "time_per_iteration": 2.795570135116577 + }, + { + "auxiliary_loss_clip": 0.01336052, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.22707772, + "balance_loss_mlp": 1.01708794, + "epoch": 0.7239440853750188, + "flos": 14213792251440.0, + "grad_norm": 1.4687134116532368, + "language_loss": 0.76816666, + "learning_rate": 7.474170592596301e-07, + "loss": 0.79182875, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13061523, + "step": 12041, + "time_per_iteration": 2.765392303466797 + }, + { + "auxiliary_loss_clip": 0.01336231, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.22636223, + "balance_loss_mlp": 1.01874828, + "epoch": 0.7240042086276868, + "flos": 21619624608360.0, + "grad_norm": 2.1182428308656305, + "language_loss": 0.6405285, + "learning_rate": 7.471134629714797e-07, + "loss": 0.664204, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12548828, + "step": 12042, + "time_per_iteration": 2.7396187782287598 + }, + { + "auxiliary_loss_clip": 0.01337031, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.22695673, + "balance_loss_mlp": 1.02111018, + "epoch": 0.7240643318803547, + "flos": 23336722553400.0, + "grad_norm": 1.8111767658526103, + "language_loss": 0.84014541, + "learning_rate": 7.468099141929116e-07, + "loss": 0.86386549, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13861084, + "step": 12043, + "time_per_iteration": 2.76492977142334 + }, + { + "auxiliary_loss_clip": 0.01339205, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.22993279, + "balance_loss_mlp": 1.01528955, + "epoch": 0.7241244551330227, + "flos": 24030234349560.0, + "grad_norm": 1.606492429821619, + "language_loss": 0.6439513, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66764176, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.14538574, + "step": 12044, + "time_per_iteration": 2.791826009750366 + }, + { + "auxiliary_loss_clip": 0.01341598, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.2325778, + "balance_loss_mlp": 1.0188483, + "epoch": 0.7241845783856906, + "flos": 18734483286000.0, + "grad_norm": 1.7232450862916475, + "language_loss": 0.8171944, + "learning_rate": 7.462029592105658e-07, + "loss": 0.84093618, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13726807, + "step": 12045, + "time_per_iteration": 2.7109124660491943 + }, + { + "auxiliary_loss_clip": 0.01327837, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.2230587, + "balance_loss_mlp": 1.01947522, + "epoch": 0.7242447016383586, + "flos": 19503208410840.0, + "grad_norm": 1.5385299596993764, + "language_loss": 0.72127938, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74488342, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.13110352, + "step": 12046, + "time_per_iteration": 2.8331587314605713 + }, + { + "auxiliary_loss_clip": 0.01339015, + "auxiliary_loss_mlp": 0.01024286, + "balance_loss_clip": 1.23011565, + "balance_loss_mlp": 1.01111317, + "epoch": 0.7243048248910267, + "flos": 22168557351000.0, + "grad_norm": 1.7744433725109965, + "language_loss": 0.71347994, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73711294, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13165283, + "step": 12047, + "time_per_iteration": 2.74845552444458 + }, + { + "auxiliary_loss_clip": 0.01345445, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.23438239, + "balance_loss_mlp": 1.02073026, + "epoch": 0.7243649481436946, + "flos": 27678537451440.0, + "grad_norm": 2.2428547496107187, + "language_loss": 0.7001313, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72392571, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13262939, + "step": 12048, + "time_per_iteration": 2.95074200630188 + }, + { + "auxiliary_loss_clip": 0.01157539, + "auxiliary_loss_mlp": 0.01004835, + "balance_loss_clip": 1.11366808, + "balance_loss_mlp": 1.00214052, + "epoch": 0.7244250713963626, + "flos": 63259489180440.0, + "grad_norm": 0.8246407480668738, + "language_loss": 0.53767151, + "learning_rate": 7.449896198672168e-07, + "loss": 0.5592953, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02697754, + "step": 12049, + "time_per_iteration": 3.269505500793457 + }, + { + "auxiliary_loss_clip": 0.01350799, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.23637605, + "balance_loss_mlp": 1.02172291, + "epoch": 0.7244851946490305, + "flos": 17971565156640.0, + "grad_norm": 2.8523620423822305, + "language_loss": 0.60720599, + "learning_rate": 7.446864039779258e-07, + "loss": 0.631078, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.14678955, + "step": 12050, + "time_per_iteration": 4.155273914337158 + }, + { + "auxiliary_loss_clip": 0.01158327, + "auxiliary_loss_mlp": 0.00997872, + "balance_loss_clip": 1.11437511, + "balance_loss_mlp": 0.99517757, + "epoch": 0.7245453179016985, + "flos": 70959677514480.0, + "grad_norm": 0.7125875243146215, + "language_loss": 0.53265589, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55421788, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02697754, + "step": 12051, + "time_per_iteration": 6.1848554611206055 + }, + { + "auxiliary_loss_clip": 0.01336716, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.23105359, + "balance_loss_mlp": 1.01862168, + "epoch": 0.7246054411543664, + "flos": 24573441313440.0, + "grad_norm": 1.4785395691525416, + "language_loss": 0.72198784, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74566805, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12695312, + "step": 12052, + "time_per_iteration": 2.7709076404571533 + }, + { + "auxiliary_loss_clip": 0.01337628, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.22892296, + "balance_loss_mlp": 1.01594543, + "epoch": 0.7246655644070344, + "flos": 32344091798040.0, + "grad_norm": 2.1411810979197914, + "language_loss": 0.74612755, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76979911, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13568115, + "step": 12053, + "time_per_iteration": 2.8290069103240967 + }, + { + "auxiliary_loss_clip": 0.01331814, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.22463703, + "balance_loss_mlp": 1.01709676, + "epoch": 0.7247256876597024, + "flos": 21877612476120.0, + "grad_norm": 1.9995326034576788, + "language_loss": 0.78670406, + "learning_rate": 7.434740165518898e-07, + "loss": 0.81032455, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13140869, + "step": 12054, + "time_per_iteration": 2.7230470180511475 + }, + { + "auxiliary_loss_clip": 0.01333182, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.22630537, + "balance_loss_mlp": 1.02093291, + "epoch": 0.7247858109123704, + "flos": 16216962068160.0, + "grad_norm": 3.1557373419134427, + "language_loss": 0.68363947, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70730919, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.128479, + "step": 12055, + "time_per_iteration": 2.7006514072418213 + }, + { + "auxiliary_loss_clip": 0.01335196, + "auxiliary_loss_mlp": 0.01035286, + "balance_loss_clip": 1.22831035, + "balance_loss_mlp": 1.02297211, + "epoch": 0.7248459341650383, + "flos": 20855691270000.0, + "grad_norm": 1.6150998267921834, + "language_loss": 0.74354881, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76725364, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12329102, + "step": 12056, + "time_per_iteration": 2.704577684402466 + }, + { + "auxiliary_loss_clip": 0.0132796, + "auxiliary_loss_mlp": 0.01025977, + "balance_loss_clip": 1.22331047, + "balance_loss_mlp": 1.013448, + "epoch": 0.7249060574177063, + "flos": 25927386073560.0, + "grad_norm": 1.4336607185322217, + "language_loss": 0.70842409, + "learning_rate": 7.425652262418368e-07, + "loss": 0.73196346, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12524414, + "step": 12057, + "time_per_iteration": 2.858797311782837 + }, + { + "auxiliary_loss_clip": 0.01345384, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.23403323, + "balance_loss_mlp": 1.0208354, + "epoch": 0.7249661806703742, + "flos": 17349733761840.0, + "grad_norm": 1.825185991119688, + "language_loss": 0.6232723, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64707232, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13793945, + "step": 12058, + "time_per_iteration": 4.265557289123535 + }, + { + "auxiliary_loss_clip": 0.01334966, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.22636902, + "balance_loss_mlp": 1.01532388, + "epoch": 0.7250263039230422, + "flos": 19579030864920.0, + "grad_norm": 1.836635593676992, + "language_loss": 0.74844056, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7720781, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13458252, + "step": 12059, + "time_per_iteration": 2.7279250621795654 + }, + { + "auxiliary_loss_clip": 0.01325442, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.22128463, + "balance_loss_mlp": 1.02311015, + "epoch": 0.7250864271757103, + "flos": 21981112850520.0, + "grad_norm": 1.6623309042427192, + "language_loss": 0.79517817, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81878304, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.1192627, + "step": 12060, + "time_per_iteration": 3.066150426864624 + }, + { + "auxiliary_loss_clip": 0.01335727, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.22727346, + "balance_loss_mlp": 1.01261783, + "epoch": 0.7251465504283782, + "flos": 25018467598080.0, + "grad_norm": 2.228588858780468, + "language_loss": 0.76420403, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78782117, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.13366699, + "step": 12061, + "time_per_iteration": 2.812319040298462 + }, + { + "auxiliary_loss_clip": 0.0132715, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.22323847, + "balance_loss_mlp": 1.01450336, + "epoch": 0.7252066736810462, + "flos": 16694417451240.0, + "grad_norm": 1.5690851847137743, + "language_loss": 0.81070566, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83424675, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12451172, + "step": 12062, + "time_per_iteration": 2.7042276859283447 + }, + { + "auxiliary_loss_clip": 0.01343696, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.23311949, + "balance_loss_mlp": 1.01265752, + "epoch": 0.7252667969337141, + "flos": 25708608900360.0, + "grad_norm": 2.1746252907206, + "language_loss": 0.69384873, + "learning_rate": 7.407489333471262e-07, + "loss": 0.7175523, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14013672, + "step": 12063, + "time_per_iteration": 2.7757298946380615 + }, + { + "auxiliary_loss_clip": 0.01324131, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.22036028, + "balance_loss_mlp": 1.01698756, + "epoch": 0.7253269201863821, + "flos": 18264540449520.0, + "grad_norm": 1.4389227569475938, + "language_loss": 0.70685631, + "learning_rate": 7.40446384925973e-07, + "loss": 0.73038983, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.12237549, + "step": 12064, + "time_per_iteration": 2.7013516426086426 + }, + { + "auxiliary_loss_clip": 0.013365, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.22916532, + "balance_loss_mlp": 1.01630044, + "epoch": 0.72538704343905, + "flos": 20416593805920.0, + "grad_norm": 1.7575064313052609, + "language_loss": 0.91053116, + "learning_rate": 7.401438842672192e-07, + "loss": 0.93418872, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12939453, + "step": 12065, + "time_per_iteration": 2.772876739501953 + }, + { + "auxiliary_loss_clip": 0.01155804, + "auxiliary_loss_mlp": 0.01000327, + "balance_loss_clip": 1.11199439, + "balance_loss_mlp": 0.99768019, + "epoch": 0.725447166691718, + "flos": 70168472339400.0, + "grad_norm": 0.663912392409185, + "language_loss": 0.56101656, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58257782, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02648926, + "step": 12066, + "time_per_iteration": 3.369253635406494 + }, + { + "auxiliary_loss_clip": 0.01336098, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.22984004, + "balance_loss_mlp": 1.01930285, + "epoch": 0.725507289944386, + "flos": 27058005524160.0, + "grad_norm": 1.8618074833239044, + "language_loss": 0.76207912, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78575492, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12188721, + "step": 12067, + "time_per_iteration": 2.878486156463623 + }, + { + "auxiliary_loss_clip": 0.01156483, + "auxiliary_loss_mlp": 0.01001899, + "balance_loss_clip": 1.11230958, + "balance_loss_mlp": 0.99885958, + "epoch": 0.725567413197054, + "flos": 62937577108080.0, + "grad_norm": 0.7265597095558046, + "language_loss": 0.57075, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59233385, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.03039551, + "step": 12068, + "time_per_iteration": 3.10125732421875 + }, + { + "auxiliary_loss_clip": 0.01157279, + "auxiliary_loss_mlp": 0.01006335, + "balance_loss_clip": 1.11316895, + "balance_loss_mlp": 1.00360477, + "epoch": 0.7256275364497219, + "flos": 60311398254120.0, + "grad_norm": 0.6634878271849255, + "language_loss": 0.55507702, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57671314, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02734375, + "step": 12069, + "time_per_iteration": 3.202489137649536 + }, + { + "auxiliary_loss_clip": 0.01328139, + "auxiliary_loss_mlp": 0.01025488, + "balance_loss_clip": 1.22481954, + "balance_loss_mlp": 1.01412153, + "epoch": 0.7256876597023899, + "flos": 24503872546800.0, + "grad_norm": 1.7342888868498973, + "language_loss": 0.80248201, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82601827, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.1137085, + "step": 12070, + "time_per_iteration": 2.801908016204834 + }, + { + "auxiliary_loss_clip": 0.01326612, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.22356749, + "balance_loss_mlp": 1.01834702, + "epoch": 0.7257477829550578, + "flos": 24358115850840.0, + "grad_norm": 1.7205333564279812, + "language_loss": 0.72441977, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74799383, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.12451172, + "step": 12071, + "time_per_iteration": 2.8661649227142334 + }, + { + "auxiliary_loss_clip": 0.0133379, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.22895503, + "balance_loss_mlp": 1.02426147, + "epoch": 0.7258079062077258, + "flos": 17206738434360.0, + "grad_norm": 1.806686822441876, + "language_loss": 0.7017355, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72543597, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11999512, + "step": 12072, + "time_per_iteration": 2.9202940464019775 + }, + { + "auxiliary_loss_clip": 0.01338327, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.22807133, + "balance_loss_mlp": 1.01560521, + "epoch": 0.7258680294603939, + "flos": 21585408742080.0, + "grad_norm": 1.826372424246699, + "language_loss": 0.78556103, + "learning_rate": 7.377255998196821e-07, + "loss": 0.8092376, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1373291, + "step": 12073, + "time_per_iteration": 2.71801495552063 + }, + { + "auxiliary_loss_clip": 0.01331615, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.22722328, + "balance_loss_mlp": 1.01265943, + "epoch": 0.7259281527130618, + "flos": 34861450582440.0, + "grad_norm": 1.6170459943832025, + "language_loss": 0.70337701, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72694713, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12731934, + "step": 12074, + "time_per_iteration": 2.829235553741455 + }, + { + "auxiliary_loss_clip": 0.01340666, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.23113191, + "balance_loss_mlp": 1.02066827, + "epoch": 0.7259882759657298, + "flos": 25409339311680.0, + "grad_norm": 2.9464988215040537, + "language_loss": 0.74379408, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76754069, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13330078, + "step": 12075, + "time_per_iteration": 2.8059744834899902 + }, + { + "auxiliary_loss_clip": 0.01332994, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.22598124, + "balance_loss_mlp": 1.01239097, + "epoch": 0.7260483992183977, + "flos": 62963897971320.0, + "grad_norm": 1.9427855561840186, + "language_loss": 0.63775551, + "learning_rate": 7.368195326186458e-07, + "loss": 0.66134322, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1338501, + "step": 12076, + "time_per_iteration": 3.116909980773926 + }, + { + "auxiliary_loss_clip": 0.013369, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.22927964, + "balance_loss_mlp": 1.01776576, + "epoch": 0.7261085224710657, + "flos": 26472988930680.0, + "grad_norm": 1.6982379441078659, + "language_loss": 0.793607, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81728327, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12963867, + "step": 12077, + "time_per_iteration": 2.8149757385253906 + }, + { + "auxiliary_loss_clip": 0.01155268, + "auxiliary_loss_mlp": 0.01004995, + "balance_loss_clip": 1.11187029, + "balance_loss_mlp": 1.0023005, + "epoch": 0.7261686457237336, + "flos": 66786708488040.0, + "grad_norm": 0.8879406629843899, + "language_loss": 0.65018415, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67178679, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02697754, + "step": 12078, + "time_per_iteration": 3.3003971576690674 + }, + { + "auxiliary_loss_clip": 0.01156182, + "auxiliary_loss_mlp": 0.01004323, + "balance_loss_clip": 1.11299443, + "balance_loss_mlp": 1.00191462, + "epoch": 0.7262287689764017, + "flos": 70016096480760.0, + "grad_norm": 0.7381543076370531, + "language_loss": 0.5926311, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61423612, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02404785, + "step": 12079, + "time_per_iteration": 3.3671531677246094 + }, + { + "auxiliary_loss_clip": 0.01331353, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.2250309, + "balance_loss_mlp": 1.02114892, + "epoch": 0.7262888922290696, + "flos": 23810726225880.0, + "grad_norm": 2.28891993923531, + "language_loss": 0.64722115, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67087609, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13000488, + "step": 12080, + "time_per_iteration": 2.8374698162078857 + }, + { + "auxiliary_loss_clip": 0.01335994, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.22632098, + "balance_loss_mlp": 1.01616144, + "epoch": 0.7263490154817376, + "flos": 19505076395400.0, + "grad_norm": 5.336898276781987, + "language_loss": 0.69560897, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71926618, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13586426, + "step": 12081, + "time_per_iteration": 2.740811586380005 + }, + { + "auxiliary_loss_clip": 0.01345175, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.2343061, + "balance_loss_mlp": 1.01739407, + "epoch": 0.7264091387344055, + "flos": 16290144978840.0, + "grad_norm": 1.7823689670057465, + "language_loss": 0.81373054, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83749658, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14044189, + "step": 12082, + "time_per_iteration": 2.7813658714294434 + }, + { + "auxiliary_loss_clip": 0.01350095, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.23529577, + "balance_loss_mlp": 1.0230813, + "epoch": 0.7264692619870735, + "flos": 24357303683640.0, + "grad_norm": 1.5447089190768635, + "language_loss": 0.77192837, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79580486, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14471436, + "step": 12083, + "time_per_iteration": 2.8072853088378906 + }, + { + "auxiliary_loss_clip": 0.01337398, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.2270875, + "balance_loss_mlp": 1.0217104, + "epoch": 0.7265293852397414, + "flos": 25125013599480.0, + "grad_norm": 3.6658113098592984, + "language_loss": 0.7328822, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75660789, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13452148, + "step": 12084, + "time_per_iteration": 2.957005023956299 + }, + { + "auxiliary_loss_clip": 0.01341962, + "auxiliary_loss_mlp": 0.0103311, + "balance_loss_clip": 1.23110414, + "balance_loss_mlp": 1.02020001, + "epoch": 0.7265895084924094, + "flos": 22643413799040.0, + "grad_norm": 3.9486625553709636, + "language_loss": 0.78164953, + "learning_rate": 7.34103918847843e-07, + "loss": 0.80540025, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 1.10888672, + "router_z_loss_mlp": 0.12915039, + "step": 12085, + "time_per_iteration": 2.7694640159606934 + }, + { + "auxiliary_loss_clip": 0.01340381, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.23075247, + "balance_loss_mlp": 1.02013183, + "epoch": 0.7266496317450775, + "flos": 23373537354720.0, + "grad_norm": 1.7002326224442164, + "language_loss": 0.72650594, + "learning_rate": 7.338024238464493e-07, + "loss": 0.75024122, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.13012695, + "step": 12086, + "time_per_iteration": 2.7966113090515137 + }, + { + "auxiliary_loss_clip": 0.01332209, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.22633696, + "balance_loss_mlp": 1.01986766, + "epoch": 0.7267097549977454, + "flos": 28080495247320.0, + "grad_norm": 1.682173810628048, + "language_loss": 0.6992994, + "learning_rate": 7.335009768593938e-07, + "loss": 0.72294658, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12646484, + "step": 12087, + "time_per_iteration": 2.7986998558044434 + }, + { + "auxiliary_loss_clip": 0.01342339, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.2314167, + "balance_loss_mlp": 1.02067304, + "epoch": 0.7267698782504134, + "flos": 22200011848800.0, + "grad_norm": 1.9199770709210477, + "language_loss": 0.79029381, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81406295, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13909912, + "step": 12088, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.01340565, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.22990811, + "balance_loss_mlp": 1.01937068, + "epoch": 0.7268300015030813, + "flos": 18519442081920.0, + "grad_norm": 1.7134363041183411, + "language_loss": 0.74230361, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76603639, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.13348389, + "step": 12089, + "time_per_iteration": 4.227902173995972 + }, + { + "auxiliary_loss_clip": 0.01342351, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.23246908, + "balance_loss_mlp": 1.02082944, + "epoch": 0.7268901247557493, + "flos": 23991104871720.0, + "grad_norm": 2.0142934604820897, + "language_loss": 0.71458912, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73835075, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13000488, + "step": 12090, + "time_per_iteration": 4.205451488494873 + }, + { + "auxiliary_loss_clip": 0.01346413, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.23672092, + "balance_loss_mlp": 1.0203402, + "epoch": 0.7269502480084172, + "flos": 32094509860800.0, + "grad_norm": 1.7276677771025388, + "language_loss": 0.77407312, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79788601, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.14532471, + "step": 12091, + "time_per_iteration": 2.8235528469085693 + }, + { + "auxiliary_loss_clip": 0.01336907, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.22779596, + "balance_loss_mlp": 1.01829493, + "epoch": 0.7270103712610853, + "flos": 19067522049000.0, + "grad_norm": 1.887348980591116, + "language_loss": 0.71647298, + "learning_rate": 7.319944625392205e-07, + "loss": 0.74015248, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12762451, + "step": 12092, + "time_per_iteration": 2.704334020614624 + }, + { + "auxiliary_loss_clip": 0.01337455, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.23002732, + "balance_loss_mlp": 1.01685643, + "epoch": 0.7270704945137532, + "flos": 34539700943520.0, + "grad_norm": 1.777398568309418, + "language_loss": 0.61225772, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63593495, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13421631, + "step": 12093, + "time_per_iteration": 2.858299493789673 + }, + { + "auxiliary_loss_clip": 0.01333695, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.22774088, + "balance_loss_mlp": 1.01657248, + "epoch": 0.7271306177664212, + "flos": 21512672523360.0, + "grad_norm": 1.6026322540908875, + "language_loss": 0.75464851, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77828276, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13153076, + "step": 12094, + "time_per_iteration": 2.745896100997925 + }, + { + "auxiliary_loss_clip": 0.01329809, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.22402346, + "balance_loss_mlp": 1.02272391, + "epoch": 0.7271907410190891, + "flos": 22277458637280.0, + "grad_norm": 1.925662067842587, + "language_loss": 0.85016227, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87381792, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13037109, + "step": 12095, + "time_per_iteration": 2.767411708831787 + }, + { + "auxiliary_loss_clip": 0.01337088, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.22780263, + "balance_loss_mlp": 1.02116418, + "epoch": 0.7272508642717571, + "flos": 22898274823080.0, + "grad_norm": 1.5940616201686817, + "language_loss": 0.78201509, + "learning_rate": 7.307901165066479e-07, + "loss": 0.80573159, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.13397217, + "step": 12096, + "time_per_iteration": 2.8999831676483154 + }, + { + "auxiliary_loss_clip": 0.01341601, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.23343015, + "balance_loss_mlp": 1.01844501, + "epoch": 0.727310987524425, + "flos": 11659618665720.0, + "grad_norm": 1.8346507371538994, + "language_loss": 0.72847581, + "learning_rate": 7.30489150291381e-07, + "loss": 0.75220442, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12817383, + "step": 12097, + "time_per_iteration": 4.24367880821228 + }, + { + "auxiliary_loss_clip": 0.01341568, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.23183095, + "balance_loss_mlp": 1.02055764, + "epoch": 0.727371110777093, + "flos": 24540727956480.0, + "grad_norm": 1.7030782467843932, + "language_loss": 0.76768357, + "learning_rate": 7.301882322160935e-07, + "loss": 0.79144341, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.13867188, + "step": 12098, + "time_per_iteration": 2.84494948387146 + }, + { + "auxiliary_loss_clip": 0.01344807, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.23178029, + "balance_loss_mlp": 1.0184232, + "epoch": 0.7274312340297611, + "flos": 74751811738200.0, + "grad_norm": 1.5942943091421675, + "language_loss": 0.67601556, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69978672, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13879395, + "step": 12099, + "time_per_iteration": 3.1199347972869873 + }, + { + "auxiliary_loss_clip": 0.01351245, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.23635769, + "balance_loss_mlp": 1.02191615, + "epoch": 0.727491357282429, + "flos": 22347555312600.0, + "grad_norm": 1.7453977486812238, + "language_loss": 0.7291109, + "learning_rate": 7.29586540531095e-07, + "loss": 0.75299352, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.15100098, + "step": 12100, + "time_per_iteration": 2.8538131713867188 + }, + { + "auxiliary_loss_clip": 0.01340051, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.23220778, + "balance_loss_mlp": 1.02183986, + "epoch": 0.727551480535097, + "flos": 23303156420880.0, + "grad_norm": 1.4022908765266302, + "language_loss": 0.74880958, + "learning_rate": 7.292857669442005e-07, + "loss": 0.77255738, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12878418, + "step": 12101, + "time_per_iteration": 2.7786166667938232 + }, + { + "auxiliary_loss_clip": 0.01332282, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.22703171, + "balance_loss_mlp": 1.01725292, + "epoch": 0.7276116037877649, + "flos": 21475573463520.0, + "grad_norm": 1.7675765804600683, + "language_loss": 0.82834733, + "learning_rate": 7.289850415429177e-07, + "loss": 0.85195982, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1171875, + "step": 12102, + "time_per_iteration": 2.751966714859009 + }, + { + "auxiliary_loss_clip": 0.01339699, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.23248172, + "balance_loss_mlp": 1.01580024, + "epoch": 0.7276717270404329, + "flos": 21468223350360.0, + "grad_norm": 2.301515284756389, + "language_loss": 0.81936568, + "learning_rate": 7.286843643386495e-07, + "loss": 0.84304023, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11956787, + "step": 12103, + "time_per_iteration": 2.7455837726593018 + }, + { + "auxiliary_loss_clip": 0.01339901, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.23121643, + "balance_loss_mlp": 1.01245201, + "epoch": 0.7277318502931008, + "flos": 16841676656520.0, + "grad_norm": 1.579333343179929, + "language_loss": 0.66475868, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68842322, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.14093018, + "step": 12104, + "time_per_iteration": 2.765686273574829 + }, + { + "auxiliary_loss_clip": 0.01330097, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.22522402, + "balance_loss_mlp": 1.02022099, + "epoch": 0.7277919735457689, + "flos": 33407010466560.0, + "grad_norm": 1.8588260132930157, + "language_loss": 0.66162574, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68526125, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.13232422, + "step": 12105, + "time_per_iteration": 2.932178020477295 + }, + { + "auxiliary_loss_clip": 0.01340194, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.23313344, + "balance_loss_mlp": 1.02105558, + "epoch": 0.7278520967984368, + "flos": 19210964068440.0, + "grad_norm": 1.9374740197406215, + "language_loss": 0.76111835, + "learning_rate": 7.27782622021939e-07, + "loss": 0.78486449, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13354492, + "step": 12106, + "time_per_iteration": 2.8722991943359375 + }, + { + "auxiliary_loss_clip": 0.0135096, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.2394948, + "balance_loss_mlp": 1.02057672, + "epoch": 0.7279122200511048, + "flos": 34101334429920.0, + "grad_norm": 2.2730841607302668, + "language_loss": 0.71197295, + "learning_rate": 7.274821377197273e-07, + "loss": 0.73583019, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14190674, + "step": 12107, + "time_per_iteration": 3.024907350540161 + }, + { + "auxiliary_loss_clip": 0.01337915, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.22987604, + "balance_loss_mlp": 1.01740086, + "epoch": 0.7279723433037727, + "flos": 54608312109960.0, + "grad_norm": 1.4187956419924885, + "language_loss": 0.75468194, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77836704, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13189697, + "step": 12108, + "time_per_iteration": 3.1366989612579346 + }, + { + "auxiliary_loss_clip": 0.01342017, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.23356783, + "balance_loss_mlp": 1.01812768, + "epoch": 0.7280324665564407, + "flos": 36144364674960.0, + "grad_norm": 1.5398610963619423, + "language_loss": 0.67374569, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69748092, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.13366699, + "step": 12109, + "time_per_iteration": 2.8956198692321777 + }, + { + "auxiliary_loss_clip": 0.01337322, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.23046148, + "balance_loss_mlp": 1.01843297, + "epoch": 0.7280925898091086, + "flos": 11622154130640.0, + "grad_norm": 2.1380279887734264, + "language_loss": 0.641599, + "learning_rate": 7.265809743826912e-07, + "loss": 0.66529822, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.14160156, + "step": 12110, + "time_per_iteration": 2.7474381923675537 + }, + { + "auxiliary_loss_clip": 0.01343417, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.23257923, + "balance_loss_mlp": 1.01461315, + "epoch": 0.7281527130617766, + "flos": 34283784102120.0, + "grad_norm": 1.6416505016214658, + "language_loss": 0.58909059, + "learning_rate": 7.26280683164847e-07, + "loss": 0.61281449, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14379883, + "step": 12111, + "time_per_iteration": 2.837986469268799 + }, + { + "auxiliary_loss_clip": 0.0134646, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.23510444, + "balance_loss_mlp": 1.02063417, + "epoch": 0.7282128363144446, + "flos": 13922075817720.0, + "grad_norm": 3.667260005800622, + "language_loss": 0.74008596, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76388961, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13275146, + "step": 12112, + "time_per_iteration": 2.7443487644195557 + }, + { + "auxiliary_loss_clip": 0.01337919, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.23088312, + "balance_loss_mlp": 1.01706123, + "epoch": 0.7282729595671126, + "flos": 20782589576040.0, + "grad_norm": 1.8663055211214499, + "language_loss": 0.66737127, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69104671, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12554932, + "step": 12113, + "time_per_iteration": 2.7102370262145996 + }, + { + "auxiliary_loss_clip": 0.01340831, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.23188984, + "balance_loss_mlp": 1.01706076, + "epoch": 0.7283330828197806, + "flos": 16329680540280.0, + "grad_norm": 1.8476831421164874, + "language_loss": 0.73059297, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75430655, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13464355, + "step": 12114, + "time_per_iteration": 2.7594382762908936 + }, + { + "auxiliary_loss_clip": 0.01336874, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.23003316, + "balance_loss_mlp": 1.01642179, + "epoch": 0.7283932060724485, + "flos": 27495925345800.0, + "grad_norm": 1.7700485718326961, + "language_loss": 0.68271548, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70638454, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.13598633, + "step": 12115, + "time_per_iteration": 2.794077157974243 + }, + { + "auxiliary_loss_clip": 0.01348085, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.23656178, + "balance_loss_mlp": 1.01922035, + "epoch": 0.7284533293251165, + "flos": 18372304701720.0, + "grad_norm": 1.5488148818015062, + "language_loss": 0.60111868, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62492955, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 1.11474609, + "router_z_loss_mlp": 0.13781738, + "step": 12116, + "time_per_iteration": 2.734417676925659 + }, + { + "auxiliary_loss_clip": 0.01340682, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.23351157, + "balance_loss_mlp": 1.01902962, + "epoch": 0.7285134525777844, + "flos": 21730515704280.0, + "grad_norm": 1.9820939725332183, + "language_loss": 0.73301619, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75673836, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12506104, + "step": 12117, + "time_per_iteration": 2.7395846843719482 + }, + { + "auxiliary_loss_clip": 0.01340944, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.23226559, + "balance_loss_mlp": 1.01575565, + "epoch": 0.7285735758304525, + "flos": 20746343291760.0, + "grad_norm": 2.71070189590706, + "language_loss": 0.69933146, + "learning_rate": 7.241799976651807e-07, + "loss": 0.72303271, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13433838, + "step": 12118, + "time_per_iteration": 2.7746317386627197 + }, + { + "auxiliary_loss_clip": 0.01328606, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.22502148, + "balance_loss_mlp": 1.01511717, + "epoch": 0.7286336990831204, + "flos": 17315396070480.0, + "grad_norm": 1.6882675437472368, + "language_loss": 0.84534097, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86890554, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.12731934, + "step": 12119, + "time_per_iteration": 2.743806838989258 + }, + { + "auxiliary_loss_clip": 0.01343378, + "auxiliary_loss_mlp": 0.01027164, + "balance_loss_clip": 1.23407114, + "balance_loss_mlp": 1.0146414, + "epoch": 0.7286938223357884, + "flos": 19791554350680.0, + "grad_norm": 2.4933274557020595, + "language_loss": 0.81856596, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84227145, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12530518, + "step": 12120, + "time_per_iteration": 2.922222137451172 + }, + { + "auxiliary_loss_clip": 0.01341816, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.23273897, + "balance_loss_mlp": 1.02290154, + "epoch": 0.7287539455884563, + "flos": 15345061435800.0, + "grad_norm": 2.0655613741230385, + "language_loss": 0.78743947, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81121367, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12731934, + "step": 12121, + "time_per_iteration": 2.7858023643493652 + }, + { + "auxiliary_loss_clip": 0.01343403, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.23173273, + "balance_loss_mlp": 1.01718009, + "epoch": 0.7288140688411243, + "flos": 25198034076720.0, + "grad_norm": 1.739766150975988, + "language_loss": 0.69348115, + "learning_rate": 7.229806700436441e-07, + "loss": 0.7172246, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13769531, + "step": 12122, + "time_per_iteration": 2.7935192584991455 + }, + { + "auxiliary_loss_clip": 0.01333573, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.22764611, + "balance_loss_mlp": 1.0200882, + "epoch": 0.7288741920937922, + "flos": 23988993237000.0, + "grad_norm": 1.8113043285233277, + "language_loss": 0.87360734, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89726615, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12225342, + "step": 12123, + "time_per_iteration": 2.874966621398926 + }, + { + "auxiliary_loss_clip": 0.01333906, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.22810471, + "balance_loss_mlp": 1.01871145, + "epoch": 0.7289343153464602, + "flos": 22749756758640.0, + "grad_norm": 4.360626581203551, + "language_loss": 0.82831371, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85196483, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12481689, + "step": 12124, + "time_per_iteration": 2.7618398666381836 + }, + { + "auxiliary_loss_clip": 0.01337503, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.2306385, + "balance_loss_mlp": 1.01997423, + "epoch": 0.7289944385991282, + "flos": 24905464867440.0, + "grad_norm": 1.8249632293674372, + "language_loss": 0.67557502, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69927561, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12567139, + "step": 12125, + "time_per_iteration": 2.8120131492614746 + }, + { + "auxiliary_loss_clip": 0.0134546, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.2326324, + "balance_loss_mlp": 1.02727425, + "epoch": 0.7290545618517962, + "flos": 22972351117680.0, + "grad_norm": 2.0563856567554355, + "language_loss": 0.75944078, + "learning_rate": 7.217821172172855e-07, + "loss": 0.78331399, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14593506, + "step": 12126, + "time_per_iteration": 2.7937090396881104 + }, + { + "auxiliary_loss_clip": 0.01156269, + "auxiliary_loss_mlp": 0.01010016, + "balance_loss_clip": 1.11421824, + "balance_loss_mlp": 1.00782228, + "epoch": 0.7291146851044642, + "flos": 61917199019640.0, + "grad_norm": 0.8299747109677581, + "language_loss": 0.58681405, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60847694, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.02197266, + "step": 12127, + "time_per_iteration": 4.620906829833984 + }, + { + "auxiliary_loss_clip": 0.01335246, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.22972107, + "balance_loss_mlp": 1.01882172, + "epoch": 0.7291748083571321, + "flos": 23336154036360.0, + "grad_norm": 1.9763046045282489, + "language_loss": 0.69068491, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71434808, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12249756, + "step": 12128, + "time_per_iteration": 4.2821526527404785 + }, + { + "auxiliary_loss_clip": 0.01343746, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.23412704, + "balance_loss_mlp": 1.02125835, + "epoch": 0.7292349316098001, + "flos": 28335924788400.0, + "grad_norm": 1.9849935925803472, + "language_loss": 0.65616018, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67994732, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13720703, + "step": 12129, + "time_per_iteration": 2.867527484893799 + }, + { + "auxiliary_loss_clip": 0.01331875, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.22685313, + "balance_loss_mlp": 1.01498127, + "epoch": 0.729295054862468, + "flos": 24137348868000.0, + "grad_norm": 1.9377530419995384, + "language_loss": 0.74486065, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76845545, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.1262207, + "step": 12130, + "time_per_iteration": 2.7524125576019287 + }, + { + "auxiliary_loss_clip": 0.01338798, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.23022389, + "balance_loss_mlp": 1.01835275, + "epoch": 0.7293551781151361, + "flos": 22820868642960.0, + "grad_norm": 1.5821962793095734, + "language_loss": 0.70371395, + "learning_rate": 7.202850168478374e-07, + "loss": 0.72741622, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1307373, + "step": 12131, + "time_per_iteration": 2.9184458255767822 + }, + { + "auxiliary_loss_clip": 0.01334172, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.22832763, + "balance_loss_mlp": 1.01766253, + "epoch": 0.729415301367804, + "flos": 22131701941320.0, + "grad_norm": 1.4952207699356515, + "language_loss": 0.77510321, + "learning_rate": 7.199857423093025e-07, + "loss": 0.7987439, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12231445, + "step": 12132, + "time_per_iteration": 2.842106342315674 + }, + { + "auxiliary_loss_clip": 0.01343285, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.23609483, + "balance_loss_mlp": 1.02551389, + "epoch": 0.729475424620472, + "flos": 12353495937120.0, + "grad_norm": 2.1659099872832397, + "language_loss": 0.80025202, + "learning_rate": 7.196865163090358e-07, + "loss": 0.82406563, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12561035, + "step": 12133, + "time_per_iteration": 2.7485969066619873 + }, + { + "auxiliary_loss_clip": 0.01339591, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.23256636, + "balance_loss_mlp": 1.01783895, + "epoch": 0.7295355478731399, + "flos": 22199768198640.0, + "grad_norm": 1.8945519578352574, + "language_loss": 0.72345817, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74715799, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12567139, + "step": 12134, + "time_per_iteration": 2.77654767036438 + }, + { + "auxiliary_loss_clip": 0.0134256, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.23411465, + "balance_loss_mlp": 1.02525663, + "epoch": 0.7295956711258079, + "flos": 23227293358440.0, + "grad_norm": 1.6011220991717658, + "language_loss": 0.71397364, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73778701, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13525391, + "step": 12135, + "time_per_iteration": 4.32539439201355 + }, + { + "auxiliary_loss_clip": 0.01343404, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.23418796, + "balance_loss_mlp": 1.01977539, + "epoch": 0.7296557943784758, + "flos": 31875489037440.0, + "grad_norm": 1.8527049030973466, + "language_loss": 0.63007963, + "learning_rate": 7.187891296513075e-07, + "loss": 0.65383899, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12756348, + "step": 12136, + "time_per_iteration": 2.8372955322265625 + }, + { + "auxiliary_loss_clip": 0.01339251, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.23177612, + "balance_loss_mlp": 1.02597427, + "epoch": 0.7297159176311439, + "flos": 26657428412520.0, + "grad_norm": 1.8468517039869874, + "language_loss": 0.7485882, + "learning_rate": 7.184900979175654e-07, + "loss": 0.77237093, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13049316, + "step": 12137, + "time_per_iteration": 2.831974983215332 + }, + { + "auxiliary_loss_clip": 0.01341587, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.2335676, + "balance_loss_mlp": 1.01963615, + "epoch": 0.7297760408838118, + "flos": 24754388476320.0, + "grad_norm": 1.5568725483597576, + "language_loss": 0.74215543, + "learning_rate": 7.181911147788069e-07, + "loss": 0.7658965, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12896729, + "step": 12138, + "time_per_iteration": 2.784648895263672 + }, + { + "auxiliary_loss_clip": 0.01336478, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.2300235, + "balance_loss_mlp": 1.01629829, + "epoch": 0.7298361641364798, + "flos": 18077867507880.0, + "grad_norm": 2.907778840736602, + "language_loss": 0.71614814, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73980761, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.13171387, + "step": 12139, + "time_per_iteration": 2.771308183670044 + }, + { + "auxiliary_loss_clip": 0.01337852, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.23290682, + "balance_loss_mlp": 1.01850367, + "epoch": 0.7298962873891478, + "flos": 29901296608560.0, + "grad_norm": 1.7502552546754295, + "language_loss": 0.73740548, + "learning_rate": 7.175932943315898e-07, + "loss": 0.76108992, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12084961, + "step": 12140, + "time_per_iteration": 2.839245557785034 + }, + { + "auxiliary_loss_clip": 0.01342882, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.23320508, + "balance_loss_mlp": 1.01803231, + "epoch": 0.7299564106418157, + "flos": 32271964704720.0, + "grad_norm": 1.4451908161596347, + "language_loss": 0.55481875, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57855916, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13122559, + "step": 12141, + "time_per_iteration": 2.8532919883728027 + }, + { + "auxiliary_loss_clip": 0.01331868, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.22809052, + "balance_loss_mlp": 1.01672089, + "epoch": 0.7300165338944837, + "flos": 22935536316360.0, + "grad_norm": 1.4693778465067786, + "language_loss": 0.72698838, + "learning_rate": 7.169956684003342e-07, + "loss": 0.75059742, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.12310791, + "step": 12142, + "time_per_iteration": 2.7704854011535645 + }, + { + "auxiliary_loss_clip": 0.01336152, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.22982323, + "balance_loss_mlp": 1.01760137, + "epoch": 0.7300766571471516, + "flos": 19833607630440.0, + "grad_norm": 1.7741198853958255, + "language_loss": 0.73692656, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76058578, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.1217041, + "step": 12143, + "time_per_iteration": 2.893864393234253 + }, + { + "auxiliary_loss_clip": 0.01341942, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.2343781, + "balance_loss_mlp": 1.01686823, + "epoch": 0.7301367803998197, + "flos": 24352552505520.0, + "grad_norm": 1.9395388568753038, + "language_loss": 0.67382646, + "learning_rate": 7.163982370756882e-07, + "loss": 0.69754708, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13262939, + "step": 12144, + "time_per_iteration": 2.8871405124664307 + }, + { + "auxiliary_loss_clip": 0.01342321, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.23340309, + "balance_loss_mlp": 1.01702631, + "epoch": 0.7301969036524876, + "flos": 15308733934800.0, + "grad_norm": 1.8430593312875612, + "language_loss": 0.7922011, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81592435, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12969971, + "step": 12145, + "time_per_iteration": 2.740816831588745 + }, + { + "auxiliary_loss_clip": 0.01335845, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.22885609, + "balance_loss_mlp": 1.01919699, + "epoch": 0.7302570269051556, + "flos": 23511700287360.0, + "grad_norm": 1.7506057823254564, + "language_loss": 0.91699749, + "learning_rate": 7.158010004482702e-07, + "loss": 0.94068491, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13684082, + "step": 12146, + "time_per_iteration": 2.7984001636505127 + }, + { + "auxiliary_loss_clip": 0.01332691, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.22838259, + "balance_loss_mlp": 1.01509905, + "epoch": 0.7303171501578235, + "flos": 20528094027240.0, + "grad_norm": 1.5451256658876795, + "language_loss": 0.62217784, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64577961, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12384033, + "step": 12147, + "time_per_iteration": 2.792473793029785 + }, + { + "auxiliary_loss_clip": 0.01345779, + "auxiliary_loss_mlp": 0.01035146, + "balance_loss_clip": 1.23656321, + "balance_loss_mlp": 1.02135992, + "epoch": 0.7303772734104915, + "flos": 18337195451520.0, + "grad_norm": 1.8759268623788803, + "language_loss": 0.75451577, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77832508, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13781738, + "step": 12148, + "time_per_iteration": 2.7946391105651855 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.00999001, + "balance_loss_clip": 1.11475897, + "balance_loss_mlp": 0.99678355, + "epoch": 0.7304373966631594, + "flos": 60669475394040.0, + "grad_norm": 0.7999736356580981, + "language_loss": 0.5673781, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58893859, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.0222168, + "step": 12149, + "time_per_iteration": 3.1963276863098145 + }, + { + "auxiliary_loss_clip": 0.01345794, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.23639059, + "balance_loss_mlp": 1.02004361, + "epoch": 0.7304975199158275, + "flos": 19832795463240.0, + "grad_norm": 1.685616821527034, + "language_loss": 0.74311829, + "learning_rate": 7.146071116474451e-07, + "loss": 0.7669121, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13555908, + "step": 12150, + "time_per_iteration": 2.8442978858947754 + }, + { + "auxiliary_loss_clip": 0.01346453, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.23653591, + "balance_loss_mlp": 1.02068043, + "epoch": 0.7305576431684954, + "flos": 13227833071080.0, + "grad_norm": 2.0323037610444827, + "language_loss": 0.84324563, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86704719, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13012695, + "step": 12151, + "time_per_iteration": 2.9518380165100098 + }, + { + "auxiliary_loss_clip": 0.01342533, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.23353672, + "balance_loss_mlp": 1.02260578, + "epoch": 0.7306177664211634, + "flos": 24065749683360.0, + "grad_norm": 1.8387875147506305, + "language_loss": 0.78274375, + "learning_rate": 7.14010459655127e-07, + "loss": 0.8065291, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13415527, + "step": 12152, + "time_per_iteration": 2.784770965576172 + }, + { + "auxiliary_loss_clip": 0.01341496, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.2328999, + "balance_loss_mlp": 1.02040112, + "epoch": 0.7306778896738314, + "flos": 27094820325480.0, + "grad_norm": 1.4690895766047103, + "language_loss": 0.79843175, + "learning_rate": 7.137122068005919e-07, + "loss": 0.82218343, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13256836, + "step": 12153, + "time_per_iteration": 2.8114922046661377 + }, + { + "auxiliary_loss_clip": 0.01346959, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.23575783, + "balance_loss_mlp": 1.02151155, + "epoch": 0.7307380129264993, + "flos": 16695026576640.0, + "grad_norm": 1.790317577333078, + "language_loss": 0.67825854, + "learning_rate": 7.134140027222173e-07, + "loss": 0.70207596, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13275146, + "step": 12154, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0134588, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.23579443, + "balance_loss_mlp": 1.01656103, + "epoch": 0.7307981361791673, + "flos": 21730759354440.0, + "grad_norm": 1.920294909283068, + "language_loss": 0.66267681, + "learning_rate": 7.131158474313128e-07, + "loss": 0.6864351, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.13378906, + "step": 12155, + "time_per_iteration": 2.8437657356262207 + }, + { + "auxiliary_loss_clip": 0.01332841, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.22805119, + "balance_loss_mlp": 1.01810718, + "epoch": 0.7308582594318352, + "flos": 18045194759280.0, + "grad_norm": 1.790925136794703, + "language_loss": 0.82022452, + "learning_rate": 7.128177409391851e-07, + "loss": 0.84386426, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.13031006, + "step": 12156, + "time_per_iteration": 2.725797653198242 + }, + { + "auxiliary_loss_clip": 0.01334032, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.22794795, + "balance_loss_mlp": 1.02236772, + "epoch": 0.7309183826845033, + "flos": 13848974123760.0, + "grad_norm": 2.0470889948246827, + "language_loss": 0.75469053, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77837658, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12219238, + "step": 12157, + "time_per_iteration": 2.7564027309417725 + }, + { + "auxiliary_loss_clip": 0.01330478, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.22656083, + "balance_loss_mlp": 1.01685596, + "epoch": 0.7309785059371712, + "flos": 17023639028400.0, + "grad_norm": 2.1655362787841512, + "language_loss": 0.73055375, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75414377, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11682129, + "step": 12158, + "time_per_iteration": 2.7856903076171875 + }, + { + "auxiliary_loss_clip": 0.01343393, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.2343322, + "balance_loss_mlp": 1.01688087, + "epoch": 0.7310386291898392, + "flos": 26507651488920.0, + "grad_norm": 1.9195312727280438, + "language_loss": 0.85592067, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87965506, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1317749, + "step": 12159, + "time_per_iteration": 2.82037091255188 + }, + { + "auxiliary_loss_clip": 0.0135152, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.23929679, + "balance_loss_mlp": 1.01718712, + "epoch": 0.7310987524425071, + "flos": 16950293684280.0, + "grad_norm": 2.2673578887149346, + "language_loss": 0.73693931, + "learning_rate": 7.116258031844895e-07, + "loss": 0.76076853, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14227295, + "step": 12160, + "time_per_iteration": 2.784881830215454 + }, + { + "auxiliary_loss_clip": 0.01348297, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.23622239, + "balance_loss_mlp": 1.01600194, + "epoch": 0.7311588756951751, + "flos": 13849339599000.0, + "grad_norm": 1.8045768972241567, + "language_loss": 0.73089421, + "learning_rate": 7.113279408557675e-07, + "loss": 0.7546761, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13879395, + "step": 12161, + "time_per_iteration": 2.79365873336792 + }, + { + "auxiliary_loss_clip": 0.01354737, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.24039137, + "balance_loss_mlp": 1.01684618, + "epoch": 0.731218998947843, + "flos": 28773885218400.0, + "grad_norm": 1.7770999723529317, + "language_loss": 0.70237267, + "learning_rate": 7.110301273936192e-07, + "loss": 0.72622609, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.13757324, + "step": 12162, + "time_per_iteration": 2.8671023845672607 + }, + { + "auxiliary_loss_clip": 0.01345993, + "auxiliary_loss_mlp": 0.0102671, + "balance_loss_clip": 1.23568189, + "balance_loss_mlp": 1.0134778, + "epoch": 0.7312791222005111, + "flos": 27094454850240.0, + "grad_norm": 1.9491205742362965, + "language_loss": 0.67632598, + "learning_rate": 7.107323628093382e-07, + "loss": 0.70005298, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13238525, + "step": 12163, + "time_per_iteration": 2.811547040939331 + }, + { + "auxiliary_loss_clip": 0.01340129, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.23167872, + "balance_loss_mlp": 1.01794338, + "epoch": 0.731339245453179, + "flos": 20929077222480.0, + "grad_norm": 1.5416907824418624, + "language_loss": 0.68772483, + "learning_rate": 7.104346471142153e-07, + "loss": 0.71144438, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13897705, + "step": 12164, + "time_per_iteration": 2.807384490966797 + }, + { + "auxiliary_loss_clip": 0.01336394, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.23223615, + "balance_loss_mlp": 1.01805604, + "epoch": 0.731399368705847, + "flos": 23080886928720.0, + "grad_norm": 1.6620459182085119, + "language_loss": 0.73324966, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75691581, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12164307, + "step": 12165, + "time_per_iteration": 4.20534610748291 + }, + { + "auxiliary_loss_clip": 0.01343008, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.23404849, + "balance_loss_mlp": 1.02026379, + "epoch": 0.731459491958515, + "flos": 23587441524720.0, + "grad_norm": 1.985172801057102, + "language_loss": 0.76999015, + "learning_rate": 7.098393624365988e-07, + "loss": 0.79375738, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13452148, + "step": 12166, + "time_per_iteration": 4.511265754699707 + }, + { + "auxiliary_loss_clip": 0.01330678, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.22539186, + "balance_loss_mlp": 1.0183804, + "epoch": 0.7315196152111829, + "flos": 22383964030320.0, + "grad_norm": 1.8617580325832848, + "language_loss": 0.80225933, + "learning_rate": 7.095417934766781e-07, + "loss": 0.82588017, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.13031006, + "step": 12167, + "time_per_iteration": 4.26976203918457 + }, + { + "auxiliary_loss_clip": 0.01330399, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.22472072, + "balance_loss_mlp": 1.02395701, + "epoch": 0.7315797384638509, + "flos": 26182653181200.0, + "grad_norm": 1.5720146216473436, + "language_loss": 0.77242148, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79609174, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12677002, + "step": 12168, + "time_per_iteration": 2.8786370754241943 + }, + { + "auxiliary_loss_clip": 0.01342218, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.23177922, + "balance_loss_mlp": 1.0183742, + "epoch": 0.7316398617165188, + "flos": 21511251230760.0, + "grad_norm": 1.714228684490103, + "language_loss": 0.82082117, + "learning_rate": 7.089468023710326e-07, + "loss": 0.84456068, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13342285, + "step": 12169, + "time_per_iteration": 2.7905843257904053 + }, + { + "auxiliary_loss_clip": 0.01340895, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.23079276, + "balance_loss_mlp": 1.02113867, + "epoch": 0.7316999849691869, + "flos": 30488749703640.0, + "grad_norm": 1.6232165733180417, + "language_loss": 0.70569777, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72944868, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13037109, + "step": 12170, + "time_per_iteration": 2.817500114440918 + }, + { + "auxiliary_loss_clip": 0.01333042, + "auxiliary_loss_mlp": 0.01027307, + "balance_loss_clip": 1.22519112, + "balance_loss_mlp": 1.01300788, + "epoch": 0.7317601082218548, + "flos": 21548837590920.0, + "grad_norm": 1.9095249467931268, + "language_loss": 0.69925171, + "learning_rate": 7.083520070928533e-07, + "loss": 0.72285521, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.14282227, + "step": 12171, + "time_per_iteration": 2.832465171813965 + }, + { + "auxiliary_loss_clip": 0.01333762, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.22633624, + "balance_loss_mlp": 1.01969647, + "epoch": 0.7318202314745228, + "flos": 33257639626560.0, + "grad_norm": 1.8916080496906373, + "language_loss": 0.66215897, + "learning_rate": 7.080546829172564e-07, + "loss": 0.68582201, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.128479, + "step": 12172, + "time_per_iteration": 2.8483588695526123 + }, + { + "auxiliary_loss_clip": 0.01336472, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.22775471, + "balance_loss_mlp": 1.01225102, + "epoch": 0.7318803547271907, + "flos": 20161895215320.0, + "grad_norm": 2.461536417408324, + "language_loss": 0.61663061, + "learning_rate": 7.077574077323564e-07, + "loss": 0.64024532, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12750244, + "step": 12173, + "time_per_iteration": 4.2757508754730225 + }, + { + "auxiliary_loss_clip": 0.01331063, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.22423077, + "balance_loss_mlp": 1.01261759, + "epoch": 0.7319404779798587, + "flos": 20563446927600.0, + "grad_norm": 4.88784407196692, + "language_loss": 0.7477774, + "learning_rate": 7.074601815494243e-07, + "loss": 0.77134097, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12670898, + "step": 12174, + "time_per_iteration": 2.7977776527404785 + }, + { + "auxiliary_loss_clip": 0.01334302, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.22920513, + "balance_loss_mlp": 1.01186836, + "epoch": 0.7320006012325266, + "flos": 28701433258200.0, + "grad_norm": 1.5169152570950037, + "language_loss": 0.80877078, + "learning_rate": 7.071630043797317e-07, + "loss": 0.8323555, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.1229248, + "step": 12175, + "time_per_iteration": 2.785287618637085 + }, + { + "auxiliary_loss_clip": 0.01335123, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.22722864, + "balance_loss_mlp": 1.0155381, + "epoch": 0.7320607244851947, + "flos": 16367063858640.0, + "grad_norm": 2.0477450161671737, + "language_loss": 0.76862651, + "learning_rate": 7.068658762345488e-07, + "loss": 0.79226196, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12884521, + "step": 12176, + "time_per_iteration": 2.749937057495117 + }, + { + "auxiliary_loss_clip": 0.0133281, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.22681391, + "balance_loss_mlp": 1.02014267, + "epoch": 0.7321208477378626, + "flos": 20959435294560.0, + "grad_norm": 1.4536556616282237, + "language_loss": 0.76699924, + "learning_rate": 7.065687971251399e-07, + "loss": 0.79065937, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.13067627, + "step": 12177, + "time_per_iteration": 2.750506639480591 + }, + { + "auxiliary_loss_clip": 0.01326664, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.22001505, + "balance_loss_mlp": 1.02084684, + "epoch": 0.7321809709905306, + "flos": 13849095948840.0, + "grad_norm": 1.9198283391653488, + "language_loss": 0.74318641, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76678228, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12078857, + "step": 12178, + "time_per_iteration": 2.90246319770813 + }, + { + "auxiliary_loss_clip": 0.01337042, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.22760248, + "balance_loss_mlp": 1.0236944, + "epoch": 0.7322410942431986, + "flos": 26985675389040.0, + "grad_norm": 1.914912247441805, + "language_loss": 0.82511866, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84885323, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12744141, + "step": 12179, + "time_per_iteration": 2.8725807666778564 + }, + { + "auxiliary_loss_clip": 0.01325614, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.22401345, + "balance_loss_mlp": 1.01834965, + "epoch": 0.7323012174958665, + "flos": 17644252172400.0, + "grad_norm": 1.6367645427200026, + "language_loss": 0.74744928, + "learning_rate": 7.056778541242115e-07, + "loss": 0.77100515, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.1161499, + "step": 12180, + "time_per_iteration": 2.747018814086914 + }, + { + "auxiliary_loss_clip": 0.01338983, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.22689342, + "balance_loss_mlp": 1.01836991, + "epoch": 0.7323613407485345, + "flos": 32349127234680.0, + "grad_norm": 2.270705386919084, + "language_loss": 0.79635251, + "learning_rate": 7.053809712705396e-07, + "loss": 0.82006145, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13555908, + "step": 12181, + "time_per_iteration": 2.91794753074646 + }, + { + "auxiliary_loss_clip": 0.01343851, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.2342124, + "balance_loss_mlp": 1.01959562, + "epoch": 0.7324214640012024, + "flos": 18366538314600.0, + "grad_norm": 1.6802108078935534, + "language_loss": 0.7232213, + "learning_rate": 7.050841375089506e-07, + "loss": 0.74698418, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12835693, + "step": 12182, + "time_per_iteration": 2.7416045665740967 + }, + { + "auxiliary_loss_clip": 0.01336702, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.22848213, + "balance_loss_mlp": 1.02067327, + "epoch": 0.7324815872538705, + "flos": 30818702231280.0, + "grad_norm": 1.501361021905092, + "language_loss": 0.71441638, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73811722, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12719727, + "step": 12183, + "time_per_iteration": 2.877697467803955 + }, + { + "auxiliary_loss_clip": 0.01344973, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.23439181, + "balance_loss_mlp": 1.0222404, + "epoch": 0.7325417105065384, + "flos": 21509829938160.0, + "grad_norm": 1.9433935405807479, + "language_loss": 0.72974062, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75355268, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13977051, + "step": 12184, + "time_per_iteration": 2.7586452960968018 + }, + { + "auxiliary_loss_clip": 0.01157049, + "auxiliary_loss_mlp": 0.01000462, + "balance_loss_clip": 1.11227119, + "balance_loss_mlp": 0.99795818, + "epoch": 0.7326018337592064, + "flos": 67273178927040.0, + "grad_norm": 0.762615071271842, + "language_loss": 0.65279067, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67436576, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02502441, + "step": 12185, + "time_per_iteration": 3.2472290992736816 + }, + { + "auxiliary_loss_clip": 0.01338428, + "auxiliary_loss_mlp": 0.01024818, + "balance_loss_clip": 1.22840428, + "balance_loss_mlp": 1.01179993, + "epoch": 0.7326619570118743, + "flos": 22862272188960.0, + "grad_norm": 1.8491926996739543, + "language_loss": 0.8046959, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82832837, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13024902, + "step": 12186, + "time_per_iteration": 2.809157371520996 + }, + { + "auxiliary_loss_clip": 0.01337503, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.22655225, + "balance_loss_mlp": 1.01861525, + "epoch": 0.7327220802645423, + "flos": 23332255633800.0, + "grad_norm": 1.5821329185479742, + "language_loss": 0.73664218, + "learning_rate": 7.036007054761508e-07, + "loss": 0.76034415, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.140625, + "step": 12187, + "time_per_iteration": 2.7873952388763428 + }, + { + "auxiliary_loss_clip": 0.01337789, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.22885489, + "balance_loss_mlp": 1.02253246, + "epoch": 0.7327822035172102, + "flos": 23185280687040.0, + "grad_norm": 2.0119591846374094, + "language_loss": 0.88967896, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91341245, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13024902, + "step": 12188, + "time_per_iteration": 2.84440016746521 + }, + { + "auxiliary_loss_clip": 0.01341092, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.23018217, + "balance_loss_mlp": 1.01864994, + "epoch": 0.7328423267698783, + "flos": 21071219774400.0, + "grad_norm": 1.8419868571242148, + "language_loss": 0.75216466, + "learning_rate": 7.030076767014284e-07, + "loss": 0.77589655, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13452148, + "step": 12189, + "time_per_iteration": 2.771441698074341 + }, + { + "auxiliary_loss_clip": 0.01342191, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.23135614, + "balance_loss_mlp": 1.01938677, + "epoch": 0.7329024500225462, + "flos": 21694634895240.0, + "grad_norm": 1.4965792810519438, + "language_loss": 0.82416785, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84791303, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12927246, + "step": 12190, + "time_per_iteration": 2.877183437347412 + }, + { + "auxiliary_loss_clip": 0.01337915, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.22831595, + "balance_loss_mlp": 1.02040768, + "epoch": 0.7329625732752142, + "flos": 24168559715640.0, + "grad_norm": 1.626300944255319, + "language_loss": 0.71933341, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74305922, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14251709, + "step": 12191, + "time_per_iteration": 2.752126455307007 + }, + { + "auxiliary_loss_clip": 0.01333616, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.22552228, + "balance_loss_mlp": 1.01970744, + "epoch": 0.7330226965278822, + "flos": 30083624455680.0, + "grad_norm": 1.6537181266302077, + "language_loss": 0.69540071, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71906948, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13574219, + "step": 12192, + "time_per_iteration": 2.854884386062622 + }, + { + "auxiliary_loss_clip": 0.01334761, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.22664309, + "balance_loss_mlp": 1.01492596, + "epoch": 0.7330828197805501, + "flos": 23373618571440.0, + "grad_norm": 1.519566314264574, + "language_loss": 0.73303521, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75665838, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12640381, + "step": 12193, + "time_per_iteration": 2.7844045162200928 + }, + { + "auxiliary_loss_clip": 0.01337107, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.22692883, + "balance_loss_mlp": 1.02079487, + "epoch": 0.7331429430332181, + "flos": 21037775466960.0, + "grad_norm": 1.5951295751292476, + "language_loss": 0.77547765, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79918653, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13006592, + "step": 12194, + "time_per_iteration": 2.8141603469848633 + }, + { + "auxiliary_loss_clip": 0.01333494, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.22611105, + "balance_loss_mlp": 1.01829624, + "epoch": 0.733203066285886, + "flos": 14652036939960.0, + "grad_norm": 1.7039651808926506, + "language_loss": 0.70695102, + "learning_rate": 7.012297711067998e-07, + "loss": 0.73060858, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13946533, + "step": 12195, + "time_per_iteration": 2.777909755706787 + }, + { + "auxiliary_loss_clip": 0.01335449, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.22616637, + "balance_loss_mlp": 1.02192998, + "epoch": 0.7332631895385541, + "flos": 17169761199600.0, + "grad_norm": 1.8294411413432397, + "language_loss": 0.72285098, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74655151, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12677002, + "step": 12196, + "time_per_iteration": 2.8141491413116455 + }, + { + "auxiliary_loss_clip": 0.01329459, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.22281909, + "balance_loss_mlp": 1.01842237, + "epoch": 0.733323312791222, + "flos": 28664090548200.0, + "grad_norm": 1.5743486407702951, + "language_loss": 0.71921295, + "learning_rate": 7.006375297847394e-07, + "loss": 0.74282366, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13195801, + "step": 12197, + "time_per_iteration": 2.8526175022125244 + }, + { + "auxiliary_loss_clip": 0.01348415, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.23420203, + "balance_loss_mlp": 1.02152193, + "epoch": 0.73338343604389, + "flos": 16622858874960.0, + "grad_norm": 1.9328813159358393, + "language_loss": 0.78459054, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80843687, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14697266, + "step": 12198, + "time_per_iteration": 2.7625930309295654 + }, + { + "auxiliary_loss_clip": 0.0133541, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.22699022, + "balance_loss_mlp": 1.02000999, + "epoch": 0.7334435592965579, + "flos": 21147042228480.0, + "grad_norm": 1.7994873703377132, + "language_loss": 0.7524451, + "learning_rate": 7.000454855504974e-07, + "loss": 0.77612865, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12939453, + "step": 12199, + "time_per_iteration": 2.846630096435547 + }, + { + "auxiliary_loss_clip": 0.01342556, + "auxiliary_loss_mlp": 0.01035433, + "balance_loss_clip": 1.23060226, + "balance_loss_mlp": 1.02161098, + "epoch": 0.7335036825492259, + "flos": 17129332254240.0, + "grad_norm": 2.295123568360772, + "language_loss": 0.77068484, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79446471, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13818359, + "step": 12200, + "time_per_iteration": 2.7824485301971436 + }, + { + "auxiliary_loss_clip": 0.01336316, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.22854257, + "balance_loss_mlp": 1.02211809, + "epoch": 0.7335638058018938, + "flos": 23737015406520.0, + "grad_norm": 1.5990626245309776, + "language_loss": 0.61328489, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63699996, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13079834, + "step": 12201, + "time_per_iteration": 2.8858416080474854 + }, + { + "auxiliary_loss_clip": 0.01328389, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.22208679, + "balance_loss_mlp": 1.01340652, + "epoch": 0.7336239290545619, + "flos": 34939993796640.0, + "grad_norm": 1.7881325949596911, + "language_loss": 0.52307379, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54661435, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12255859, + "step": 12202, + "time_per_iteration": 3.0624330043792725 + }, + { + "auxiliary_loss_clip": 0.01333446, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.22583723, + "balance_loss_mlp": 1.01544178, + "epoch": 0.7336840523072298, + "flos": 21107669100480.0, + "grad_norm": 1.8108673116062324, + "language_loss": 0.688196, + "learning_rate": 6.98861988704645e-07, + "loss": 0.71181667, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13171387, + "step": 12203, + "time_per_iteration": 2.8284473419189453 + }, + { + "auxiliary_loss_clip": 0.01347914, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.23465323, + "balance_loss_mlp": 1.02103043, + "epoch": 0.7337441755598978, + "flos": 24029747049240.0, + "grad_norm": 1.9214992651415321, + "language_loss": 0.65798301, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68180585, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13342285, + "step": 12204, + "time_per_iteration": 4.206365585327148 + }, + { + "auxiliary_loss_clip": 0.01331187, + "auxiliary_loss_mlp": 0.01033479, + "balance_loss_clip": 1.22533393, + "balance_loss_mlp": 1.02131402, + "epoch": 0.7338042988125658, + "flos": 22716515493000.0, + "grad_norm": 1.7778028628352232, + "language_loss": 0.77662194, + "learning_rate": 6.982705362725479e-07, + "loss": 0.80026853, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1217041, + "step": 12205, + "time_per_iteration": 4.372737884521484 + }, + { + "auxiliary_loss_clip": 0.01331804, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.22633982, + "balance_loss_mlp": 1.0172447, + "epoch": 0.7338644220652337, + "flos": 21365981835120.0, + "grad_norm": 1.473981938780351, + "language_loss": 0.80079508, + "learning_rate": 6.979748840934601e-07, + "loss": 0.8244068, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12133789, + "step": 12206, + "time_per_iteration": 2.780944585800171 + }, + { + "auxiliary_loss_clip": 0.01335232, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.22625589, + "balance_loss_mlp": 1.01629901, + "epoch": 0.7339245453179017, + "flos": 30926466483480.0, + "grad_norm": 1.9219866772980423, + "language_loss": 0.72183663, + "learning_rate": 6.976792812872958e-07, + "loss": 0.74547911, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12719727, + "step": 12207, + "time_per_iteration": 2.8678977489471436 + }, + { + "auxiliary_loss_clip": 0.01161315, + "auxiliary_loss_mlp": 0.01009826, + "balance_loss_clip": 1.11472404, + "balance_loss_mlp": 1.0074414, + "epoch": 0.7339846685705697, + "flos": 67912982844120.0, + "grad_norm": 0.7753713976535536, + "language_loss": 0.54876792, + "learning_rate": 6.97383727865263e-07, + "loss": 0.57047927, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.02380371, + "step": 12208, + "time_per_iteration": 3.3165431022644043 + }, + { + "auxiliary_loss_clip": 0.01334103, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.22558987, + "balance_loss_mlp": 1.01913953, + "epoch": 0.7340447918232377, + "flos": 22241862086760.0, + "grad_norm": 1.3317101754516285, + "language_loss": 0.80611646, + "learning_rate": 6.970882238385703e-07, + "loss": 0.8297655, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.11651611, + "step": 12209, + "time_per_iteration": 2.749173164367676 + }, + { + "auxiliary_loss_clip": 0.0133186, + "auxiliary_loss_mlp": 0.01028233, + "balance_loss_clip": 1.22488046, + "balance_loss_mlp": 1.01559079, + "epoch": 0.7341049150759056, + "flos": 23769525721680.0, + "grad_norm": 1.3825590127584846, + "language_loss": 0.78904724, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81264818, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12658691, + "step": 12210, + "time_per_iteration": 2.8152878284454346 + }, + { + "auxiliary_loss_clip": 0.01332141, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.22478914, + "balance_loss_mlp": 1.01421928, + "epoch": 0.7341650383285736, + "flos": 17240791867200.0, + "grad_norm": 1.6162442655490958, + "language_loss": 0.76600969, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78960681, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13354492, + "step": 12211, + "time_per_iteration": 2.8121533393859863 + }, + { + "auxiliary_loss_clip": 0.01336744, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.22815847, + "balance_loss_mlp": 1.0196861, + "epoch": 0.7342251615812415, + "flos": 23409174513600.0, + "grad_norm": 2.1794457997190957, + "language_loss": 0.72857225, + "learning_rate": 6.962020082425748e-07, + "loss": 0.75226629, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13000488, + "step": 12212, + "time_per_iteration": 4.33378267288208 + }, + { + "auxiliary_loss_clip": 0.01334578, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.22701299, + "balance_loss_mlp": 1.01787865, + "epoch": 0.7342852848339095, + "flos": 22752315085320.0, + "grad_norm": 1.6933230724757657, + "language_loss": 0.68863612, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71228594, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12512207, + "step": 12213, + "time_per_iteration": 2.8600287437438965 + }, + { + "auxiliary_loss_clip": 0.01160108, + "auxiliary_loss_mlp": 0.01007125, + "balance_loss_clip": 1.11361814, + "balance_loss_mlp": 1.0045737, + "epoch": 0.7343454080865774, + "flos": 53956302057720.0, + "grad_norm": 0.7806451429207524, + "language_loss": 0.54285991, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56453216, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.0255127, + "step": 12214, + "time_per_iteration": 3.123809337615967 + }, + { + "auxiliary_loss_clip": 0.01344347, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.23146939, + "balance_loss_mlp": 1.01706219, + "epoch": 0.7344055313392455, + "flos": 12170680789680.0, + "grad_norm": 1.925324473726449, + "language_loss": 0.70460558, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72834915, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.12963867, + "step": 12215, + "time_per_iteration": 2.8225371837615967 + }, + { + "auxiliary_loss_clip": 0.01327611, + "auxiliary_loss_mlp": 0.01026548, + "balance_loss_clip": 1.22236419, + "balance_loss_mlp": 1.01454318, + "epoch": 0.7344656545919134, + "flos": 18554470115400.0, + "grad_norm": 1.6179304149296083, + "language_loss": 0.73155117, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75509274, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12005615, + "step": 12216, + "time_per_iteration": 2.7490968704223633 + }, + { + "auxiliary_loss_clip": 0.01346763, + "auxiliary_loss_mlp": 0.01035013, + "balance_loss_clip": 1.23169136, + "balance_loss_mlp": 1.02019489, + "epoch": 0.7345257778445814, + "flos": 23667202989720.0, + "grad_norm": 1.666323777176504, + "language_loss": 0.78149194, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80530965, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14807129, + "step": 12217, + "time_per_iteration": 2.8151440620422363 + }, + { + "auxiliary_loss_clip": 0.01326143, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.22002232, + "balance_loss_mlp": 1.01604962, + "epoch": 0.7345859010972494, + "flos": 13812687231120.0, + "grad_norm": 1.7437304307172208, + "language_loss": 0.77789497, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80143642, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11968994, + "step": 12218, + "time_per_iteration": 2.7805657386779785 + }, + { + "auxiliary_loss_clip": 0.01330047, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.22389483, + "balance_loss_mlp": 1.0150646, + "epoch": 0.7346460243499173, + "flos": 22277783504160.0, + "grad_norm": 1.6082215791205825, + "language_loss": 0.72584879, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74943435, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13439941, + "step": 12219, + "time_per_iteration": 2.747467279434204 + }, + { + "auxiliary_loss_clip": 0.01329688, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.2244091, + "balance_loss_mlp": 1.0157665, + "epoch": 0.7347061476025853, + "flos": 23260087932120.0, + "grad_norm": 1.6718032140176513, + "language_loss": 0.7512961, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77487648, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12597656, + "step": 12220, + "time_per_iteration": 2.851153612136841 + }, + { + "auxiliary_loss_clip": 0.0134186, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.22986448, + "balance_loss_mlp": 1.01248932, + "epoch": 0.7347662708552533, + "flos": 15271716091680.0, + "grad_norm": 2.523017687274033, + "language_loss": 0.66163063, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68530023, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 1.11865234, + "router_z_loss_mlp": 0.12615967, + "step": 12221, + "time_per_iteration": 2.7499709129333496 + }, + { + "auxiliary_loss_clip": 0.01333418, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.22487402, + "balance_loss_mlp": 1.01753592, + "epoch": 0.7348263941079213, + "flos": 24864954705360.0, + "grad_norm": 1.8867650973320522, + "language_loss": 0.69401121, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71764493, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12408447, + "step": 12222, + "time_per_iteration": 2.840850591659546 + }, + { + "auxiliary_loss_clip": 0.0132834, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.22171319, + "balance_loss_mlp": 1.01903009, + "epoch": 0.7348865173605892, + "flos": 24357384900360.0, + "grad_norm": 1.568476443253762, + "language_loss": 0.66175914, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68535185, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11907959, + "step": 12223, + "time_per_iteration": 2.8052003383636475 + }, + { + "auxiliary_loss_clip": 0.01333375, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.22568166, + "balance_loss_mlp": 1.01754165, + "epoch": 0.7349466406132572, + "flos": 20008585364400.0, + "grad_norm": 1.7057051388709787, + "language_loss": 0.72321343, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74684983, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12719727, + "step": 12224, + "time_per_iteration": 2.751828908920288 + }, + { + "auxiliary_loss_clip": 0.01339377, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.22979617, + "balance_loss_mlp": 1.01878595, + "epoch": 0.7350067638659251, + "flos": 29831078108160.0, + "grad_norm": 1.6748100041592122, + "language_loss": 0.72619271, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74991053, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1361084, + "step": 12225, + "time_per_iteration": 2.9166817665100098 + }, + { + "auxiliary_loss_clip": 0.01346849, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.23297739, + "balance_loss_mlp": 1.01701713, + "epoch": 0.7350668871185931, + "flos": 21870465404760.0, + "grad_norm": 1.8368564533295648, + "language_loss": 0.76726675, + "learning_rate": 6.920722237226501e-07, + "loss": 0.7910468, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14135742, + "step": 12226, + "time_per_iteration": 2.7420833110809326 + }, + { + "auxiliary_loss_clip": 0.01337968, + "auxiliary_loss_mlp": 0.01023645, + "balance_loss_clip": 1.22863531, + "balance_loss_mlp": 1.01090741, + "epoch": 0.735127010371261, + "flos": 22571408530800.0, + "grad_norm": 1.4813630863446714, + "language_loss": 0.66912401, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69274014, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12738037, + "step": 12227, + "time_per_iteration": 2.80167293548584 + }, + { + "auxiliary_loss_clip": 0.01336495, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.22658992, + "balance_loss_mlp": 1.01838923, + "epoch": 0.7351871336239291, + "flos": 25889799713400.0, + "grad_norm": 1.514153182456888, + "language_loss": 0.63914418, + "learning_rate": 6.914830473380749e-07, + "loss": 0.66281712, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12408447, + "step": 12228, + "time_per_iteration": 2.747699737548828 + }, + { + "auxiliary_loss_clip": 0.01338556, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.22935939, + "balance_loss_mlp": 1.02279401, + "epoch": 0.735247256876597, + "flos": 17936821381680.0, + "grad_norm": 1.5381642742303074, + "language_loss": 0.63189948, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65563649, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1237793, + "step": 12229, + "time_per_iteration": 2.7866671085357666 + }, + { + "auxiliary_loss_clip": 0.01336155, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.22504759, + "balance_loss_mlp": 1.02192628, + "epoch": 0.735307380129265, + "flos": 28880877911760.0, + "grad_norm": 1.6198809313837368, + "language_loss": 0.73976326, + "learning_rate": 6.908940694298726e-07, + "loss": 0.76348245, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13848877, + "step": 12230, + "time_per_iteration": 2.76577091217041 + }, + { + "auxiliary_loss_clip": 0.01334648, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.22499752, + "balance_loss_mlp": 1.01860917, + "epoch": 0.7353675033819329, + "flos": 13629587825160.0, + "grad_norm": 2.2140768331345972, + "language_loss": 0.72297657, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74664545, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13653564, + "step": 12231, + "time_per_iteration": 2.7154083251953125 + }, + { + "auxiliary_loss_clip": 0.01341382, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.23068798, + "balance_loss_mlp": 1.01984763, + "epoch": 0.7354276266346009, + "flos": 19467733685400.0, + "grad_norm": 2.2552752705926946, + "language_loss": 0.64211971, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66587329, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14135742, + "step": 12232, + "time_per_iteration": 2.702512264251709 + }, + { + "auxiliary_loss_clip": 0.01339797, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.23026705, + "balance_loss_mlp": 1.01730108, + "epoch": 0.735487749887269, + "flos": 15775225060680.0, + "grad_norm": 1.7274299960845154, + "language_loss": 0.75922298, + "learning_rate": 6.900109749061874e-07, + "loss": 0.78292567, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13165283, + "step": 12233, + "time_per_iteration": 2.726808547973633 + }, + { + "auxiliary_loss_clip": 0.01333406, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.22446275, + "balance_loss_mlp": 1.0137341, + "epoch": 0.7355478731399369, + "flos": 18265352616720.0, + "grad_norm": 1.7266945116645622, + "language_loss": 0.73809826, + "learning_rate": 6.897167093999079e-07, + "loss": 0.76170075, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13104248, + "step": 12234, + "time_per_iteration": 2.7391443252563477 + }, + { + "auxiliary_loss_clip": 0.01340634, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.22986722, + "balance_loss_mlp": 1.01817513, + "epoch": 0.7356079963926049, + "flos": 26547714959040.0, + "grad_norm": 1.9104581713654125, + "language_loss": 0.59612608, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61984634, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13201904, + "step": 12235, + "time_per_iteration": 2.752615213394165 + }, + { + "auxiliary_loss_clip": 0.01333257, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.22680044, + "balance_loss_mlp": 1.01667905, + "epoch": 0.7356681196452728, + "flos": 10782032862960.0, + "grad_norm": 2.092753969486097, + "language_loss": 0.8612107, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88483423, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12426758, + "step": 12236, + "time_per_iteration": 2.6995396614074707 + }, + { + "auxiliary_loss_clip": 0.01335941, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.22506571, + "balance_loss_mlp": 1.01130784, + "epoch": 0.7357282428979408, + "flos": 19723406876640.0, + "grad_norm": 1.748755838400917, + "language_loss": 0.69625258, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71985757, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13256836, + "step": 12237, + "time_per_iteration": 2.7047431468963623 + }, + { + "auxiliary_loss_clip": 0.01336332, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.22775149, + "balance_loss_mlp": 1.01726079, + "epoch": 0.7357883661506087, + "flos": 19468789502760.0, + "grad_norm": 1.6694776126656328, + "language_loss": 0.72522026, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74888074, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12451172, + "step": 12238, + "time_per_iteration": 2.8546993732452393 + }, + { + "auxiliary_loss_clip": 0.01347134, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.23170137, + "balance_loss_mlp": 1.01643181, + "epoch": 0.7358484894032767, + "flos": 27128386458000.0, + "grad_norm": 1.620076344735698, + "language_loss": 0.72479904, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74856728, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.13262939, + "step": 12239, + "time_per_iteration": 2.780795097351074 + }, + { + "auxiliary_loss_clip": 0.01325517, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.22002769, + "balance_loss_mlp": 1.01499104, + "epoch": 0.7359086126559446, + "flos": 24508136424600.0, + "grad_norm": 2.0427984293741215, + "language_loss": 0.79326481, + "learning_rate": 6.879521601601954e-07, + "loss": 0.8167938, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1239624, + "step": 12240, + "time_per_iteration": 2.8392090797424316 + }, + { + "auxiliary_loss_clip": 0.01332073, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.22513652, + "balance_loss_mlp": 1.01910245, + "epoch": 0.7359687359086127, + "flos": 23336519511600.0, + "grad_norm": 2.152697227690958, + "language_loss": 0.83625323, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85989189, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1270752, + "step": 12241, + "time_per_iteration": 2.8439271450042725 + }, + { + "auxiliary_loss_clip": 0.0132848, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.22170222, + "balance_loss_mlp": 1.01330876, + "epoch": 0.7360288591612806, + "flos": 20198222716320.0, + "grad_norm": 1.8008825959436359, + "language_loss": 0.78890604, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81244767, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12384033, + "step": 12242, + "time_per_iteration": 2.8336663246154785 + }, + { + "auxiliary_loss_clip": 0.01332181, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.22582638, + "balance_loss_mlp": 1.01710773, + "epoch": 0.7360889824139486, + "flos": 24978201086160.0, + "grad_norm": 1.9790813549083819, + "language_loss": 0.79501581, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81862879, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12011719, + "step": 12243, + "time_per_iteration": 4.264260292053223 + }, + { + "auxiliary_loss_clip": 0.01344899, + "auxiliary_loss_mlp": 0.01031443, + "balance_loss_clip": 1.23246264, + "balance_loss_mlp": 1.01720881, + "epoch": 0.7361491056666165, + "flos": 15016448984040.0, + "grad_norm": 2.070946718381447, + "language_loss": 0.74207306, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76583654, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14227295, + "step": 12244, + "time_per_iteration": 4.1485655307769775 + }, + { + "auxiliary_loss_clip": 0.01339525, + "auxiliary_loss_mlp": 0.01026278, + "balance_loss_clip": 1.22960889, + "balance_loss_mlp": 1.01362395, + "epoch": 0.7362092289192845, + "flos": 22935658141440.0, + "grad_norm": 1.6305267069219638, + "language_loss": 0.69460917, + "learning_rate": 6.864830705652347e-07, + "loss": 0.7182672, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12646484, + "step": 12245, + "time_per_iteration": 2.715435266494751 + }, + { + "auxiliary_loss_clip": 0.01325246, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.22191238, + "balance_loss_mlp": 1.01520276, + "epoch": 0.7362693521719526, + "flos": 20707376247360.0, + "grad_norm": 1.4593671421703538, + "language_loss": 0.73056293, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75409901, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.1315918, + "step": 12246, + "time_per_iteration": 2.7988758087158203 + }, + { + "auxiliary_loss_clip": 0.01326801, + "auxiliary_loss_mlp": 0.01022565, + "balance_loss_clip": 1.22230148, + "balance_loss_mlp": 1.0099411, + "epoch": 0.7363294754246205, + "flos": 13114789732080.0, + "grad_norm": 1.8997186133747352, + "language_loss": 0.73894787, + "learning_rate": 6.858957833101266e-07, + "loss": 0.76244164, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12634277, + "step": 12247, + "time_per_iteration": 2.6361677646636963 + }, + { + "auxiliary_loss_clip": 0.01329897, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.22472167, + "balance_loss_mlp": 1.0185833, + "epoch": 0.7363895986772885, + "flos": 14031464404320.0, + "grad_norm": 1.7099572025085614, + "language_loss": 0.74822187, + "learning_rate": 6.856022144234526e-07, + "loss": 0.77183145, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.125, + "step": 12248, + "time_per_iteration": 2.711636543273926 + }, + { + "auxiliary_loss_clip": 0.0133898, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.22940326, + "balance_loss_mlp": 1.01783836, + "epoch": 0.7364497219299564, + "flos": 19724868777600.0, + "grad_norm": 1.7391925570688587, + "language_loss": 0.73438871, + "learning_rate": 6.853086953788727e-07, + "loss": 0.75808668, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12976074, + "step": 12249, + "time_per_iteration": 2.781742811203003 + }, + { + "auxiliary_loss_clip": 0.01333628, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.22578204, + "balance_loss_mlp": 1.01689982, + "epoch": 0.7365098451826244, + "flos": 21366428527080.0, + "grad_norm": 1.755797568693123, + "language_loss": 0.77321899, + "learning_rate": 6.850152261875189e-07, + "loss": 0.7968573, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13305664, + "step": 12250, + "time_per_iteration": 2.7362351417541504 + }, + { + "auxiliary_loss_clip": 0.01337831, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.22772527, + "balance_loss_mlp": 1.01429069, + "epoch": 0.7365699684352923, + "flos": 23373537354720.0, + "grad_norm": 1.521838123572685, + "language_loss": 0.71157223, + "learning_rate": 6.8472180686052e-07, + "loss": 0.7352249, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.13140869, + "step": 12251, + "time_per_iteration": 4.234380483627319 + }, + { + "auxiliary_loss_clip": 0.01333785, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.2271359, + "balance_loss_mlp": 1.01954579, + "epoch": 0.7366300916879603, + "flos": 59534818734600.0, + "grad_norm": 1.5109519322296794, + "language_loss": 0.65890485, + "learning_rate": 6.844284374090015e-07, + "loss": 0.6825648, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12664795, + "step": 12252, + "time_per_iteration": 3.0590248107910156 + }, + { + "auxiliary_loss_clip": 0.0134373, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.23383224, + "balance_loss_mlp": 1.01875091, + "epoch": 0.7366902149406283, + "flos": 20928062013480.0, + "grad_norm": 1.5148325110982526, + "language_loss": 0.79068696, + "learning_rate": 6.841351178440884e-07, + "loss": 0.8144418, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13000488, + "step": 12253, + "time_per_iteration": 2.776522397994995 + }, + { + "auxiliary_loss_clip": 0.01331493, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.22642112, + "balance_loss_mlp": 1.01838565, + "epoch": 0.7367503381932963, + "flos": 17352698172120.0, + "grad_norm": 2.0063396314478594, + "language_loss": 0.76631916, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78994519, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1272583, + "step": 12254, + "time_per_iteration": 2.774160861968994 + }, + { + "auxiliary_loss_clip": 0.01335989, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.22789192, + "balance_loss_mlp": 1.02107668, + "epoch": 0.7368104614459642, + "flos": 17825767852320.0, + "grad_norm": 2.3293613227761663, + "language_loss": 0.69698846, + "learning_rate": 6.835486284185692e-07, + "loss": 0.72069168, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13262939, + "step": 12255, + "time_per_iteration": 2.6693830490112305 + }, + { + "auxiliary_loss_clip": 0.01335712, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.22631097, + "balance_loss_mlp": 1.0167321, + "epoch": 0.7368705846986322, + "flos": 24611230715400.0, + "grad_norm": 1.7251637936845454, + "language_loss": 0.75470436, + "learning_rate": 6.832554585802012e-07, + "loss": 0.778368, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13916016, + "step": 12256, + "time_per_iteration": 2.862851619720459 + }, + { + "auxiliary_loss_clip": 0.013395, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.23081434, + "balance_loss_mlp": 1.01694441, + "epoch": 0.7369307079513001, + "flos": 34976930423040.0, + "grad_norm": 1.6617872214026659, + "language_loss": 0.73745501, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76115143, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13208008, + "step": 12257, + "time_per_iteration": 2.844517946243286 + }, + { + "auxiliary_loss_clip": 0.01332662, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.22573817, + "balance_loss_mlp": 1.02233231, + "epoch": 0.7369908312039681, + "flos": 21219494188680.0, + "grad_norm": 1.5668287873098508, + "language_loss": 0.78463227, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80830705, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12487793, + "step": 12258, + "time_per_iteration": 2.748296022415161 + }, + { + "auxiliary_loss_clip": 0.01341563, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.23097587, + "balance_loss_mlp": 1.0148623, + "epoch": 0.7370509544566362, + "flos": 23629007504160.0, + "grad_norm": 1.8775796361190054, + "language_loss": 0.66120398, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68489623, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12817383, + "step": 12259, + "time_per_iteration": 2.7605931758880615 + }, + { + "auxiliary_loss_clip": 0.01336087, + "auxiliary_loss_mlp": 0.01036329, + "balance_loss_clip": 1.2277534, + "balance_loss_mlp": 1.02271533, + "epoch": 0.7371110777093041, + "flos": 24833215949040.0, + "grad_norm": 1.6216525178558463, + "language_loss": 0.73272693, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75645113, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13616943, + "step": 12260, + "time_per_iteration": 2.7663440704345703 + }, + { + "auxiliary_loss_clip": 0.0133772, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.22859514, + "balance_loss_mlp": 1.01980531, + "epoch": 0.7371712009619721, + "flos": 23154800789880.0, + "grad_norm": 1.6537043002359944, + "language_loss": 0.74233913, + "learning_rate": 6.817903585769125e-07, + "loss": 0.76604605, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13165283, + "step": 12261, + "time_per_iteration": 2.8170156478881836 + }, + { + "auxiliary_loss_clip": 0.01342079, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.23070908, + "balance_loss_mlp": 1.02380681, + "epoch": 0.73723132421464, + "flos": 23118189030360.0, + "grad_norm": 6.041935105993194, + "language_loss": 0.67101038, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69481826, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14904785, + "step": 12262, + "time_per_iteration": 2.7523553371429443 + }, + { + "auxiliary_loss_clip": 0.01338398, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.22798944, + "balance_loss_mlp": 1.0170002, + "epoch": 0.737291447467308, + "flos": 19276999907760.0, + "grad_norm": 1.7164827673477816, + "language_loss": 0.88922763, + "learning_rate": 6.81204668404322e-07, + "loss": 0.91291547, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13372803, + "step": 12263, + "time_per_iteration": 2.731470823287964 + }, + { + "auxiliary_loss_clip": 0.01322986, + "auxiliary_loss_mlp": 0.01023764, + "balance_loss_clip": 1.2203033, + "balance_loss_mlp": 1.01295793, + "epoch": 0.7373515707199759, + "flos": 25123632915240.0, + "grad_norm": 2.9399978915751643, + "language_loss": 0.67654252, + "learning_rate": 6.809118983257522e-07, + "loss": 0.70001006, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.10803223, + "step": 12264, + "time_per_iteration": 2.784313440322876 + }, + { + "auxiliary_loss_clip": 0.01331179, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.22521591, + "balance_loss_mlp": 1.01456666, + "epoch": 0.737411693972644, + "flos": 32413904214840.0, + "grad_norm": 1.988364288824434, + "language_loss": 0.80523968, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82882434, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12738037, + "step": 12265, + "time_per_iteration": 2.936742067337036 + }, + { + "auxiliary_loss_clip": 0.01348023, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.23324573, + "balance_loss_mlp": 1.02055264, + "epoch": 0.7374718172253119, + "flos": 24321098007720.0, + "grad_norm": 1.7169125433331829, + "language_loss": 0.74786335, + "learning_rate": 6.803265082395711e-07, + "loss": 0.77168369, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13464355, + "step": 12266, + "time_per_iteration": 2.831798791885376 + }, + { + "auxiliary_loss_clip": 0.01340003, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.23099756, + "balance_loss_mlp": 1.02114558, + "epoch": 0.7375319404779799, + "flos": 27161059206600.0, + "grad_norm": 1.568081405036709, + "language_loss": 0.73540735, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75915194, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13299561, + "step": 12267, + "time_per_iteration": 2.7895522117614746 + }, + { + "auxiliary_loss_clip": 0.01335706, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.22732615, + "balance_loss_mlp": 1.02210951, + "epoch": 0.7375920637306478, + "flos": 18884666293200.0, + "grad_norm": 2.031513556149938, + "language_loss": 0.83527696, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85898089, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12573242, + "step": 12268, + "time_per_iteration": 2.695918083190918 + }, + { + "auxiliary_loss_clip": 0.013345, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_clip": 1.22760725, + "balance_loss_mlp": 1.02974296, + "epoch": 0.7376521869833158, + "flos": 15674079971160.0, + "grad_norm": 1.7345540713147332, + "language_loss": 0.73693109, + "learning_rate": 6.794487984541677e-07, + "loss": 0.76070118, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12780762, + "step": 12269, + "time_per_iteration": 2.717385768890381 + }, + { + "auxiliary_loss_clip": 0.01346096, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.23523915, + "balance_loss_mlp": 1.01801586, + "epoch": 0.7377123102359837, + "flos": 36978354080280.0, + "grad_norm": 1.8925524801128493, + "language_loss": 0.70760334, + "learning_rate": 6.791563286617776e-07, + "loss": 0.73138034, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 1.10888672, + "router_z_loss_mlp": 0.13604736, + "step": 12270, + "time_per_iteration": 2.853769302368164 + }, + { + "auxiliary_loss_clip": 0.01334447, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.22639942, + "balance_loss_mlp": 1.01498604, + "epoch": 0.7377724334886517, + "flos": 24501354828480.0, + "grad_norm": 1.861221201115418, + "language_loss": 0.69691133, + "learning_rate": 6.788639089559119e-07, + "loss": 0.72052413, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.11846924, + "step": 12271, + "time_per_iteration": 2.7650089263916016 + }, + { + "auxiliary_loss_clip": 0.01338258, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.22858405, + "balance_loss_mlp": 1.01673484, + "epoch": 0.7378325567413198, + "flos": 24395499169200.0, + "grad_norm": 2.0129066927123467, + "language_loss": 0.68122649, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70490903, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13269043, + "step": 12272, + "time_per_iteration": 2.766484022140503 + }, + { + "auxiliary_loss_clip": 0.01334154, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.2280364, + "balance_loss_mlp": 1.01812696, + "epoch": 0.7378926799939877, + "flos": 17420317737480.0, + "grad_norm": 1.8412218727701966, + "language_loss": 0.78610158, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80975735, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.13311768, + "step": 12273, + "time_per_iteration": 2.712278127670288 + }, + { + "auxiliary_loss_clip": 0.01333291, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.22620332, + "balance_loss_mlp": 1.01694798, + "epoch": 0.7379528032466557, + "flos": 18478241577720.0, + "grad_norm": 1.7599685483240097, + "language_loss": 0.83745086, + "learning_rate": 6.779869504683355e-07, + "loss": 0.86108315, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13000488, + "step": 12274, + "time_per_iteration": 2.7986490726470947 + }, + { + "auxiliary_loss_clip": 0.01346978, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.23306489, + "balance_loss_mlp": 1.01745987, + "epoch": 0.7380129264993236, + "flos": 17826539411160.0, + "grad_norm": 2.2049830016716294, + "language_loss": 0.74351656, + "learning_rate": 6.776947312194341e-07, + "loss": 0.76730037, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13952637, + "step": 12275, + "time_per_iteration": 2.6735377311706543 + }, + { + "auxiliary_loss_clip": 0.01343618, + "auxiliary_loss_mlp": 0.01036107, + "balance_loss_clip": 1.23172522, + "balance_loss_mlp": 1.02177763, + "epoch": 0.7380730497519916, + "flos": 23001856414200.0, + "grad_norm": 1.7399664870856268, + "language_loss": 0.74152863, + "learning_rate": 6.774025621124813e-07, + "loss": 0.7653259, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14331055, + "step": 12276, + "time_per_iteration": 2.7437000274658203 + }, + { + "auxiliary_loss_clip": 0.01334891, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.22581458, + "balance_loss_mlp": 1.01617646, + "epoch": 0.7381331730046595, + "flos": 20271080760120.0, + "grad_norm": 1.8533087087161646, + "language_loss": 0.78034741, + "learning_rate": 6.771104431585551e-07, + "loss": 0.80398321, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.125, + "step": 12277, + "time_per_iteration": 2.7126452922821045 + }, + { + "auxiliary_loss_clip": 0.0133153, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.2256453, + "balance_loss_mlp": 1.0270189, + "epoch": 0.7381932962573275, + "flos": 19759165860600.0, + "grad_norm": 1.792941645850599, + "language_loss": 0.79006892, + "learning_rate": 6.768183743687338e-07, + "loss": 0.81378078, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12646484, + "step": 12278, + "time_per_iteration": 2.7792234420776367 + }, + { + "auxiliary_loss_clip": 0.01343723, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.23324752, + "balance_loss_mlp": 1.01902699, + "epoch": 0.7382534195099955, + "flos": 17309020557960.0, + "grad_norm": 1.967565385729755, + "language_loss": 0.72232515, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74608064, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12799072, + "step": 12279, + "time_per_iteration": 2.715315818786621 + }, + { + "auxiliary_loss_clip": 0.01342958, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.23156071, + "balance_loss_mlp": 1.02290511, + "epoch": 0.7383135427626635, + "flos": 18701972970840.0, + "grad_norm": 2.1939150757507093, + "language_loss": 0.86188132, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88567996, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14007568, + "step": 12280, + "time_per_iteration": 2.7026257514953613 + }, + { + "auxiliary_loss_clip": 0.01341032, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.23082709, + "balance_loss_mlp": 1.01944113, + "epoch": 0.7383736660153314, + "flos": 20885521433400.0, + "grad_norm": 2.145072378009444, + "language_loss": 0.7278192, + "learning_rate": 6.759424690946408e-07, + "loss": 0.7515586, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13470459, + "step": 12281, + "time_per_iteration": 2.7308530807495117 + }, + { + "auxiliary_loss_clip": 0.0134661, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.23550344, + "balance_loss_mlp": 1.02157307, + "epoch": 0.7384337892679994, + "flos": 20667515819040.0, + "grad_norm": 1.7638098661835757, + "language_loss": 0.61203957, + "learning_rate": 6.756506010719711e-07, + "loss": 0.6358515, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 1.11083984, + "router_z_loss_mlp": 0.13006592, + "step": 12282, + "time_per_iteration": 5.7915260791778564 + }, + { + "auxiliary_loss_clip": 0.01345738, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.23447669, + "balance_loss_mlp": 1.01947474, + "epoch": 0.7384939125206673, + "flos": 29175721189200.0, + "grad_norm": 1.8257313719663306, + "language_loss": 0.68122423, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70501065, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.13433838, + "step": 12283, + "time_per_iteration": 2.7763891220092773 + }, + { + "auxiliary_loss_clip": 0.01336904, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.23028731, + "balance_loss_mlp": 1.01838756, + "epoch": 0.7385540357733353, + "flos": 36318814500240.0, + "grad_norm": 1.5698209270862133, + "language_loss": 0.7620374, + "learning_rate": 6.750670156960832e-07, + "loss": 0.7857151, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12487793, + "step": 12284, + "time_per_iteration": 2.8687491416931152 + }, + { + "auxiliary_loss_clip": 0.01341842, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.23104441, + "balance_loss_mlp": 1.01829863, + "epoch": 0.7386141590260034, + "flos": 20307408261120.0, + "grad_norm": 1.674378650864853, + "language_loss": 0.68847418, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71220845, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.1328125, + "step": 12285, + "time_per_iteration": 2.803865671157837 + }, + { + "auxiliary_loss_clip": 0.01351204, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.23742723, + "balance_loss_mlp": 1.01705313, + "epoch": 0.7386742822786713, + "flos": 25489303818480.0, + "grad_norm": 1.796284655009658, + "language_loss": 0.80126601, + "learning_rate": 6.744836312865602e-07, + "loss": 0.82508904, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14056396, + "step": 12286, + "time_per_iteration": 2.8967514038085938 + }, + { + "auxiliary_loss_clip": 0.01340608, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.23352408, + "balance_loss_mlp": 1.01678443, + "epoch": 0.7387344055313393, + "flos": 13775953646520.0, + "grad_norm": 2.093042862834912, + "language_loss": 0.66065538, + "learning_rate": 6.741920144718396e-07, + "loss": 0.68436354, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13415527, + "step": 12287, + "time_per_iteration": 2.6754188537597656 + }, + { + "auxiliary_loss_clip": 0.01332486, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.22640181, + "balance_loss_mlp": 1.01530457, + "epoch": 0.7387945287840072, + "flos": 27860987123640.0, + "grad_norm": 1.6269741317867406, + "language_loss": 0.76737988, + "learning_rate": 6.739004479318903e-07, + "loss": 0.79097867, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12078857, + "step": 12288, + "time_per_iteration": 2.7540149688720703 + }, + { + "auxiliary_loss_clip": 0.01352618, + "auxiliary_loss_mlp": 0.01042379, + "balance_loss_clip": 1.23994541, + "balance_loss_mlp": 1.02807331, + "epoch": 0.7388546520366752, + "flos": 44240419550880.0, + "grad_norm": 1.606588709447856, + "language_loss": 0.58257771, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60652763, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14300537, + "step": 12289, + "time_per_iteration": 2.9392850399017334 + }, + { + "auxiliary_loss_clip": 0.01154759, + "auxiliary_loss_mlp": 0.01006051, + "balance_loss_clip": 1.10973787, + "balance_loss_mlp": 1.00369036, + "epoch": 0.7389147752893431, + "flos": 70695923259600.0, + "grad_norm": 0.6518775745768325, + "language_loss": 0.49347857, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51508665, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02355957, + "step": 12290, + "time_per_iteration": 4.92020845413208 + }, + { + "auxiliary_loss_clip": 0.01338427, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.22926843, + "balance_loss_mlp": 1.02115297, + "epoch": 0.7389748985420111, + "flos": 26000609592600.0, + "grad_norm": 1.8143461235078586, + "language_loss": 0.67922521, + "learning_rate": 6.730260500712237e-07, + "loss": 0.7029618, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.140625, + "step": 12291, + "time_per_iteration": 2.79483699798584 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.01006267, + "balance_loss_clip": 1.11098242, + "balance_loss_mlp": 1.00394297, + "epoch": 0.7390350217946791, + "flos": 54415970979120.0, + "grad_norm": 1.920279077494209, + "language_loss": 0.60870618, + "learning_rate": 6.727346847409052e-07, + "loss": 0.63032395, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02319336, + "step": 12292, + "time_per_iteration": 2.867918014526367 + }, + { + "auxiliary_loss_clip": 0.0134021, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.23260617, + "balance_loss_mlp": 1.02179396, + "epoch": 0.7390951450473471, + "flos": 32203329930360.0, + "grad_norm": 2.1435471331148697, + "language_loss": 0.67519569, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69893283, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.11724854, + "step": 12293, + "time_per_iteration": 2.787614107131958 + }, + { + "auxiliary_loss_clip": 0.0133791, + "auxiliary_loss_mlp": 0.01038544, + "balance_loss_clip": 1.23050213, + "balance_loss_mlp": 1.02596784, + "epoch": 0.739155268300015, + "flos": 16687757680200.0, + "grad_norm": 1.7214039069587548, + "language_loss": 0.83413386, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85789835, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12573242, + "step": 12294, + "time_per_iteration": 2.7288191318511963 + }, + { + "auxiliary_loss_clip": 0.01330597, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.22531509, + "balance_loss_mlp": 1.02057886, + "epoch": 0.739215391552683, + "flos": 31656062130480.0, + "grad_norm": 1.5980785895624516, + "language_loss": 0.73123693, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75488031, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.13171387, + "step": 12295, + "time_per_iteration": 2.787111520767212 + }, + { + "auxiliary_loss_clip": 0.01330686, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.22604191, + "balance_loss_mlp": 1.02790022, + "epoch": 0.7392755148053509, + "flos": 29724897582000.0, + "grad_norm": 1.8137248248957958, + "language_loss": 0.78815603, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81186044, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11859131, + "step": 12296, + "time_per_iteration": 2.820781707763672 + }, + { + "auxiliary_loss_clip": 0.01335129, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.22762942, + "balance_loss_mlp": 1.01698613, + "epoch": 0.7393356380580189, + "flos": 37057587636600.0, + "grad_norm": 2.0538109189338103, + "language_loss": 0.66437745, + "learning_rate": 6.712786132607182e-07, + "loss": 0.688039, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.14031982, + "step": 12297, + "time_per_iteration": 2.940694808959961 + }, + { + "auxiliary_loss_clip": 0.01343611, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.23460138, + "balance_loss_mlp": 1.02514887, + "epoch": 0.739395761310687, + "flos": 19724462694000.0, + "grad_norm": 1.612467606143421, + "language_loss": 0.68894011, + "learning_rate": 6.709875500762645e-07, + "loss": 0.71276742, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13964844, + "step": 12298, + "time_per_iteration": 2.748561382293701 + }, + { + "auxiliary_loss_clip": 0.01337352, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.22855616, + "balance_loss_mlp": 1.02023554, + "epoch": 0.7394558845633549, + "flos": 11805212928240.0, + "grad_norm": 1.9076340808590375, + "language_loss": 0.74477834, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76848805, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13378906, + "step": 12299, + "time_per_iteration": 2.739368200302124 + }, + { + "auxiliary_loss_clip": 0.01158499, + "auxiliary_loss_mlp": 0.01003373, + "balance_loss_clip": 1.11350441, + "balance_loss_mlp": 1.00091743, + "epoch": 0.7395160078160229, + "flos": 66210625733760.0, + "grad_norm": 0.739932236488455, + "language_loss": 0.60937971, + "learning_rate": 6.704055749072455e-07, + "loss": 0.63099843, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02453613, + "step": 12300, + "time_per_iteration": 3.2877068519592285 + }, + { + "auxiliary_loss_clip": 0.01341763, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.23324192, + "balance_loss_mlp": 1.02100468, + "epoch": 0.7395761310686908, + "flos": 21254441005440.0, + "grad_norm": 1.4828161706923177, + "language_loss": 0.80798239, + "learning_rate": 6.7011466294475e-07, + "loss": 0.83173859, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12854004, + "step": 12301, + "time_per_iteration": 2.93666934967041 + }, + { + "auxiliary_loss_clip": 0.01329216, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.22281969, + "balance_loss_mlp": 1.01487184, + "epoch": 0.7396362543213588, + "flos": 25960505514120.0, + "grad_norm": 1.5767378282396012, + "language_loss": 0.73518133, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75873768, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11547852, + "step": 12302, + "time_per_iteration": 2.8244707584381104 + }, + { + "auxiliary_loss_clip": 0.01347126, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.23710227, + "balance_loss_mlp": 1.0210979, + "epoch": 0.7396963775740267, + "flos": 27383288090400.0, + "grad_norm": 1.6880771915351103, + "language_loss": 0.74256748, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76638114, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13140869, + "step": 12303, + "time_per_iteration": 2.8314952850341797 + }, + { + "auxiliary_loss_clip": 0.01335583, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.2288512, + "balance_loss_mlp": 1.02006435, + "epoch": 0.7397565008266948, + "flos": 25525631319480.0, + "grad_norm": 2.328950511079412, + "language_loss": 0.54417431, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56785679, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.1260376, + "step": 12304, + "time_per_iteration": 2.798781156539917 + }, + { + "auxiliary_loss_clip": 0.01340352, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.23172677, + "balance_loss_mlp": 1.02161288, + "epoch": 0.7398166240793627, + "flos": 23732426661840.0, + "grad_norm": 1.884924138685937, + "language_loss": 0.84956563, + "learning_rate": 6.689515194989084e-07, + "loss": 0.87331694, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13183594, + "step": 12305, + "time_per_iteration": 2.7826335430145264 + }, + { + "auxiliary_loss_clip": 0.01156943, + "auxiliary_loss_mlp": 0.01006889, + "balance_loss_clip": 1.11246049, + "balance_loss_mlp": 1.00432646, + "epoch": 0.7398767473320307, + "flos": 67284305617680.0, + "grad_norm": 0.887276434490361, + "language_loss": 0.57780623, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59944463, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02563477, + "step": 12306, + "time_per_iteration": 3.2660741806030273 + }, + { + "auxiliary_loss_clip": 0.0134429, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.23341084, + "balance_loss_mlp": 1.01878965, + "epoch": 0.7399368705846986, + "flos": 22023897080760.0, + "grad_norm": 1.8217809629225061, + "language_loss": 0.8199082, + "learning_rate": 6.683702505728355e-07, + "loss": 0.84367132, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13226318, + "step": 12307, + "time_per_iteration": 2.7839715480804443 + }, + { + "auxiliary_loss_clip": 0.01330495, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.22672558, + "balance_loss_mlp": 1.0153935, + "epoch": 0.7399969938373666, + "flos": 14177992659120.0, + "grad_norm": 1.6461242093938637, + "language_loss": 0.70072782, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72431421, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.1272583, + "step": 12308, + "time_per_iteration": 2.7385611534118652 + }, + { + "auxiliary_loss_clip": 0.0132871, + "auxiliary_loss_mlp": 0.01024818, + "balance_loss_clip": 1.22423661, + "balance_loss_mlp": 1.01238418, + "epoch": 0.7400571170900345, + "flos": 25306975971360.0, + "grad_norm": 3.8642973458468597, + "language_loss": 0.81770515, + "learning_rate": 6.67789183628896e-07, + "loss": 0.84124041, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12432861, + "step": 12309, + "time_per_iteration": 2.809615135192871 + }, + { + "auxiliary_loss_clip": 0.01347612, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.23562396, + "balance_loss_mlp": 1.02129722, + "epoch": 0.7401172403427025, + "flos": 22716556101360.0, + "grad_norm": 1.7699405208876515, + "language_loss": 0.73127019, + "learning_rate": 6.674987259277692e-07, + "loss": 0.75509638, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13696289, + "step": 12310, + "time_per_iteration": 2.7418789863586426 + }, + { + "auxiliary_loss_clip": 0.01341458, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.23178983, + "balance_loss_mlp": 1.02249622, + "epoch": 0.7401773635953706, + "flos": 18070720436520.0, + "grad_norm": 2.3903995605295805, + "language_loss": 0.88477522, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90855289, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13818359, + "step": 12311, + "time_per_iteration": 2.7030653953552246 + }, + { + "auxiliary_loss_clip": 0.0133966, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.23065627, + "balance_loss_mlp": 1.01605248, + "epoch": 0.7402374868480385, + "flos": 22717449485280.0, + "grad_norm": 1.521612003438103, + "language_loss": 0.8050046, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82869029, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12854004, + "step": 12312, + "time_per_iteration": 2.721869468688965 + }, + { + "auxiliary_loss_clip": 0.01337535, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.23116589, + "balance_loss_mlp": 1.01886177, + "epoch": 0.7402976101007065, + "flos": 22861825497000.0, + "grad_norm": 1.6469055133381594, + "language_loss": 0.78513622, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80882001, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11981201, + "step": 12313, + "time_per_iteration": 2.751940965652466 + }, + { + "auxiliary_loss_clip": 0.01344229, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.23234093, + "balance_loss_mlp": 1.01889277, + "epoch": 0.7403577333533744, + "flos": 12349191450960.0, + "grad_norm": 1.9555066532654268, + "language_loss": 0.78869963, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81246793, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13708496, + "step": 12314, + "time_per_iteration": 2.7024986743927 + }, + { + "auxiliary_loss_clip": 0.01158105, + "auxiliary_loss_mlp": 0.0100203, + "balance_loss_clip": 1.11334872, + "balance_loss_mlp": 0.99949038, + "epoch": 0.7404178566060424, + "flos": 60342040584720.0, + "grad_norm": 0.8287574731174732, + "language_loss": 0.55252659, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57412803, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02539062, + "step": 12315, + "time_per_iteration": 3.2344319820404053 + }, + { + "auxiliary_loss_clip": 0.01329638, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.22550654, + "balance_loss_mlp": 1.01653934, + "epoch": 0.7404779798587103, + "flos": 32021002083240.0, + "grad_norm": 1.716113110288232, + "language_loss": 0.79628325, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81987655, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.13140869, + "step": 12316, + "time_per_iteration": 2.8322715759277344 + }, + { + "auxiliary_loss_clip": 0.01337909, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.22923315, + "balance_loss_mlp": 1.01576161, + "epoch": 0.7405381031113784, + "flos": 12892723281720.0, + "grad_norm": 2.134499986502278, + "language_loss": 0.75101811, + "learning_rate": 6.654669374367275e-07, + "loss": 0.77468288, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12805176, + "step": 12317, + "time_per_iteration": 2.8176794052124023 + }, + { + "auxiliary_loss_clip": 0.01326797, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.22403133, + "balance_loss_mlp": 1.01537633, + "epoch": 0.7405982263640463, + "flos": 20233859875200.0, + "grad_norm": 1.532232654444345, + "language_loss": 0.81426466, + "learning_rate": 6.651768842724917e-07, + "loss": 0.8378098, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.12341309, + "step": 12318, + "time_per_iteration": 2.816617727279663 + }, + { + "auxiliary_loss_clip": 0.01347156, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.23592496, + "balance_loss_mlp": 1.0172143, + "epoch": 0.7406583496167143, + "flos": 17571800212200.0, + "grad_norm": 1.7171505116590027, + "language_loss": 0.76860905, + "learning_rate": 6.648868817248827e-07, + "loss": 0.79238325, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13049316, + "step": 12319, + "time_per_iteration": 2.747783899307251 + }, + { + "auxiliary_loss_clip": 0.01337559, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.23055148, + "balance_loss_mlp": 1.01984239, + "epoch": 0.7407184728693822, + "flos": 18300258825120.0, + "grad_norm": 2.2215578923897823, + "language_loss": 0.64055252, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66424507, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.11846924, + "step": 12320, + "time_per_iteration": 5.835261106491089 + }, + { + "auxiliary_loss_clip": 0.01347097, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.23520136, + "balance_loss_mlp": 1.0239346, + "epoch": 0.7407785961220502, + "flos": 16694904751560.0, + "grad_norm": 2.726956921924486, + "language_loss": 0.83579093, + "learning_rate": 6.643070285235288e-07, + "loss": 0.85963315, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13183594, + "step": 12321, + "time_per_iteration": 4.154232025146484 + }, + { + "auxiliary_loss_clip": 0.01349802, + "auxiliary_loss_mlp": 0.01049046, + "balance_loss_clip": 1.23567808, + "balance_loss_mlp": 1.03385901, + "epoch": 0.7408387193747181, + "flos": 22093100372160.0, + "grad_norm": 1.7227108137332392, + "language_loss": 0.72343886, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74742734, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.15197754, + "step": 12322, + "time_per_iteration": 2.826122760772705 + }, + { + "auxiliary_loss_clip": 0.01339446, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.23098493, + "balance_loss_mlp": 1.02029788, + "epoch": 0.7408988426273861, + "flos": 24241214717640.0, + "grad_norm": 1.7568265158228524, + "language_loss": 0.64312673, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66685319, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12915039, + "step": 12323, + "time_per_iteration": 2.7836992740631104 + }, + { + "auxiliary_loss_clip": 0.0134414, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.23283815, + "balance_loss_mlp": 1.02284992, + "epoch": 0.7409589658800542, + "flos": 29028989892600.0, + "grad_norm": 1.317743781777527, + "language_loss": 0.76041383, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78422475, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14099121, + "step": 12324, + "time_per_iteration": 2.8975651264190674 + }, + { + "auxiliary_loss_clip": 0.01337202, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.2287066, + "balance_loss_mlp": 1.01151872, + "epoch": 0.7410190891327221, + "flos": 19355705555400.0, + "grad_norm": 1.7482866837369067, + "language_loss": 0.7517488, + "learning_rate": 6.63147930004073e-07, + "loss": 0.77536452, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12860107, + "step": 12325, + "time_per_iteration": 2.73818039894104 + }, + { + "auxiliary_loss_clip": 0.01351894, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.23747909, + "balance_loss_mlp": 1.02085066, + "epoch": 0.7410792123853901, + "flos": 22752842994000.0, + "grad_norm": 2.4294833727071454, + "language_loss": 0.68898517, + "learning_rate": 6.628582820806545e-07, + "loss": 0.71285331, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14074707, + "step": 12326, + "time_per_iteration": 2.8052310943603516 + }, + { + "auxiliary_loss_clip": 0.01332393, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.2236079, + "balance_loss_mlp": 1.01536775, + "epoch": 0.741139335638058, + "flos": 25377681772080.0, + "grad_norm": 1.6367886106157465, + "language_loss": 0.89601278, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91962564, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13525391, + "step": 12327, + "time_per_iteration": 2.7893333435058594 + }, + { + "auxiliary_loss_clip": 0.01340628, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.2327342, + "balance_loss_mlp": 1.01981449, + "epoch": 0.741199458890726, + "flos": 18589944840840.0, + "grad_norm": 1.6359162039417212, + "language_loss": 0.85549307, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87923563, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13818359, + "step": 12328, + "time_per_iteration": 2.701796293258667 + }, + { + "auxiliary_loss_clip": 0.01333762, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.2270602, + "balance_loss_mlp": 1.01725984, + "epoch": 0.7412595821433939, + "flos": 22132148633280.0, + "grad_norm": 1.9034434207022104, + "language_loss": 0.67238766, + "learning_rate": 6.619896425816103e-07, + "loss": 0.696033, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13531494, + "step": 12329, + "time_per_iteration": 4.436125755310059 + }, + { + "auxiliary_loss_clip": 0.01351001, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.23760712, + "balance_loss_mlp": 1.0176264, + "epoch": 0.741319705396062, + "flos": 29175802405920.0, + "grad_norm": 1.711088124967739, + "language_loss": 0.67090619, + "learning_rate": 6.617001975422647e-07, + "loss": 0.6947186, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12628174, + "step": 12330, + "time_per_iteration": 2.823594808578491 + }, + { + "auxiliary_loss_clip": 0.01348542, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.23521733, + "balance_loss_mlp": 1.01467872, + "epoch": 0.7413798286487299, + "flos": 20672348213880.0, + "grad_norm": 2.1328507566145607, + "language_loss": 0.85350311, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87728214, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14660645, + "step": 12331, + "time_per_iteration": 2.840721368789673 + }, + { + "auxiliary_loss_clip": 0.01344013, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.23357034, + "balance_loss_mlp": 1.01836324, + "epoch": 0.7414399519013979, + "flos": 16403228926200.0, + "grad_norm": 1.869983066853501, + "language_loss": 0.69981444, + "learning_rate": 6.611214597199364e-07, + "loss": 0.72357404, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13592529, + "step": 12332, + "time_per_iteration": 2.738396644592285 + }, + { + "auxiliary_loss_clip": 0.01338423, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.2290535, + "balance_loss_mlp": 1.01976061, + "epoch": 0.7415000751540658, + "flos": 25635832073280.0, + "grad_norm": 1.8698715583819912, + "language_loss": 0.63415611, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65788209, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.14416504, + "step": 12333, + "time_per_iteration": 2.755321502685547 + }, + { + "auxiliary_loss_clip": 0.01330935, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.22691774, + "balance_loss_mlp": 1.02114534, + "epoch": 0.7415601984067338, + "flos": 24505253231040.0, + "grad_norm": 1.7106300540797497, + "language_loss": 0.71126807, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73491079, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.12188721, + "step": 12334, + "time_per_iteration": 2.789217472076416 + }, + { + "auxiliary_loss_clip": 0.01337845, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.23020089, + "balance_loss_mlp": 1.01448536, + "epoch": 0.7416203216594017, + "flos": 20892140596080.0, + "grad_norm": 1.6928299820387875, + "language_loss": 0.82820892, + "learning_rate": 6.602537337919257e-07, + "loss": 0.85186219, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12988281, + "step": 12335, + "time_per_iteration": 2.7462680339813232 + }, + { + "auxiliary_loss_clip": 0.01344253, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.23350668, + "balance_loss_mlp": 1.02171135, + "epoch": 0.7416804449120697, + "flos": 15627397338360.0, + "grad_norm": 2.2994598331167135, + "language_loss": 0.74981284, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77361333, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14086914, + "step": 12336, + "time_per_iteration": 2.7225561141967773 + }, + { + "auxiliary_loss_clip": 0.01345794, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.23450899, + "balance_loss_mlp": 1.01979113, + "epoch": 0.7417405681647377, + "flos": 17123850125640.0, + "grad_norm": 1.7829562136371915, + "language_loss": 0.73329306, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75708699, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13812256, + "step": 12337, + "time_per_iteration": 2.719712972640991 + }, + { + "auxiliary_loss_clip": 0.01334421, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.2289536, + "balance_loss_mlp": 1.02084589, + "epoch": 0.7418006914174057, + "flos": 18885031768440.0, + "grad_norm": 1.6282434338998093, + "language_loss": 0.76968145, + "learning_rate": 6.593864650937186e-07, + "loss": 0.79336298, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12884521, + "step": 12338, + "time_per_iteration": 2.7861011028289795 + }, + { + "auxiliary_loss_clip": 0.01335458, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.22892845, + "balance_loss_mlp": 1.01853418, + "epoch": 0.7418608146700737, + "flos": 21585855434040.0, + "grad_norm": 1.6687196163113933, + "language_loss": 0.73211992, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75577879, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.11889648, + "step": 12339, + "time_per_iteration": 2.723714828491211 + }, + { + "auxiliary_loss_clip": 0.01342162, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.23335838, + "balance_loss_mlp": 1.01689768, + "epoch": 0.7419209379227416, + "flos": 22344834552480.0, + "grad_norm": 2.27078405383174, + "language_loss": 0.79653466, + "learning_rate": 6.588085401243077e-07, + "loss": 0.82025617, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13085938, + "step": 12340, + "time_per_iteration": 2.7832815647125244 + }, + { + "auxiliary_loss_clip": 0.01338943, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.23003173, + "balance_loss_mlp": 1.02084017, + "epoch": 0.7419810611754096, + "flos": 16766463327840.0, + "grad_norm": 1.3596651778492128, + "language_loss": 0.756437, + "learning_rate": 6.585196539212958e-07, + "loss": 0.780168, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13311768, + "step": 12341, + "time_per_iteration": 2.7529244422912598 + }, + { + "auxiliary_loss_clip": 0.01318733, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.21788633, + "balance_loss_mlp": 1.01757562, + "epoch": 0.7420411844280775, + "flos": 26218493381880.0, + "grad_norm": 1.3570210530953222, + "language_loss": 0.80337954, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82687068, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.12805176, + "step": 12342, + "time_per_iteration": 2.747671127319336 + }, + { + "auxiliary_loss_clip": 0.01342254, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.23363268, + "balance_loss_mlp": 1.01354933, + "epoch": 0.7421013076807456, + "flos": 68539507827480.0, + "grad_norm": 1.610612962961952, + "language_loss": 0.77675247, + "learning_rate": 6.57942034133433e-07, + "loss": 0.80044198, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.13146973, + "step": 12343, + "time_per_iteration": 3.1464672088623047 + }, + { + "auxiliary_loss_clip": 0.01338672, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.23057997, + "balance_loss_mlp": 1.0185616, + "epoch": 0.7421614309334135, + "flos": 24430649027760.0, + "grad_norm": 1.494227723127187, + "language_loss": 0.67579067, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69949251, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12945557, + "step": 12344, + "time_per_iteration": 2.9125282764434814 + }, + { + "auxiliary_loss_clip": 0.01345032, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.23474431, + "balance_loss_mlp": 1.01890492, + "epoch": 0.7422215541860815, + "flos": 12314691326160.0, + "grad_norm": 2.6006594673925347, + "language_loss": 0.81462491, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83840626, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.14178467, + "step": 12345, + "time_per_iteration": 2.789386749267578 + }, + { + "auxiliary_loss_clip": 0.0133957, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.23027861, + "balance_loss_mlp": 1.0194993, + "epoch": 0.7422816774387494, + "flos": 19650264574320.0, + "grad_norm": 2.1223282007448847, + "language_loss": 0.7115556, + "learning_rate": 6.570759861612988e-07, + "loss": 0.7352795, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13317871, + "step": 12346, + "time_per_iteration": 2.701353073120117 + }, + { + "auxiliary_loss_clip": 0.01341108, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.23217833, + "balance_loss_mlp": 1.01844573, + "epoch": 0.7423418006914174, + "flos": 32022504592560.0, + "grad_norm": 1.4948504713890873, + "language_loss": 0.73356879, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75729811, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13360596, + "step": 12347, + "time_per_iteration": 2.82094144821167 + }, + { + "auxiliary_loss_clip": 0.01347744, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.23551726, + "balance_loss_mlp": 1.01576078, + "epoch": 0.7424019239440853, + "flos": 18921684136320.0, + "grad_norm": 1.8822328962683599, + "language_loss": 0.81516534, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83893234, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13195801, + "step": 12348, + "time_per_iteration": 2.7294788360595703 + }, + { + "auxiliary_loss_clip": 0.01330726, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.22369218, + "balance_loss_mlp": 1.01806223, + "epoch": 0.7424620471967533, + "flos": 35882640838080.0, + "grad_norm": 1.6046108742462097, + "language_loss": 0.7217347, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74535412, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13153076, + "step": 12349, + "time_per_iteration": 2.8710224628448486 + }, + { + "auxiliary_loss_clip": 0.01351125, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.23735642, + "balance_loss_mlp": 1.02154374, + "epoch": 0.7425221704494213, + "flos": 27022368365280.0, + "grad_norm": 1.7966804590868906, + "language_loss": 0.79140484, + "learning_rate": 6.559219685162165e-07, + "loss": 0.81527686, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1451416, + "step": 12350, + "time_per_iteration": 2.8179638385772705 + }, + { + "auxiliary_loss_clip": 0.01338935, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.23034167, + "balance_loss_mlp": 1.02082419, + "epoch": 0.7425822937020893, + "flos": 34173745781760.0, + "grad_norm": 1.767470611691108, + "language_loss": 0.75423825, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77796125, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12536621, + "step": 12351, + "time_per_iteration": 2.846450090408325 + }, + { + "auxiliary_loss_clip": 0.01335528, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.22774577, + "balance_loss_mlp": 1.0121609, + "epoch": 0.7426424169547573, + "flos": 21287966529600.0, + "grad_norm": 2.116845470461932, + "language_loss": 0.81915653, + "learning_rate": 6.553452654553611e-07, + "loss": 0.84275544, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12207031, + "step": 12352, + "time_per_iteration": 2.788241147994995 + }, + { + "auxiliary_loss_clip": 0.01347157, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.23702109, + "balance_loss_mlp": 1.02171969, + "epoch": 0.7427025402074252, + "flos": 22452639413040.0, + "grad_norm": 2.5627013916992185, + "language_loss": 0.72130108, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74511653, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.12664795, + "step": 12353, + "time_per_iteration": 2.7317099571228027 + }, + { + "auxiliary_loss_clip": 0.01339874, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.23210907, + "balance_loss_mlp": 1.02549767, + "epoch": 0.7427626634600932, + "flos": 22529233425960.0, + "grad_norm": 1.5475134081356614, + "language_loss": 0.72290635, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74668723, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12719727, + "step": 12354, + "time_per_iteration": 2.770169734954834 + }, + { + "auxiliary_loss_clip": 0.01158665, + "auxiliary_loss_mlp": 0.01007347, + "balance_loss_clip": 1.11401415, + "balance_loss_mlp": 1.00466526, + "epoch": 0.7428227867127611, + "flos": 67223369450520.0, + "grad_norm": 0.6949824784206666, + "language_loss": 0.59596884, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61762905, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02685547, + "step": 12355, + "time_per_iteration": 3.3736412525177 + }, + { + "auxiliary_loss_clip": 0.01341298, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.23213851, + "balance_loss_mlp": 1.01442635, + "epoch": 0.7428829099654292, + "flos": 14725422892440.0, + "grad_norm": 1.7948382409340535, + "language_loss": 0.6750825, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69876885, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12921143, + "step": 12356, + "time_per_iteration": 2.8166558742523193 + }, + { + "auxiliary_loss_clip": 0.01342314, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.23151207, + "balance_loss_mlp": 1.024472, + "epoch": 0.7429430332180971, + "flos": 48654280325520.0, + "grad_norm": 1.5436687112645404, + "language_loss": 0.72316366, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74696875, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 1.10888672, + "router_z_loss_mlp": 0.13720703, + "step": 12357, + "time_per_iteration": 3.022408962249756 + }, + { + "auxiliary_loss_clip": 0.01331369, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.22790396, + "balance_loss_mlp": 1.01519835, + "epoch": 0.7430031564707651, + "flos": 16768493745840.0, + "grad_norm": 1.757654905911189, + "language_loss": 0.65168768, + "learning_rate": 6.53616380369143e-07, + "loss": 0.6752727, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.11932373, + "step": 12358, + "time_per_iteration": 2.687391757965088 + }, + { + "auxiliary_loss_clip": 0.01346232, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.2341218, + "balance_loss_mlp": 1.01419818, + "epoch": 0.743063279723433, + "flos": 23874853472280.0, + "grad_norm": 1.9491052470927102, + "language_loss": 0.8091768, + "learning_rate": 6.533284114835591e-07, + "loss": 0.83292198, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14080811, + "step": 12359, + "time_per_iteration": 5.601720333099365 + }, + { + "auxiliary_loss_clip": 0.0134245, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.23344898, + "balance_loss_mlp": 1.02233613, + "epoch": 0.743123402976101, + "flos": 14395714014960.0, + "grad_norm": 1.94238324101073, + "language_loss": 0.685332, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70911038, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13049316, + "step": 12360, + "time_per_iteration": 2.715078830718994 + }, + { + "auxiliary_loss_clip": 0.01334475, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.22674584, + "balance_loss_mlp": 1.02002454, + "epoch": 0.7431835262287689, + "flos": 27460125753480.0, + "grad_norm": 1.6152280832789705, + "language_loss": 0.72894287, + "learning_rate": 6.527526269210715e-07, + "loss": 0.75261861, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13067627, + "step": 12361, + "time_per_iteration": 2.8048954010009766 + }, + { + "auxiliary_loss_clip": 0.01342367, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.23238707, + "balance_loss_mlp": 1.0204165, + "epoch": 0.743243649481437, + "flos": 20964430122840.0, + "grad_norm": 1.7314859792297752, + "language_loss": 0.56031156, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58407557, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1361084, + "step": 12362, + "time_per_iteration": 2.77911376953125 + }, + { + "auxiliary_loss_clip": 0.01336165, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.22767901, + "balance_loss_mlp": 1.01861763, + "epoch": 0.7433037727341049, + "flos": 22788236502720.0, + "grad_norm": 1.6604369601601017, + "language_loss": 0.77816647, + "learning_rate": 6.521770467096039e-07, + "loss": 0.80184305, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12866211, + "step": 12363, + "time_per_iteration": 2.808310031890869 + }, + { + "auxiliary_loss_clip": 0.01341385, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.23306465, + "balance_loss_mlp": 1.01963365, + "epoch": 0.7433638959867729, + "flos": 22201351924680.0, + "grad_norm": 1.8870714627295384, + "language_loss": 0.78377593, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80750811, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12188721, + "step": 12364, + "time_per_iteration": 2.742248773574829 + }, + { + "auxiliary_loss_clip": 0.0133827, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.22984695, + "balance_loss_mlp": 1.02168298, + "epoch": 0.7434240192394409, + "flos": 23302263036960.0, + "grad_norm": 2.098835167823599, + "language_loss": 0.78644013, + "learning_rate": 6.516016709364604e-07, + "loss": 0.8101629, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12322998, + "step": 12365, + "time_per_iteration": 2.8081703186035156 + }, + { + "auxiliary_loss_clip": 0.01346945, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.23484254, + "balance_loss_mlp": 1.01998556, + "epoch": 0.7434841424921088, + "flos": 54018016429680.0, + "grad_norm": 1.609842261293518, + "language_loss": 0.77265042, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79645967, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14007568, + "step": 12366, + "time_per_iteration": 3.0021679401397705 + }, + { + "auxiliary_loss_clip": 0.01336293, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.23180652, + "balance_loss_mlp": 1.01572037, + "epoch": 0.7435442657447768, + "flos": 21438961704000.0, + "grad_norm": 1.4330375863001095, + "language_loss": 0.71678567, + "learning_rate": 6.510264996889141e-07, + "loss": 0.74041784, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11199951, + "step": 12367, + "time_per_iteration": 2.799917459487915 + }, + { + "auxiliary_loss_clip": 0.01353095, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.24089551, + "balance_loss_mlp": 1.01973128, + "epoch": 0.7436043889974447, + "flos": 24504887755800.0, + "grad_norm": 1.5954109610744873, + "language_loss": 0.74717623, + "learning_rate": 6.507389907895038e-07, + "loss": 0.77103043, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.12585449, + "step": 12368, + "time_per_iteration": 4.35630989074707 + }, + { + "auxiliary_loss_clip": 0.01332603, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.22690082, + "balance_loss_mlp": 1.01895499, + "epoch": 0.7436645122501128, + "flos": 40705890738480.0, + "grad_norm": 1.7147378030449998, + "language_loss": 0.69255894, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71619654, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12200928, + "step": 12369, + "time_per_iteration": 2.8839898109436035 + }, + { + "auxiliary_loss_clip": 0.01343566, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.23557091, + "balance_loss_mlp": 1.01750708, + "epoch": 0.7437246355027807, + "flos": 18912506646960.0, + "grad_norm": 1.7285103715180612, + "language_loss": 0.7564339, + "learning_rate": 6.501641264939233e-07, + "loss": 0.78017724, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.13256836, + "step": 12370, + "time_per_iteration": 2.717524528503418 + }, + { + "auxiliary_loss_clip": 0.01333982, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.22821832, + "balance_loss_mlp": 1.01923156, + "epoch": 0.7437847587554487, + "flos": 21548878199280.0, + "grad_norm": 1.4620088741851722, + "language_loss": 0.78785765, + "learning_rate": 6.498767711195503e-07, + "loss": 0.81152654, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13684082, + "step": 12371, + "time_per_iteration": 2.7204794883728027 + }, + { + "auxiliary_loss_clip": 0.01338369, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.22963512, + "balance_loss_mlp": 1.01395309, + "epoch": 0.7438448820081166, + "flos": 27788291513280.0, + "grad_norm": 1.609929273149642, + "language_loss": 0.70172578, + "learning_rate": 6.495894669419857e-07, + "loss": 0.72537625, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12713623, + "step": 12372, + "time_per_iteration": 2.8176186084747314 + }, + { + "auxiliary_loss_clip": 0.01340436, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.2330128, + "balance_loss_mlp": 1.01765847, + "epoch": 0.7439050052607846, + "flos": 17972377323840.0, + "grad_norm": 1.9081805327872903, + "language_loss": 0.7543087, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77801663, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12713623, + "step": 12373, + "time_per_iteration": 2.764843225479126 + }, + { + "auxiliary_loss_clip": 0.01346033, + "auxiliary_loss_mlp": 0.01033889, + "balance_loss_clip": 1.23514819, + "balance_loss_mlp": 1.0193274, + "epoch": 0.7439651285134525, + "flos": 22962889369800.0, + "grad_norm": 1.6230935527964858, + "language_loss": 0.77309251, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79689169, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14538574, + "step": 12374, + "time_per_iteration": 2.7753658294677734 + }, + { + "auxiliary_loss_clip": 0.0134033, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.2310524, + "balance_loss_mlp": 1.02061319, + "epoch": 0.7440252517661206, + "flos": 18811402165800.0, + "grad_norm": 2.042481050895185, + "language_loss": 0.76877707, + "learning_rate": 6.487278616990774e-07, + "loss": 0.79251707, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13061523, + "step": 12375, + "time_per_iteration": 2.9339916706085205 + }, + { + "auxiliary_loss_clip": 0.01338626, + "auxiliary_loss_mlp": 0.0102932, + "balance_loss_clip": 1.23270798, + "balance_loss_mlp": 1.01739907, + "epoch": 0.7440853750187885, + "flos": 20271121368480.0, + "grad_norm": 1.8230233562219984, + "language_loss": 0.77344799, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79712749, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1192627, + "step": 12376, + "time_per_iteration": 2.7781171798706055 + }, + { + "auxiliary_loss_clip": 0.01340682, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.23102629, + "balance_loss_mlp": 1.01763654, + "epoch": 0.7441454982714565, + "flos": 25342694346960.0, + "grad_norm": 1.731954535905765, + "language_loss": 0.79346091, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81717372, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12976074, + "step": 12377, + "time_per_iteration": 2.9348108768463135 + }, + { + "auxiliary_loss_clip": 0.01344057, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.23323047, + "balance_loss_mlp": 1.02171826, + "epoch": 0.7442056215241245, + "flos": 64490708830680.0, + "grad_norm": 1.9012542714501697, + "language_loss": 0.6754365, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69923079, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13659668, + "step": 12378, + "time_per_iteration": 3.1151323318481445 + }, + { + "auxiliary_loss_clip": 0.01344805, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.23441291, + "balance_loss_mlp": 1.02009654, + "epoch": 0.7442657447767924, + "flos": 31291690694760.0, + "grad_norm": 1.7649048959639118, + "language_loss": 0.71711987, + "learning_rate": 6.475797721245648e-07, + "loss": 0.7409128, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.14385986, + "step": 12379, + "time_per_iteration": 2.9577488899230957 + }, + { + "auxiliary_loss_clip": 0.01338803, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.23077548, + "balance_loss_mlp": 1.02021289, + "epoch": 0.7443258680294604, + "flos": 20812054264200.0, + "grad_norm": 2.8032723012289686, + "language_loss": 0.65693349, + "learning_rate": 6.472928779135085e-07, + "loss": 0.68065178, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12811279, + "step": 12380, + "time_per_iteration": 2.843193292617798 + }, + { + "auxiliary_loss_clip": 0.01339277, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.23028517, + "balance_loss_mlp": 1.01984167, + "epoch": 0.7443859912821283, + "flos": 22204884852000.0, + "grad_norm": 1.8953944804764602, + "language_loss": 0.79024166, + "learning_rate": 6.470060349972411e-07, + "loss": 0.81396186, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12908936, + "step": 12381, + "time_per_iteration": 2.736602783203125 + }, + { + "auxiliary_loss_clip": 0.01344126, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.23343503, + "balance_loss_mlp": 1.02107763, + "epoch": 0.7444461145347964, + "flos": 22022922480120.0, + "grad_norm": 2.1203852919574233, + "language_loss": 0.72570372, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74949396, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13812256, + "step": 12382, + "time_per_iteration": 2.7871460914611816 + }, + { + "auxiliary_loss_clip": 0.01159326, + "auxiliary_loss_mlp": 0.01001875, + "balance_loss_clip": 1.11553121, + "balance_loss_mlp": 0.99939549, + "epoch": 0.7445062377874643, + "flos": 70577211731400.0, + "grad_norm": 0.6509302181786929, + "language_loss": 0.54729283, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56890488, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02478027, + "step": 12383, + "time_per_iteration": 3.400500774383545 + }, + { + "auxiliary_loss_clip": 0.01341162, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.23170102, + "balance_loss_mlp": 1.01817083, + "epoch": 0.7445663610401323, + "flos": 22169613168360.0, + "grad_norm": 1.906767403768508, + "language_loss": 0.75910389, + "learning_rate": 6.461458141259395e-07, + "loss": 0.7828266, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12939453, + "step": 12384, + "time_per_iteration": 2.7876124382019043 + }, + { + "auxiliary_loss_clip": 0.01333801, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.2264396, + "balance_loss_mlp": 1.01768565, + "epoch": 0.7446264842928002, + "flos": 24175584961920.0, + "grad_norm": 1.886571042678178, + "language_loss": 0.79464614, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81829262, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13165283, + "step": 12385, + "time_per_iteration": 2.7745964527130127 + }, + { + "auxiliary_loss_clip": 0.01347147, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.23601186, + "balance_loss_mlp": 1.01828837, + "epoch": 0.7446866075454682, + "flos": 24139988411400.0, + "grad_norm": 1.5865929557567962, + "language_loss": 0.82006323, + "learning_rate": 6.455725902183813e-07, + "loss": 0.84386706, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14941406, + "step": 12386, + "time_per_iteration": 2.7989611625671387 + }, + { + "auxiliary_loss_clip": 0.01334165, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.22802806, + "balance_loss_mlp": 1.01905048, + "epoch": 0.7447467307981361, + "flos": 23553063225000.0, + "grad_norm": 1.6882582496454321, + "language_loss": 0.71853757, + "learning_rate": 6.452860552992037e-07, + "loss": 0.74220026, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.1305542, + "step": 12387, + "time_per_iteration": 2.796516180038452 + }, + { + "auxiliary_loss_clip": 0.01334569, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.22684491, + "balance_loss_mlp": 1.01742733, + "epoch": 0.7448068540508042, + "flos": 19571883793560.0, + "grad_norm": 2.774443641927864, + "language_loss": 0.70809925, + "learning_rate": 6.449995717509138e-07, + "loss": 0.73174375, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12451172, + "step": 12388, + "time_per_iteration": 2.706017017364502 + }, + { + "auxiliary_loss_clip": 0.0133744, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.22937918, + "balance_loss_mlp": 1.0194068, + "epoch": 0.7448669773034721, + "flos": 21845345811120.0, + "grad_norm": 1.4873839598257272, + "language_loss": 0.85154176, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87523466, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12451172, + "step": 12389, + "time_per_iteration": 2.8028178215026855 + }, + { + "auxiliary_loss_clip": 0.01342195, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.23326588, + "balance_loss_mlp": 1.01955795, + "epoch": 0.7449271005561401, + "flos": 25160853800160.0, + "grad_norm": 1.8581428461994067, + "language_loss": 0.79809749, + "learning_rate": 6.444267588104526e-07, + "loss": 0.821841, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12597656, + "step": 12390, + "time_per_iteration": 2.9218859672546387 + }, + { + "auxiliary_loss_clip": 0.01344845, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.23487222, + "balance_loss_mlp": 1.01568103, + "epoch": 0.7449872238088081, + "flos": 22278473846280.0, + "grad_norm": 1.679651094681696, + "language_loss": 0.84896934, + "learning_rate": 6.441404294400014e-07, + "loss": 0.87271565, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.14099121, + "step": 12391, + "time_per_iteration": 2.8971035480499268 + }, + { + "auxiliary_loss_clip": 0.01336931, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.2289207, + "balance_loss_mlp": 1.01668274, + "epoch": 0.745047347061476, + "flos": 20599693211880.0, + "grad_norm": 1.9536072371935451, + "language_loss": 0.73930037, + "learning_rate": 6.438541514838811e-07, + "loss": 0.76296091, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12445068, + "step": 12392, + "time_per_iteration": 2.7050771713256836 + }, + { + "auxiliary_loss_clip": 0.01333878, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.22803557, + "balance_loss_mlp": 1.02288222, + "epoch": 0.745107470314144, + "flos": 22132676541960.0, + "grad_norm": 1.7552474818317316, + "language_loss": 0.77279437, + "learning_rate": 6.435679249529487e-07, + "loss": 0.79649323, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13122559, + "step": 12393, + "time_per_iteration": 2.800489664077759 + }, + { + "auxiliary_loss_clip": 0.01338269, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.2302053, + "balance_loss_mlp": 1.02650464, + "epoch": 0.745167593566812, + "flos": 22241577828240.0, + "grad_norm": 1.858382221043764, + "language_loss": 0.73140806, + "learning_rate": 6.432817498580552e-07, + "loss": 0.75519645, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.14068604, + "step": 12394, + "time_per_iteration": 2.7471044063568115 + }, + { + "auxiliary_loss_clip": 0.01335929, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.22857082, + "balance_loss_mlp": 1.01672637, + "epoch": 0.74522771681948, + "flos": 20670926921280.0, + "grad_norm": 1.6875372909817865, + "language_loss": 0.81756622, + "learning_rate": 6.429956262100535e-07, + "loss": 0.84122413, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.13153076, + "step": 12395, + "time_per_iteration": 2.7359366416931152 + }, + { + "auxiliary_loss_clip": 0.0134611, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.23476434, + "balance_loss_mlp": 1.02330899, + "epoch": 0.7452878400721479, + "flos": 21112136020080.0, + "grad_norm": 2.458811320650946, + "language_loss": 0.71709114, + "learning_rate": 6.427095540197937e-07, + "loss": 0.74092102, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13586426, + "step": 12396, + "time_per_iteration": 2.814913034439087 + }, + { + "auxiliary_loss_clip": 0.0134793, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.2370944, + "balance_loss_mlp": 1.02304101, + "epoch": 0.7453479633248159, + "flos": 26693755913520.0, + "grad_norm": 1.778615501602622, + "language_loss": 0.68531126, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70915544, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13439941, + "step": 12397, + "time_per_iteration": 4.288271427154541 + }, + { + "auxiliary_loss_clip": 0.01335786, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.22837329, + "balance_loss_mlp": 1.02661276, + "epoch": 0.7454080865774838, + "flos": 17020065492720.0, + "grad_norm": 1.6619149147078867, + "language_loss": 0.77288663, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7966423, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1317749, + "step": 12398, + "time_per_iteration": 4.1817169189453125 + }, + { + "auxiliary_loss_clip": 0.01332785, + "auxiliary_loss_mlp": 0.01027051, + "balance_loss_clip": 1.22686493, + "balance_loss_mlp": 1.01422405, + "epoch": 0.7454682098301518, + "flos": 21328801558560.0, + "grad_norm": 1.5740179202281082, + "language_loss": 0.78088742, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8044858, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12835693, + "step": 12399, + "time_per_iteration": 2.820131301879883 + }, + { + "auxiliary_loss_clip": 0.0132608, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.22317147, + "balance_loss_mlp": 1.02827549, + "epoch": 0.7455283330828197, + "flos": 17862866912160.0, + "grad_norm": 1.7692753248570958, + "language_loss": 0.73926848, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76292461, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.11260986, + "step": 12400, + "time_per_iteration": 2.76729679107666 + }, + { + "auxiliary_loss_clip": 0.0133559, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.22897089, + "balance_loss_mlp": 1.01461554, + "epoch": 0.7455884563354878, + "flos": 30780466137360.0, + "grad_norm": 2.822772912776041, + "language_loss": 0.82365346, + "learning_rate": 6.412799653142327e-07, + "loss": 0.8472752, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11968994, + "step": 12401, + "time_per_iteration": 2.8490681648254395 + }, + { + "auxiliary_loss_clip": 0.01332056, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.22483325, + "balance_loss_mlp": 1.02383232, + "epoch": 0.7456485795881557, + "flos": 23190803424000.0, + "grad_norm": 1.9465709345797393, + "language_loss": 0.65071583, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67439467, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11987305, + "step": 12402, + "time_per_iteration": 2.810316801071167 + }, + { + "auxiliary_loss_clip": 0.01332057, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.22610712, + "balance_loss_mlp": 1.02110732, + "epoch": 0.7457087028408237, + "flos": 38734256636280.0, + "grad_norm": 1.4844391715539467, + "language_loss": 0.7343998, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75805038, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11907959, + "step": 12403, + "time_per_iteration": 2.943251132965088 + }, + { + "auxiliary_loss_clip": 0.01155019, + "auxiliary_loss_mlp": 0.01000953, + "balance_loss_clip": 1.11036539, + "balance_loss_mlp": 0.9984501, + "epoch": 0.7457688260934917, + "flos": 56056028219280.0, + "grad_norm": 0.8280265036303631, + "language_loss": 0.58878267, + "learning_rate": 6.404228302777621e-07, + "loss": 0.61034238, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02502441, + "step": 12404, + "time_per_iteration": 3.153407335281372 + }, + { + "auxiliary_loss_clip": 0.01338147, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.23030019, + "balance_loss_mlp": 1.02489483, + "epoch": 0.7458289493461596, + "flos": 20120410452600.0, + "grad_norm": 1.4574919074081574, + "language_loss": 0.77793926, + "learning_rate": 6.401372216950995e-07, + "loss": 0.80169529, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12561035, + "step": 12405, + "time_per_iteration": 2.713651180267334 + }, + { + "auxiliary_loss_clip": 0.01330847, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.22573292, + "balance_loss_mlp": 1.0219264, + "epoch": 0.7458890725988276, + "flos": 20197857241080.0, + "grad_norm": 1.6422975491917873, + "language_loss": 0.69080663, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71445906, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12481689, + "step": 12406, + "time_per_iteration": 4.271705389022827 + }, + { + "auxiliary_loss_clip": 0.01347868, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.23455358, + "balance_loss_mlp": 1.02374601, + "epoch": 0.7459491958514956, + "flos": 17023111119720.0, + "grad_norm": 6.897313687513913, + "language_loss": 0.65387475, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67773461, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14367676, + "step": 12407, + "time_per_iteration": 2.7589521408081055 + }, + { + "auxiliary_loss_clip": 0.01342177, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.23130608, + "balance_loss_mlp": 1.01888943, + "epoch": 0.7460093191041636, + "flos": 25083691270200.0, + "grad_norm": 1.6650686385910964, + "language_loss": 0.72147804, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74522209, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13336182, + "step": 12408, + "time_per_iteration": 2.750859498977661 + }, + { + "auxiliary_loss_clip": 0.01342243, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.22981548, + "balance_loss_mlp": 1.02186453, + "epoch": 0.7460694423568315, + "flos": 21913615110240.0, + "grad_norm": 1.859359185226074, + "language_loss": 0.72961009, + "learning_rate": 6.38995303134053e-07, + "loss": 0.75339437, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14324951, + "step": 12409, + "time_per_iteration": 2.7716753482818604 + }, + { + "auxiliary_loss_clip": 0.01329032, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.2244581, + "balance_loss_mlp": 1.02374434, + "epoch": 0.7461295656094995, + "flos": 21220793656200.0, + "grad_norm": 1.5600645833546953, + "language_loss": 0.66086125, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68450648, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11743164, + "step": 12410, + "time_per_iteration": 2.7591042518615723 + }, + { + "auxiliary_loss_clip": 0.01332675, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.22723317, + "balance_loss_mlp": 1.0154655, + "epoch": 0.7461896888621674, + "flos": 22352062840560.0, + "grad_norm": 2.1422316791343934, + "language_loss": 0.84292316, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86653483, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13024902, + "step": 12411, + "time_per_iteration": 2.806495189666748 + }, + { + "auxiliary_loss_clip": 0.01337039, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.22754991, + "balance_loss_mlp": 1.01460838, + "epoch": 0.7462498121148354, + "flos": 25488329217840.0, + "grad_norm": 1.4054595935765426, + "language_loss": 0.78261155, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80625767, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.1295166, + "step": 12412, + "time_per_iteration": 2.81105637550354 + }, + { + "auxiliary_loss_clip": 0.01341523, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.23237062, + "balance_loss_mlp": 1.02337766, + "epoch": 0.7463099353675033, + "flos": 33954481308240.0, + "grad_norm": 1.9287321969554643, + "language_loss": 0.62591267, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64968771, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12591553, + "step": 12413, + "time_per_iteration": 2.8386754989624023 + }, + { + "auxiliary_loss_clip": 0.011554, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 1.11113691, + "balance_loss_mlp": 0.99918205, + "epoch": 0.7463700586201714, + "flos": 62782277447520.0, + "grad_norm": 0.7266752584260459, + "language_loss": 0.54926288, + "learning_rate": 6.375690662261082e-07, + "loss": 0.57083768, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.02893066, + "step": 12414, + "time_per_iteration": 3.362870454788208 + }, + { + "auxiliary_loss_clip": 0.01341311, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.23264718, + "balance_loss_mlp": 1.02432895, + "epoch": 0.7464301818728393, + "flos": 33438546181080.0, + "grad_norm": 1.5757344620168274, + "language_loss": 0.55255723, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57634729, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13360596, + "step": 12415, + "time_per_iteration": 2.858809471130371 + }, + { + "auxiliary_loss_clip": 0.01334491, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.22719276, + "balance_loss_mlp": 1.01918805, + "epoch": 0.7464903051255073, + "flos": 26875515243600.0, + "grad_norm": 1.6063345653009105, + "language_loss": 0.74944055, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77310878, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.1315918, + "step": 12416, + "time_per_iteration": 2.864255428314209 + }, + { + "auxiliary_loss_clip": 0.01335012, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.22609925, + "balance_loss_mlp": 1.02145469, + "epoch": 0.7465504283781753, + "flos": 44094297379680.0, + "grad_norm": 2.0345935026386535, + "language_loss": 0.69484544, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71854401, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13378906, + "step": 12417, + "time_per_iteration": 2.9554357528686523 + }, + { + "auxiliary_loss_clip": 0.01344702, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.23469508, + "balance_loss_mlp": 1.02059937, + "epoch": 0.7466105516308432, + "flos": 19680013521000.0, + "grad_norm": 1.7126074606767228, + "language_loss": 0.73790884, + "learning_rate": 6.364290065781392e-07, + "loss": 0.76169777, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.13604736, + "step": 12418, + "time_per_iteration": 2.7378196716308594 + }, + { + "auxiliary_loss_clip": 0.01334311, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.22628236, + "balance_loss_mlp": 1.01664567, + "epoch": 0.7466706748835112, + "flos": 20525454483840.0, + "grad_norm": 1.4933144489975845, + "language_loss": 0.69126564, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71490347, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1282959, + "step": 12419, + "time_per_iteration": 2.8253095149993896 + }, + { + "auxiliary_loss_clip": 0.01326497, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.22317266, + "balance_loss_mlp": 1.01753712, + "epoch": 0.7467307981361792, + "flos": 21695406454080.0, + "grad_norm": 1.6026416990611811, + "language_loss": 0.74797148, + "learning_rate": 6.358592869514216e-07, + "loss": 0.77153242, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.12078857, + "step": 12420, + "time_per_iteration": 2.852088451385498 + }, + { + "auxiliary_loss_clip": 0.01340943, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.23101223, + "balance_loss_mlp": 1.02034092, + "epoch": 0.7467909213888472, + "flos": 19578502956240.0, + "grad_norm": 2.062820598692936, + "language_loss": 0.67407703, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69782329, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13348389, + "step": 12421, + "time_per_iteration": 2.8697431087493896 + }, + { + "auxiliary_loss_clip": 0.01346015, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.23421121, + "balance_loss_mlp": 1.01990473, + "epoch": 0.7468510446415151, + "flos": 23700403647000.0, + "grad_norm": 1.699045381463432, + "language_loss": 0.72561949, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74941468, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13623047, + "step": 12422, + "time_per_iteration": 2.838170051574707 + }, + { + "auxiliary_loss_clip": 0.01337146, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.2316277, + "balance_loss_mlp": 1.01542664, + "epoch": 0.7469111678941831, + "flos": 29321924577120.0, + "grad_norm": 2.0293869891013308, + "language_loss": 0.75153363, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77518708, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12780762, + "step": 12423, + "time_per_iteration": 2.8542141914367676 + }, + { + "auxiliary_loss_clip": 0.01325609, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.21961176, + "balance_loss_mlp": 1.01461852, + "epoch": 0.746971291146851, + "flos": 21803658006600.0, + "grad_norm": 1.2100224731346263, + "language_loss": 0.67867529, + "learning_rate": 6.347204685245929e-07, + "loss": 0.70219827, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12060547, + "step": 12424, + "time_per_iteration": 2.7763311862945557 + }, + { + "auxiliary_loss_clip": 0.01343607, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.23267972, + "balance_loss_mlp": 1.02493811, + "epoch": 0.747031414399519, + "flos": 36251722843560.0, + "grad_norm": 1.8050431227537673, + "language_loss": 0.74999535, + "learning_rate": 6.344358933197418e-07, + "loss": 0.77381384, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13293457, + "step": 12425, + "time_per_iteration": 2.8904552459716797 + }, + { + "auxiliary_loss_clip": 0.01342113, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.23323584, + "balance_loss_mlp": 1.01762342, + "epoch": 0.7470915376521869, + "flos": 19979770410000.0, + "grad_norm": 1.9673669373385492, + "language_loss": 0.70083988, + "learning_rate": 6.341513698972194e-07, + "loss": 0.72456944, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13208008, + "step": 12426, + "time_per_iteration": 2.768502712249756 + }, + { + "auxiliary_loss_clip": 0.01331471, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.22555637, + "balance_loss_mlp": 1.02125335, + "epoch": 0.747151660904855, + "flos": 20089158996600.0, + "grad_norm": 1.4352595701368724, + "language_loss": 0.65415221, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67781055, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13116455, + "step": 12427, + "time_per_iteration": 2.8175265789031982 + }, + { + "auxiliary_loss_clip": 0.01337272, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.22824478, + "balance_loss_mlp": 1.01423383, + "epoch": 0.7472117841575229, + "flos": 16294814940240.0, + "grad_norm": 1.513917518065046, + "language_loss": 0.74930477, + "learning_rate": 6.335824784423118e-07, + "loss": 0.7729578, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13787842, + "step": 12428, + "time_per_iteration": 2.75592041015625 + }, + { + "auxiliary_loss_clip": 0.01351239, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.23774743, + "balance_loss_mlp": 1.02016008, + "epoch": 0.7472719074101909, + "flos": 21393944013960.0, + "grad_norm": 2.0409553520822494, + "language_loss": 0.58676624, + "learning_rate": 6.33298110431499e-07, + "loss": 0.61062217, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14196777, + "step": 12429, + "time_per_iteration": 2.727790117263794 + }, + { + "auxiliary_loss_clip": 0.01348001, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.23660684, + "balance_loss_mlp": 1.02178431, + "epoch": 0.7473320306628589, + "flos": 29649359386440.0, + "grad_norm": 3.224144535701303, + "language_loss": 0.60977197, + "learning_rate": 6.330137942461595e-07, + "loss": 0.633605, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.1350708, + "step": 12430, + "time_per_iteration": 2.818006753921509 + }, + { + "auxiliary_loss_clip": 0.01334132, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.22820091, + "balance_loss_mlp": 1.0177294, + "epoch": 0.7473921539155268, + "flos": 24141734570880.0, + "grad_norm": 1.4207862747584126, + "language_loss": 0.75583982, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77948809, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12976074, + "step": 12431, + "time_per_iteration": 2.779493808746338 + }, + { + "auxiliary_loss_clip": 0.01339509, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.23092258, + "balance_loss_mlp": 1.01509452, + "epoch": 0.7474522771681948, + "flos": 17491713880320.0, + "grad_norm": 1.7542703197276166, + "language_loss": 0.75309962, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77677107, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12548828, + "step": 12432, + "time_per_iteration": 2.7769784927368164 + }, + { + "auxiliary_loss_clip": 0.01348225, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.23572016, + "balance_loss_mlp": 1.02187109, + "epoch": 0.7475124004208628, + "flos": 16731678944520.0, + "grad_norm": 9.10944917633187, + "language_loss": 0.7032907, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72713542, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14373779, + "step": 12433, + "time_per_iteration": 2.703850269317627 + }, + { + "auxiliary_loss_clip": 0.01340519, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.23091388, + "balance_loss_mlp": 1.01536953, + "epoch": 0.7475725236735308, + "flos": 19725153036120.0, + "grad_norm": 1.7771252097542842, + "language_loss": 0.6756891, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69938445, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13647461, + "step": 12434, + "time_per_iteration": 2.7841501235961914 + }, + { + "auxiliary_loss_clip": 0.01326968, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.22496152, + "balance_loss_mlp": 1.01710689, + "epoch": 0.7476326469261987, + "flos": 26291554467480.0, + "grad_norm": 1.3907812220499967, + "language_loss": 0.79781789, + "learning_rate": 6.315929910788263e-07, + "loss": 0.82137543, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.11682129, + "step": 12435, + "time_per_iteration": 4.380160570144653 + }, + { + "auxiliary_loss_clip": 0.01340424, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_clip": 1.22963047, + "balance_loss_mlp": 1.01701522, + "epoch": 0.7476927701788667, + "flos": 31838105719080.0, + "grad_norm": 1.6440628040455625, + "language_loss": 0.68082249, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70452046, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12365723, + "step": 12436, + "time_per_iteration": 4.284092426300049 + }, + { + "auxiliary_loss_clip": 0.01348171, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.23550534, + "balance_loss_mlp": 1.01779747, + "epoch": 0.7477528934315346, + "flos": 31801778218080.0, + "grad_norm": 1.507922824149803, + "language_loss": 0.7095322, + "learning_rate": 6.31025032967396e-07, + "loss": 0.73332179, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13000488, + "step": 12437, + "time_per_iteration": 2.8331174850463867 + }, + { + "auxiliary_loss_clip": 0.01327835, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.22359443, + "balance_loss_mlp": 1.01477957, + "epoch": 0.7478130166842026, + "flos": 20376327294000.0, + "grad_norm": 1.8895186695058692, + "language_loss": 0.67319822, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69674677, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12243652, + "step": 12438, + "time_per_iteration": 2.8690242767333984 + }, + { + "auxiliary_loss_clip": 0.01338313, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.22966218, + "balance_loss_mlp": 1.0202682, + "epoch": 0.7478731399368705, + "flos": 18153324486720.0, + "grad_norm": 1.5550361917163527, + "language_loss": 0.80906707, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83278072, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12792969, + "step": 12439, + "time_per_iteration": 2.708782434463501 + }, + { + "auxiliary_loss_clip": 0.01334353, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.22719932, + "balance_loss_mlp": 1.02227473, + "epoch": 0.7479332631895386, + "flos": 15272081566920.0, + "grad_norm": 2.2363100490069696, + "language_loss": 0.71217054, + "learning_rate": 6.301734851646674e-07, + "loss": 0.73586667, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12994385, + "step": 12440, + "time_per_iteration": 2.760493040084839 + }, + { + "auxiliary_loss_clip": 0.01334348, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.22856045, + "balance_loss_mlp": 1.01762271, + "epoch": 0.7479933864422065, + "flos": 21147529528800.0, + "grad_norm": 1.5896911534900262, + "language_loss": 0.74637496, + "learning_rate": 6.298897397706597e-07, + "loss": 0.77002192, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12738037, + "step": 12441, + "time_per_iteration": 2.856001138687134 + }, + { + "auxiliary_loss_clip": 0.01341929, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.23193622, + "balance_loss_mlp": 1.02409196, + "epoch": 0.7480535096948745, + "flos": 14396079490200.0, + "grad_norm": 1.803853616956744, + "language_loss": 0.82660908, + "learning_rate": 6.296060463313698e-07, + "loss": 0.85040843, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13916016, + "step": 12442, + "time_per_iteration": 2.752389430999756 + }, + { + "auxiliary_loss_clip": 0.0134661, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.23602724, + "balance_loss_mlp": 1.02024043, + "epoch": 0.7481136329475425, + "flos": 27350452908360.0, + "grad_norm": 2.073910548543583, + "language_loss": 0.63007617, + "learning_rate": 6.293224048575565e-07, + "loss": 0.65388083, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13623047, + "step": 12443, + "time_per_iteration": 2.8369503021240234 + }, + { + "auxiliary_loss_clip": 0.01336731, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.23033774, + "balance_loss_mlp": 1.01802957, + "epoch": 0.7481737562002104, + "flos": 19535515684200.0, + "grad_norm": 2.10141231500417, + "language_loss": 0.71819407, + "learning_rate": 6.29038815359975e-07, + "loss": 0.74186075, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11914062, + "step": 12444, + "time_per_iteration": 2.7484912872314453 + }, + { + "auxiliary_loss_clip": 0.01340041, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.23186541, + "balance_loss_mlp": 1.01860714, + "epoch": 0.7482338794528784, + "flos": 21764772178920.0, + "grad_norm": 1.4735136284743346, + "language_loss": 0.69201446, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71572793, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1270752, + "step": 12445, + "time_per_iteration": 4.321407318115234 + }, + { + "auxiliary_loss_clip": 0.01337098, + "auxiliary_loss_mlp": 0.01024702, + "balance_loss_clip": 1.23035288, + "balance_loss_mlp": 1.01197076, + "epoch": 0.7482940027055464, + "flos": 18701972970840.0, + "grad_norm": 1.583072866825619, + "language_loss": 0.74231648, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76593447, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.1272583, + "step": 12446, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.01341953, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.23028636, + "balance_loss_mlp": 1.01880789, + "epoch": 0.7483541259582144, + "flos": 16002367556040.0, + "grad_norm": 2.802814553256158, + "language_loss": 0.73350263, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75725085, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14050293, + "step": 12447, + "time_per_iteration": 2.752115488052368 + }, + { + "auxiliary_loss_clip": 0.01335029, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.22736847, + "balance_loss_mlp": 1.01862144, + "epoch": 0.7484142492108823, + "flos": 25561755778680.0, + "grad_norm": 2.7622290684860147, + "language_loss": 0.72686529, + "learning_rate": 6.279049773470109e-07, + "loss": 0.75052154, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.11962891, + "step": 12448, + "time_per_iteration": 2.798628568649292 + }, + { + "auxiliary_loss_clip": 0.01342628, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.23275232, + "balance_loss_mlp": 1.01835787, + "epoch": 0.7484743724635503, + "flos": 22892183569080.0, + "grad_norm": 2.0204465430738114, + "language_loss": 0.73829937, + "learning_rate": 6.276216478918543e-07, + "loss": 0.76204073, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13153076, + "step": 12449, + "time_per_iteration": 2.713783025741577 + }, + { + "auxiliary_loss_clip": 0.01350287, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.23745096, + "balance_loss_mlp": 1.01882851, + "epoch": 0.7485344957162182, + "flos": 25305635895480.0, + "grad_norm": 1.995159242626533, + "language_loss": 0.61322939, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63705981, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13928223, + "step": 12450, + "time_per_iteration": 2.910776138305664 + }, + { + "auxiliary_loss_clip": 0.01330456, + "auxiliary_loss_mlp": 0.01024355, + "balance_loss_clip": 1.22650981, + "balance_loss_mlp": 1.01235628, + "epoch": 0.7485946189688862, + "flos": 27058777083000.0, + "grad_norm": 1.9568758958120414, + "language_loss": 0.70854735, + "learning_rate": 6.270551451144577e-07, + "loss": 0.73209548, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11999512, + "step": 12451, + "time_per_iteration": 2.7534101009368896 + }, + { + "auxiliary_loss_clip": 0.01354196, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.2401402, + "balance_loss_mlp": 1.02393556, + "epoch": 0.7486547422215541, + "flos": 26912451870000.0, + "grad_norm": 1.9512030803406903, + "language_loss": 0.80528075, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82919526, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13317871, + "step": 12452, + "time_per_iteration": 2.786830425262451 + }, + { + "auxiliary_loss_clip": 0.01348359, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.23591149, + "balance_loss_mlp": 1.0207839, + "epoch": 0.7487148654742222, + "flos": 22351413106800.0, + "grad_norm": 2.4455207209978536, + "language_loss": 0.71958858, + "learning_rate": 6.264888505858843e-07, + "loss": 0.74341196, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13183594, + "step": 12453, + "time_per_iteration": 2.8131086826324463 + }, + { + "auxiliary_loss_clip": 0.01343994, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.23529243, + "balance_loss_mlp": 1.02610433, + "epoch": 0.7487749887268901, + "flos": 23043950302320.0, + "grad_norm": 1.592852076887016, + "language_loss": 0.74535632, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76918662, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12957764, + "step": 12454, + "time_per_iteration": 2.751401901245117 + }, + { + "auxiliary_loss_clip": 0.01155705, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.11113441, + "balance_loss_mlp": 0.99838489, + "epoch": 0.7488351119795581, + "flos": 71541120572280.0, + "grad_norm": 0.7312840074143363, + "language_loss": 0.59445477, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61602437, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02868652, + "step": 12455, + "time_per_iteration": 3.385336399078369 + }, + { + "auxiliary_loss_clip": 0.0133775, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.23209107, + "balance_loss_mlp": 1.0190618, + "epoch": 0.748895235232226, + "flos": 17199997446600.0, + "grad_norm": 1.7495832592712175, + "language_loss": 0.80189788, + "learning_rate": 6.256397994474592e-07, + "loss": 0.82559454, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12860107, + "step": 12456, + "time_per_iteration": 2.696884870529175 + }, + { + "auxiliary_loss_clip": 0.01155189, + "auxiliary_loss_mlp": 0.01002397, + "balance_loss_clip": 1.11077356, + "balance_loss_mlp": 1.00013161, + "epoch": 0.748955358484894, + "flos": 58994105861880.0, + "grad_norm": 0.824678320693109, + "language_loss": 0.61445481, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63603061, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02270508, + "step": 12457, + "time_per_iteration": 3.1378328800201416 + }, + { + "auxiliary_loss_clip": 0.01344982, + "auxiliary_loss_mlp": 0.01036847, + "balance_loss_clip": 1.23499179, + "balance_loss_mlp": 1.02424049, + "epoch": 0.749015481737562, + "flos": 11363638354200.0, + "grad_norm": 1.9510675745957582, + "language_loss": 0.68228269, + "learning_rate": 6.250740259166711e-07, + "loss": 0.70610094, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1260376, + "step": 12458, + "time_per_iteration": 2.7316832542419434 + }, + { + "auxiliary_loss_clip": 0.01336312, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.22965705, + "balance_loss_mlp": 1.0192138, + "epoch": 0.74907560499023, + "flos": 21111567503040.0, + "grad_norm": 1.7738919996012965, + "language_loss": 0.80027926, + "learning_rate": 6.247912173519106e-07, + "loss": 0.82396185, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12731934, + "step": 12459, + "time_per_iteration": 2.7630929946899414 + }, + { + "auxiliary_loss_clip": 0.01338365, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.23164713, + "balance_loss_mlp": 1.02039909, + "epoch": 0.749135728242898, + "flos": 22272463809000.0, + "grad_norm": 1.5174093444064354, + "language_loss": 0.80907643, + "learning_rate": 6.245084609352043e-07, + "loss": 0.83279479, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13067627, + "step": 12460, + "time_per_iteration": 2.745577573776245 + }, + { + "auxiliary_loss_clip": 0.01340787, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.23367167, + "balance_loss_mlp": 1.0176369, + "epoch": 0.7491958514955659, + "flos": 24062419797840.0, + "grad_norm": 1.7093407636347449, + "language_loss": 0.86202812, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88574994, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13763428, + "step": 12461, + "time_per_iteration": 2.7708020210266113 + }, + { + "auxiliary_loss_clip": 0.01331251, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.2269876, + "balance_loss_mlp": 1.02009082, + "epoch": 0.7492559747482339, + "flos": 24496644258720.0, + "grad_norm": 1.8919998559608633, + "language_loss": 0.69596601, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71960336, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.1239624, + "step": 12462, + "time_per_iteration": 2.926708698272705 + }, + { + "auxiliary_loss_clip": 0.0133865, + "auxiliary_loss_mlp": 0.01031327, + "balance_loss_clip": 1.23043597, + "balance_loss_mlp": 1.017928, + "epoch": 0.7493160980009018, + "flos": 27751070628360.0, + "grad_norm": 2.078368320642214, + "language_loss": 0.71390027, + "learning_rate": 6.236605046806267e-07, + "loss": 0.73760003, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13409424, + "step": 12463, + "time_per_iteration": 2.976126194000244 + }, + { + "auxiliary_loss_clip": 0.01340159, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.23332095, + "balance_loss_mlp": 1.02294683, + "epoch": 0.7493762212535698, + "flos": 30232223736840.0, + "grad_norm": 4.019107128510597, + "language_loss": 0.77583814, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79959035, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12127686, + "step": 12464, + "time_per_iteration": 2.8167061805725098 + }, + { + "auxiliary_loss_clip": 0.01339981, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.23204124, + "balance_loss_mlp": 1.02010047, + "epoch": 0.7494363445062378, + "flos": 21949495919280.0, + "grad_norm": 1.6874274429857092, + "language_loss": 0.78438014, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80810314, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12225342, + "step": 12465, + "time_per_iteration": 2.708364725112915 + }, + { + "auxiliary_loss_clip": 0.01355513, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.24050534, + "balance_loss_mlp": 1.02534175, + "epoch": 0.7494964677589058, + "flos": 12493567462680.0, + "grad_norm": 3.4112597960719895, + "language_loss": 0.74701208, + "learning_rate": 6.22813018144422e-07, + "loss": 0.77096713, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14672852, + "step": 12466, + "time_per_iteration": 2.791551113128662 + }, + { + "auxiliary_loss_clip": 0.01347707, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.23757064, + "balance_loss_mlp": 1.01913118, + "epoch": 0.7495565910115737, + "flos": 21658104352440.0, + "grad_norm": 1.7575879156010474, + "language_loss": 0.66397226, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68776608, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12542725, + "step": 12467, + "time_per_iteration": 2.7404749393463135 + }, + { + "auxiliary_loss_clip": 0.01350329, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.23975658, + "balance_loss_mlp": 1.0222311, + "epoch": 0.7496167142642417, + "flos": 15272812517400.0, + "grad_norm": 2.2596014668883653, + "language_loss": 0.7671544, + "learning_rate": 6.222482882177735e-07, + "loss": 0.79101074, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1307373, + "step": 12468, + "time_per_iteration": 2.6853487491607666 + }, + { + "auxiliary_loss_clip": 0.01342439, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.23550737, + "balance_loss_mlp": 1.02011538, + "epoch": 0.7496768375169096, + "flos": 22060102756680.0, + "grad_norm": 1.8148889043363599, + "language_loss": 0.69584477, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71960461, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13421631, + "step": 12469, + "time_per_iteration": 2.743682861328125 + }, + { + "auxiliary_loss_clip": 0.0134578, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.23595166, + "balance_loss_mlp": 1.02435827, + "epoch": 0.7497369607695776, + "flos": 19061024711400.0, + "grad_norm": 1.8459994167762983, + "language_loss": 0.69380885, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71764016, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13006592, + "step": 12470, + "time_per_iteration": 2.6666388511657715 + }, + { + "auxiliary_loss_clip": 0.01354027, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.23937225, + "balance_loss_mlp": 1.01669967, + "epoch": 0.7497970840222457, + "flos": 21622589018640.0, + "grad_norm": 2.0304769763571433, + "language_loss": 0.75319141, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77703881, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.14013672, + "step": 12471, + "time_per_iteration": 2.7940175533294678 + }, + { + "auxiliary_loss_clip": 0.01344648, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.23491871, + "balance_loss_mlp": 1.01870513, + "epoch": 0.7498572072749136, + "flos": 13739585537160.0, + "grad_norm": 2.1060086062625363, + "language_loss": 0.76720619, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79097396, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.13439941, + "step": 12472, + "time_per_iteration": 2.703831195831299 + }, + { + "auxiliary_loss_clip": 0.01342141, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.23525167, + "balance_loss_mlp": 1.02373385, + "epoch": 0.7499173305275816, + "flos": 22971741992280.0, + "grad_norm": 1.4446713755973095, + "language_loss": 0.84474081, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86852598, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12640381, + "step": 12473, + "time_per_iteration": 2.743934392929077 + }, + { + "auxiliary_loss_clip": 0.01351423, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.2392025, + "balance_loss_mlp": 1.02040315, + "epoch": 0.7499774537802495, + "flos": 22744721322000.0, + "grad_norm": 1.9823047835944745, + "language_loss": 0.74014759, + "learning_rate": 6.205553526478829e-07, + "loss": 0.76400602, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14007568, + "step": 12474, + "time_per_iteration": 4.263984441757202 + }, + { + "auxiliary_loss_clip": 0.01353886, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.23973227, + "balance_loss_mlp": 1.02212977, + "epoch": 0.7500375770329175, + "flos": 18301192817400.0, + "grad_norm": 1.639041842105484, + "language_loss": 0.74473047, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76863253, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14178467, + "step": 12475, + "time_per_iteration": 4.332024574279785 + }, + { + "auxiliary_loss_clip": 0.01357797, + "auxiliary_loss_mlp": 0.01037161, + "balance_loss_clip": 1.24310243, + "balance_loss_mlp": 1.02350521, + "epoch": 0.7500977002855854, + "flos": 19174433525640.0, + "grad_norm": 2.0054006752841422, + "language_loss": 0.80013794, + "learning_rate": 6.199914591465878e-07, + "loss": 0.8240875, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13671875, + "step": 12476, + "time_per_iteration": 2.7302589416503906 + }, + { + "auxiliary_loss_clip": 0.01341239, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.23175979, + "balance_loss_mlp": 1.020895, + "epoch": 0.7501578235382534, + "flos": 22169004042960.0, + "grad_norm": 1.7234419586172907, + "language_loss": 0.78098714, + "learning_rate": 6.19709590885688e-07, + "loss": 0.80473447, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1260376, + "step": 12477, + "time_per_iteration": 2.7858293056488037 + }, + { + "auxiliary_loss_clip": 0.01155564, + "auxiliary_loss_mlp": 0.01007335, + "balance_loss_clip": 1.11070096, + "balance_loss_mlp": 1.00475967, + "epoch": 0.7502179467909214, + "flos": 64477528094880.0, + "grad_norm": 0.8135050630832806, + "language_loss": 0.54490745, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56653649, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02575684, + "step": 12478, + "time_per_iteration": 3.2091946601867676 + }, + { + "auxiliary_loss_clip": 0.01338866, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.23241568, + "balance_loss_mlp": 1.02237868, + "epoch": 0.7502780700435894, + "flos": 20482223561640.0, + "grad_norm": 1.701702593822188, + "language_loss": 0.80394924, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82768977, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12811279, + "step": 12479, + "time_per_iteration": 2.8413589000701904 + }, + { + "auxiliary_loss_clip": 0.01356484, + "auxiliary_loss_mlp": 0.01037177, + "balance_loss_clip": 1.24334216, + "balance_loss_mlp": 1.02303922, + "epoch": 0.7503381932962573, + "flos": 20449672638120.0, + "grad_norm": 1.9245928692909549, + "language_loss": 0.62891895, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65285552, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.14147949, + "step": 12480, + "time_per_iteration": 2.7219669818878174 + }, + { + "auxiliary_loss_clip": 0.01334118, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.22971344, + "balance_loss_mlp": 1.02599859, + "epoch": 0.7503983165489253, + "flos": 22386969048960.0, + "grad_norm": 1.5627709098450984, + "language_loss": 0.77937132, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80309689, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12438965, + "step": 12481, + "time_per_iteration": 2.7533445358276367 + }, + { + "auxiliary_loss_clip": 0.0134312, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.23288786, + "balance_loss_mlp": 1.01731217, + "epoch": 0.7504584398015932, + "flos": 24904815133680.0, + "grad_norm": 1.7131533404107833, + "language_loss": 0.71769977, + "learning_rate": 6.183010349061501e-07, + "loss": 0.74143791, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13397217, + "step": 12482, + "time_per_iteration": 2.839839458465576 + }, + { + "auxiliary_loss_clip": 0.01343666, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.23427868, + "balance_loss_mlp": 1.02056527, + "epoch": 0.7505185630542612, + "flos": 25890814922400.0, + "grad_norm": 1.6405857073023598, + "language_loss": 0.69903183, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72280937, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13525391, + "step": 12483, + "time_per_iteration": 2.8781423568725586 + }, + { + "auxiliary_loss_clip": 0.01343712, + "auxiliary_loss_mlp": 0.01022783, + "balance_loss_clip": 1.23487091, + "balance_loss_mlp": 1.01069474, + "epoch": 0.7505786863069293, + "flos": 23148425277360.0, + "grad_norm": 1.6025360371514779, + "language_loss": 0.74512416, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76878911, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12084961, + "step": 12484, + "time_per_iteration": 4.230836629867554 + }, + { + "auxiliary_loss_clip": 0.01340725, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.23250449, + "balance_loss_mlp": 1.01668572, + "epoch": 0.7506388095595972, + "flos": 16987880044440.0, + "grad_norm": 1.9948113672898538, + "language_loss": 0.84940374, + "learning_rate": 6.174565299629295e-07, + "loss": 0.87311643, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13842773, + "step": 12485, + "time_per_iteration": 2.7296884059906006 + }, + { + "auxiliary_loss_clip": 0.01342948, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.23512936, + "balance_loss_mlp": 1.01844776, + "epoch": 0.7506989328122652, + "flos": 22349951205840.0, + "grad_norm": 1.4777451855875194, + "language_loss": 0.78401637, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80775702, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12670898, + "step": 12486, + "time_per_iteration": 2.766251564025879 + }, + { + "auxiliary_loss_clip": 0.01345243, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.23453808, + "balance_loss_mlp": 1.01850128, + "epoch": 0.7507590560649331, + "flos": 25781669985960.0, + "grad_norm": 2.048778858228935, + "language_loss": 0.72692716, + "learning_rate": 6.168937887805932e-07, + "loss": 0.75069761, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13317871, + "step": 12487, + "time_per_iteration": 2.77524471282959 + }, + { + "auxiliary_loss_clip": 0.01343543, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.23394203, + "balance_loss_mlp": 1.01802874, + "epoch": 0.7508191793176011, + "flos": 24284729898360.0, + "grad_norm": 4.274685202144162, + "language_loss": 0.6749391, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69868261, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12780762, + "step": 12488, + "time_per_iteration": 2.7193074226379395 + }, + { + "auxiliary_loss_clip": 0.01341616, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.23323238, + "balance_loss_mlp": 1.02058446, + "epoch": 0.750879302570269, + "flos": 19904110389360.0, + "grad_norm": 1.7036586543585144, + "language_loss": 0.77231437, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79606068, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12414551, + "step": 12489, + "time_per_iteration": 2.7240540981292725 + }, + { + "auxiliary_loss_clip": 0.01336421, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.23053408, + "balance_loss_mlp": 1.01851213, + "epoch": 0.750939425822937, + "flos": 29211723823320.0, + "grad_norm": 1.825055363192299, + "language_loss": 0.75190592, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77557224, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11694336, + "step": 12490, + "time_per_iteration": 2.771146535873413 + }, + { + "auxiliary_loss_clip": 0.01338301, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.23225653, + "balance_loss_mlp": 1.02037287, + "epoch": 0.750999549075605, + "flos": 21147285878640.0, + "grad_norm": 1.6570014576731846, + "language_loss": 0.78449881, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80821699, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13140869, + "step": 12491, + "time_per_iteration": 2.7545247077941895 + }, + { + "auxiliary_loss_clip": 0.01337133, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.23164785, + "balance_loss_mlp": 1.0231874, + "epoch": 0.751059672328273, + "flos": 23552697749760.0, + "grad_norm": 2.078078764294559, + "language_loss": 0.76382089, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78754365, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11962891, + "step": 12492, + "time_per_iteration": 2.7339491844177246 + }, + { + "auxiliary_loss_clip": 0.01338059, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.23060441, + "balance_loss_mlp": 1.01976323, + "epoch": 0.7511197955809409, + "flos": 18994460963400.0, + "grad_norm": 3.502381569746931, + "language_loss": 0.71327704, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73697615, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12084961, + "step": 12493, + "time_per_iteration": 2.743316650390625 + }, + { + "auxiliary_loss_clip": 0.01342632, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.23286009, + "balance_loss_mlp": 1.01466465, + "epoch": 0.7511799188336089, + "flos": 22051087700760.0, + "grad_norm": 1.5817889137161718, + "language_loss": 0.80844796, + "learning_rate": 6.149258472993395e-07, + "loss": 0.83215284, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13189697, + "step": 12494, + "time_per_iteration": 2.778874158859253 + }, + { + "auxiliary_loss_clip": 0.01342756, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.23402822, + "balance_loss_mlp": 1.01702738, + "epoch": 0.7512400420862768, + "flos": 16470848491560.0, + "grad_norm": 2.66402856253748, + "language_loss": 0.79127908, + "learning_rate": 6.146449228053634e-07, + "loss": 0.81501281, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1361084, + "step": 12495, + "time_per_iteration": 2.880305767059326 + }, + { + "auxiliary_loss_clip": 0.01338485, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.23094726, + "balance_loss_mlp": 1.02100945, + "epoch": 0.7513001653389448, + "flos": 20453124348720.0, + "grad_norm": 1.9129162106065059, + "language_loss": 0.71664178, + "learning_rate": 6.143640508441898e-07, + "loss": 0.7403658, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12878418, + "step": 12496, + "time_per_iteration": 2.757453441619873 + }, + { + "auxiliary_loss_clip": 0.01336375, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.228953, + "balance_loss_mlp": 1.01531816, + "epoch": 0.7513602885916129, + "flos": 23482032557400.0, + "grad_norm": 1.57269859077729, + "language_loss": 0.78470564, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80835086, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1282959, + "step": 12497, + "time_per_iteration": 2.745549201965332 + }, + { + "auxiliary_loss_clip": 0.01344093, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.2348007, + "balance_loss_mlp": 1.01760578, + "epoch": 0.7514204118442808, + "flos": 26802900849960.0, + "grad_norm": 1.4997131983782241, + "language_loss": 0.76950175, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79325211, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13330078, + "step": 12498, + "time_per_iteration": 2.9030914306640625 + }, + { + "auxiliary_loss_clip": 0.01337098, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.23198414, + "balance_loss_mlp": 1.02124906, + "epoch": 0.7514805350969488, + "flos": 19870503648480.0, + "grad_norm": 1.7552243577610251, + "language_loss": 0.74588287, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76958501, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11877441, + "step": 12499, + "time_per_iteration": 2.7180888652801514 + }, + { + "auxiliary_loss_clip": 0.01334628, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.22851646, + "balance_loss_mlp": 1.01657009, + "epoch": 0.7515406583496167, + "flos": 24576933632400.0, + "grad_norm": 1.8685883910505432, + "language_loss": 0.79419553, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81782484, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11749268, + "step": 12500, + "time_per_iteration": 2.776897668838501 + }, + { + "auxiliary_loss_clip": 0.01361664, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.24601507, + "balance_loss_mlp": 1.01976454, + "epoch": 0.7516007816022847, + "flos": 20125039805640.0, + "grad_norm": 1.8754207897077981, + "language_loss": 0.7375983, + "learning_rate": 6.129604794030794e-07, + "loss": 0.76156443, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.15197754, + "step": 12501, + "time_per_iteration": 2.733964681625366 + }, + { + "auxiliary_loss_clip": 0.01337908, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.22996306, + "balance_loss_mlp": 1.01366484, + "epoch": 0.7516609048549526, + "flos": 22789901445480.0, + "grad_norm": 1.6598805223185236, + "language_loss": 0.78767443, + "learning_rate": 6.126799228623207e-07, + "loss": 0.81131786, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12774658, + "step": 12502, + "time_per_iteration": 2.773252248764038 + }, + { + "auxiliary_loss_clip": 0.01347058, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.2373184, + "balance_loss_mlp": 1.01765084, + "epoch": 0.7517210281076206, + "flos": 10637575634520.0, + "grad_norm": 1.9708447143655985, + "language_loss": 0.70931351, + "learning_rate": 6.123994189288786e-07, + "loss": 0.73308885, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.12841797, + "step": 12503, + "time_per_iteration": 2.7260704040527344 + }, + { + "auxiliary_loss_clip": 0.01155367, + "auxiliary_loss_mlp": 0.01002489, + "balance_loss_clip": 1.11100304, + "balance_loss_mlp": 0.99969989, + "epoch": 0.7517811513602886, + "flos": 66067305539400.0, + "grad_norm": 1.8718842125316388, + "language_loss": 0.64015025, + "learning_rate": 6.121189676133903e-07, + "loss": 0.6617288, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.0279541, + "step": 12504, + "time_per_iteration": 3.2104222774505615 + }, + { + "auxiliary_loss_clip": 0.01338108, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.23300481, + "balance_loss_mlp": 1.0161438, + "epoch": 0.7518412746129566, + "flos": 37275349600800.0, + "grad_norm": 1.4085162293239295, + "language_loss": 0.68972713, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71339142, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12176514, + "step": 12505, + "time_per_iteration": 2.9695546627044678 + }, + { + "auxiliary_loss_clip": 0.01156813, + "auxiliary_loss_mlp": 0.0100651, + "balance_loss_clip": 1.11274099, + "balance_loss_mlp": 1.00398302, + "epoch": 0.7519013978656245, + "flos": 60534520521840.0, + "grad_norm": 0.6910863195923512, + "language_loss": 0.55173653, + "learning_rate": 6.11558222878809e-07, + "loss": 0.5733698, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02526855, + "step": 12506, + "time_per_iteration": 3.3234267234802246 + }, + { + "auxiliary_loss_clip": 0.01348374, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.23885214, + "balance_loss_mlp": 1.0202992, + "epoch": 0.7519615211182925, + "flos": 18811523990880.0, + "grad_norm": 1.837555221614796, + "language_loss": 0.78590953, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80972487, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12866211, + "step": 12507, + "time_per_iteration": 2.843752861022949 + }, + { + "auxiliary_loss_clip": 0.01337866, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.23323226, + "balance_loss_mlp": 1.02188993, + "epoch": 0.7520216443709604, + "flos": 14579422546320.0, + "grad_norm": 1.6349949398139918, + "language_loss": 0.71275544, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73647666, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1237793, + "step": 12508, + "time_per_iteration": 2.715881586074829 + }, + { + "auxiliary_loss_clip": 0.01338025, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.23103416, + "balance_loss_mlp": 1.01676452, + "epoch": 0.7520817676236284, + "flos": 17061144171840.0, + "grad_norm": 1.592155915061036, + "language_loss": 0.72269183, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74636972, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12994385, + "step": 12509, + "time_per_iteration": 2.728193521499634 + }, + { + "auxiliary_loss_clip": 0.01353394, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.24102187, + "balance_loss_mlp": 1.02481484, + "epoch": 0.7521418908762965, + "flos": 25671956532480.0, + "grad_norm": 1.6570430312092457, + "language_loss": 0.62409234, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64801323, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13891602, + "step": 12510, + "time_per_iteration": 2.8935136795043945 + }, + { + "auxiliary_loss_clip": 0.01341729, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.2366097, + "balance_loss_mlp": 1.02442324, + "epoch": 0.7522020141289644, + "flos": 20891693904120.0, + "grad_norm": 1.5798401786653211, + "language_loss": 0.82213378, + "learning_rate": 6.10157282600722e-07, + "loss": 0.84592521, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.13000488, + "step": 12511, + "time_per_iteration": 2.7199270725250244 + }, + { + "auxiliary_loss_clip": 0.0135053, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.23955142, + "balance_loss_mlp": 1.02247453, + "epoch": 0.7522621373816324, + "flos": 12644075336760.0, + "grad_norm": 1.7424532632586496, + "language_loss": 0.76367509, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78754187, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13653564, + "step": 12512, + "time_per_iteration": 4.379276275634766 + }, + { + "auxiliary_loss_clip": 0.0133022, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.22743535, + "balance_loss_mlp": 1.01792526, + "epoch": 0.7523222606343003, + "flos": 25631202720240.0, + "grad_norm": 1.7501999223212752, + "language_loss": 0.8272208, + "learning_rate": 6.095972753359537e-07, + "loss": 0.85081851, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.11633301, + "step": 12513, + "time_per_iteration": 4.251084089279175 + }, + { + "auxiliary_loss_clip": 0.01348888, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.23864365, + "balance_loss_mlp": 1.01934123, + "epoch": 0.7523823838869683, + "flos": 20453774082480.0, + "grad_norm": 2.138028852982628, + "language_loss": 0.75576949, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77958667, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1348877, + "step": 12514, + "time_per_iteration": 4.236233234405518 + }, + { + "auxiliary_loss_clip": 0.01336184, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.23242104, + "balance_loss_mlp": 1.01460433, + "epoch": 0.7524425071396362, + "flos": 14724732550320.0, + "grad_norm": 1.842162970048643, + "language_loss": 0.69398689, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71761358, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.11883545, + "step": 12515, + "time_per_iteration": 2.775423526763916 + }, + { + "auxiliary_loss_clip": 0.01341688, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.23361707, + "balance_loss_mlp": 1.01865923, + "epoch": 0.7525026303923043, + "flos": 30598260115320.0, + "grad_norm": 1.6231825791199397, + "language_loss": 0.70235825, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72608387, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12219238, + "step": 12516, + "time_per_iteration": 2.8330488204956055 + }, + { + "auxiliary_loss_clip": 0.01335277, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.23080564, + "balance_loss_mlp": 1.01602948, + "epoch": 0.7525627536449722, + "flos": 24797335140000.0, + "grad_norm": 1.5290053644473143, + "language_loss": 0.89620817, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91984737, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.1262207, + "step": 12517, + "time_per_iteration": 2.859903573989868 + }, + { + "auxiliary_loss_clip": 0.01349471, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.23899245, + "balance_loss_mlp": 1.02200687, + "epoch": 0.7526228768976402, + "flos": 20785026077640.0, + "grad_norm": 1.5147119813645789, + "language_loss": 0.74289417, + "learning_rate": 6.081981800334437e-07, + "loss": 0.7667383, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12915039, + "step": 12518, + "time_per_iteration": 2.9058282375335693 + }, + { + "auxiliary_loss_clip": 0.0115528, + "auxiliary_loss_mlp": 0.01004413, + "balance_loss_clip": 1.11210966, + "balance_loss_mlp": 1.00218379, + "epoch": 0.7526830001503081, + "flos": 66573900743760.0, + "grad_norm": 0.7186088495637936, + "language_loss": 0.55760574, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57920277, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02233887, + "step": 12519, + "time_per_iteration": 3.293010950088501 + }, + { + "auxiliary_loss_clip": 0.01339969, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.23271215, + "balance_loss_mlp": 1.02367544, + "epoch": 0.7527431234029761, + "flos": 23483250808200.0, + "grad_norm": 1.447911744954423, + "language_loss": 0.77553988, + "learning_rate": 6.07638911279029e-07, + "loss": 0.7992928, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11651611, + "step": 12520, + "time_per_iteration": 2.7766788005828857 + }, + { + "auxiliary_loss_clip": 0.01336158, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.22930074, + "balance_loss_mlp": 1.02288532, + "epoch": 0.752803246655644, + "flos": 22054011502680.0, + "grad_norm": 2.019322469767161, + "language_loss": 0.74318343, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76689488, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12109375, + "step": 12521, + "time_per_iteration": 2.9273123741149902 + }, + { + "auxiliary_loss_clip": 0.01351276, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.23856962, + "balance_loss_mlp": 1.01964462, + "epoch": 0.752863369908312, + "flos": 30159893601720.0, + "grad_norm": 1.8030088709007996, + "language_loss": 0.67600834, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69985402, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13659668, + "step": 12522, + "time_per_iteration": 4.310238599777222 + }, + { + "auxiliary_loss_clip": 0.01346877, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.23582315, + "balance_loss_mlp": 1.02769041, + "epoch": 0.7529234931609801, + "flos": 24572507321160.0, + "grad_norm": 1.8656975968389247, + "language_loss": 0.78556108, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80943871, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13171387, + "step": 12523, + "time_per_iteration": 2.7813780307769775 + }, + { + "auxiliary_loss_clip": 0.0133623, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.2305212, + "balance_loss_mlp": 1.01732922, + "epoch": 0.752983616413648, + "flos": 23117620513320.0, + "grad_norm": 2.1356861552440387, + "language_loss": 0.80748081, + "learning_rate": 6.065210074366571e-07, + "loss": 0.83113807, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.1217041, + "step": 12524, + "time_per_iteration": 2.763003349304199 + }, + { + "auxiliary_loss_clip": 0.01338381, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.2319417, + "balance_loss_mlp": 1.02151346, + "epoch": 0.753043739666316, + "flos": 24322397475240.0, + "grad_norm": 1.4944025341663745, + "language_loss": 0.74169874, + "learning_rate": 6.062416635517326e-07, + "loss": 0.76541871, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12091064, + "step": 12525, + "time_per_iteration": 2.8178353309631348 + }, + { + "auxiliary_loss_clip": 0.01334379, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.22871852, + "balance_loss_mlp": 1.01813304, + "epoch": 0.7531038629189839, + "flos": 24248646047520.0, + "grad_norm": 1.8749547584415025, + "language_loss": 0.72252274, + "learning_rate": 6.059623725182641e-07, + "loss": 0.746171, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12322998, + "step": 12526, + "time_per_iteration": 2.7486164569854736 + }, + { + "auxiliary_loss_clip": 0.01336039, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.22920895, + "balance_loss_mlp": 1.01968336, + "epoch": 0.7531639861716519, + "flos": 30194718593400.0, + "grad_norm": 1.8572891784076178, + "language_loss": 0.72398001, + "learning_rate": 6.056831343468414e-07, + "loss": 0.7476536, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11651611, + "step": 12527, + "time_per_iteration": 2.7781765460968018 + }, + { + "auxiliary_loss_clip": 0.01337896, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.2316587, + "balance_loss_mlp": 1.01731789, + "epoch": 0.7532241094243198, + "flos": 18227766256560.0, + "grad_norm": 1.7722749255202566, + "language_loss": 0.81420785, + "learning_rate": 6.054039490480539e-07, + "loss": 0.8378793, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.11932373, + "step": 12528, + "time_per_iteration": 2.730236291885376 + }, + { + "auxiliary_loss_clip": 0.01344992, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.23617196, + "balance_loss_mlp": 1.02174318, + "epoch": 0.7532842326769879, + "flos": 20885237174880.0, + "grad_norm": 2.33002768965798, + "language_loss": 0.85135043, + "learning_rate": 6.051248166324892e-07, + "loss": 0.8751511, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13323975, + "step": 12529, + "time_per_iteration": 2.78728985786438 + }, + { + "auxiliary_loss_clip": 0.0135309, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.24047494, + "balance_loss_mlp": 1.01744103, + "epoch": 0.7533443559296558, + "flos": 18083430853200.0, + "grad_norm": 2.190964014961122, + "language_loss": 0.7405051, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76434553, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13513184, + "step": 12530, + "time_per_iteration": 2.7661526203155518 + }, + { + "auxiliary_loss_clip": 0.01152616, + "auxiliary_loss_mlp": 0.01000111, + "balance_loss_clip": 1.10976577, + "balance_loss_mlp": 0.99746478, + "epoch": 0.7534044791823238, + "flos": 50267976094080.0, + "grad_norm": 0.8300583405629662, + "language_loss": 0.63626343, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65779072, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02648926, + "step": 12531, + "time_per_iteration": 3.117335557937622 + }, + { + "auxiliary_loss_clip": 0.01345427, + "auxiliary_loss_mlp": 0.01024144, + "balance_loss_clip": 1.23550653, + "balance_loss_mlp": 1.01124001, + "epoch": 0.7534646024349917, + "flos": 20855203969680.0, + "grad_norm": 1.8829782522010097, + "language_loss": 0.69869733, + "learning_rate": 6.042877367909633e-07, + "loss": 0.7223931, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12902832, + "step": 12532, + "time_per_iteration": 2.7897696495056152 + }, + { + "auxiliary_loss_clip": 0.01329276, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.22584021, + "balance_loss_mlp": 1.02088416, + "epoch": 0.7535247256876597, + "flos": 23076460617480.0, + "grad_norm": 1.558575048747154, + "language_loss": 0.7758975, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79951698, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.11791992, + "step": 12533, + "time_per_iteration": 2.9011154174804688 + }, + { + "auxiliary_loss_clip": 0.01152574, + "auxiliary_loss_mlp": 0.01007149, + "balance_loss_clip": 1.10945415, + "balance_loss_mlp": 1.00452673, + "epoch": 0.7535848489403276, + "flos": 58639399215840.0, + "grad_norm": 0.7847447216983509, + "language_loss": 0.57402647, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59562373, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02624512, + "step": 12534, + "time_per_iteration": 3.292818069458008 + }, + { + "auxiliary_loss_clip": 0.0133752, + "auxiliary_loss_mlp": 0.01024761, + "balance_loss_clip": 1.23037434, + "balance_loss_mlp": 1.01161861, + "epoch": 0.7536449721929956, + "flos": 26583473943000.0, + "grad_norm": 1.5361057027062224, + "language_loss": 0.71373695, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73735976, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13134766, + "step": 12535, + "time_per_iteration": 2.8355202674865723 + }, + { + "auxiliary_loss_clip": 0.01345218, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.23448396, + "balance_loss_mlp": 1.01609993, + "epoch": 0.7537050954456637, + "flos": 25741119215520.0, + "grad_norm": 1.6479538819371642, + "language_loss": 0.8113054, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83506209, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14349365, + "step": 12536, + "time_per_iteration": 2.8765485286712646 + }, + { + "auxiliary_loss_clip": 0.0133914, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.23337495, + "balance_loss_mlp": 1.01700878, + "epoch": 0.7537652186983316, + "flos": 30230233927200.0, + "grad_norm": 1.8387654301345229, + "language_loss": 0.74503207, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76871586, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12237549, + "step": 12537, + "time_per_iteration": 2.8088643550872803 + }, + { + "auxiliary_loss_clip": 0.01341398, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.23222566, + "balance_loss_mlp": 1.01809204, + "epoch": 0.7538253419509996, + "flos": 12645131154120.0, + "grad_norm": 1.5765163206749937, + "language_loss": 0.74491334, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76863503, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.12701416, + "step": 12538, + "time_per_iteration": 2.7800135612487793 + }, + { + "auxiliary_loss_clip": 0.01344476, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.23464561, + "balance_loss_mlp": 1.01752067, + "epoch": 0.7538854652036675, + "flos": 23191290724320.0, + "grad_norm": 1.4019347157677862, + "language_loss": 0.67630315, + "learning_rate": 6.023364033816956e-07, + "loss": 0.70005119, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12811279, + "step": 12539, + "time_per_iteration": 2.7969253063201904 + }, + { + "auxiliary_loss_clip": 0.01331599, + "auxiliary_loss_mlp": 0.01027929, + "balance_loss_clip": 1.22624707, + "balance_loss_mlp": 1.01485229, + "epoch": 0.7539455884563355, + "flos": 23191574982840.0, + "grad_norm": 1.6824961406821834, + "language_loss": 0.74860907, + "learning_rate": 6.020578533797229e-07, + "loss": 0.77220434, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.13085938, + "step": 12540, + "time_per_iteration": 2.776029348373413 + }, + { + "auxiliary_loss_clip": 0.01344612, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.23427916, + "balance_loss_mlp": 1.01473117, + "epoch": 0.7540057117090034, + "flos": 13184155456920.0, + "grad_norm": 2.5536315513401155, + "language_loss": 0.73333824, + "learning_rate": 6.017793563878566e-07, + "loss": 0.75706255, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13085938, + "step": 12541, + "time_per_iteration": 2.8290653228759766 + }, + { + "auxiliary_loss_clip": 0.01340314, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.23182464, + "balance_loss_mlp": 1.01850498, + "epoch": 0.7540658349616715, + "flos": 45486275191920.0, + "grad_norm": 1.6237780272927804, + "language_loss": 0.72470558, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74842697, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13323975, + "step": 12542, + "time_per_iteration": 2.980957508087158 + }, + { + "auxiliary_loss_clip": 0.01335082, + "auxiliary_loss_mlp": 0.0102673, + "balance_loss_clip": 1.22890472, + "balance_loss_mlp": 1.01351523, + "epoch": 0.7541259582143394, + "flos": 19934671503240.0, + "grad_norm": 1.7257143367105858, + "language_loss": 0.84485281, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86847097, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13220215, + "step": 12543, + "time_per_iteration": 2.8051297664642334 + }, + { + "auxiliary_loss_clip": 0.0134097, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.2336663, + "balance_loss_mlp": 1.01708579, + "epoch": 0.7541860814670074, + "flos": 27203924653560.0, + "grad_norm": 2.012323376199903, + "language_loss": 0.74033958, + "learning_rate": 6.009441835784927e-07, + "loss": 0.76406205, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.14178467, + "step": 12544, + "time_per_iteration": 2.759269952774048 + }, + { + "auxiliary_loss_clip": 0.01336575, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.22930312, + "balance_loss_mlp": 1.02011573, + "epoch": 0.7542462047196753, + "flos": 21329085817080.0, + "grad_norm": 1.7321529434257368, + "language_loss": 0.68808699, + "learning_rate": 6.006658987326383e-07, + "loss": 0.71177948, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12548828, + "step": 12545, + "time_per_iteration": 2.900244951248169 + }, + { + "auxiliary_loss_clip": 0.01341565, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.23264718, + "balance_loss_mlp": 1.01478696, + "epoch": 0.7543063279723433, + "flos": 11943822552840.0, + "grad_norm": 1.7935911301783807, + "language_loss": 0.68766677, + "learning_rate": 6.003876669496728e-07, + "loss": 0.71135902, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12860107, + "step": 12546, + "time_per_iteration": 2.6802191734313965 + }, + { + "auxiliary_loss_clip": 0.01345134, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.2364583, + "balance_loss_mlp": 1.01729488, + "epoch": 0.7543664512250112, + "flos": 22825091912400.0, + "grad_norm": 2.4571659158081345, + "language_loss": 0.73658746, + "learning_rate": 6.00109488240147e-07, + "loss": 0.76035303, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.14123535, + "step": 12547, + "time_per_iteration": 2.8254892826080322 + }, + { + "auxiliary_loss_clip": 0.01341195, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.23264313, + "balance_loss_mlp": 1.01676762, + "epoch": 0.7544265744776792, + "flos": 20928874180680.0, + "grad_norm": 1.730429176488198, + "language_loss": 0.68097234, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70468557, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13366699, + "step": 12548, + "time_per_iteration": 2.7173867225646973 + }, + { + "auxiliary_loss_clip": 0.0134581, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.23518682, + "balance_loss_mlp": 1.01919293, + "epoch": 0.7544866977303473, + "flos": 15199751431800.0, + "grad_norm": 1.855601508357324, + "language_loss": 0.87181687, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89559853, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13153076, + "step": 12549, + "time_per_iteration": 2.746781349182129 + }, + { + "auxiliary_loss_clip": 0.01325459, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.22204065, + "balance_loss_mlp": 1.0183953, + "epoch": 0.7545468209830152, + "flos": 27088688463120.0, + "grad_norm": 2.145172992308191, + "language_loss": 0.77814341, + "learning_rate": 5.992752706576865e-07, + "loss": 0.80170625, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12445068, + "step": 12550, + "time_per_iteration": 2.8070011138916016 + }, + { + "auxiliary_loss_clip": 0.01342805, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.23364401, + "balance_loss_mlp": 1.01328206, + "epoch": 0.7546069442356832, + "flos": 26877586269960.0, + "grad_norm": 1.426169394557089, + "language_loss": 0.69625187, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71993935, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12670898, + "step": 12551, + "time_per_iteration": 4.21239161491394 + }, + { + "auxiliary_loss_clip": 0.01344287, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.23786628, + "balance_loss_mlp": 1.0180006, + "epoch": 0.7546670674883511, + "flos": 15747506532000.0, + "grad_norm": 1.9880097218973016, + "language_loss": 0.86303329, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88679087, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13464355, + "step": 12552, + "time_per_iteration": 4.164396286010742 + }, + { + "auxiliary_loss_clip": 0.01344517, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.23513103, + "balance_loss_mlp": 1.02179122, + "epoch": 0.7547271907410191, + "flos": 23483047766400.0, + "grad_norm": 1.7760529888202758, + "language_loss": 0.78371739, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80751169, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13122559, + "step": 12553, + "time_per_iteration": 2.8219242095947266 + }, + { + "auxiliary_loss_clip": 0.01345074, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.23668504, + "balance_loss_mlp": 1.0267539, + "epoch": 0.754787313993687, + "flos": 31729123216080.0, + "grad_norm": 2.0555877634772197, + "language_loss": 0.63203275, + "learning_rate": 5.981637242156135e-07, + "loss": 0.65588498, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1340332, + "step": 12554, + "time_per_iteration": 2.8561530113220215 + }, + { + "auxiliary_loss_clip": 0.01338805, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.23049641, + "balance_loss_mlp": 1.01751077, + "epoch": 0.7548474372463551, + "flos": 27568783389600.0, + "grad_norm": 1.6578212061027444, + "language_loss": 0.7369048, + "learning_rate": 5.978859704731864e-07, + "loss": 0.76059651, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12854004, + "step": 12555, + "time_per_iteration": 2.889979600906372 + }, + { + "auxiliary_loss_clip": 0.01351666, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.24173689, + "balance_loss_mlp": 1.01496255, + "epoch": 0.754907560499023, + "flos": 19323398282040.0, + "grad_norm": 1.8018502977809752, + "language_loss": 0.79117513, + "learning_rate": 5.976082698990645e-07, + "loss": 0.81497538, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13415527, + "step": 12556, + "time_per_iteration": 2.9703686237335205 + }, + { + "auxiliary_loss_clip": 0.01153981, + "auxiliary_loss_mlp": 0.01010783, + "balance_loss_clip": 1.11181211, + "balance_loss_mlp": 1.00746942, + "epoch": 0.754967683751691, + "flos": 69761154240000.0, + "grad_norm": 0.8739326622473306, + "language_loss": 0.50507444, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52672213, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.03320312, + "step": 12557, + "time_per_iteration": 3.275033473968506 + }, + { + "auxiliary_loss_clip": 0.01350862, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.24040604, + "balance_loss_mlp": 1.01639986, + "epoch": 0.7550278070043589, + "flos": 24426791233560.0, + "grad_norm": 1.5891663294863014, + "language_loss": 0.72027051, + "learning_rate": 5.970530282978525e-07, + "loss": 0.74408406, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14074707, + "step": 12558, + "time_per_iteration": 2.874448776245117 + }, + { + "auxiliary_loss_clip": 0.01342791, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.23351753, + "balance_loss_mlp": 1.0198164, + "epoch": 0.7550879302570269, + "flos": 32641006101840.0, + "grad_norm": 1.703457600081404, + "language_loss": 0.80335873, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82711726, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13238525, + "step": 12559, + "time_per_iteration": 2.8188745975494385 + }, + { + "auxiliary_loss_clip": 0.01350663, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.23936939, + "balance_loss_mlp": 1.01843369, + "epoch": 0.7551480535096948, + "flos": 21800165687640.0, + "grad_norm": 1.6617889754864192, + "language_loss": 0.79040658, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81423724, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13952637, + "step": 12560, + "time_per_iteration": 2.859640598297119 + }, + { + "auxiliary_loss_clip": 0.01343627, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.23694539, + "balance_loss_mlp": 1.01788664, + "epoch": 0.7552081767623628, + "flos": 18519726340440.0, + "grad_norm": 1.6351006207520984, + "language_loss": 0.70757306, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73131704, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12878418, + "step": 12561, + "time_per_iteration": 4.2890801429748535 + }, + { + "auxiliary_loss_clip": 0.0134306, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.23410249, + "balance_loss_mlp": 1.01803195, + "epoch": 0.7552683000150308, + "flos": 27640382574240.0, + "grad_norm": 2.380968806958428, + "language_loss": 0.75778967, + "learning_rate": 5.959431835782889e-07, + "loss": 0.78152812, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12744141, + "step": 12562, + "time_per_iteration": 2.7795517444610596 + }, + { + "auxiliary_loss_clip": 0.01342591, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.23432708, + "balance_loss_mlp": 1.02045393, + "epoch": 0.7553284232676988, + "flos": 20307773736360.0, + "grad_norm": 1.9312420839772002, + "language_loss": 0.76401854, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78778946, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.14038086, + "step": 12563, + "time_per_iteration": 2.6809797286987305 + }, + { + "auxiliary_loss_clip": 0.01361644, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.24607611, + "balance_loss_mlp": 1.01762581, + "epoch": 0.7553885465203668, + "flos": 33262025329440.0, + "grad_norm": 2.605717931553126, + "language_loss": 0.67824745, + "learning_rate": 5.953885806282768e-07, + "loss": 0.70219553, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.15539551, + "step": 12564, + "time_per_iteration": 2.8404948711395264 + }, + { + "auxiliary_loss_clip": 0.0134696, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.23591173, + "balance_loss_mlp": 1.02068734, + "epoch": 0.7554486697730347, + "flos": 21621248942760.0, + "grad_norm": 2.0081793608462073, + "language_loss": 0.6861282, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70993638, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13165283, + "step": 12565, + "time_per_iteration": 2.7841055393218994 + }, + { + "auxiliary_loss_clip": 0.0135767, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.24288476, + "balance_loss_mlp": 1.01596856, + "epoch": 0.7555087930257027, + "flos": 27638311547880.0, + "grad_norm": 1.675872017110553, + "language_loss": 0.7540549, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77792919, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.13806152, + "step": 12566, + "time_per_iteration": 2.8136143684387207 + }, + { + "auxiliary_loss_clip": 0.01358204, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.24533331, + "balance_loss_mlp": 1.02198672, + "epoch": 0.7555689162783706, + "flos": 23626733436000.0, + "grad_norm": 1.8484497476289443, + "language_loss": 0.74144894, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76539516, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 1.12744141, + "router_z_loss_mlp": 0.14422607, + "step": 12567, + "time_per_iteration": 2.8261783123016357 + }, + { + "auxiliary_loss_clip": 0.01345106, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.23555613, + "balance_loss_mlp": 1.01575994, + "epoch": 0.7556290395310387, + "flos": 24868447024320.0, + "grad_norm": 1.6603451430629683, + "language_loss": 0.62774539, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65148395, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12994385, + "step": 12568, + "time_per_iteration": 2.8562891483306885 + }, + { + "auxiliary_loss_clip": 0.01344718, + "auxiliary_loss_mlp": 0.01029508, + "balance_loss_clip": 1.237764, + "balance_loss_mlp": 1.01650238, + "epoch": 0.7556891627837066, + "flos": 43551577716120.0, + "grad_norm": 1.9778003016497556, + "language_loss": 0.66348982, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68723208, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12994385, + "step": 12569, + "time_per_iteration": 2.8850669860839844 + }, + { + "auxiliary_loss_clip": 0.0134942, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.23620629, + "balance_loss_mlp": 1.0233345, + "epoch": 0.7557492860363746, + "flos": 26656535028600.0, + "grad_norm": 1.6061334359675579, + "language_loss": 0.67590255, + "learning_rate": 5.93726050426697e-07, + "loss": 0.6997757, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14575195, + "step": 12570, + "time_per_iteration": 2.7791595458984375 + }, + { + "auxiliary_loss_clip": 0.01344557, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.23399258, + "balance_loss_mlp": 1.02091408, + "epoch": 0.7558094092890425, + "flos": 55192232277720.0, + "grad_norm": 1.9448796434141453, + "language_loss": 0.72128522, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74508035, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14031982, + "step": 12571, + "time_per_iteration": 3.0023484230041504 + }, + { + "auxiliary_loss_clip": 0.01353229, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.24038374, + "balance_loss_mlp": 1.01819253, + "epoch": 0.7558695325417105, + "flos": 23993663198400.0, + "grad_norm": 2.309807532917283, + "language_loss": 0.73838359, + "learning_rate": 5.931723001891811e-07, + "loss": 0.76223564, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13775635, + "step": 12572, + "time_per_iteration": 2.8227293491363525 + }, + { + "auxiliary_loss_clip": 0.0134879, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.23733473, + "balance_loss_mlp": 1.01992643, + "epoch": 0.7559296557943784, + "flos": 14615384572080.0, + "grad_norm": 2.0091241691984627, + "language_loss": 0.76494968, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78876698, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13012695, + "step": 12573, + "time_per_iteration": 2.7164463996887207 + }, + { + "auxiliary_loss_clip": 0.01347073, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.23505735, + "balance_loss_mlp": 1.02408588, + "epoch": 0.7559897790470465, + "flos": 18554957415720.0, + "grad_norm": 1.7408001768892947, + "language_loss": 0.69115371, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71500134, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13604736, + "step": 12574, + "time_per_iteration": 2.746626615524292 + }, + { + "auxiliary_loss_clip": 0.01338736, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.23089337, + "balance_loss_mlp": 1.02292645, + "epoch": 0.7560499022997144, + "flos": 17972417932200.0, + "grad_norm": 2.076473042975508, + "language_loss": 0.72122633, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74497819, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1350708, + "step": 12575, + "time_per_iteration": 2.830278158187866 + }, + { + "auxiliary_loss_clip": 0.01341241, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.23348558, + "balance_loss_mlp": 1.02417994, + "epoch": 0.7561100255523824, + "flos": 15741496494720.0, + "grad_norm": 3.035849975278058, + "language_loss": 0.72354829, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74732459, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12213135, + "step": 12576, + "time_per_iteration": 2.7757015228271484 + }, + { + "auxiliary_loss_clip": 0.01340321, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.23326945, + "balance_loss_mlp": 1.02067745, + "epoch": 0.7561701488050504, + "flos": 15892369844040.0, + "grad_norm": 1.8649359950460884, + "language_loss": 0.67541075, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69915569, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1348877, + "step": 12577, + "time_per_iteration": 2.8358733654022217 + }, + { + "auxiliary_loss_clip": 0.01334084, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.2269839, + "balance_loss_mlp": 1.02167487, + "epoch": 0.7562302720577183, + "flos": 20343979412280.0, + "grad_norm": 1.6403392124793836, + "language_loss": 0.78220606, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80589163, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12805176, + "step": 12578, + "time_per_iteration": 2.8121564388275146 + }, + { + "auxiliary_loss_clip": 0.01346479, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.23499334, + "balance_loss_mlp": 1.01936626, + "epoch": 0.7562903953103863, + "flos": 20816927267400.0, + "grad_norm": 1.5102694834764214, + "language_loss": 0.75755358, + "learning_rate": 5.912358553407641e-07, + "loss": 0.78134608, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13415527, + "step": 12579, + "time_per_iteration": 2.8434391021728516 + }, + { + "auxiliary_loss_clip": 0.01353678, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.2387681, + "balance_loss_mlp": 1.01631713, + "epoch": 0.7563505185630542, + "flos": 37604083877640.0, + "grad_norm": 1.789612847631569, + "language_loss": 0.6317662, + "learning_rate": 5.90959433960437e-07, + "loss": 0.65561306, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.14691162, + "step": 12580, + "time_per_iteration": 2.9559326171875 + }, + { + "auxiliary_loss_clip": 0.01342688, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.23431253, + "balance_loss_mlp": 1.02111912, + "epoch": 0.7564106418157223, + "flos": 20235971509920.0, + "grad_norm": 1.6298140121207705, + "language_loss": 0.7536782, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77744573, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12945557, + "step": 12581, + "time_per_iteration": 2.7989065647125244 + }, + { + "auxiliary_loss_clip": 0.01352127, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.23992801, + "balance_loss_mlp": 1.02405381, + "epoch": 0.7564707650683902, + "flos": 24760398513600.0, + "grad_norm": 1.5548744878656167, + "language_loss": 0.62730241, + "learning_rate": 5.904067515031412e-07, + "loss": 0.65120327, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13903809, + "step": 12582, + "time_per_iteration": 2.8284130096435547 + }, + { + "auxiliary_loss_clip": 0.01157451, + "auxiliary_loss_mlp": 0.010076, + "balance_loss_clip": 1.11421394, + "balance_loss_mlp": 1.00510836, + "epoch": 0.7565308883210582, + "flos": 48541700659680.0, + "grad_norm": 1.0569996916913798, + "language_loss": 0.60708207, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62873256, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02490234, + "step": 12583, + "time_per_iteration": 3.0044164657592773 + }, + { + "auxiliary_loss_clip": 0.01347068, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.23783934, + "balance_loss_mlp": 1.02434778, + "epoch": 0.7565910115737261, + "flos": 12498927766200.0, + "grad_norm": 1.8948443097338785, + "language_loss": 0.79572201, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81957459, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.1383667, + "step": 12584, + "time_per_iteration": 2.7309460639953613 + }, + { + "auxiliary_loss_clip": 0.01342745, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.23731673, + "balance_loss_mlp": 1.01970577, + "epoch": 0.7566511348263941, + "flos": 21177237867120.0, + "grad_norm": 1.823476052026917, + "language_loss": 0.77755845, + "learning_rate": 5.895781287327612e-07, + "loss": 0.80131245, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12957764, + "step": 12585, + "time_per_iteration": 2.7644577026367188 + }, + { + "auxiliary_loss_clip": 0.0135327, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.24169278, + "balance_loss_mlp": 1.02107787, + "epoch": 0.756711258079062, + "flos": 21758883966720.0, + "grad_norm": 1.7023628771509023, + "language_loss": 0.83132237, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85520935, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14355469, + "step": 12586, + "time_per_iteration": 2.7630598545074463 + }, + { + "auxiliary_loss_clip": 0.01347333, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.23551011, + "balance_loss_mlp": 1.01779175, + "epoch": 0.75677138133173, + "flos": 22388065474680.0, + "grad_norm": 2.120018629935825, + "language_loss": 0.83629447, + "learning_rate": 5.890259809517459e-07, + "loss": 0.86007392, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 1.11767578, + "router_z_loss_mlp": 0.12817383, + "step": 12587, + "time_per_iteration": 2.7637434005737305 + }, + { + "auxiliary_loss_clip": 0.01341451, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.23223233, + "balance_loss_mlp": 1.01488137, + "epoch": 0.756831504584398, + "flos": 22713875949600.0, + "grad_norm": 1.8081581671500986, + "language_loss": 0.70927668, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73297346, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13330078, + "step": 12588, + "time_per_iteration": 2.79299259185791 + }, + { + "auxiliary_loss_clip": 0.01350536, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.23810291, + "balance_loss_mlp": 1.02044082, + "epoch": 0.756891627837066, + "flos": 24103823343840.0, + "grad_norm": 1.770860978132, + "language_loss": 0.69362605, + "learning_rate": 5.884740471878327e-07, + "loss": 0.71747839, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14257812, + "step": 12589, + "time_per_iteration": 2.791330099105835 + }, + { + "auxiliary_loss_clip": 0.01341744, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.23264778, + "balance_loss_mlp": 1.02089155, + "epoch": 0.756951751089734, + "flos": 19752912173160.0, + "grad_norm": 1.5902507360941733, + "language_loss": 0.92459798, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94836009, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.13586426, + "step": 12590, + "time_per_iteration": 4.312599182128906 + }, + { + "auxiliary_loss_clip": 0.01334797, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.22891033, + "balance_loss_mlp": 1.01926792, + "epoch": 0.7570118743424019, + "flos": 35085994142760.0, + "grad_norm": 2.313936426637115, + "language_loss": 0.65410137, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67777479, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13269043, + "step": 12591, + "time_per_iteration": 2.8982014656066895 + }, + { + "auxiliary_loss_clip": 0.01345418, + "auxiliary_loss_mlp": 0.01027239, + "balance_loss_clip": 1.23836243, + "balance_loss_mlp": 1.01488888, + "epoch": 0.7570719975950699, + "flos": 25600235522760.0, + "grad_norm": 1.4850391733524526, + "language_loss": 0.73858148, + "learning_rate": 5.876465480071528e-07, + "loss": 0.76230806, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12353516, + "step": 12592, + "time_per_iteration": 2.901019811630249 + }, + { + "auxiliary_loss_clip": 0.01349934, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.23883903, + "balance_loss_mlp": 1.01946354, + "epoch": 0.7571321208477378, + "flos": 10819781656560.0, + "grad_norm": 2.2925737489871882, + "language_loss": 0.72275066, + "learning_rate": 5.873708220461522e-07, + "loss": 0.74657542, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13085938, + "step": 12593, + "time_per_iteration": 2.736401319503784 + }, + { + "auxiliary_loss_clip": 0.01347158, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.23629904, + "balance_loss_mlp": 1.02059484, + "epoch": 0.7571922441004059, + "flos": 18264743491320.0, + "grad_norm": 2.425379730584866, + "language_loss": 0.66383171, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68764436, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13513184, + "step": 12594, + "time_per_iteration": 2.769394636154175 + }, + { + "auxiliary_loss_clip": 0.01353181, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.24078989, + "balance_loss_mlp": 1.01987815, + "epoch": 0.7572523673530738, + "flos": 22895310412800.0, + "grad_norm": 1.5476112200358416, + "language_loss": 0.80869502, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83256125, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13586426, + "step": 12595, + "time_per_iteration": 2.7278292179107666 + }, + { + "auxiliary_loss_clip": 0.01343602, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.23544717, + "balance_loss_mlp": 1.02256441, + "epoch": 0.7573124906057418, + "flos": 21001285532520.0, + "grad_norm": 2.1097791600894094, + "language_loss": 0.72087789, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74467397, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.13446045, + "step": 12596, + "time_per_iteration": 2.7957351207733154 + }, + { + "auxiliary_loss_clip": 0.01338236, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.2320447, + "balance_loss_mlp": 1.02296972, + "epoch": 0.7573726138584097, + "flos": 20891165995440.0, + "grad_norm": 1.5780188955188432, + "language_loss": 0.80383486, + "learning_rate": 5.862684539770706e-07, + "loss": 0.8275733, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12646484, + "step": 12597, + "time_per_iteration": 2.7366065979003906 + }, + { + "auxiliary_loss_clip": 0.01353356, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.24086666, + "balance_loss_mlp": 1.01854503, + "epoch": 0.7574327371110777, + "flos": 24535286436240.0, + "grad_norm": 1.591862611931134, + "language_loss": 0.83356237, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85741842, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 1.12451172, + "router_z_loss_mlp": 0.13690186, + "step": 12598, + "time_per_iteration": 2.7931747436523438 + }, + { + "auxiliary_loss_clip": 0.01342488, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.23539639, + "balance_loss_mlp": 1.0196532, + "epoch": 0.7574928603637456, + "flos": 23369354693640.0, + "grad_norm": 1.5493928931794088, + "language_loss": 0.6260519, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64980006, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12677002, + "step": 12599, + "time_per_iteration": 4.37720513343811 + }, + { + "auxiliary_loss_clip": 0.01352658, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.23914671, + "balance_loss_mlp": 1.02160549, + "epoch": 0.7575529836164137, + "flos": 13520199238560.0, + "grad_norm": 2.521013041138698, + "language_loss": 0.63395667, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65784699, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14770508, + "step": 12600, + "time_per_iteration": 2.76521635055542 + }, + { + "auxiliary_loss_clip": 0.01333886, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.22773504, + "balance_loss_mlp": 1.01554275, + "epoch": 0.7576131068690816, + "flos": 19651401608400.0, + "grad_norm": 1.6339254585476048, + "language_loss": 0.66308433, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68670774, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12927246, + "step": 12601, + "time_per_iteration": 3.0239310264587402 + }, + { + "auxiliary_loss_clip": 0.01339773, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.23293781, + "balance_loss_mlp": 1.01718688, + "epoch": 0.7576732301217496, + "flos": 20053156362480.0, + "grad_norm": 1.4918029863988835, + "language_loss": 0.68060839, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70430475, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12677002, + "step": 12602, + "time_per_iteration": 2.768915891647339 + }, + { + "auxiliary_loss_clip": 0.01343478, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.23458648, + "balance_loss_mlp": 1.02353776, + "epoch": 0.7577333533744176, + "flos": 15381267111720.0, + "grad_norm": 2.202684790209899, + "language_loss": 0.66990793, + "learning_rate": 5.846165103474967e-07, + "loss": 0.6937148, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13665771, + "step": 12603, + "time_per_iteration": 2.863158702850342 + }, + { + "auxiliary_loss_clip": 0.01335328, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.22951007, + "balance_loss_mlp": 1.02156115, + "epoch": 0.7577934766270855, + "flos": 17899397454960.0, + "grad_norm": 2.1953404678744093, + "language_loss": 0.62165904, + "learning_rate": 5.843413741985439e-07, + "loss": 0.64534664, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11871338, + "step": 12604, + "time_per_iteration": 2.7598586082458496 + }, + { + "auxiliary_loss_clip": 0.01340242, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.23279262, + "balance_loss_mlp": 1.02191246, + "epoch": 0.7578535998797535, + "flos": 21618446965920.0, + "grad_norm": 2.2792820162764706, + "language_loss": 0.80136204, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82511991, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.13623047, + "step": 12605, + "time_per_iteration": 2.745621681213379 + }, + { + "auxiliary_loss_clip": 0.01349194, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.23770797, + "balance_loss_mlp": 1.01762068, + "epoch": 0.7579137231324214, + "flos": 18482952147480.0, + "grad_norm": 2.926345373751668, + "language_loss": 0.80228972, + "learning_rate": 5.837912629568198e-07, + "loss": 0.82609618, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.1383667, + "step": 12606, + "time_per_iteration": 2.7303526401519775 + }, + { + "auxiliary_loss_clip": 0.01329393, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.22553682, + "balance_loss_mlp": 1.01636696, + "epoch": 0.7579738463850895, + "flos": 23259844281960.0, + "grad_norm": 1.3319373415845284, + "language_loss": 0.73144305, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75501502, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.11444092, + "step": 12607, + "time_per_iteration": 2.7444145679473877 + }, + { + "auxiliary_loss_clip": 0.01353558, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.24026203, + "balance_loss_mlp": 1.01897717, + "epoch": 0.7580339696377574, + "flos": 14031017712360.0, + "grad_norm": 2.0972970523136985, + "language_loss": 0.75021231, + "learning_rate": 5.83241366526202e-07, + "loss": 0.77408135, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14373779, + "step": 12608, + "time_per_iteration": 2.751647710800171 + }, + { + "auxiliary_loss_clip": 0.01336093, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.22821379, + "balance_loss_mlp": 1.01925504, + "epoch": 0.7580940928904254, + "flos": 25088280014880.0, + "grad_norm": 1.564697859118916, + "language_loss": 0.71656436, + "learning_rate": 5.829664988911245e-07, + "loss": 0.74025309, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13525391, + "step": 12609, + "time_per_iteration": 2.809995651245117 + }, + { + "auxiliary_loss_clip": 0.0133959, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.23028851, + "balance_loss_mlp": 1.01737666, + "epoch": 0.7581542161430933, + "flos": 23841003081240.0, + "grad_norm": 1.5756796034151503, + "language_loss": 0.81458223, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83829802, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.14624023, + "step": 12610, + "time_per_iteration": 2.745180606842041 + }, + { + "auxiliary_loss_clip": 0.01348673, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.23720694, + "balance_loss_mlp": 1.02055264, + "epoch": 0.7582143393957613, + "flos": 22242146345280.0, + "grad_norm": 1.6042074936759763, + "language_loss": 0.70546615, + "learning_rate": 5.824169248335488e-07, + "loss": 0.7292937, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13531494, + "step": 12611, + "time_per_iteration": 2.7622103691101074 + }, + { + "auxiliary_loss_clip": 0.01343429, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.2345103, + "balance_loss_mlp": 1.01764035, + "epoch": 0.7582744626484292, + "flos": 21111445677960.0, + "grad_norm": 1.5087629550643002, + "language_loss": 0.711586, + "learning_rate": 5.821422184318893e-07, + "loss": 0.7353363, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13946533, + "step": 12612, + "time_per_iteration": 2.7707293033599854 + }, + { + "auxiliary_loss_clip": 0.01350396, + "auxiliary_loss_mlp": 0.0104044, + "balance_loss_clip": 1.23776031, + "balance_loss_mlp": 1.02663541, + "epoch": 0.7583345859010973, + "flos": 24609728206080.0, + "grad_norm": 1.3447670624254844, + "language_loss": 0.60249984, + "learning_rate": 5.818675657955397e-07, + "loss": 0.62640822, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13787842, + "step": 12613, + "time_per_iteration": 2.891815423965454 + }, + { + "auxiliary_loss_clip": 0.01344795, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.2358036, + "balance_loss_mlp": 1.02665484, + "epoch": 0.7583947091537652, + "flos": 33553335679560.0, + "grad_norm": 1.5109813035423725, + "language_loss": 0.605474, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62931556, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12701416, + "step": 12614, + "time_per_iteration": 2.835862398147583 + }, + { + "auxiliary_loss_clip": 0.01352714, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.24073005, + "balance_loss_mlp": 1.01831901, + "epoch": 0.7584548324064332, + "flos": 20125892581200.0, + "grad_norm": 2.008831077068236, + "language_loss": 0.73300934, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75685531, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13580322, + "step": 12615, + "time_per_iteration": 2.9133472442626953 + }, + { + "auxiliary_loss_clip": 0.01154486, + "auxiliary_loss_mlp": 0.01012446, + "balance_loss_clip": 1.11247802, + "balance_loss_mlp": 1.00987148, + "epoch": 0.7585149556591012, + "flos": 70419353744160.0, + "grad_norm": 0.8399026676871237, + "language_loss": 0.67775857, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69942784, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.02575684, + "step": 12616, + "time_per_iteration": 3.270591974258423 + }, + { + "auxiliary_loss_clip": 0.01350706, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.23939085, + "balance_loss_mlp": 1.01753795, + "epoch": 0.7585750789117691, + "flos": 16148124252000.0, + "grad_norm": 1.9168763009391732, + "language_loss": 0.85052347, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87434375, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13775635, + "step": 12617, + "time_per_iteration": 2.7471508979797363 + }, + { + "auxiliary_loss_clip": 0.01343816, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.2330476, + "balance_loss_mlp": 1.02046871, + "epoch": 0.7586352021644371, + "flos": 17497602092520.0, + "grad_norm": 3.282419902353128, + "language_loss": 0.75844258, + "learning_rate": 5.804951094578757e-07, + "loss": 0.78221112, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12573242, + "step": 12618, + "time_per_iteration": 2.6939687728881836 + }, + { + "auxiliary_loss_clip": 0.013518, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.23768294, + "balance_loss_mlp": 1.02229822, + "epoch": 0.758695325417105, + "flos": 17279718303240.0, + "grad_norm": 2.2074477830983965, + "language_loss": 0.77287942, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79675788, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13739014, + "step": 12619, + "time_per_iteration": 2.744880437850952 + }, + { + "auxiliary_loss_clip": 0.01336237, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.22866344, + "balance_loss_mlp": 1.01807439, + "epoch": 0.7587554486697731, + "flos": 29501491055760.0, + "grad_norm": 1.625598638100646, + "language_loss": 0.82581818, + "learning_rate": 5.79946503644337e-07, + "loss": 0.8494904, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12908936, + "step": 12620, + "time_per_iteration": 2.9384050369262695 + }, + { + "auxiliary_loss_clip": 0.01351749, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.23835838, + "balance_loss_mlp": 1.02350235, + "epoch": 0.758815571922441, + "flos": 16104284204400.0, + "grad_norm": 1.9663378675256267, + "language_loss": 0.82753325, + "learning_rate": 5.796722815052242e-07, + "loss": 0.85142744, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14178467, + "step": 12621, + "time_per_iteration": 2.747647285461426 + }, + { + "auxiliary_loss_clip": 0.01342474, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.23407805, + "balance_loss_mlp": 1.02087808, + "epoch": 0.758875695175109, + "flos": 16148043035280.0, + "grad_norm": 1.8974611457748853, + "language_loss": 0.73372048, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75748527, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13110352, + "step": 12622, + "time_per_iteration": 2.7294862270355225 + }, + { + "auxiliary_loss_clip": 0.01154798, + "auxiliary_loss_mlp": 0.01003112, + "balance_loss_clip": 1.11220956, + "balance_loss_mlp": 1.00032222, + "epoch": 0.7589358184277769, + "flos": 68477631022080.0, + "grad_norm": 0.9638843622951646, + "language_loss": 0.60873568, + "learning_rate": 5.791239988143024e-07, + "loss": 0.63031477, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0279541, + "step": 12623, + "time_per_iteration": 3.2471976280212402 + }, + { + "auxiliary_loss_clip": 0.01334828, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.22898269, + "balance_loss_mlp": 1.02231967, + "epoch": 0.7589959416804449, + "flos": 20052100545120.0, + "grad_norm": 1.7393354569066004, + "language_loss": 0.68289363, + "learning_rate": 5.788499382832847e-07, + "loss": 0.7065897, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12457275, + "step": 12624, + "time_per_iteration": 2.7614786624908447 + }, + { + "auxiliary_loss_clip": 0.01337677, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.23058796, + "balance_loss_mlp": 1.01867795, + "epoch": 0.7590560649331128, + "flos": 18776617782480.0, + "grad_norm": 1.9211725713461192, + "language_loss": 0.76017034, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78386581, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13201904, + "step": 12625, + "time_per_iteration": 2.7007052898406982 + }, + { + "auxiliary_loss_clip": 0.01339024, + "auxiliary_loss_mlp": 0.01039672, + "balance_loss_clip": 1.23400187, + "balance_loss_mlp": 1.02632666, + "epoch": 0.7591161881857809, + "flos": 29831321758320.0, + "grad_norm": 2.307973708972921, + "language_loss": 0.63388115, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65766811, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.13342285, + "step": 12626, + "time_per_iteration": 2.791672706604004 + }, + { + "auxiliary_loss_clip": 0.01347136, + "auxiliary_loss_mlp": 0.01039593, + "balance_loss_clip": 1.23704028, + "balance_loss_mlp": 1.02502573, + "epoch": 0.7591763114384488, + "flos": 20307530086200.0, + "grad_norm": 2.0496540358222286, + "language_loss": 0.74744606, + "learning_rate": 5.780280800727084e-07, + "loss": 0.77131331, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.14562988, + "step": 12627, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.01348915, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.23813999, + "balance_loss_mlp": 1.01738763, + "epoch": 0.7592364346911168, + "flos": 20818510993440.0, + "grad_norm": 2.0648070438603603, + "language_loss": 0.69327366, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71706563, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12921143, + "step": 12628, + "time_per_iteration": 2.713076114654541 + }, + { + "auxiliary_loss_clip": 0.01357557, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.24311352, + "balance_loss_mlp": 1.01790142, + "epoch": 0.7592965579437848, + "flos": 21256471423440.0, + "grad_norm": 1.7020472039783263, + "language_loss": 0.63308334, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65697783, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.13989258, + "step": 12629, + "time_per_iteration": 4.2513511180877686 + }, + { + "auxiliary_loss_clip": 0.01336501, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.23107517, + "balance_loss_mlp": 1.01979661, + "epoch": 0.7593566811964527, + "flos": 26219224332360.0, + "grad_norm": 1.4515629197262012, + "language_loss": 0.78116941, + "learning_rate": 5.772067071539786e-07, + "loss": 0.80485988, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12738037, + "step": 12630, + "time_per_iteration": 4.420422554016113 + }, + { + "auxiliary_loss_clip": 0.01152531, + "auxiliary_loss_mlp": 0.01007316, + "balance_loss_clip": 1.11030734, + "balance_loss_mlp": 1.00452662, + "epoch": 0.7594168044491207, + "flos": 71253546191280.0, + "grad_norm": 0.8111298747322341, + "language_loss": 0.61542058, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63701916, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.0279541, + "step": 12631, + "time_per_iteration": 3.3075203895568848 + }, + { + "auxiliary_loss_clip": 0.01353079, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.23913169, + "balance_loss_mlp": 1.02265728, + "epoch": 0.7594769277017887, + "flos": 26618786235000.0, + "grad_norm": 1.7135082888063724, + "language_loss": 0.74101019, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76491225, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14483643, + "step": 12632, + "time_per_iteration": 2.7814035415649414 + }, + { + "auxiliary_loss_clip": 0.01344068, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.23500454, + "balance_loss_mlp": 1.01958597, + "epoch": 0.7595370509544567, + "flos": 17599518740880.0, + "grad_norm": 1.9058076850391268, + "language_loss": 0.7501691, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77393389, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12835693, + "step": 12633, + "time_per_iteration": 2.7673211097717285 + }, + { + "auxiliary_loss_clip": 0.01342661, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.23445725, + "balance_loss_mlp": 1.01910305, + "epoch": 0.7595971742071246, + "flos": 18007324140600.0, + "grad_norm": 1.9453810498246071, + "language_loss": 0.7379787, + "learning_rate": 5.76112298645246e-07, + "loss": 0.76171708, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1206665, + "step": 12634, + "time_per_iteration": 2.7927017211914062 + }, + { + "auxiliary_loss_clip": 0.01341451, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.23295259, + "balance_loss_mlp": 1.01844716, + "epoch": 0.7596572974597926, + "flos": 28846255961880.0, + "grad_norm": 1.5826592875089762, + "language_loss": 0.65184152, + "learning_rate": 5.758388314770408e-07, + "loss": 0.6755715, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13098145, + "step": 12635, + "time_per_iteration": 2.845322370529175 + }, + { + "auxiliary_loss_clip": 0.01351131, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.2401942, + "balance_loss_mlp": 1.01936877, + "epoch": 0.7597174207124605, + "flos": 14286893945400.0, + "grad_norm": 1.6617964845207613, + "language_loss": 0.69053388, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71437949, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14068604, + "step": 12636, + "time_per_iteration": 2.691762685775757 + }, + { + "auxiliary_loss_clip": 0.01347772, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.23681474, + "balance_loss_mlp": 1.02580869, + "epoch": 0.7597775439651285, + "flos": 21693903944760.0, + "grad_norm": 1.986023450668637, + "language_loss": 0.81409538, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83797348, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14227295, + "step": 12637, + "time_per_iteration": 2.9339516162872314 + }, + { + "auxiliary_loss_clip": 0.01348185, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.23833299, + "balance_loss_mlp": 1.02061093, + "epoch": 0.7598376672177964, + "flos": 36108158999040.0, + "grad_norm": 1.786832967637136, + "language_loss": 0.66873175, + "learning_rate": 5.750187540399017e-07, + "loss": 0.69255292, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13317871, + "step": 12638, + "time_per_iteration": 4.390104532241821 + }, + { + "auxiliary_loss_clip": 0.01345262, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.23554075, + "balance_loss_mlp": 1.02760267, + "epoch": 0.7598977904704645, + "flos": 18336870584640.0, + "grad_norm": 1.9263636489319904, + "language_loss": 0.65545285, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67932695, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.14538574, + "step": 12639, + "time_per_iteration": 2.745774269104004 + }, + { + "auxiliary_loss_clip": 0.01341453, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.23376632, + "balance_loss_mlp": 1.01742005, + "epoch": 0.7599579137231324, + "flos": 20197369940760.0, + "grad_norm": 1.896024632323656, + "language_loss": 0.70425963, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72797799, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12957764, + "step": 12640, + "time_per_iteration": 2.8736352920532227 + }, + { + "auxiliary_loss_clip": 0.01356667, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.2439301, + "balance_loss_mlp": 1.02171707, + "epoch": 0.7600180369758004, + "flos": 24030234349560.0, + "grad_norm": 1.7125415652863956, + "language_loss": 0.6701712, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69409436, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13934326, + "step": 12641, + "time_per_iteration": 2.7961840629577637 + }, + { + "auxiliary_loss_clip": 0.01349778, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.23854578, + "balance_loss_mlp": 1.02572441, + "epoch": 0.7600781602284684, + "flos": 18994217313240.0, + "grad_norm": 2.1972871290539877, + "language_loss": 0.67138225, + "learning_rate": 5.73926074001422e-07, + "loss": 0.69527513, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13781738, + "step": 12642, + "time_per_iteration": 2.7357075214385986 + }, + { + "auxiliary_loss_clip": 0.01333722, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.22815096, + "balance_loss_mlp": 1.01892471, + "epoch": 0.7601382834811363, + "flos": 26073102161160.0, + "grad_norm": 1.799006329571447, + "language_loss": 0.76067162, + "learning_rate": 5.736530391580765e-07, + "loss": 0.78432786, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12994385, + "step": 12643, + "time_per_iteration": 2.754103422164917 + }, + { + "auxiliary_loss_clip": 0.0134784, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.23713374, + "balance_loss_mlp": 1.02248979, + "epoch": 0.7601984067338043, + "flos": 18848988525960.0, + "grad_norm": 1.6623220629418, + "language_loss": 0.7903415, + "learning_rate": 5.733800584019508e-07, + "loss": 0.81418717, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14227295, + "step": 12644, + "time_per_iteration": 2.6915197372436523 + }, + { + "auxiliary_loss_clip": 0.01348009, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.23841667, + "balance_loss_mlp": 1.01738477, + "epoch": 0.7602585299864723, + "flos": 24652025136000.0, + "grad_norm": 1.4754932495230049, + "language_loss": 0.80632198, + "learning_rate": 5.731071317433957e-07, + "loss": 0.83010519, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12927246, + "step": 12645, + "time_per_iteration": 2.7693090438842773 + }, + { + "auxiliary_loss_clip": 0.01349914, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.2387681, + "balance_loss_mlp": 1.02063787, + "epoch": 0.7603186532391403, + "flos": 23847419202120.0, + "grad_norm": 1.4438753275100082, + "language_loss": 0.73310816, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75695115, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13751221, + "step": 12646, + "time_per_iteration": 2.821026086807251 + }, + { + "auxiliary_loss_clip": 0.01338536, + "auxiliary_loss_mlp": 0.01037811, + "balance_loss_clip": 1.23079896, + "balance_loss_mlp": 1.0249598, + "epoch": 0.7603787764918082, + "flos": 22204966068720.0, + "grad_norm": 1.9367610178848365, + "language_loss": 0.67857647, + "learning_rate": 5.725614407603949e-07, + "loss": 0.70233989, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12854004, + "step": 12647, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.01154129, + "auxiliary_loss_mlp": 0.01014125, + "balance_loss_clip": 1.11204576, + "balance_loss_mlp": 1.01151395, + "epoch": 0.7604388997444762, + "flos": 54100678069440.0, + "grad_norm": 0.6897081901097123, + "language_loss": 0.49034092, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51202351, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02612305, + "step": 12648, + "time_per_iteration": 3.2230730056762695 + }, + { + "auxiliary_loss_clip": 0.01337482, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.23233938, + "balance_loss_mlp": 1.02284598, + "epoch": 0.7604990229971441, + "flos": 19686592075320.0, + "grad_norm": 1.6274920001535929, + "language_loss": 0.76758128, + "learning_rate": 5.720159662918451e-07, + "loss": 0.79130965, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12506104, + "step": 12649, + "time_per_iteration": 2.798908233642578 + }, + { + "auxiliary_loss_clip": 0.01340493, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.23296869, + "balance_loss_mlp": 1.02327061, + "epoch": 0.7605591462498121, + "flos": 25233468193800.0, + "grad_norm": 5.226871788145335, + "language_loss": 0.68846595, + "learning_rate": 5.717433102763462e-07, + "loss": 0.71222734, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12384033, + "step": 12650, + "time_per_iteration": 2.847550392150879 + }, + { + "auxiliary_loss_clip": 0.01153392, + "auxiliary_loss_mlp": 0.01014875, + "balance_loss_clip": 1.11117327, + "balance_loss_mlp": 1.01260972, + "epoch": 0.76061926950248, + "flos": 66799175254560.0, + "grad_norm": 0.7522920093129176, + "language_loss": 0.62721056, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64889324, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02270508, + "step": 12651, + "time_per_iteration": 3.253903865814209 + }, + { + "auxiliary_loss_clip": 0.01337295, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.23072577, + "balance_loss_mlp": 1.02114451, + "epoch": 0.7606793927551481, + "flos": 25343912597760.0, + "grad_norm": 1.765085413693314, + "language_loss": 0.71651995, + "learning_rate": 5.711981607345951e-07, + "loss": 0.74022853, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.1239624, + "step": 12652, + "time_per_iteration": 2.844390392303467 + }, + { + "auxiliary_loss_clip": 0.01345151, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.23534119, + "balance_loss_mlp": 1.02310085, + "epoch": 0.760739516007816, + "flos": 18228253556880.0, + "grad_norm": 1.8562740428265903, + "language_loss": 0.80536795, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82918715, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13665771, + "step": 12653, + "time_per_iteration": 2.705113172531128 + }, + { + "auxiliary_loss_clip": 0.01355484, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.24419594, + "balance_loss_mlp": 1.02357686, + "epoch": 0.760799639260484, + "flos": 22563489900600.0, + "grad_norm": 2.3976502674097726, + "language_loss": 0.80380398, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82772326, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12860107, + "step": 12654, + "time_per_iteration": 2.772454023361206 + }, + { + "auxiliary_loss_clip": 0.01345682, + "auxiliary_loss_mlp": 0.01037604, + "balance_loss_clip": 1.23379672, + "balance_loss_mlp": 1.02337074, + "epoch": 0.760859762513152, + "flos": 22314395263680.0, + "grad_norm": 1.9879224379453657, + "language_loss": 0.79970509, + "learning_rate": 5.703808428001136e-07, + "loss": 0.82353795, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14239502, + "step": 12655, + "time_per_iteration": 2.709193706512451 + }, + { + "auxiliary_loss_clip": 0.01335134, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.22938728, + "balance_loss_mlp": 1.01519179, + "epoch": 0.7609198857658199, + "flos": 24869381016600.0, + "grad_norm": 1.722062505396202, + "language_loss": 0.68664438, + "learning_rate": 5.701085118974505e-07, + "loss": 0.71025693, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.10925293, + "step": 12656, + "time_per_iteration": 2.7754406929016113 + }, + { + "auxiliary_loss_clip": 0.01354107, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.24003863, + "balance_loss_mlp": 1.01699877, + "epoch": 0.760980009018488, + "flos": 16841189356200.0, + "grad_norm": 2.179878612810283, + "language_loss": 0.73380029, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75764883, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13739014, + "step": 12657, + "time_per_iteration": 2.899460792541504 + }, + { + "auxiliary_loss_clip": 0.01152883, + "auxiliary_loss_mlp": 0.01014046, + "balance_loss_clip": 1.11054134, + "balance_loss_mlp": 1.01162624, + "epoch": 0.7610401322711559, + "flos": 61244339897520.0, + "grad_norm": 0.8642369095660116, + "language_loss": 0.64992636, + "learning_rate": 5.695640127673347e-07, + "loss": 0.67159569, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02416992, + "step": 12658, + "time_per_iteration": 3.2212073802948 + }, + { + "auxiliary_loss_clip": 0.01343556, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.23892188, + "balance_loss_mlp": 1.0285573, + "epoch": 0.7611002555238239, + "flos": 19644782445720.0, + "grad_norm": 1.5615882562439152, + "language_loss": 0.79901606, + "learning_rate": 5.692918445605293e-07, + "loss": 0.82287723, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.13977051, + "step": 12659, + "time_per_iteration": 2.7473809719085693 + }, + { + "auxiliary_loss_clip": 0.0134356, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.23502707, + "balance_loss_mlp": 1.01295507, + "epoch": 0.7611603787764918, + "flos": 26878154787000.0, + "grad_norm": 1.4453249143197828, + "language_loss": 0.69086969, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71456748, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13250732, + "step": 12660, + "time_per_iteration": 2.8932700157165527 + }, + { + "auxiliary_loss_clip": 0.01344493, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.23502111, + "balance_loss_mlp": 1.02320242, + "epoch": 0.7612205020291598, + "flos": 27349965608040.0, + "grad_norm": 1.8485857616109667, + "language_loss": 0.71047366, + "learning_rate": 5.687476709150281e-07, + "loss": 0.7342757, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.125, + "step": 12661, + "time_per_iteration": 2.8689627647399902 + }, + { + "auxiliary_loss_clip": 0.01345888, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.23703194, + "balance_loss_mlp": 1.02012002, + "epoch": 0.7612806252818277, + "flos": 29320584501240.0, + "grad_norm": 1.4517471972432803, + "language_loss": 0.83827275, + "learning_rate": 5.68475665496966e-07, + "loss": 0.86206156, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12872314, + "step": 12662, + "time_per_iteration": 2.8460726737976074 + }, + { + "auxiliary_loss_clip": 0.01342559, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.23297691, + "balance_loss_mlp": 1.02848506, + "epoch": 0.7613407485344957, + "flos": 19030747856040.0, + "grad_norm": 1.6237231257430687, + "language_loss": 0.69292122, + "learning_rate": 5.682037143624505e-07, + "loss": 0.7167632, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13165283, + "step": 12663, + "time_per_iteration": 2.81532883644104 + }, + { + "auxiliary_loss_clip": 0.01337938, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.23255396, + "balance_loss_mlp": 1.01419568, + "epoch": 0.7614008717871636, + "flos": 23260981316040.0, + "grad_norm": 1.7365026376998747, + "language_loss": 0.70217669, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72581875, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12078857, + "step": 12664, + "time_per_iteration": 2.8445029258728027 + }, + { + "auxiliary_loss_clip": 0.01351977, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.23963523, + "balance_loss_mlp": 1.02685332, + "epoch": 0.7614609950398317, + "flos": 21584921441760.0, + "grad_norm": 1.706375692430749, + "language_loss": 0.7976613, + "learning_rate": 5.676599749853066e-07, + "loss": 0.82158601, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13647461, + "step": 12665, + "time_per_iteration": 2.7296011447906494 + }, + { + "auxiliary_loss_clip": 0.01335357, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.23045814, + "balance_loss_mlp": 1.02191579, + "epoch": 0.7615211182924996, + "flos": 29283850916640.0, + "grad_norm": 2.481210737404305, + "language_loss": 0.88389951, + "learning_rate": 5.673881867632959e-07, + "loss": 0.9075973, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.125, + "step": 12666, + "time_per_iteration": 2.8103396892547607 + }, + { + "auxiliary_loss_clip": 0.01345397, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.2359432, + "balance_loss_mlp": 1.02003717, + "epoch": 0.7615812415451676, + "flos": 13265053956000.0, + "grad_norm": 2.0301282300296797, + "language_loss": 0.83463764, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85843241, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14025879, + "step": 12667, + "time_per_iteration": 4.294055223464966 + }, + { + "auxiliary_loss_clip": 0.0133968, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.23382306, + "balance_loss_mlp": 1.02407694, + "epoch": 0.7616413647978356, + "flos": 18588929631840.0, + "grad_norm": 1.5169482414528823, + "language_loss": 0.78521013, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80896533, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11761475, + "step": 12668, + "time_per_iteration": 2.6958580017089844 + }, + { + "auxiliary_loss_clip": 0.01341861, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.23355722, + "balance_loss_mlp": 1.01830041, + "epoch": 0.7617014880505035, + "flos": 18520944591240.0, + "grad_norm": 2.09130355252593, + "language_loss": 0.64375484, + "learning_rate": 5.6657314808718e-07, + "loss": 0.6674819, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12554932, + "step": 12669, + "time_per_iteration": 2.7618346214294434 + }, + { + "auxiliary_loss_clip": 0.01351047, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.23993897, + "balance_loss_mlp": 1.02256155, + "epoch": 0.7617616113031715, + "flos": 24978647778120.0, + "grad_norm": 1.962845398274641, + "language_loss": 0.66460222, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68847978, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14160156, + "step": 12670, + "time_per_iteration": 2.73043155670166 + }, + { + "auxiliary_loss_clip": 0.0134489, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.23400283, + "balance_loss_mlp": 1.02370584, + "epoch": 0.7618217345558395, + "flos": 23300313835680.0, + "grad_norm": 1.4907788100068224, + "language_loss": 0.73254776, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75636369, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12994385, + "step": 12671, + "time_per_iteration": 2.8480610847473145 + }, + { + "auxiliary_loss_clip": 0.0134293, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.2355901, + "balance_loss_mlp": 1.01805484, + "epoch": 0.7618818578085075, + "flos": 25488288609480.0, + "grad_norm": 1.5287735159354967, + "language_loss": 0.73409092, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75781983, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11907959, + "step": 12672, + "time_per_iteration": 2.814452886581421 + }, + { + "auxiliary_loss_clip": 0.01153419, + "auxiliary_loss_mlp": 0.01006551, + "balance_loss_clip": 1.11114788, + "balance_loss_mlp": 1.00452447, + "epoch": 0.7619419810611754, + "flos": 61164984516120.0, + "grad_norm": 0.8366099880682099, + "language_loss": 0.56713724, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58873695, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02026367, + "step": 12673, + "time_per_iteration": 3.234314203262329 + }, + { + "auxiliary_loss_clip": 0.01344108, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.23408365, + "balance_loss_mlp": 1.01962566, + "epoch": 0.7620021043138434, + "flos": 23263945726320.0, + "grad_norm": 1.7551563784082982, + "language_loss": 0.74751335, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77128446, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13360596, + "step": 12674, + "time_per_iteration": 2.798830270767212 + }, + { + "auxiliary_loss_clip": 0.01340679, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.23451281, + "balance_loss_mlp": 1.01435614, + "epoch": 0.7620622275665113, + "flos": 25087955148000.0, + "grad_norm": 3.2689696251581126, + "language_loss": 0.73024184, + "learning_rate": 5.649445386165286e-07, + "loss": 0.7539168, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12457275, + "step": 12675, + "time_per_iteration": 2.7878494262695312 + }, + { + "auxiliary_loss_clip": 0.01339772, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.23404014, + "balance_loss_mlp": 1.01928067, + "epoch": 0.7621223508191793, + "flos": 20159580538800.0, + "grad_norm": 2.0079115648266996, + "language_loss": 0.72730571, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75101769, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12145996, + "step": 12676, + "time_per_iteration": 4.276289939880371 + }, + { + "auxiliary_loss_clip": 0.0135836, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.24326253, + "balance_loss_mlp": 1.02331936, + "epoch": 0.7621824740718472, + "flos": 18004197296880.0, + "grad_norm": 2.542624667504337, + "language_loss": 0.54503411, + "learning_rate": 5.644021040227927e-07, + "loss": 0.56898773, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.13665771, + "step": 12677, + "time_per_iteration": 2.7250540256500244 + }, + { + "auxiliary_loss_clip": 0.01345455, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.23716331, + "balance_loss_mlp": 1.01884365, + "epoch": 0.7622425973245153, + "flos": 21730678137720.0, + "grad_norm": 1.842931965913443, + "language_loss": 0.7919848, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81575823, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13043213, + "step": 12678, + "time_per_iteration": 2.756502866744995 + }, + { + "auxiliary_loss_clip": 0.01344368, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.23384941, + "balance_loss_mlp": 1.01792324, + "epoch": 0.7623027205771832, + "flos": 19723244443200.0, + "grad_norm": 1.7935854297594342, + "language_loss": 0.77504843, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79880285, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1315918, + "step": 12679, + "time_per_iteration": 2.772998571395874 + }, + { + "auxiliary_loss_clip": 0.01340598, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.23128951, + "balance_loss_mlp": 1.01631057, + "epoch": 0.7623628438298512, + "flos": 23994272323800.0, + "grad_norm": 1.3011457314926835, + "language_loss": 0.80453098, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82823324, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13323975, + "step": 12680, + "time_per_iteration": 2.7985832691192627 + }, + { + "auxiliary_loss_clip": 0.01342664, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.23487926, + "balance_loss_mlp": 1.01798594, + "epoch": 0.7624229670825191, + "flos": 22350763373040.0, + "grad_norm": 2.0558873678369465, + "language_loss": 0.62968075, + "learning_rate": 5.633178881737493e-07, + "loss": 0.65343022, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.14294434, + "step": 12681, + "time_per_iteration": 2.83590030670166 + }, + { + "auxiliary_loss_clip": 0.01340747, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.23484004, + "balance_loss_mlp": 1.02107334, + "epoch": 0.7624830903351871, + "flos": 22717043401680.0, + "grad_norm": 2.095491481763462, + "language_loss": 0.76786172, + "learning_rate": 5.63046970383622e-07, + "loss": 0.7916044, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12451172, + "step": 12682, + "time_per_iteration": 2.837223768234253 + }, + { + "auxiliary_loss_clip": 0.01337516, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.23159909, + "balance_loss_mlp": 1.02340388, + "epoch": 0.7625432135878552, + "flos": 25599301530480.0, + "grad_norm": 1.6567310672700735, + "language_loss": 0.68710256, + "learning_rate": 5.627761070828974e-07, + "loss": 0.71083099, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11932373, + "step": 12683, + "time_per_iteration": 2.801236152648926 + }, + { + "auxiliary_loss_clip": 0.0134237, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.23406267, + "balance_loss_mlp": 1.02211726, + "epoch": 0.7626033368405231, + "flos": 23992932247920.0, + "grad_norm": 2.070854178015478, + "language_loss": 0.8322053, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85597277, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12243652, + "step": 12684, + "time_per_iteration": 2.7200095653533936 + }, + { + "auxiliary_loss_clip": 0.01344333, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.2352469, + "balance_loss_mlp": 1.02393079, + "epoch": 0.7626634600931911, + "flos": 12602265707160.0, + "grad_norm": 1.7716662934163732, + "language_loss": 0.83203489, + "learning_rate": 5.622345439907396e-07, + "loss": 0.85586405, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14642334, + "step": 12685, + "time_per_iteration": 2.688620090484619 + }, + { + "auxiliary_loss_clip": 0.01344679, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.23593855, + "balance_loss_mlp": 1.01511681, + "epoch": 0.762723583345859, + "flos": 26328166227000.0, + "grad_norm": 1.7290864905290249, + "language_loss": 0.77305126, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79677773, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12854004, + "step": 12686, + "time_per_iteration": 2.7514946460723877 + }, + { + "auxiliary_loss_clip": 0.01352245, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.24004054, + "balance_loss_mlp": 1.02402854, + "epoch": 0.762783706598527, + "flos": 21911950167480.0, + "grad_norm": 1.6468081546661189, + "language_loss": 0.72389036, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74779475, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14147949, + "step": 12687, + "time_per_iteration": 2.7057576179504395 + }, + { + "auxiliary_loss_clip": 0.01341261, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.23403192, + "balance_loss_mlp": 1.01798391, + "epoch": 0.7628438298511949, + "flos": 15343843185000.0, + "grad_norm": 1.8052437426616783, + "language_loss": 0.64891529, + "learning_rate": 5.614226082797369e-07, + "loss": 0.67264491, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13708496, + "step": 12688, + "time_per_iteration": 2.7479066848754883 + }, + { + "auxiliary_loss_clip": 0.01340115, + "auxiliary_loss_mlp": 0.0102486, + "balance_loss_clip": 1.23365259, + "balance_loss_mlp": 1.01240897, + "epoch": 0.7629039531038629, + "flos": 13010274148680.0, + "grad_norm": 2.027985473353839, + "language_loss": 0.7084887, + "learning_rate": 5.611520721310515e-07, + "loss": 0.7321384, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12457275, + "step": 12689, + "time_per_iteration": 2.7027831077575684 + }, + { + "auxiliary_loss_clip": 0.01359306, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.24552655, + "balance_loss_mlp": 1.02841163, + "epoch": 0.7629640763565309, + "flos": 26176561927200.0, + "grad_norm": 1.7575046807447032, + "language_loss": 0.70185596, + "learning_rate": 5.608815905436238e-07, + "loss": 0.72586656, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13366699, + "step": 12690, + "time_per_iteration": 2.783231496810913 + }, + { + "auxiliary_loss_clip": 0.01344102, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.23575258, + "balance_loss_mlp": 1.01691794, + "epoch": 0.7630241996091989, + "flos": 36800614977840.0, + "grad_norm": 1.453134232698131, + "language_loss": 0.69676453, + "learning_rate": 5.606111635277109e-07, + "loss": 0.72049689, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12207031, + "step": 12691, + "time_per_iteration": 2.8290417194366455 + }, + { + "auxiliary_loss_clip": 0.01342792, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.23561811, + "balance_loss_mlp": 1.02245045, + "epoch": 0.7630843228618668, + "flos": 21840188549400.0, + "grad_norm": 1.5810886290324746, + "language_loss": 0.81962407, + "learning_rate": 5.603407910935662e-07, + "loss": 0.84339643, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.11993408, + "step": 12692, + "time_per_iteration": 2.865648031234741 + }, + { + "auxiliary_loss_clip": 0.01350347, + "auxiliary_loss_mlp": 0.01028147, + "balance_loss_clip": 1.24059689, + "balance_loss_mlp": 1.01590466, + "epoch": 0.7631444461145348, + "flos": 12644318986920.0, + "grad_norm": 2.1094408717317177, + "language_loss": 0.7719177, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79570264, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12255859, + "step": 12693, + "time_per_iteration": 2.8536458015441895 + }, + { + "auxiliary_loss_clip": 0.01348016, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.23792458, + "balance_loss_mlp": 1.01960421, + "epoch": 0.7632045693672027, + "flos": 16840986314400.0, + "grad_norm": 1.9647569631439286, + "language_loss": 0.73653954, + "learning_rate": 5.598002100115933e-07, + "loss": 0.76035488, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13922119, + "step": 12694, + "time_per_iteration": 2.8785550594329834 + }, + { + "auxiliary_loss_clip": 0.01342954, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.23549759, + "balance_loss_mlp": 1.01511717, + "epoch": 0.7632646926198707, + "flos": 22022353963080.0, + "grad_norm": 1.7373520187660063, + "language_loss": 0.70942402, + "learning_rate": 5.595300013842625e-07, + "loss": 0.73313165, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12683105, + "step": 12695, + "time_per_iteration": 2.802757740020752 + }, + { + "auxiliary_loss_clip": 0.01346701, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.23763466, + "balance_loss_mlp": 1.01672816, + "epoch": 0.7633248158725388, + "flos": 23119447889520.0, + "grad_norm": 1.4184120905530924, + "language_loss": 0.72404397, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74780571, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12750244, + "step": 12696, + "time_per_iteration": 2.805201530456543 + }, + { + "auxiliary_loss_clip": 0.01345805, + "auxiliary_loss_mlp": 0.01034124, + "balance_loss_clip": 1.23643827, + "balance_loss_mlp": 1.02071309, + "epoch": 0.7633849391252067, + "flos": 10893492475920.0, + "grad_norm": 2.1467305030823707, + "language_loss": 0.71326518, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73706448, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13415527, + "step": 12697, + "time_per_iteration": 2.73004412651062 + }, + { + "auxiliary_loss_clip": 0.01344623, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.23839808, + "balance_loss_mlp": 1.01845777, + "epoch": 0.7634450623778747, + "flos": 20998849030920.0, + "grad_norm": 1.9292011880345221, + "language_loss": 0.67595834, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69971782, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.128479, + "step": 12698, + "time_per_iteration": 2.7057228088378906 + }, + { + "auxiliary_loss_clip": 0.01344436, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.23470879, + "balance_loss_mlp": 1.01532602, + "epoch": 0.7635051856305426, + "flos": 18886980969720.0, + "grad_norm": 1.569295790737838, + "language_loss": 0.72585189, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74957842, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12896729, + "step": 12699, + "time_per_iteration": 2.7943878173828125 + }, + { + "auxiliary_loss_clip": 0.01336108, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.23003852, + "balance_loss_mlp": 1.01916301, + "epoch": 0.7635653088832106, + "flos": 34793627975280.0, + "grad_norm": 3.2940612124855715, + "language_loss": 0.73312867, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75680423, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12286377, + "step": 12700, + "time_per_iteration": 2.8739919662475586 + }, + { + "auxiliary_loss_clip": 0.01344372, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.23516774, + "balance_loss_mlp": 1.01971889, + "epoch": 0.7636254321358785, + "flos": 23182762968720.0, + "grad_norm": 2.060026249562134, + "language_loss": 0.69416773, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71793193, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12322998, + "step": 12701, + "time_per_iteration": 2.8118667602539062 + }, + { + "auxiliary_loss_clip": 0.01344532, + "auxiliary_loss_mlp": 0.01030864, + "balance_loss_clip": 1.23567653, + "balance_loss_mlp": 1.01794171, + "epoch": 0.7636855553885465, + "flos": 21330304067880.0, + "grad_norm": 1.9994855893269285, + "language_loss": 0.65090489, + "learning_rate": 5.576400710039508e-07, + "loss": 0.67465889, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12921143, + "step": 12702, + "time_per_iteration": 2.734332323074341 + }, + { + "auxiliary_loss_clip": 0.01346467, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.23751235, + "balance_loss_mlp": 1.01679993, + "epoch": 0.7637456786412145, + "flos": 28664009331480.0, + "grad_norm": 2.0479063000829334, + "language_loss": 0.65557528, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67933863, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13067627, + "step": 12703, + "time_per_iteration": 2.99226713180542 + }, + { + "auxiliary_loss_clip": 0.0134233, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.23432183, + "balance_loss_mlp": 1.01481748, + "epoch": 0.7638058018938825, + "flos": 21912193817640.0, + "grad_norm": 1.9044956375438047, + "language_loss": 0.83871984, + "learning_rate": 5.571005829916668e-07, + "loss": 0.8624121, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12078857, + "step": 12704, + "time_per_iteration": 2.8689441680908203 + }, + { + "auxiliary_loss_clip": 0.01341987, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.23320818, + "balance_loss_mlp": 1.02336788, + "epoch": 0.7638659251465504, + "flos": 29650049728560.0, + "grad_norm": 3.4641605153016872, + "language_loss": 0.68296856, + "learning_rate": 5.568309210527469e-07, + "loss": 0.7067467, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12469482, + "step": 12705, + "time_per_iteration": 4.481739521026611 + }, + { + "auxiliary_loss_clip": 0.01341744, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.23546982, + "balance_loss_mlp": 1.01573992, + "epoch": 0.7639260483992184, + "flos": 26146853588880.0, + "grad_norm": 1.8675207113580738, + "language_loss": 0.74178708, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76549101, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12908936, + "step": 12706, + "time_per_iteration": 5.683200120925903 + }, + { + "auxiliary_loss_clip": 0.01343047, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.23563051, + "balance_loss_mlp": 1.01822615, + "epoch": 0.7639861716518863, + "flos": 20161489131720.0, + "grad_norm": 1.7571822048191474, + "language_loss": 0.78553987, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80928361, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13098145, + "step": 12707, + "time_per_iteration": 2.6850883960723877 + }, + { + "auxiliary_loss_clip": 0.01343159, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.23432302, + "balance_loss_mlp": 1.01636648, + "epoch": 0.7640462949045543, + "flos": 18587102255640.0, + "grad_norm": 1.646136269487543, + "language_loss": 0.80136764, + "learning_rate": 5.560222636275751e-07, + "loss": 0.8250922, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12921143, + "step": 12708, + "time_per_iteration": 2.709233522415161 + }, + { + "auxiliary_loss_clip": 0.01153538, + "auxiliary_loss_mlp": 0.01001339, + "balance_loss_clip": 1.11113882, + "balance_loss_mlp": 0.99883616, + "epoch": 0.7641064181572224, + "flos": 68338371663720.0, + "grad_norm": 0.814704523058653, + "language_loss": 0.56400263, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58555138, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02502441, + "step": 12709, + "time_per_iteration": 3.238651752471924 + }, + { + "auxiliary_loss_clip": 0.01350284, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.23931408, + "balance_loss_mlp": 1.0182507, + "epoch": 0.7641665414098903, + "flos": 17973676791360.0, + "grad_norm": 1.910067876301749, + "language_loss": 0.63394088, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65777183, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14544678, + "step": 12710, + "time_per_iteration": 2.715298652648926 + }, + { + "auxiliary_loss_clip": 0.01352402, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.24159884, + "balance_loss_mlp": 1.02216935, + "epoch": 0.7642266646625583, + "flos": 21257121157200.0, + "grad_norm": 2.003531628596165, + "language_loss": 0.64787954, + "learning_rate": 5.552140990044154e-07, + "loss": 0.67176884, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14361572, + "step": 12711, + "time_per_iteration": 2.7460756301879883 + }, + { + "auxiliary_loss_clip": 0.01342958, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.23455071, + "balance_loss_mlp": 1.01569009, + "epoch": 0.7642867879152262, + "flos": 22753005427440.0, + "grad_norm": 1.4025549430662139, + "language_loss": 0.73037457, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75408852, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12750244, + "step": 12712, + "time_per_iteration": 2.819401979446411 + }, + { + "auxiliary_loss_clip": 0.01339486, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.23387969, + "balance_loss_mlp": 1.02269292, + "epoch": 0.7643469111678942, + "flos": 23337494112240.0, + "grad_norm": 1.4631333862070983, + "language_loss": 0.80647409, + "learning_rate": 5.546755965040804e-07, + "loss": 0.83021498, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11895752, + "step": 12713, + "time_per_iteration": 2.8115222454071045 + }, + { + "auxiliary_loss_clip": 0.01345546, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.23507929, + "balance_loss_mlp": 1.01551497, + "epoch": 0.7644070344205621, + "flos": 19860838858800.0, + "grad_norm": 1.9639612927896402, + "language_loss": 0.83572745, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85948002, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.14196777, + "step": 12714, + "time_per_iteration": 4.292583227157593 + }, + { + "auxiliary_loss_clip": 0.01348719, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.23885739, + "balance_loss_mlp": 1.02227354, + "epoch": 0.7644671576732301, + "flos": 22095699307200.0, + "grad_norm": 1.468363101971893, + "language_loss": 0.73085684, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75469589, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12896729, + "step": 12715, + "time_per_iteration": 2.8008251190185547 + }, + { + "auxiliary_loss_clip": 0.01349377, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.24044418, + "balance_loss_mlp": 1.01452243, + "epoch": 0.7645272809258981, + "flos": 25486623666720.0, + "grad_norm": 1.7023481918251193, + "language_loss": 0.63096821, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65473711, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12988281, + "step": 12716, + "time_per_iteration": 2.9142251014709473 + }, + { + "auxiliary_loss_clip": 0.01351625, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.23969746, + "balance_loss_mlp": 1.01776159, + "epoch": 0.7645874041785661, + "flos": 21546685347840.0, + "grad_norm": 1.5588601124551502, + "language_loss": 0.80024779, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82408059, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13891602, + "step": 12717, + "time_per_iteration": 2.7848973274230957 + }, + { + "auxiliary_loss_clip": 0.01335081, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.22917938, + "balance_loss_mlp": 1.02086985, + "epoch": 0.764647527431234, + "flos": 20635736454360.0, + "grad_norm": 2.285932134855618, + "language_loss": 0.66927105, + "learning_rate": 5.53330299551638e-07, + "loss": 0.69296014, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12957764, + "step": 12718, + "time_per_iteration": 2.826611280441284 + }, + { + "auxiliary_loss_clip": 0.01340666, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.23428786, + "balance_loss_mlp": 1.01732242, + "epoch": 0.764707650683902, + "flos": 21439083529080.0, + "grad_norm": 1.914008710674955, + "language_loss": 0.77530229, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79900068, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11846924, + "step": 12719, + "time_per_iteration": 2.8098857402801514 + }, + { + "auxiliary_loss_clip": 0.01343155, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.23353958, + "balance_loss_mlp": 1.01568437, + "epoch": 0.7647677739365699, + "flos": 22716393667920.0, + "grad_norm": 1.8220493036326801, + "language_loss": 0.70227253, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72599912, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13830566, + "step": 12720, + "time_per_iteration": 2.7682058811187744 + }, + { + "auxiliary_loss_clip": 0.01345272, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.23675537, + "balance_loss_mlp": 1.0209676, + "epoch": 0.7648278971892379, + "flos": 21329045208720.0, + "grad_norm": 1.5599528161680656, + "language_loss": 0.73986948, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76366138, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12957764, + "step": 12721, + "time_per_iteration": 2.7103140354156494 + }, + { + "auxiliary_loss_clip": 0.01340763, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.23169601, + "balance_loss_mlp": 1.01435363, + "epoch": 0.764888020441906, + "flos": 20672185780440.0, + "grad_norm": 2.0968829030841993, + "language_loss": 0.73715866, + "learning_rate": 5.522550493699163e-07, + "loss": 0.76083851, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12878418, + "step": 12722, + "time_per_iteration": 2.7341856956481934 + }, + { + "auxiliary_loss_clip": 0.01337884, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.23053777, + "balance_loss_mlp": 1.02214444, + "epoch": 0.7649481436945739, + "flos": 25087995756360.0, + "grad_norm": 1.8776284436866981, + "language_loss": 0.74539167, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76912051, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12835693, + "step": 12723, + "time_per_iteration": 2.7590694427490234 + }, + { + "auxiliary_loss_clip": 0.01351289, + "auxiliary_loss_mlp": 0.01026465, + "balance_loss_clip": 1.23975348, + "balance_loss_mlp": 1.01384711, + "epoch": 0.7650082669472419, + "flos": 24906439468080.0, + "grad_norm": 1.7609749524322418, + "language_loss": 0.73288679, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7566644, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12628174, + "step": 12724, + "time_per_iteration": 2.8681881427764893 + }, + { + "auxiliary_loss_clip": 0.01336884, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.23168421, + "balance_loss_mlp": 1.01490855, + "epoch": 0.7650683901999098, + "flos": 14651468422920.0, + "grad_norm": 1.7805984156083068, + "language_loss": 0.84246051, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86610305, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.12469482, + "step": 12725, + "time_per_iteration": 2.6919970512390137 + }, + { + "auxiliary_loss_clip": 0.01341901, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.23439777, + "balance_loss_mlp": 1.01778555, + "epoch": 0.7651285134525778, + "flos": 26357143614840.0, + "grad_norm": 1.7034101584713184, + "language_loss": 0.77589607, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79962766, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.13482666, + "step": 12726, + "time_per_iteration": 2.748363971710205 + }, + { + "auxiliary_loss_clip": 0.01342493, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.23423862, + "balance_loss_mlp": 1.0210501, + "epoch": 0.7651886367052457, + "flos": 26651946283920.0, + "grad_norm": 1.818418366546722, + "language_loss": 0.70629323, + "learning_rate": 5.509122219383615e-07, + "loss": 0.73005748, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12896729, + "step": 12727, + "time_per_iteration": 2.809962272644043 + }, + { + "auxiliary_loss_clip": 0.01334411, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.22985363, + "balance_loss_mlp": 1.01660323, + "epoch": 0.7652487599579137, + "flos": 25708405858560.0, + "grad_norm": 1.7628919572187272, + "language_loss": 0.79742664, + "learning_rate": 5.506438212599864e-07, + "loss": 0.82106686, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.13012695, + "step": 12728, + "time_per_iteration": 2.8454959392547607 + }, + { + "auxiliary_loss_clip": 0.01347654, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.23742199, + "balance_loss_mlp": 1.01919556, + "epoch": 0.7653088832105817, + "flos": 28591557371280.0, + "grad_norm": 4.143437424782501, + "language_loss": 0.5614562, + "learning_rate": 5.503754755413424e-07, + "loss": 0.58526123, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13659668, + "step": 12729, + "time_per_iteration": 2.7818357944488525 + }, + { + "auxiliary_loss_clip": 0.01341335, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.23440766, + "balance_loss_mlp": 1.01730561, + "epoch": 0.7653690064632497, + "flos": 23371872411960.0, + "grad_norm": 1.6338570065624867, + "language_loss": 0.77916253, + "learning_rate": 5.501071847926055e-07, + "loss": 0.802881, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13220215, + "step": 12730, + "time_per_iteration": 2.7430734634399414 + }, + { + "auxiliary_loss_clip": 0.01353871, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.24367046, + "balance_loss_mlp": 1.03123522, + "epoch": 0.7654291297159176, + "flos": 15777539737200.0, + "grad_norm": 1.6085639577727482, + "language_loss": 0.69261217, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71659851, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13525391, + "step": 12731, + "time_per_iteration": 2.748967170715332 + }, + { + "auxiliary_loss_clip": 0.01343448, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.23437929, + "balance_loss_mlp": 1.01674032, + "epoch": 0.7654892529685856, + "flos": 18037438562520.0, + "grad_norm": 2.1504138769367764, + "language_loss": 0.70494938, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72868258, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13134766, + "step": 12732, + "time_per_iteration": 2.701852321624756 + }, + { + "auxiliary_loss_clip": 0.01349572, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.23903179, + "balance_loss_mlp": 1.01678979, + "epoch": 0.7655493762212535, + "flos": 27241876488960.0, + "grad_norm": 1.4667692155043617, + "language_loss": 0.78517902, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80898023, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13769531, + "step": 12733, + "time_per_iteration": 2.8320436477661133 + }, + { + "auxiliary_loss_clip": 0.01339533, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.23313451, + "balance_loss_mlp": 1.01780689, + "epoch": 0.7656094994739215, + "flos": 20778447523320.0, + "grad_norm": 1.6648712529698935, + "language_loss": 0.77976477, + "learning_rate": 5.490345717001726e-07, + "loss": 0.80346888, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.1307373, + "step": 12734, + "time_per_iteration": 2.787874698638916 + }, + { + "auxiliary_loss_clip": 0.01347828, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.2360121, + "balance_loss_mlp": 1.01820087, + "epoch": 0.7656696227265896, + "flos": 23044437602640.0, + "grad_norm": 2.2266977476178043, + "language_loss": 0.73307502, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75687778, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14227295, + "step": 12735, + "time_per_iteration": 2.907667636871338 + }, + { + "auxiliary_loss_clip": 0.01342928, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.23377693, + "balance_loss_mlp": 1.01681149, + "epoch": 0.7657297459792575, + "flos": 27531237637800.0, + "grad_norm": 3.3989390434143014, + "language_loss": 0.72926664, + "learning_rate": 5.484985952378145e-07, + "loss": 0.75299215, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12799072, + "step": 12736, + "time_per_iteration": 2.911857843399048 + }, + { + "auxiliary_loss_clip": 0.01353594, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.24154305, + "balance_loss_mlp": 1.0202018, + "epoch": 0.7657898692319255, + "flos": 17132865181560.0, + "grad_norm": 1.9283535648751025, + "language_loss": 0.78236973, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80625212, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14428711, + "step": 12737, + "time_per_iteration": 2.7631237506866455 + }, + { + "auxiliary_loss_clip": 0.0133924, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.23094928, + "balance_loss_mlp": 1.01803291, + "epoch": 0.7658499924845934, + "flos": 21470131943280.0, + "grad_norm": 1.7632865853575748, + "language_loss": 0.76916224, + "learning_rate": 5.479628389397699e-07, + "loss": 0.79286683, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.1317749, + "step": 12738, + "time_per_iteration": 2.921065092086792 + }, + { + "auxiliary_loss_clip": 0.01352125, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.24038875, + "balance_loss_mlp": 1.01441193, + "epoch": 0.7659101157372614, + "flos": 29502628089840.0, + "grad_norm": 1.7189627096266722, + "language_loss": 0.6267947, + "learning_rate": 5.476950433777603e-07, + "loss": 0.65059745, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 1.11767578, + "router_z_loss_mlp": 0.13726807, + "step": 12739, + "time_per_iteration": 2.806175470352173 + }, + { + "auxiliary_loss_clip": 0.01345921, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.23761475, + "balance_loss_mlp": 1.018399, + "epoch": 0.7659702389899293, + "flos": 18556459925040.0, + "grad_norm": 1.7068092419447332, + "language_loss": 0.79196143, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81574392, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.1395874, + "step": 12740, + "time_per_iteration": 2.9770073890686035 + }, + { + "auxiliary_loss_clip": 0.0134067, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.23278451, + "balance_loss_mlp": 1.01767397, + "epoch": 0.7660303622425974, + "flos": 23554322084160.0, + "grad_norm": 1.7063788720652298, + "language_loss": 0.6626308, + "learning_rate": 5.471596174785429e-07, + "loss": 0.68634635, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13208008, + "step": 12741, + "time_per_iteration": 2.7745115756988525 + }, + { + "auxiliary_loss_clip": 0.0134118, + "auxiliary_loss_mlp": 0.01027598, + "balance_loss_clip": 1.23425603, + "balance_loss_mlp": 1.01433587, + "epoch": 0.7660904854952653, + "flos": 18921440486160.0, + "grad_norm": 1.5027900994945118, + "language_loss": 0.75883913, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78252697, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.13262939, + "step": 12742, + "time_per_iteration": 2.726992130279541 + }, + { + "auxiliary_loss_clip": 0.01335286, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.23090661, + "balance_loss_mlp": 1.0180372, + "epoch": 0.7661506087479333, + "flos": 23152567330080.0, + "grad_norm": 1.3284564397984588, + "language_loss": 0.76818115, + "learning_rate": 5.46624411946736e-07, + "loss": 0.79183483, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12042236, + "step": 12743, + "time_per_iteration": 4.211795330047607 + }, + { + "auxiliary_loss_clip": 0.01345254, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.23695803, + "balance_loss_mlp": 1.01508045, + "epoch": 0.7662107320006012, + "flos": 17570175877800.0, + "grad_norm": 1.822276115773764, + "language_loss": 0.74863172, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77236545, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.1305542, + "step": 12744, + "time_per_iteration": 4.166890859603882 + }, + { + "auxiliary_loss_clip": 0.01344671, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.23578918, + "balance_loss_mlp": 1.01449275, + "epoch": 0.7662708552532692, + "flos": 22307329409040.0, + "grad_norm": 1.9235350228828678, + "language_loss": 0.71466327, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73839033, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13531494, + "step": 12745, + "time_per_iteration": 2.7360501289367676 + }, + { + "auxiliary_loss_clip": 0.01338829, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.23037958, + "balance_loss_mlp": 1.01986015, + "epoch": 0.7663309785059371, + "flos": 15746288281200.0, + "grad_norm": 2.0781075916620124, + "language_loss": 0.77548951, + "learning_rate": 5.458220170154896e-07, + "loss": 0.79921412, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13757324, + "step": 12746, + "time_per_iteration": 2.823748826980591 + }, + { + "auxiliary_loss_clip": 0.01157455, + "auxiliary_loss_mlp": 0.01000154, + "balance_loss_clip": 1.11456752, + "balance_loss_mlp": 0.9981395, + "epoch": 0.7663911017586051, + "flos": 62179856848800.0, + "grad_norm": 0.6629629734240104, + "language_loss": 0.56765902, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58923513, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.0201416, + "step": 12747, + "time_per_iteration": 3.2951979637145996 + }, + { + "auxiliary_loss_clip": 0.01338776, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.23247278, + "balance_loss_mlp": 1.02286649, + "epoch": 0.7664512250112732, + "flos": 26511752933280.0, + "grad_norm": 1.3926717528886985, + "language_loss": 0.72359025, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74731588, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.10931396, + "step": 12748, + "time_per_iteration": 2.7761287689208984 + }, + { + "auxiliary_loss_clip": 0.01343192, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.23456514, + "balance_loss_mlp": 1.0155437, + "epoch": 0.7665113482639411, + "flos": 16253411394240.0, + "grad_norm": 1.703783581857414, + "language_loss": 0.69714546, + "learning_rate": 5.450201183674052e-07, + "loss": 0.72086322, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13043213, + "step": 12749, + "time_per_iteration": 2.69913649559021 + }, + { + "auxiliary_loss_clip": 0.01343231, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.23363972, + "balance_loss_mlp": 1.01491654, + "epoch": 0.7665714715166091, + "flos": 27203518569960.0, + "grad_norm": 1.5612893047621392, + "language_loss": 0.73556894, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75929105, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.14050293, + "step": 12750, + "time_per_iteration": 2.740316152572632 + }, + { + "auxiliary_loss_clip": 0.01334607, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.22938919, + "balance_loss_mlp": 1.01795006, + "epoch": 0.766631594769277, + "flos": 21072559850280.0, + "grad_norm": 1.8443365969059793, + "language_loss": 0.75793707, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78158313, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1204834, + "step": 12751, + "time_per_iteration": 2.8221402168273926 + }, + { + "auxiliary_loss_clip": 0.0134034, + "auxiliary_loss_mlp": 0.01038949, + "balance_loss_clip": 1.23392105, + "balance_loss_mlp": 1.02550852, + "epoch": 0.766691718021945, + "flos": 24103376651880.0, + "grad_norm": 3.086953575514685, + "language_loss": 0.61646992, + "learning_rate": 5.442187162761537e-07, + "loss": 0.64026284, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13452148, + "step": 12752, + "time_per_iteration": 2.731476306915283 + }, + { + "auxiliary_loss_clip": 0.01349521, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.2398355, + "balance_loss_mlp": 1.01792479, + "epoch": 0.7667518412746129, + "flos": 23445908098200.0, + "grad_norm": 1.9869569019302395, + "language_loss": 0.69431007, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71812469, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.14013672, + "step": 12753, + "time_per_iteration": 4.239405870437622 + }, + { + "auxiliary_loss_clip": 0.01342097, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.23458457, + "balance_loss_mlp": 1.01841688, + "epoch": 0.766811964527281, + "flos": 18153162053280.0, + "grad_norm": 2.0903157712477247, + "language_loss": 0.62714112, + "learning_rate": 5.436847242152971e-07, + "loss": 0.65087295, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12664795, + "step": 12754, + "time_per_iteration": 2.6975693702697754 + }, + { + "auxiliary_loss_clip": 0.01339976, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.23447526, + "balance_loss_mlp": 1.01872003, + "epoch": 0.7668720877799489, + "flos": 19540713554280.0, + "grad_norm": 4.0410795627983385, + "language_loss": 0.8033784, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82708752, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12213135, + "step": 12755, + "time_per_iteration": 2.683945894241333 + }, + { + "auxiliary_loss_clip": 0.01339002, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.23267198, + "balance_loss_mlp": 1.01689339, + "epoch": 0.7669322110326169, + "flos": 22679457041520.0, + "grad_norm": 1.813025811271482, + "language_loss": 0.7086488, + "learning_rate": 5.431509530489242e-07, + "loss": 0.73233438, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12683105, + "step": 12756, + "time_per_iteration": 2.7644104957580566 + }, + { + "auxiliary_loss_clip": 0.01343894, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.23577666, + "balance_loss_mlp": 1.02947676, + "epoch": 0.7669923342852848, + "flos": 26475100565400.0, + "grad_norm": 1.6119259265017973, + "language_loss": 0.70121956, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72507858, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12536621, + "step": 12757, + "time_per_iteration": 2.7550761699676514 + }, + { + "auxiliary_loss_clip": 0.01336995, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.23018897, + "balance_loss_mlp": 1.0249505, + "epoch": 0.7670524575379528, + "flos": 22861216371600.0, + "grad_norm": 2.0826440982693324, + "language_loss": 0.76368898, + "learning_rate": 5.426174028579955e-07, + "loss": 0.7874493, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.14099121, + "step": 12758, + "time_per_iteration": 2.738858699798584 + }, + { + "auxiliary_loss_clip": 0.01341782, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.23687148, + "balance_loss_mlp": 1.02131319, + "epoch": 0.7671125807906207, + "flos": 22456822074120.0, + "grad_norm": 1.7169961159540976, + "language_loss": 0.76324701, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78700376, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12579346, + "step": 12759, + "time_per_iteration": 2.7717392444610596 + }, + { + "auxiliary_loss_clip": 0.01348436, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.23895335, + "balance_loss_mlp": 1.01729703, + "epoch": 0.7671727040432887, + "flos": 35378360310240.0, + "grad_norm": 1.691641899144513, + "language_loss": 0.68774736, + "learning_rate": 5.420840737234425e-07, + "loss": 0.71153057, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12567139, + "step": 12760, + "time_per_iteration": 2.9004580974578857 + }, + { + "auxiliary_loss_clip": 0.01345027, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.23718667, + "balance_loss_mlp": 1.01933253, + "epoch": 0.7672328272959568, + "flos": 22500459079920.0, + "grad_norm": 1.3553699400501473, + "language_loss": 0.79237008, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81614476, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13128662, + "step": 12761, + "time_per_iteration": 2.780900716781616 + }, + { + "auxiliary_loss_clip": 0.01336473, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.2314136, + "balance_loss_mlp": 1.01717567, + "epoch": 0.7672929505486247, + "flos": 22820097084120.0, + "grad_norm": 1.6917522394471776, + "language_loss": 0.66841042, + "learning_rate": 5.415509657261589e-07, + "loss": 0.69207221, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12524414, + "step": 12762, + "time_per_iteration": 2.790428876876831 + }, + { + "auxiliary_loss_clip": 0.01343527, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.23367548, + "balance_loss_mlp": 1.01772404, + "epoch": 0.7673530738012927, + "flos": 20343613937040.0, + "grad_norm": 1.6100122796803567, + "language_loss": 0.74767607, + "learning_rate": 5.412844946792639e-07, + "loss": 0.77142894, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.14031982, + "step": 12763, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.01341624, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.23562193, + "balance_loss_mlp": 1.02392983, + "epoch": 0.7674131970539606, + "flos": 34940318663520.0, + "grad_norm": 1.5294702982722979, + "language_loss": 0.70917737, + "learning_rate": 5.410180789470067e-07, + "loss": 0.73296177, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12890625, + "step": 12764, + "time_per_iteration": 2.814962863922119 + }, + { + "auxiliary_loss_clip": 0.0133907, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.2328968, + "balance_loss_mlp": 1.01754236, + "epoch": 0.7674733203066286, + "flos": 28334625320880.0, + "grad_norm": 1.62327717190694, + "language_loss": 0.69686162, + "learning_rate": 5.40751718539491e-07, + "loss": 0.72055066, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12310791, + "step": 12765, + "time_per_iteration": 2.7622923851013184 + }, + { + "auxiliary_loss_clip": 0.0133797, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.23213601, + "balance_loss_mlp": 1.01871967, + "epoch": 0.7675334435592965, + "flos": 16294083989760.0, + "grad_norm": 1.6559987987297429, + "language_loss": 0.60856932, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63225067, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11437988, + "step": 12766, + "time_per_iteration": 2.6841328144073486 + }, + { + "auxiliary_loss_clip": 0.01156615, + "auxiliary_loss_mlp": 0.01014131, + "balance_loss_clip": 1.11380744, + "balance_loss_mlp": 1.01177061, + "epoch": 0.7675935668119646, + "flos": 64843053545880.0, + "grad_norm": 0.9218230905804393, + "language_loss": 0.60848254, + "learning_rate": 5.402191637390803e-07, + "loss": 0.63019001, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02355957, + "step": 12767, + "time_per_iteration": 3.3588297367095947 + }, + { + "auxiliary_loss_clip": 0.01338466, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.23291385, + "balance_loss_mlp": 1.01601839, + "epoch": 0.7676536900646325, + "flos": 22680675292320.0, + "grad_norm": 1.6851108881608479, + "language_loss": 0.69998157, + "learning_rate": 5.399529693663801e-07, + "loss": 0.72364211, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11566162, + "step": 12768, + "time_per_iteration": 2.7516725063323975 + }, + { + "auxiliary_loss_clip": 0.01353729, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.24256444, + "balance_loss_mlp": 1.02574527, + "epoch": 0.7677138133173005, + "flos": 26944921576800.0, + "grad_norm": 1.651271364602172, + "language_loss": 0.71018863, + "learning_rate": 5.3968683035881e-07, + "loss": 0.73411161, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12835693, + "step": 12769, + "time_per_iteration": 2.7445931434631348 + }, + { + "auxiliary_loss_clip": 0.01348298, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.23806453, + "balance_loss_mlp": 1.02049434, + "epoch": 0.7677739365699684, + "flos": 23804066454840.0, + "grad_norm": 1.8672061659690855, + "language_loss": 0.80861962, + "learning_rate": 5.394207467264611e-07, + "loss": 0.83243644, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12890625, + "step": 12770, + "time_per_iteration": 2.7906129360198975 + }, + { + "auxiliary_loss_clip": 0.01338368, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.234056, + "balance_loss_mlp": 1.02023602, + "epoch": 0.7678340598226364, + "flos": 34461685638000.0, + "grad_norm": 1.6934929511489645, + "language_loss": 0.7871207, + "learning_rate": 5.391547184794245e-07, + "loss": 0.8108294, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12255859, + "step": 12771, + "time_per_iteration": 2.8400216102600098 + }, + { + "auxiliary_loss_clip": 0.01342225, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.23437119, + "balance_loss_mlp": 1.02025807, + "epoch": 0.7678941830753043, + "flos": 23847013118520.0, + "grad_norm": 1.3231687912052226, + "language_loss": 0.68373841, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70749241, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12915039, + "step": 12772, + "time_per_iteration": 2.816847085952759 + }, + { + "auxiliary_loss_clip": 0.01328459, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.22574377, + "balance_loss_mlp": 1.01150966, + "epoch": 0.7679543063279723, + "flos": 25416445774680.0, + "grad_norm": 1.3938198348179625, + "language_loss": 0.73256403, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75607872, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.11480713, + "step": 12773, + "time_per_iteration": 2.761707305908203 + }, + { + "auxiliary_loss_clip": 0.01335856, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.23163795, + "balance_loss_mlp": 1.0173279, + "epoch": 0.7680144295806404, + "flos": 27967898600280.0, + "grad_norm": 1.6285470996842275, + "language_loss": 0.81404126, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83768654, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11346436, + "step": 12774, + "time_per_iteration": 2.897858142852783 + }, + { + "auxiliary_loss_clip": 0.01336431, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.23158407, + "balance_loss_mlp": 1.01905727, + "epoch": 0.7680745528333083, + "flos": 20417730840000.0, + "grad_norm": 1.777608496911148, + "language_loss": 0.70310044, + "learning_rate": 5.380911595461177e-07, + "loss": 0.7267791, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12371826, + "step": 12775, + "time_per_iteration": 2.7299978733062744 + }, + { + "auxiliary_loss_clip": 0.01156059, + "auxiliary_loss_mlp": 0.01003385, + "balance_loss_clip": 1.11273241, + "balance_loss_mlp": 1.00111985, + "epoch": 0.7681346760859763, + "flos": 68418295562160.0, + "grad_norm": 0.7002370071387365, + "language_loss": 0.5689404, + "learning_rate": 5.378254083769147e-07, + "loss": 0.59053487, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02270508, + "step": 12776, + "time_per_iteration": 3.3091177940368652 + }, + { + "auxiliary_loss_clip": 0.01339753, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.23395348, + "balance_loss_mlp": 1.01949561, + "epoch": 0.7681947993386442, + "flos": 21256430815080.0, + "grad_norm": 1.7672990619156934, + "language_loss": 0.74190533, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76561552, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.11767578, + "step": 12777, + "time_per_iteration": 2.841900587081909 + }, + { + "auxiliary_loss_clip": 0.0133999, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.23386395, + "balance_loss_mlp": 1.02318656, + "epoch": 0.7682549225913122, + "flos": 21402715419720.0, + "grad_norm": 1.92548145163259, + "language_loss": 0.70070231, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72445273, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.11877441, + "step": 12778, + "time_per_iteration": 2.7268786430358887 + }, + { + "auxiliary_loss_clip": 0.01341376, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.23521757, + "balance_loss_mlp": 1.01345503, + "epoch": 0.7683150458439801, + "flos": 23044072127400.0, + "grad_norm": 2.919440240160351, + "language_loss": 0.70871401, + "learning_rate": 5.37028487584446e-07, + "loss": 0.73238385, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12164307, + "step": 12779, + "time_per_iteration": 2.7681634426116943 + }, + { + "auxiliary_loss_clip": 0.01347408, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.23954535, + "balance_loss_mlp": 1.01869822, + "epoch": 0.7683751690966482, + "flos": 67343299229520.0, + "grad_norm": 1.6142989837789135, + "language_loss": 0.58821917, + "learning_rate": 5.367629582589133e-07, + "loss": 0.61200798, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12768555, + "step": 12780, + "time_per_iteration": 3.148128032684326 + }, + { + "auxiliary_loss_clip": 0.01347415, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.23706627, + "balance_loss_mlp": 1.0224669, + "epoch": 0.7684352923493161, + "flos": 21803942265120.0, + "grad_norm": 1.6717392822815613, + "language_loss": 0.6857053, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70953977, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13574219, + "step": 12781, + "time_per_iteration": 2.772061824798584 + }, + { + "auxiliary_loss_clip": 0.01343207, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.23619878, + "balance_loss_mlp": 1.01783895, + "epoch": 0.7684954156019841, + "flos": 25853025520440.0, + "grad_norm": 1.4773295511826563, + "language_loss": 0.79678816, + "learning_rate": 5.362320660762016e-07, + "loss": 0.82052326, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12457275, + "step": 12782, + "time_per_iteration": 4.257019996643066 + }, + { + "auxiliary_loss_clip": 0.01345199, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.2373054, + "balance_loss_mlp": 1.02081251, + "epoch": 0.768555538854652, + "flos": 25452895100760.0, + "grad_norm": 1.5154531115071623, + "language_loss": 0.66960782, + "learning_rate": 5.35966703239153e-07, + "loss": 0.69339949, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13140869, + "step": 12783, + "time_per_iteration": 4.324413776397705 + }, + { + "auxiliary_loss_clip": 0.01341778, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.23492289, + "balance_loss_mlp": 1.02034831, + "epoch": 0.76861566210732, + "flos": 19651157958240.0, + "grad_norm": 1.8333261337938607, + "language_loss": 0.69532973, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71908009, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12927246, + "step": 12784, + "time_per_iteration": 4.297109365463257 + }, + { + "auxiliary_loss_clip": 0.01340286, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.23308539, + "balance_loss_mlp": 1.01637065, + "epoch": 0.7686757853599879, + "flos": 22424149325520.0, + "grad_norm": 1.7399945153736451, + "language_loss": 0.80790907, + "learning_rate": 5.354361441239843e-07, + "loss": 0.83158958, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.11407471, + "step": 12785, + "time_per_iteration": 2.7057712078094482 + }, + { + "auxiliary_loss_clip": 0.01344952, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.23711038, + "balance_loss_mlp": 1.01909184, + "epoch": 0.768735908612656, + "flos": 47782379693160.0, + "grad_norm": 1.5270804303796852, + "language_loss": 0.77474523, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79851997, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13433838, + "step": 12786, + "time_per_iteration": 3.0603578090667725 + }, + { + "auxiliary_loss_clip": 0.01337247, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.23047674, + "balance_loss_mlp": 1.02331448, + "epoch": 0.7687960318653239, + "flos": 30269282188320.0, + "grad_norm": 2.0861282091305178, + "language_loss": 0.58866256, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61239004, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12194824, + "step": 12787, + "time_per_iteration": 2.7622084617614746 + }, + { + "auxiliary_loss_clip": 0.01339539, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.23391438, + "balance_loss_mlp": 1.01664114, + "epoch": 0.7688561551179919, + "flos": 19578218697720.0, + "grad_norm": 1.6070551403570705, + "language_loss": 0.76286936, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78655756, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12634277, + "step": 12788, + "time_per_iteration": 2.7315216064453125 + }, + { + "auxiliary_loss_clip": 0.01344017, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.23566258, + "balance_loss_mlp": 1.01893067, + "epoch": 0.7689162783706599, + "flos": 22788926844840.0, + "grad_norm": 1.6823477961869038, + "language_loss": 0.670187, + "learning_rate": 5.343756924109821e-07, + "loss": 0.69394577, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12927246, + "step": 12789, + "time_per_iteration": 2.7166974544525146 + }, + { + "auxiliary_loss_clip": 0.01344474, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.2358036, + "balance_loss_mlp": 1.02060008, + "epoch": 0.7689764016233278, + "flos": 34210885449960.0, + "grad_norm": 4.417120985341808, + "language_loss": 0.69024509, + "learning_rate": 5.341107183991553e-07, + "loss": 0.71403092, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13513184, + "step": 12790, + "time_per_iteration": 3.074838876724243 + }, + { + "auxiliary_loss_clip": 0.01336693, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.23031998, + "balance_loss_mlp": 1.01626849, + "epoch": 0.7690365248759958, + "flos": 17278906136040.0, + "grad_norm": 1.5405908346056714, + "language_loss": 0.68991745, + "learning_rate": 5.338457999739969e-07, + "loss": 0.71357763, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13049316, + "step": 12791, + "time_per_iteration": 2.9688258171081543 + }, + { + "auxiliary_loss_clip": 0.01338394, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.23283327, + "balance_loss_mlp": 1.02035558, + "epoch": 0.7690966481286637, + "flos": 18227725648200.0, + "grad_norm": 1.7571036509972136, + "language_loss": 0.7992177, + "learning_rate": 5.335809371455526e-07, + "loss": 0.82292753, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12249756, + "step": 12792, + "time_per_iteration": 4.200439453125 + }, + { + "auxiliary_loss_clip": 0.0135061, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.2390132, + "balance_loss_mlp": 1.01647115, + "epoch": 0.7691567713813318, + "flos": 21541812344640.0, + "grad_norm": 2.087592079810805, + "language_loss": 0.73060989, + "learning_rate": 5.333161299238673e-07, + "loss": 0.75441468, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.1338501, + "step": 12793, + "time_per_iteration": 2.7294931411743164 + }, + { + "auxiliary_loss_clip": 0.01347983, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.23907852, + "balance_loss_mlp": 1.01925731, + "epoch": 0.7692168946339997, + "flos": 39386689753320.0, + "grad_norm": 1.5652922378753484, + "language_loss": 0.63262248, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65641749, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12261963, + "step": 12794, + "time_per_iteration": 2.912990093231201 + }, + { + "auxiliary_loss_clip": 0.01347965, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.23963761, + "balance_loss_mlp": 1.01855683, + "epoch": 0.7692770178866677, + "flos": 25015462579440.0, + "grad_norm": 1.4451149732939161, + "language_loss": 0.76764154, + "learning_rate": 5.327866823409319e-07, + "loss": 0.79143107, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12438965, + "step": 12795, + "time_per_iteration": 2.848114252090454 + }, + { + "auxiliary_loss_clip": 0.01343306, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.23542297, + "balance_loss_mlp": 1.01584244, + "epoch": 0.7693371411393356, + "flos": 24721512685920.0, + "grad_norm": 1.4882569170110012, + "language_loss": 0.71751523, + "learning_rate": 5.325220419997601e-07, + "loss": 0.7412383, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13171387, + "step": 12796, + "time_per_iteration": 2.8105807304382324 + }, + { + "auxiliary_loss_clip": 0.01343108, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.23621249, + "balance_loss_mlp": 1.01850402, + "epoch": 0.7693972643920036, + "flos": 15929103428640.0, + "grad_norm": 1.8363927342197253, + "language_loss": 0.65064478, + "learning_rate": 5.32257457305499e-07, + "loss": 0.67438519, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12414551, + "step": 12797, + "time_per_iteration": 2.782003879547119 + }, + { + "auxiliary_loss_clip": 0.01344676, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.23624253, + "balance_loss_mlp": 1.02093267, + "epoch": 0.7694573876446715, + "flos": 25410760604280.0, + "grad_norm": 1.8848784004722097, + "language_loss": 0.91850257, + "learning_rate": 5.319929282681823e-07, + "loss": 0.94229388, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.13543701, + "step": 12798, + "time_per_iteration": 2.7752127647399902 + }, + { + "auxiliary_loss_clip": 0.0134317, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.23532188, + "balance_loss_mlp": 1.01483083, + "epoch": 0.7695175108973396, + "flos": 16658577250560.0, + "grad_norm": 1.749933235826421, + "language_loss": 0.82655811, + "learning_rate": 5.317284548978418e-07, + "loss": 0.85026085, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12268066, + "step": 12799, + "time_per_iteration": 2.6571476459503174 + }, + { + "auxiliary_loss_clip": 0.01352989, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.24423218, + "balance_loss_mlp": 1.01757574, + "epoch": 0.7695776341500075, + "flos": 13630562425800.0, + "grad_norm": 1.9504897366842866, + "language_loss": 0.77653378, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80036998, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1305542, + "step": 12800, + "time_per_iteration": 2.7254538536071777 + }, + { + "auxiliary_loss_clip": 0.01355876, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.24198532, + "balance_loss_mlp": 1.01832008, + "epoch": 0.7696377574026755, + "flos": 24281643663000.0, + "grad_norm": 1.5416244360586568, + "language_loss": 0.83835709, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86223692, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13781738, + "step": 12801, + "time_per_iteration": 2.767754316329956 + }, + { + "auxiliary_loss_clip": 0.01347451, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.23917699, + "balance_loss_mlp": 1.02002978, + "epoch": 0.7696978806553435, + "flos": 20928062013480.0, + "grad_norm": 1.8225996458058242, + "language_loss": 0.72024453, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74405193, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1328125, + "step": 12802, + "time_per_iteration": 2.726792097091675 + }, + { + "auxiliary_loss_clip": 0.01336162, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.23069835, + "balance_loss_mlp": 1.01985919, + "epoch": 0.7697580039080114, + "flos": 22934764757520.0, + "grad_norm": 1.9423734474090004, + "language_loss": 0.76372272, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78741336, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13049316, + "step": 12803, + "time_per_iteration": 2.8065333366394043 + }, + { + "auxiliary_loss_clip": 0.01159646, + "auxiliary_loss_mlp": 0.00999879, + "balance_loss_clip": 1.11717212, + "balance_loss_mlp": 0.99745941, + "epoch": 0.7698181271606794, + "flos": 68733344821680.0, + "grad_norm": 0.7399673066161623, + "language_loss": 0.55873036, + "learning_rate": 5.304069234017001e-07, + "loss": 0.5803256, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02416992, + "step": 12804, + "time_per_iteration": 3.2646849155426025 + }, + { + "auxiliary_loss_clip": 0.01159699, + "auxiliary_loss_mlp": 0.01008935, + "balance_loss_clip": 1.1171186, + "balance_loss_mlp": 1.00614536, + "epoch": 0.7698782504133473, + "flos": 67425960869280.0, + "grad_norm": 0.7500489040233266, + "language_loss": 0.5406096, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56229591, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0279541, + "step": 12805, + "time_per_iteration": 3.323507785797119 + }, + { + "auxiliary_loss_clip": 0.01342988, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.23618305, + "balance_loss_mlp": 1.01887918, + "epoch": 0.7699383736660154, + "flos": 22493555658720.0, + "grad_norm": 1.9429726767166313, + "language_loss": 0.73265588, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75640535, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13092041, + "step": 12806, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.01346578, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.23864067, + "balance_loss_mlp": 1.02408826, + "epoch": 0.7699984969186833, + "flos": 21544248846240.0, + "grad_norm": 1.8272632584289357, + "language_loss": 0.75300694, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77684343, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12988281, + "step": 12807, + "time_per_iteration": 2.8076233863830566 + }, + { + "auxiliary_loss_clip": 0.01354524, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.24263716, + "balance_loss_mlp": 1.02015924, + "epoch": 0.7700586201713513, + "flos": 21723246807840.0, + "grad_norm": 1.8694221483862483, + "language_loss": 0.80502725, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82891917, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.1451416, + "step": 12808, + "time_per_iteration": 2.8403732776641846 + }, + { + "auxiliary_loss_clip": 0.01351731, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.24081683, + "balance_loss_mlp": 1.01859653, + "epoch": 0.7701187434240192, + "flos": 27861921115920.0, + "grad_norm": 3.3811897975310323, + "language_loss": 0.79070663, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81453973, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12988281, + "step": 12809, + "time_per_iteration": 2.8015294075012207 + }, + { + "auxiliary_loss_clip": 0.0133422, + "auxiliary_loss_mlp": 0.01023966, + "balance_loss_clip": 1.22920728, + "balance_loss_mlp": 1.01164603, + "epoch": 0.7701788666766872, + "flos": 28627194530160.0, + "grad_norm": 1.4992853051170905, + "language_loss": 0.70347667, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72705847, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12329102, + "step": 12810, + "time_per_iteration": 2.8177573680877686 + }, + { + "auxiliary_loss_clip": 0.01347283, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.23671842, + "balance_loss_mlp": 1.02056038, + "epoch": 0.7702389899293551, + "flos": 14250525836040.0, + "grad_norm": 4.124225880835189, + "language_loss": 0.79247713, + "learning_rate": 5.285591201262079e-07, + "loss": 0.81630176, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14630127, + "step": 12811, + "time_per_iteration": 2.6776316165924072 + }, + { + "auxiliary_loss_clip": 0.01161483, + "auxiliary_loss_mlp": 0.00997788, + "balance_loss_clip": 1.11900091, + "balance_loss_mlp": 0.99533236, + "epoch": 0.7702991131820232, + "flos": 70589499083280.0, + "grad_norm": 0.8035493406568934, + "language_loss": 0.56728864, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58888137, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02453613, + "step": 12812, + "time_per_iteration": 3.306445598602295 + }, + { + "auxiliary_loss_clip": 0.0134664, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.23665261, + "balance_loss_mlp": 1.02051818, + "epoch": 0.7703592364346911, + "flos": 25485039940680.0, + "grad_norm": 1.5251330831969987, + "language_loss": 0.71750677, + "learning_rate": 5.280316783577836e-07, + "loss": 0.7413097, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13128662, + "step": 12813, + "time_per_iteration": 2.7647392749786377 + }, + { + "auxiliary_loss_clip": 0.01347558, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.23821378, + "balance_loss_mlp": 1.01466966, + "epoch": 0.7704193596873591, + "flos": 19285568271720.0, + "grad_norm": 1.6604427763371934, + "language_loss": 0.66675413, + "learning_rate": 5.27768041194351e-07, + "loss": 0.6905092, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.1328125, + "step": 12814, + "time_per_iteration": 2.746286392211914 + }, + { + "auxiliary_loss_clip": 0.01341353, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.2341429, + "balance_loss_mlp": 1.01879835, + "epoch": 0.7704794829400271, + "flos": 23663588845680.0, + "grad_norm": 1.8772085584654683, + "language_loss": 0.65758651, + "learning_rate": 5.275044598581018e-07, + "loss": 0.68131417, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12609863, + "step": 12815, + "time_per_iteration": 2.789292097091675 + }, + { + "auxiliary_loss_clip": 0.0134246, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.23378205, + "balance_loss_mlp": 1.01886463, + "epoch": 0.770539606192695, + "flos": 18993973663080.0, + "grad_norm": 2.102234992483836, + "language_loss": 0.65209186, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67583668, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13165283, + "step": 12816, + "time_per_iteration": 2.7154147624969482 + }, + { + "auxiliary_loss_clip": 0.01349067, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.23938107, + "balance_loss_mlp": 1.01935863, + "epoch": 0.770599729445363, + "flos": 11832850240200.0, + "grad_norm": 3.8300842996983047, + "language_loss": 0.71673036, + "learning_rate": 5.26977464707133e-07, + "loss": 0.74053752, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1229248, + "step": 12817, + "time_per_iteration": 2.742715358734131 + }, + { + "auxiliary_loss_clip": 0.01347848, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.23886943, + "balance_loss_mlp": 1.01891232, + "epoch": 0.770659852698031, + "flos": 17827229753280.0, + "grad_norm": 1.8836374144551298, + "language_loss": 0.61178803, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63558495, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12927246, + "step": 12818, + "time_per_iteration": 2.9509992599487305 + }, + { + "auxiliary_loss_clip": 0.0134068, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.2366184, + "balance_loss_mlp": 1.0167613, + "epoch": 0.770719975950699, + "flos": 21877043959080.0, + "grad_norm": 1.6094276659955946, + "language_loss": 0.67640173, + "learning_rate": 5.264506929848093e-07, + "loss": 0.70009381, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.11779785, + "step": 12819, + "time_per_iteration": 2.755572557449341 + }, + { + "auxiliary_loss_clip": 0.01349691, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.2405529, + "balance_loss_mlp": 1.02214932, + "epoch": 0.7707800992033669, + "flos": 21330101026080.0, + "grad_norm": 1.643208334928731, + "language_loss": 0.57409096, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59793842, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.12908936, + "step": 12820, + "time_per_iteration": 2.73688006401062 + }, + { + "auxiliary_loss_clip": 0.01343068, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.23422837, + "balance_loss_mlp": 1.01785994, + "epoch": 0.7708402224560349, + "flos": 28184726572200.0, + "grad_norm": 1.5613961913527006, + "language_loss": 0.80982977, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83356851, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1295166, + "step": 12821, + "time_per_iteration": 5.721849203109741 + }, + { + "auxiliary_loss_clip": 0.01344248, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.23675382, + "balance_loss_mlp": 1.0212121, + "epoch": 0.7709003457087028, + "flos": 15381226503360.0, + "grad_norm": 2.5096637620448083, + "language_loss": 0.68811506, + "learning_rate": 5.256609545048114e-07, + "loss": 0.71190143, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13183594, + "step": 12822, + "time_per_iteration": 4.112893342971802 + }, + { + "auxiliary_loss_clip": 0.01339691, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.23393369, + "balance_loss_mlp": 1.02436602, + "epoch": 0.7709604689613708, + "flos": 30627196894800.0, + "grad_norm": 1.5340945900598464, + "language_loss": 0.72447836, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74824172, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12286377, + "step": 12823, + "time_per_iteration": 2.9603211879730225 + }, + { + "auxiliary_loss_clip": 0.01353028, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.24051344, + "balance_loss_mlp": 1.0240221, + "epoch": 0.7710205922140387, + "flos": 20305905751800.0, + "grad_norm": 2.3620334637711013, + "language_loss": 0.76672542, + "learning_rate": 5.251347417035969e-07, + "loss": 0.79064393, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14788818, + "step": 12824, + "time_per_iteration": 2.672053575515747 + }, + { + "auxiliary_loss_clip": 0.01349137, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.24176455, + "balance_loss_mlp": 1.01628315, + "epoch": 0.7710807154667068, + "flos": 19648721456640.0, + "grad_norm": 1.663332104319107, + "language_loss": 0.73020327, + "learning_rate": 5.248717191885592e-07, + "loss": 0.75398761, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13018799, + "step": 12825, + "time_per_iteration": 2.710801124572754 + }, + { + "auxiliary_loss_clip": 0.01337964, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.23407638, + "balance_loss_mlp": 1.0256325, + "epoch": 0.7711408387193747, + "flos": 20010534565680.0, + "grad_norm": 1.9007799933054035, + "language_loss": 0.74080569, + "learning_rate": 5.246087526105343e-07, + "loss": 0.76455426, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11260986, + "step": 12826, + "time_per_iteration": 2.741839647293091 + }, + { + "auxiliary_loss_clip": 0.01345747, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.23522878, + "balance_loss_mlp": 1.01648378, + "epoch": 0.7712009619720427, + "flos": 24976333101600.0, + "grad_norm": 1.5741862664235784, + "language_loss": 0.81687832, + "learning_rate": 5.243458419794933e-07, + "loss": 0.84064031, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13970947, + "step": 12827, + "time_per_iteration": 2.8451783657073975 + }, + { + "auxiliary_loss_clip": 0.01156209, + "auxiliary_loss_mlp": 0.01000887, + "balance_loss_clip": 1.1138761, + "balance_loss_mlp": 0.99844283, + "epoch": 0.7712610852247107, + "flos": 63265296175920.0, + "grad_norm": 0.8825062350034076, + "language_loss": 0.5521822, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57375318, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02441406, + "step": 12828, + "time_per_iteration": 3.536884307861328 + }, + { + "auxiliary_loss_clip": 0.0133485, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.22969651, + "balance_loss_mlp": 1.0209856, + "epoch": 0.7713212084773786, + "flos": 18702541487880.0, + "grad_norm": 1.7678884815121745, + "language_loss": 0.6988517, + "learning_rate": 5.23820188598238e-07, + "loss": 0.72252703, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11694336, + "step": 12829, + "time_per_iteration": 2.702026605606079 + }, + { + "auxiliary_loss_clip": 0.01348967, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.2383914, + "balance_loss_mlp": 1.02519453, + "epoch": 0.7713813317300466, + "flos": 14177424142080.0, + "grad_norm": 2.582400108285294, + "language_loss": 0.80012476, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82400382, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1373291, + "step": 12830, + "time_per_iteration": 4.277104139328003 + }, + { + "auxiliary_loss_clip": 0.01350725, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.2391876, + "balance_loss_mlp": 1.02440739, + "epoch": 0.7714414549827145, + "flos": 25709664717720.0, + "grad_norm": 1.5088443142829442, + "language_loss": 0.77853847, + "learning_rate": 5.232947591245269e-07, + "loss": 0.8024286, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13891602, + "step": 12831, + "time_per_iteration": 2.902784824371338 + }, + { + "auxiliary_loss_clip": 0.01342764, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.23466086, + "balance_loss_mlp": 1.01342976, + "epoch": 0.7715015782353826, + "flos": 30561485922360.0, + "grad_norm": 1.5539737832174636, + "language_loss": 0.60927832, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63296568, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12542725, + "step": 12832, + "time_per_iteration": 2.852203607559204 + }, + { + "auxiliary_loss_clip": 0.01346996, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.23715854, + "balance_loss_mlp": 1.02160811, + "epoch": 0.7715617014880505, + "flos": 20234103525360.0, + "grad_norm": 1.5016842675859041, + "language_loss": 0.79604352, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81986076, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13116455, + "step": 12833, + "time_per_iteration": 2.816749334335327 + }, + { + "auxiliary_loss_clip": 0.01157241, + "auxiliary_loss_mlp": 0.01002663, + "balance_loss_clip": 1.11449862, + "balance_loss_mlp": 1.00020754, + "epoch": 0.7716218247407185, + "flos": 63677470297320.0, + "grad_norm": 0.847800073233694, + "language_loss": 0.55471867, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57631773, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02453613, + "step": 12834, + "time_per_iteration": 3.2673568725585938 + }, + { + "auxiliary_loss_clip": 0.0134783, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.23873234, + "balance_loss_mlp": 1.01728404, + "epoch": 0.7716819479933864, + "flos": 19796549178960.0, + "grad_norm": 2.057328686038464, + "language_loss": 0.72867417, + "learning_rate": 5.222445722184903e-07, + "loss": 0.7524544, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12915039, + "step": 12835, + "time_per_iteration": 2.744597911834717 + }, + { + "auxiliary_loss_clip": 0.0134344, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.23429203, + "balance_loss_mlp": 1.02672303, + "epoch": 0.7717420712460544, + "flos": 18446787079920.0, + "grad_norm": 2.589952028727005, + "language_loss": 0.70363629, + "learning_rate": 5.219821655586814e-07, + "loss": 0.72747076, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13269043, + "step": 12836, + "time_per_iteration": 2.779425859451294 + }, + { + "auxiliary_loss_clip": 0.01335837, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.23061299, + "balance_loss_mlp": 1.01850986, + "epoch": 0.7718021944987223, + "flos": 35196316721640.0, + "grad_norm": 1.6324669212381469, + "language_loss": 0.59915578, + "learning_rate": 5.217198149454575e-07, + "loss": 0.62282288, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12359619, + "step": 12837, + "time_per_iteration": 2.8683834075927734 + }, + { + "auxiliary_loss_clip": 0.01156517, + "auxiliary_loss_mlp": 0.01003458, + "balance_loss_clip": 1.11363351, + "balance_loss_mlp": 1.00088358, + "epoch": 0.7718623177513904, + "flos": 67939824970080.0, + "grad_norm": 0.8766203016453947, + "language_loss": 0.55836231, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57996207, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02575684, + "step": 12838, + "time_per_iteration": 3.1787002086639404 + }, + { + "auxiliary_loss_clip": 0.01341214, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.2348237, + "balance_loss_mlp": 1.01909161, + "epoch": 0.7719224410040583, + "flos": 18585153054360.0, + "grad_norm": 4.000538102267801, + "language_loss": 0.69702268, + "learning_rate": 5.211952818985538e-07, + "loss": 0.7207557, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.13000488, + "step": 12839, + "time_per_iteration": 2.7311692237854004 + }, + { + "auxiliary_loss_clip": 0.01336895, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.23110461, + "balance_loss_mlp": 1.01732206, + "epoch": 0.7719825642567263, + "flos": 23081008753800.0, + "grad_norm": 1.8325697048501088, + "language_loss": 0.80356616, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82723063, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12225342, + "step": 12840, + "time_per_iteration": 2.757448673248291 + }, + { + "auxiliary_loss_clip": 0.0134516, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.23732352, + "balance_loss_mlp": 1.01638174, + "epoch": 0.7720426875093943, + "flos": 20343898195560.0, + "grad_norm": 1.6921108644667504, + "language_loss": 0.80029756, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82403749, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12469482, + "step": 12841, + "time_per_iteration": 2.7296059131622314 + }, + { + "auxiliary_loss_clip": 0.01344175, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.23535419, + "balance_loss_mlp": 1.01821136, + "epoch": 0.7721028107620622, + "flos": 23886305029800.0, + "grad_norm": 1.482302929482822, + "language_loss": 0.76812255, + "learning_rate": 5.204089029262208e-07, + "loss": 0.79187322, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12689209, + "step": 12842, + "time_per_iteration": 2.7357146739959717 + }, + { + "auxiliary_loss_clip": 0.01349599, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.23957872, + "balance_loss_mlp": 1.01947141, + "epoch": 0.7721629340147302, + "flos": 26657265979080.0, + "grad_norm": 1.4595337908467445, + "language_loss": 0.68848372, + "learning_rate": 5.201468888013445e-07, + "loss": 0.71230984, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13543701, + "step": 12843, + "time_per_iteration": 2.8031647205352783 + }, + { + "auxiliary_loss_clip": 0.01351901, + "auxiliary_loss_mlp": 0.01029014, + "balance_loss_clip": 1.24029183, + "balance_loss_mlp": 1.01619315, + "epoch": 0.7722230572673981, + "flos": 21183857029800.0, + "grad_norm": 1.8563880317439498, + "language_loss": 0.74704921, + "learning_rate": 5.198849307926465e-07, + "loss": 0.77085841, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12817383, + "step": 12844, + "time_per_iteration": 2.7679712772369385 + }, + { + "auxiliary_loss_clip": 0.01340257, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.23406363, + "balance_loss_mlp": 1.01709163, + "epoch": 0.7722831805200662, + "flos": 27970456926960.0, + "grad_norm": 1.3718814405743187, + "language_loss": 0.71859944, + "learning_rate": 5.196230289100596e-07, + "loss": 0.74229795, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12512207, + "step": 12845, + "time_per_iteration": 2.819138765335083 + }, + { + "auxiliary_loss_clip": 0.01337261, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.23218083, + "balance_loss_mlp": 1.02155995, + "epoch": 0.7723433037727341, + "flos": 33882191781480.0, + "grad_norm": 1.722910578830386, + "language_loss": 0.64379519, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66750544, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12213135, + "step": 12846, + "time_per_iteration": 2.8249309062957764 + }, + { + "auxiliary_loss_clip": 0.01153728, + "auxiliary_loss_mlp": 0.0100745, + "balance_loss_clip": 1.11169362, + "balance_loss_mlp": 1.00498223, + "epoch": 0.7724034270254021, + "flos": 62863825680360.0, + "grad_norm": 0.7860889160966305, + "language_loss": 0.61746359, + "learning_rate": 5.19099393562945e-07, + "loss": 0.6390754, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.0246582, + "step": 12847, + "time_per_iteration": 3.1882503032684326 + }, + { + "auxiliary_loss_clip": 0.01339901, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.23130322, + "balance_loss_mlp": 1.01861131, + "epoch": 0.77246355027807, + "flos": 23300801136000.0, + "grad_norm": 1.82057030239489, + "language_loss": 0.78938776, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81309718, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12432861, + "step": 12848, + "time_per_iteration": 2.795104742050171 + }, + { + "auxiliary_loss_clip": 0.01347991, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.23698056, + "balance_loss_mlp": 1.01771998, + "epoch": 0.772523673530738, + "flos": 20126217448080.0, + "grad_norm": 1.9621635493382872, + "language_loss": 0.72846127, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75224799, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12988281, + "step": 12849, + "time_per_iteration": 2.7772135734558105 + }, + { + "auxiliary_loss_clip": 0.01342317, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.23469687, + "balance_loss_mlp": 1.01855636, + "epoch": 0.7725837967834059, + "flos": 17824387168080.0, + "grad_norm": 1.9952745616521204, + "language_loss": 0.78601265, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80974704, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12567139, + "step": 12850, + "time_per_iteration": 2.828488349914551 + }, + { + "auxiliary_loss_clip": 0.01346754, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.23679185, + "balance_loss_mlp": 1.01874828, + "epoch": 0.772643920036074, + "flos": 27204696212400.0, + "grad_norm": 1.5197525032014016, + "language_loss": 0.80068862, + "learning_rate": 5.180527968188935e-07, + "loss": 0.82447863, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13500977, + "step": 12851, + "time_per_iteration": 2.7932238578796387 + }, + { + "auxiliary_loss_clip": 0.01342978, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.23679829, + "balance_loss_mlp": 1.01464224, + "epoch": 0.7727040432887419, + "flos": 21584474749800.0, + "grad_norm": 1.3887349996542266, + "language_loss": 0.73380494, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75751281, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13146973, + "step": 12852, + "time_per_iteration": 2.831817388534546 + }, + { + "auxiliary_loss_clip": 0.01336774, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.23023498, + "balance_loss_mlp": 1.02183747, + "epoch": 0.7727641665414099, + "flos": 22241740261680.0, + "grad_norm": 2.117575057763283, + "language_loss": 0.82666349, + "learning_rate": 5.17529835580704e-07, + "loss": 0.85037863, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12902832, + "step": 12853, + "time_per_iteration": 2.745661973953247 + }, + { + "auxiliary_loss_clip": 0.01153975, + "auxiliary_loss_mlp": 0.01004047, + "balance_loss_clip": 1.11187971, + "balance_loss_mlp": 1.00165081, + "epoch": 0.7728242897940779, + "flos": 54848141394840.0, + "grad_norm": 0.8303680395022478, + "language_loss": 0.54628289, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56786311, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02392578, + "step": 12854, + "time_per_iteration": 3.3130195140838623 + }, + { + "auxiliary_loss_clip": 0.01351266, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.24009919, + "balance_loss_mlp": 1.01914573, + "epoch": 0.7728844130467458, + "flos": 34468995142800.0, + "grad_norm": 1.4671606864765323, + "language_loss": 0.72130865, + "learning_rate": 5.170070992041826e-07, + "loss": 0.7451551, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14233398, + "step": 12855, + "time_per_iteration": 2.910848379135132 + }, + { + "auxiliary_loss_clip": 0.01341766, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.23441768, + "balance_loss_mlp": 1.01521599, + "epoch": 0.7729445362994138, + "flos": 18920993794200.0, + "grad_norm": 1.6607990683397391, + "language_loss": 0.67976999, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70347536, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13555908, + "step": 12856, + "time_per_iteration": 2.8224756717681885 + }, + { + "auxiliary_loss_clip": 0.01342269, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.23398471, + "balance_loss_mlp": 1.01720762, + "epoch": 0.7730046595520818, + "flos": 22205128502160.0, + "grad_norm": 1.7326080513331716, + "language_loss": 0.79562747, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81934869, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1262207, + "step": 12857, + "time_per_iteration": 2.788282632827759 + }, + { + "auxiliary_loss_clip": 0.01339026, + "auxiliary_loss_mlp": 0.01027878, + "balance_loss_clip": 1.2344209, + "balance_loss_mlp": 1.01469314, + "epoch": 0.7730647828047498, + "flos": 13556201872680.0, + "grad_norm": 1.8998126405043585, + "language_loss": 0.78806949, + "learning_rate": 5.162234164284591e-07, + "loss": 0.81173849, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1317749, + "step": 12858, + "time_per_iteration": 2.685750961303711 + }, + { + "auxiliary_loss_clip": 0.01346186, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.23675382, + "balance_loss_mlp": 1.01459479, + "epoch": 0.7731249060574177, + "flos": 21980422508400.0, + "grad_norm": 1.8647776314846432, + "language_loss": 0.77580404, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79954553, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13378906, + "step": 12859, + "time_per_iteration": 4.2786359786987305 + }, + { + "auxiliary_loss_clip": 0.01336238, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.23274255, + "balance_loss_mlp": 1.01860666, + "epoch": 0.7731850293100857, + "flos": 22607005081320.0, + "grad_norm": 1.3852725299924447, + "language_loss": 0.68226063, + "learning_rate": 5.157012425529186e-07, + "loss": 0.70592171, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.11260986, + "step": 12860, + "time_per_iteration": 5.766093730926514 + }, + { + "auxiliary_loss_clip": 0.01353055, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.23927927, + "balance_loss_mlp": 1.0247705, + "epoch": 0.7732451525627536, + "flos": 14102210813400.0, + "grad_norm": 2.2030770918183675, + "language_loss": 0.75099182, + "learning_rate": 5.154402400373343e-07, + "loss": 0.77491057, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.14031982, + "step": 12861, + "time_per_iteration": 2.7127487659454346 + }, + { + "auxiliary_loss_clip": 0.01349352, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.23854232, + "balance_loss_mlp": 1.01635909, + "epoch": 0.7733052758154216, + "flos": 21474964338120.0, + "grad_norm": 2.024749240715568, + "language_loss": 0.75332791, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77711475, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 1.10791016, + "router_z_loss_mlp": 0.12976074, + "step": 12862, + "time_per_iteration": 2.7594242095947266 + }, + { + "auxiliary_loss_clip": 0.01332677, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.22780502, + "balance_loss_mlp": 1.02104902, + "epoch": 0.7733653990680895, + "flos": 21398857625520.0, + "grad_norm": 1.7651995205166477, + "language_loss": 0.83069754, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85435641, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.121521, + "step": 12863, + "time_per_iteration": 2.7687876224517822 + }, + { + "auxiliary_loss_clip": 0.01337509, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.23038161, + "balance_loss_mlp": 1.01666164, + "epoch": 0.7734255223207576, + "flos": 17680579673400.0, + "grad_norm": 1.7047362434925077, + "language_loss": 0.73428082, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75794721, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12475586, + "step": 12864, + "time_per_iteration": 2.6790177822113037 + }, + { + "auxiliary_loss_clip": 0.01340198, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.23252583, + "balance_loss_mlp": 1.01908255, + "epoch": 0.7734856455734255, + "flos": 25236635645880.0, + "grad_norm": 1.5548483868941634, + "language_loss": 0.82565928, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84936821, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1161499, + "step": 12865, + "time_per_iteration": 2.7774429321289062 + }, + { + "auxiliary_loss_clip": 0.01355337, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.24285769, + "balance_loss_mlp": 1.01727486, + "epoch": 0.7735457688260935, + "flos": 23437380342600.0, + "grad_norm": 2.2469434083980295, + "language_loss": 0.72317064, + "learning_rate": 5.141360720771077e-07, + "loss": 0.7470367, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14001465, + "step": 12866, + "time_per_iteration": 2.7651219367980957 + }, + { + "auxiliary_loss_clip": 0.0134217, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.23333573, + "balance_loss_mlp": 1.02196121, + "epoch": 0.7736058920787615, + "flos": 18733265035200.0, + "grad_norm": 2.513373161985356, + "language_loss": 0.64851183, + "learning_rate": 5.138754074778371e-07, + "loss": 0.6722824, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12939453, + "step": 12867, + "time_per_iteration": 2.7689716815948486 + }, + { + "auxiliary_loss_clip": 0.01337965, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.2317183, + "balance_loss_mlp": 1.02509654, + "epoch": 0.7736660153314294, + "flos": 22898477864880.0, + "grad_norm": 1.6416190770507775, + "language_loss": 0.71013278, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73388851, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12506104, + "step": 12868, + "time_per_iteration": 2.832267999649048 + }, + { + "auxiliary_loss_clip": 0.01346609, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.23723876, + "balance_loss_mlp": 1.01911712, + "epoch": 0.7737261385840974, + "flos": 13803915825360.0, + "grad_norm": 2.3606401468383447, + "language_loss": 0.78038985, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80417794, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13067627, + "step": 12869, + "time_per_iteration": 4.392698049545288 + }, + { + "auxiliary_loss_clip": 0.01334999, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.2297225, + "balance_loss_mlp": 1.01509798, + "epoch": 0.7737862618367654, + "flos": 28736623725120.0, + "grad_norm": 1.5134499813676634, + "language_loss": 0.73777771, + "learning_rate": 5.130937518435124e-07, + "loss": 0.7614063, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12780762, + "step": 12870, + "time_per_iteration": 2.925947666168213 + }, + { + "auxiliary_loss_clip": 0.0134547, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.23693752, + "balance_loss_mlp": 1.02127159, + "epoch": 0.7738463850894334, + "flos": 17022908077920.0, + "grad_norm": 1.9079030056181308, + "language_loss": 0.76467341, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78846413, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12329102, + "step": 12871, + "time_per_iteration": 2.8395133018493652 + }, + { + "auxiliary_loss_clip": 0.01339148, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.23175573, + "balance_loss_mlp": 1.01917291, + "epoch": 0.7739065083421013, + "flos": 20708878756680.0, + "grad_norm": 1.5475204234220659, + "language_loss": 0.68934548, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71304876, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11993408, + "step": 12872, + "time_per_iteration": 2.7402265071868896 + }, + { + "auxiliary_loss_clip": 0.01338693, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.2304914, + "balance_loss_mlp": 1.02198422, + "epoch": 0.7739666315947693, + "flos": 20701203776640.0, + "grad_norm": 1.9071275973877588, + "language_loss": 0.85542023, + "learning_rate": 5.123126036618804e-07, + "loss": 0.8791672, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.14019775, + "step": 12873, + "time_per_iteration": 2.7984750270843506 + }, + { + "auxiliary_loss_clip": 0.01340811, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.2333293, + "balance_loss_mlp": 1.02157235, + "epoch": 0.7740267548474372, + "flos": 29576785601160.0, + "grad_norm": 2.526421538268437, + "language_loss": 0.66016692, + "learning_rate": 5.120523337480174e-07, + "loss": 0.68391997, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12915039, + "step": 12874, + "time_per_iteration": 2.7992970943450928 + }, + { + "auxiliary_loss_clip": 0.01340592, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.23317361, + "balance_loss_mlp": 1.02206182, + "epoch": 0.7740868781001052, + "flos": 23664319796160.0, + "grad_norm": 1.6920898233776682, + "language_loss": 0.62634671, + "learning_rate": 5.117921202572785e-07, + "loss": 0.65010691, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13354492, + "step": 12875, + "time_per_iteration": 2.7867093086242676 + }, + { + "auxiliary_loss_clip": 0.01348142, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.23795629, + "balance_loss_mlp": 1.0244416, + "epoch": 0.7741470013527731, + "flos": 24722690328360.0, + "grad_norm": 1.74426766454088, + "language_loss": 0.6534512, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67732143, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14434814, + "step": 12876, + "time_per_iteration": 2.808277130126953 + }, + { + "auxiliary_loss_clip": 0.01335435, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.22986555, + "balance_loss_mlp": 1.02446783, + "epoch": 0.7742071246054412, + "flos": 21876678483840.0, + "grad_norm": 1.9458321353060997, + "language_loss": 0.71086824, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73459411, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12701416, + "step": 12877, + "time_per_iteration": 2.704578161239624 + }, + { + "auxiliary_loss_clip": 0.01351673, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.23891151, + "balance_loss_mlp": 1.02124095, + "epoch": 0.7742672478581091, + "flos": 22679375824800.0, + "grad_norm": 1.6890194100638993, + "language_loss": 0.82901621, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85288101, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13574219, + "step": 12878, + "time_per_iteration": 2.872300386428833 + }, + { + "auxiliary_loss_clip": 0.01342973, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.23408997, + "balance_loss_mlp": 1.02138317, + "epoch": 0.7743273711107771, + "flos": 18845211948480.0, + "grad_norm": 1.6149584752793755, + "language_loss": 0.73462158, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75840104, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.13574219, + "step": 12879, + "time_per_iteration": 2.858067750930786 + }, + { + "auxiliary_loss_clip": 0.01333241, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.22890484, + "balance_loss_mlp": 1.01787329, + "epoch": 0.7743874943634451, + "flos": 28735039999080.0, + "grad_norm": 1.73183934067801, + "language_loss": 0.7951895, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81882775, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12695312, + "step": 12880, + "time_per_iteration": 2.9045979976654053 + }, + { + "auxiliary_loss_clip": 0.01341997, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.23628855, + "balance_loss_mlp": 1.02659202, + "epoch": 0.774447617616113, + "flos": 21914873969400.0, + "grad_norm": 1.5292205035373214, + "language_loss": 0.70606691, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72988087, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12817383, + "step": 12881, + "time_per_iteration": 2.737046003341675 + }, + { + "auxiliary_loss_clip": 0.01353337, + "auxiliary_loss_mlp": 0.01039424, + "balance_loss_clip": 1.24120438, + "balance_loss_mlp": 1.02493978, + "epoch": 0.774507740868781, + "flos": 19505441870640.0, + "grad_norm": 1.8230505850170267, + "language_loss": 0.84854031, + "learning_rate": 5.099722064981832e-07, + "loss": 0.87246788, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14489746, + "step": 12882, + "time_per_iteration": 2.7520787715911865 + }, + { + "auxiliary_loss_clip": 0.01157114, + "auxiliary_loss_mlp": 0.0100351, + "balance_loss_clip": 1.11453617, + "balance_loss_mlp": 1.00094736, + "epoch": 0.774567864121449, + "flos": 59442705682200.0, + "grad_norm": 0.7703583497666172, + "language_loss": 0.60489333, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62649953, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02563477, + "step": 12883, + "time_per_iteration": 3.3130767345428467 + }, + { + "auxiliary_loss_clip": 0.01343586, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.23435009, + "balance_loss_mlp": 1.01811016, + "epoch": 0.774627987374117, + "flos": 13229335580400.0, + "grad_norm": 4.766730869990235, + "language_loss": 0.73267502, + "learning_rate": 5.094527395086416e-07, + "loss": 0.75643009, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13800049, + "step": 12884, + "time_per_iteration": 2.6738169193267822 + }, + { + "auxiliary_loss_clip": 0.01339529, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.23458612, + "balance_loss_mlp": 1.02098274, + "epoch": 0.7746881106267849, + "flos": 21398573367000.0, + "grad_norm": 1.562071321048131, + "language_loss": 0.81259531, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83632386, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12341309, + "step": 12885, + "time_per_iteration": 2.792163848876953 + }, + { + "auxiliary_loss_clip": 0.01334055, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.22930455, + "balance_loss_mlp": 1.02143204, + "epoch": 0.7747482338794529, + "flos": 25634816864280.0, + "grad_norm": 1.624967277443438, + "language_loss": 0.64300555, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66668105, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12060547, + "step": 12886, + "time_per_iteration": 2.7488651275634766 + }, + { + "auxiliary_loss_clip": 0.01342458, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.23271322, + "balance_loss_mlp": 1.02118897, + "epoch": 0.7748083571321208, + "flos": 11550717379440.0, + "grad_norm": 1.7366288737465572, + "language_loss": 0.69753724, + "learning_rate": 5.086739629616987e-07, + "loss": 0.7212916, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.11785889, + "step": 12887, + "time_per_iteration": 2.7435410022735596 + }, + { + "auxiliary_loss_clip": 0.01334061, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.22878098, + "balance_loss_mlp": 1.01973033, + "epoch": 0.7748684803847888, + "flos": 19067115965400.0, + "grad_norm": 1.6192738886544895, + "language_loss": 0.70739853, + "learning_rate": 5.084144838687275e-07, + "loss": 0.73105514, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11871338, + "step": 12888, + "time_per_iteration": 2.755659341812134 + }, + { + "auxiliary_loss_clip": 0.01348041, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.23841798, + "balance_loss_mlp": 1.01970768, + "epoch": 0.7749286036374567, + "flos": 22278270804480.0, + "grad_norm": 1.9284262574673086, + "language_loss": 0.81726015, + "learning_rate": 5.081550613368279e-07, + "loss": 0.84106761, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12988281, + "step": 12889, + "time_per_iteration": 2.8211357593536377 + }, + { + "auxiliary_loss_clip": 0.01343966, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.23603702, + "balance_loss_mlp": 1.02003479, + "epoch": 0.7749887268901248, + "flos": 20197166898960.0, + "grad_norm": 1.8723693664032437, + "language_loss": 0.79758096, + "learning_rate": 5.07895695375838e-07, + "loss": 0.82134819, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1272583, + "step": 12890, + "time_per_iteration": 2.814769983291626 + }, + { + "auxiliary_loss_clip": 0.01344472, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.23643446, + "balance_loss_mlp": 1.01559341, + "epoch": 0.7750488501427927, + "flos": 20342152036080.0, + "grad_norm": 1.5956943858452215, + "language_loss": 0.66709393, + "learning_rate": 5.076363859955932e-07, + "loss": 0.69082463, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13018799, + "step": 12891, + "time_per_iteration": 2.7945773601531982 + }, + { + "auxiliary_loss_clip": 0.01344668, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.2357775, + "balance_loss_mlp": 1.01710999, + "epoch": 0.7751089733954607, + "flos": 28369653354360.0, + "grad_norm": 1.4469492451710675, + "language_loss": 0.78836155, + "learning_rate": 5.073771332059257e-07, + "loss": 0.8121025, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12310791, + "step": 12892, + "time_per_iteration": 2.8796675205230713 + }, + { + "auxiliary_loss_clip": 0.01348595, + "auxiliary_loss_mlp": 0.01032165, + "balance_loss_clip": 1.23827362, + "balance_loss_mlp": 1.01893258, + "epoch": 0.7751690966481286, + "flos": 16947897791040.0, + "grad_norm": 1.7934164417995144, + "language_loss": 0.67382514, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69763273, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.13238525, + "step": 12893, + "time_per_iteration": 2.7369587421417236 + }, + { + "auxiliary_loss_clip": 0.01153625, + "auxiliary_loss_mlp": 0.01003659, + "balance_loss_clip": 1.11104965, + "balance_loss_mlp": 1.00113189, + "epoch": 0.7752292199007966, + "flos": 65685327056640.0, + "grad_norm": 0.822631365440791, + "language_loss": 0.58615637, + "learning_rate": 5.068587974376468e-07, + "loss": 0.6077292, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02526855, + "step": 12894, + "time_per_iteration": 3.405984401702881 + }, + { + "auxiliary_loss_clip": 0.01346003, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.23710549, + "balance_loss_mlp": 1.01863623, + "epoch": 0.7752893431534646, + "flos": 20599530778440.0, + "grad_norm": 2.055440678339632, + "language_loss": 0.78329992, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80707777, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.13153076, + "step": 12895, + "time_per_iteration": 2.7534165382385254 + }, + { + "auxiliary_loss_clip": 0.01341799, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.23512352, + "balance_loss_mlp": 1.01967931, + "epoch": 0.7753494664061326, + "flos": 20490223408560.0, + "grad_norm": 1.8853214961603004, + "language_loss": 0.67749929, + "learning_rate": 5.063406881496209e-07, + "loss": 0.70124578, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13171387, + "step": 12896, + "time_per_iteration": 2.754491090774536 + }, + { + "auxiliary_loss_clip": 0.01343042, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.23548579, + "balance_loss_mlp": 1.02144945, + "epoch": 0.7754095896588006, + "flos": 20270674676520.0, + "grad_norm": 1.7602180385127812, + "language_loss": 0.6898995, + "learning_rate": 5.060817184602629e-07, + "loss": 0.713664, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11968994, + "step": 12897, + "time_per_iteration": 4.249463319778442 + }, + { + "auxiliary_loss_clip": 0.01344116, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.23521805, + "balance_loss_mlp": 1.02415442, + "epoch": 0.7754697129114685, + "flos": 23336316469800.0, + "grad_norm": 1.7755133024669023, + "language_loss": 0.74728954, + "learning_rate": 5.058228054204364e-07, + "loss": 0.7711097, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13751221, + "step": 12898, + "time_per_iteration": 4.242105484008789 + }, + { + "auxiliary_loss_clip": 0.01345777, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.23654354, + "balance_loss_mlp": 1.02157199, + "epoch": 0.7755298361641365, + "flos": 17352089046720.0, + "grad_norm": 1.7897912666119817, + "language_loss": 0.7000351, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72384298, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13439941, + "step": 12899, + "time_per_iteration": 4.1438610553741455 + }, + { + "auxiliary_loss_clip": 0.01343714, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.23614836, + "balance_loss_mlp": 1.02000761, + "epoch": 0.7755899594168044, + "flos": 19650589441200.0, + "grad_norm": 1.8454786794529963, + "language_loss": 0.75813842, + "learning_rate": 5.053051493286453e-07, + "loss": 0.78190839, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13275146, + "step": 12900, + "time_per_iteration": 2.827080249786377 + }, + { + "auxiliary_loss_clip": 0.01337847, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.23223841, + "balance_loss_mlp": 1.02287948, + "epoch": 0.7756500826694724, + "flos": 27419818633200.0, + "grad_norm": 1.7918490686182789, + "language_loss": 0.77573586, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79946029, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.1171875, + "step": 12901, + "time_per_iteration": 2.8579020500183105 + }, + { + "auxiliary_loss_clip": 0.01343397, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.23658729, + "balance_loss_mlp": 1.01831627, + "epoch": 0.7757102059221404, + "flos": 28736380074960.0, + "grad_norm": 1.3723940137684643, + "language_loss": 0.7733556, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79710716, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13439941, + "step": 12902, + "time_per_iteration": 2.8570308685302734 + }, + { + "auxiliary_loss_clip": 0.01341965, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.23395693, + "balance_loss_mlp": 1.02073812, + "epoch": 0.7757703291748084, + "flos": 22491240982200.0, + "grad_norm": 1.7716941493047689, + "language_loss": 0.73381025, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75755966, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12243652, + "step": 12903, + "time_per_iteration": 2.8056516647338867 + }, + { + "auxiliary_loss_clip": 0.01342661, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.23635221, + "balance_loss_mlp": 1.01600313, + "epoch": 0.7758304524274763, + "flos": 21434657217840.0, + "grad_norm": 1.9060513050816634, + "language_loss": 0.75985748, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78357518, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.13110352, + "step": 12904, + "time_per_iteration": 2.736053705215454 + }, + { + "auxiliary_loss_clip": 0.01331914, + "auxiliary_loss_mlp": 0.01022349, + "balance_loss_clip": 1.22818089, + "balance_loss_mlp": 1.01063681, + "epoch": 0.7758905756801443, + "flos": 23664482229600.0, + "grad_norm": 2.1563030146684117, + "language_loss": 0.69172955, + "learning_rate": 5.040120011529576e-07, + "loss": 0.71527219, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.11724854, + "step": 12905, + "time_per_iteration": 2.8288557529449463 + }, + { + "auxiliary_loss_clip": 0.01333571, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.2298727, + "balance_loss_mlp": 1.01511645, + "epoch": 0.7759506989328122, + "flos": 28371277688760.0, + "grad_norm": 1.7810300654180489, + "language_loss": 0.67124033, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69485056, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12341309, + "step": 12906, + "time_per_iteration": 2.772127628326416 + }, + { + "auxiliary_loss_clip": 0.01342974, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.23532212, + "balance_loss_mlp": 1.01872134, + "epoch": 0.7760108221854802, + "flos": 14906897964000.0, + "grad_norm": 2.212262420935627, + "language_loss": 0.81757939, + "learning_rate": 5.034951389101498e-07, + "loss": 0.84132063, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12426758, + "step": 12907, + "time_per_iteration": 2.7411882877349854 + }, + { + "auxiliary_loss_clip": 0.01332505, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.22881579, + "balance_loss_mlp": 1.01624453, + "epoch": 0.7760709454381483, + "flos": 14796494168400.0, + "grad_norm": 2.270027028224859, + "language_loss": 0.67501098, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69861668, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.1182251, + "step": 12908, + "time_per_iteration": 4.296247243881226 + }, + { + "auxiliary_loss_clip": 0.0134678, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.23709917, + "balance_loss_mlp": 1.02346766, + "epoch": 0.7761310686908162, + "flos": 17383746586320.0, + "grad_norm": 1.7894827567362068, + "language_loss": 0.70336688, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72720206, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13262939, + "step": 12909, + "time_per_iteration": 2.7208685874938965 + }, + { + "auxiliary_loss_clip": 0.01336809, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.23132849, + "balance_loss_mlp": 1.0255816, + "epoch": 0.7761911919434842, + "flos": 25562040037200.0, + "grad_norm": 1.7276067973894456, + "language_loss": 0.68009156, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70384097, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.12554932, + "step": 12910, + "time_per_iteration": 2.869523763656616 + }, + { + "auxiliary_loss_clip": 0.01346975, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.23832583, + "balance_loss_mlp": 1.02209926, + "epoch": 0.7762513151961521, + "flos": 23184265478040.0, + "grad_norm": 2.101675165796662, + "language_loss": 0.72155404, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74537152, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12683105, + "step": 12911, + "time_per_iteration": 2.7570154666900635 + }, + { + "auxiliary_loss_clip": 0.01342633, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.23440719, + "balance_loss_mlp": 1.01802135, + "epoch": 0.7763114384488201, + "flos": 21694675503600.0, + "grad_norm": 3.169684715436833, + "language_loss": 0.63381433, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65755498, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13427734, + "step": 12912, + "time_per_iteration": 2.771723747253418 + }, + { + "auxiliary_loss_clip": 0.01155983, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_clip": 1.11323547, + "balance_loss_mlp": 0.99992031, + "epoch": 0.776371561701488, + "flos": 69043375625760.0, + "grad_norm": 0.7663719576529369, + "language_loss": 0.53300017, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55458248, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02319336, + "step": 12913, + "time_per_iteration": 3.3016841411590576 + }, + { + "auxiliary_loss_clip": 0.01342038, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.23390269, + "balance_loss_mlp": 1.02324009, + "epoch": 0.776431684954156, + "flos": 22899168207000.0, + "grad_norm": 1.9119615867789117, + "language_loss": 0.62974942, + "learning_rate": 5.016879091243338e-07, + "loss": 0.65353334, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.13128662, + "step": 12914, + "time_per_iteration": 2.787381410598755 + }, + { + "auxiliary_loss_clip": 0.01343116, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.23620021, + "balance_loss_mlp": 1.01849699, + "epoch": 0.776491808206824, + "flos": 20265761064960.0, + "grad_norm": 1.7320271957236566, + "language_loss": 0.82276654, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84651637, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13397217, + "step": 12915, + "time_per_iteration": 3.042463779449463 + }, + { + "auxiliary_loss_clip": 0.01348654, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.23790932, + "balance_loss_mlp": 1.02109158, + "epoch": 0.776551931459492, + "flos": 26764502322600.0, + "grad_norm": 1.6898981274870257, + "language_loss": 0.75087261, + "learning_rate": 5.011720689554603e-07, + "loss": 0.77470958, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13952637, + "step": 12916, + "time_per_iteration": 2.918513536453247 + }, + { + "auxiliary_loss_clip": 0.0134431, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.23584771, + "balance_loss_mlp": 1.0167079, + "epoch": 0.7766120547121599, + "flos": 52674061326120.0, + "grad_norm": 1.3977434141757406, + "language_loss": 0.65669322, + "learning_rate": 5.009142341196919e-07, + "loss": 0.68043405, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13061523, + "step": 12917, + "time_per_iteration": 3.0398662090301514 + }, + { + "auxiliary_loss_clip": 0.01346911, + "auxiliary_loss_mlp": 0.01041206, + "balance_loss_clip": 1.23924649, + "balance_loss_mlp": 1.02859378, + "epoch": 0.7766721779648279, + "flos": 25161787792440.0, + "grad_norm": 1.659400240946079, + "language_loss": 0.64570737, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66958857, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1260376, + "step": 12918, + "time_per_iteration": 2.8299765586853027 + }, + { + "auxiliary_loss_clip": 0.01337682, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.23120308, + "balance_loss_mlp": 1.02506948, + "epoch": 0.7767323012174958, + "flos": 23764977585360.0, + "grad_norm": 1.9441655916829574, + "language_loss": 0.73627996, + "learning_rate": 5.003987349943777e-07, + "loss": 0.76003546, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12811279, + "step": 12919, + "time_per_iteration": 2.716707706451416 + }, + { + "auxiliary_loss_clip": 0.01346949, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.23800659, + "balance_loss_mlp": 1.01897883, + "epoch": 0.7767924244701638, + "flos": 22091191779240.0, + "grad_norm": 1.6768493508287583, + "language_loss": 0.79136634, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81515777, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13232422, + "step": 12920, + "time_per_iteration": 2.855813503265381 + }, + { + "auxiliary_loss_clip": 0.01345733, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.2364974, + "balance_loss_mlp": 1.02411389, + "epoch": 0.7768525477228319, + "flos": 21986757412560.0, + "grad_norm": 1.8713892744517204, + "language_loss": 0.70949507, + "learning_rate": 4.998834633291829e-07, + "loss": 0.73332226, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12866211, + "step": 12921, + "time_per_iteration": 2.738151788711548 + }, + { + "auxiliary_loss_clip": 0.01349306, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.23953533, + "balance_loss_mlp": 1.01853919, + "epoch": 0.7769126709754998, + "flos": 21799069261920.0, + "grad_norm": 2.635652824941624, + "language_loss": 0.76294422, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78675759, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13500977, + "step": 12922, + "time_per_iteration": 2.9147706031799316 + }, + { + "auxiliary_loss_clip": 0.0134186, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_clip": 1.23632658, + "balance_loss_mlp": 1.02906632, + "epoch": 0.7769727942281678, + "flos": 20052872103960.0, + "grad_norm": 1.7460210204677054, + "language_loss": 0.80720562, + "learning_rate": 4.993684192022625e-07, + "loss": 0.83104479, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12969971, + "step": 12923, + "time_per_iteration": 2.7042794227600098 + }, + { + "auxiliary_loss_clip": 0.01338742, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.23121202, + "balance_loss_mlp": 1.0232954, + "epoch": 0.7770329174808357, + "flos": 21691467443160.0, + "grad_norm": 1.86317087161423, + "language_loss": 0.92461181, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94835734, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12524414, + "step": 12924, + "time_per_iteration": 2.748833656311035 + }, + { + "auxiliary_loss_clip": 0.01342197, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.23384917, + "balance_loss_mlp": 1.01826799, + "epoch": 0.7770930407335037, + "flos": 25854893505000.0, + "grad_norm": 9.7046112153528, + "language_loss": 0.66099465, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68472815, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12884521, + "step": 12925, + "time_per_iteration": 2.7351467609405518 + }, + { + "auxiliary_loss_clip": 0.0134938, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.2394228, + "balance_loss_mlp": 1.02208507, + "epoch": 0.7771531639861716, + "flos": 24352593113880.0, + "grad_norm": 1.7004354700439532, + "language_loss": 0.72135794, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74520439, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13195801, + "step": 12926, + "time_per_iteration": 2.757089138031006 + }, + { + "auxiliary_loss_clip": 0.0134803, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.23798192, + "balance_loss_mlp": 1.01392555, + "epoch": 0.7772132872388396, + "flos": 25635507206400.0, + "grad_norm": 1.7261600597177902, + "language_loss": 0.66319078, + "learning_rate": 4.983390138757027e-07, + "loss": 0.68694353, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13323975, + "step": 12927, + "time_per_iteration": 2.8027842044830322 + }, + { + "auxiliary_loss_clip": 0.01344083, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.23530459, + "balance_loss_mlp": 1.02347994, + "epoch": 0.7772734104915076, + "flos": 26072980336080.0, + "grad_norm": 1.7700502269735179, + "language_loss": 0.72404557, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74785501, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.1338501, + "step": 12928, + "time_per_iteration": 2.919151782989502 + }, + { + "auxiliary_loss_clip": 0.01337705, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.23241258, + "balance_loss_mlp": 1.0200032, + "epoch": 0.7773335337441756, + "flos": 22929607495800.0, + "grad_norm": 1.5521944199865336, + "language_loss": 0.74645245, + "learning_rate": 4.978246528322036e-07, + "loss": 0.77015913, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12939453, + "step": 12929, + "time_per_iteration": 2.8211400508880615 + }, + { + "auxiliary_loss_clip": 0.01348373, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.23923242, + "balance_loss_mlp": 1.02473152, + "epoch": 0.7773936569968435, + "flos": 20781614975400.0, + "grad_norm": 2.0659569977152508, + "language_loss": 0.77637595, + "learning_rate": 4.975675577495377e-07, + "loss": 0.8002454, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1385498, + "step": 12930, + "time_per_iteration": 2.7715892791748047 + }, + { + "auxiliary_loss_clip": 0.01345395, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.23766387, + "balance_loss_mlp": 1.01915085, + "epoch": 0.7774537802495115, + "flos": 20376652160880.0, + "grad_norm": 1.7577061796334565, + "language_loss": 0.79539412, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81916714, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12762451, + "step": 12931, + "time_per_iteration": 2.8354554176330566 + }, + { + "auxiliary_loss_clip": 0.01153394, + "auxiliary_loss_mlp": 0.01004951, + "balance_loss_clip": 1.11107063, + "balance_loss_mlp": 1.00247121, + "epoch": 0.7775139035021794, + "flos": 53926837369560.0, + "grad_norm": 0.8433575211602059, + "language_loss": 0.59832358, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61990702, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02478027, + "step": 12932, + "time_per_iteration": 3.217799425125122 + }, + { + "auxiliary_loss_clip": 0.01346136, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.23606801, + "balance_loss_mlp": 1.02051556, + "epoch": 0.7775740267548474, + "flos": 28848692463480.0, + "grad_norm": 1.572975791206597, + "language_loss": 0.76401389, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78780276, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.12249756, + "step": 12933, + "time_per_iteration": 2.831437826156616 + }, + { + "auxiliary_loss_clip": 0.01345319, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.23730958, + "balance_loss_mlp": 1.02069354, + "epoch": 0.7776341500075155, + "flos": 21877612476120.0, + "grad_norm": 1.885693838073945, + "language_loss": 0.73468876, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75848359, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13464355, + "step": 12934, + "time_per_iteration": 2.7800097465515137 + }, + { + "auxiliary_loss_clip": 0.01346087, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.23752975, + "balance_loss_mlp": 1.01665509, + "epoch": 0.7776942732601834, + "flos": 20234672042400.0, + "grad_norm": 1.8006431596751598, + "language_loss": 0.70632732, + "learning_rate": 4.962829371169475e-07, + "loss": 0.73009044, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13568115, + "step": 12935, + "time_per_iteration": 2.753981351852417 + }, + { + "auxiliary_loss_clip": 0.01346779, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.23760164, + "balance_loss_mlp": 1.02437687, + "epoch": 0.7777543965128514, + "flos": 22236298741440.0, + "grad_norm": 2.109101645714692, + "language_loss": 0.83936179, + "learning_rate": 4.960261840147746e-07, + "loss": 0.86320084, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.12738037, + "step": 12936, + "time_per_iteration": 4.253128528594971 + }, + { + "auxiliary_loss_clip": 0.01352346, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.2407763, + "balance_loss_mlp": 1.01782608, + "epoch": 0.7778145197655193, + "flos": 14506077202200.0, + "grad_norm": 1.8325007521340881, + "language_loss": 0.67699516, + "learning_rate": 4.957694879434397e-07, + "loss": 0.70081705, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12023926, + "step": 12937, + "time_per_iteration": 5.580273628234863 + }, + { + "auxiliary_loss_clip": 0.0134382, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.23454714, + "balance_loss_mlp": 1.01769662, + "epoch": 0.7778746430181873, + "flos": 21145011810480.0, + "grad_norm": 1.4586971810320244, + "language_loss": 0.87250292, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89624882, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.1307373, + "step": 12938, + "time_per_iteration": 2.853848457336426 + }, + { + "auxiliary_loss_clip": 0.0134181, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.23423469, + "balance_loss_mlp": 1.01939178, + "epoch": 0.7779347662708552, + "flos": 20271202585200.0, + "grad_norm": 2.419760039248465, + "language_loss": 0.85559446, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87933886, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13244629, + "step": 12939, + "time_per_iteration": 2.8141930103302 + }, + { + "auxiliary_loss_clip": 0.01337018, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.23240542, + "balance_loss_mlp": 1.01979256, + "epoch": 0.7779948895235232, + "flos": 19213806653640.0, + "grad_norm": 1.8393797345881069, + "language_loss": 0.69269747, + "learning_rate": 4.949997420117915e-07, + "loss": 0.71638936, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1237793, + "step": 12940, + "time_per_iteration": 2.7956793308258057 + }, + { + "auxiliary_loss_clip": 0.01344415, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.23544598, + "balance_loss_mlp": 1.01742101, + "epoch": 0.7780550127761912, + "flos": 23919992987400.0, + "grad_norm": 1.5278843946725673, + "language_loss": 0.77746105, + "learning_rate": 4.947432741611255e-07, + "loss": 0.80120546, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12597656, + "step": 12941, + "time_per_iteration": 2.7826383113861084 + }, + { + "auxiliary_loss_clip": 0.01351138, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.23900354, + "balance_loss_mlp": 1.01944566, + "epoch": 0.7781151360288592, + "flos": 32422350753720.0, + "grad_norm": 2.1745463438273362, + "language_loss": 0.7321552, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75599813, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.137146, + "step": 12942, + "time_per_iteration": 2.7789227962493896 + }, + { + "auxiliary_loss_clip": 0.01340709, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.23566747, + "balance_loss_mlp": 1.01785898, + "epoch": 0.7781752592815271, + "flos": 22351534931880.0, + "grad_norm": 2.0638862361788872, + "language_loss": 0.67563188, + "learning_rate": 4.942305097079751e-07, + "loss": 0.69934517, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12756348, + "step": 12943, + "time_per_iteration": 2.7430005073547363 + }, + { + "auxiliary_loss_clip": 0.01154498, + "auxiliary_loss_mlp": 0.01001309, + "balance_loss_clip": 1.11186934, + "balance_loss_mlp": 0.99882931, + "epoch": 0.7782353825341951, + "flos": 70474953234960.0, + "grad_norm": 0.7780544267336897, + "language_loss": 0.58566737, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60722554, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02478027, + "step": 12944, + "time_per_iteration": 3.4057157039642334 + }, + { + "auxiliary_loss_clip": 0.01347821, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.23790812, + "balance_loss_mlp": 1.01949716, + "epoch": 0.778295505786863, + "flos": 19067278398840.0, + "grad_norm": 2.178060796530352, + "language_loss": 0.6808266, + "learning_rate": 4.937179736505428e-07, + "loss": 0.70464289, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.14318848, + "step": 12945, + "time_per_iteration": 2.7374470233917236 + }, + { + "auxiliary_loss_clip": 0.01348602, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.23936605, + "balance_loss_mlp": 1.02109122, + "epoch": 0.778355629039531, + "flos": 21005143326720.0, + "grad_norm": 2.1984823939465366, + "language_loss": 0.69670922, + "learning_rate": 4.93461791294516e-07, + "loss": 0.72054195, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13574219, + "step": 12946, + "time_per_iteration": 4.267124891281128 + }, + { + "auxiliary_loss_clip": 0.01349414, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.23987532, + "balance_loss_mlp": 1.02082229, + "epoch": 0.7784157522921991, + "flos": 21403162111680.0, + "grad_norm": 1.8071608964120185, + "language_loss": 0.65935647, + "learning_rate": 4.932056660665689e-07, + "loss": 0.68319201, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13317871, + "step": 12947, + "time_per_iteration": 2.724546432495117 + }, + { + "auxiliary_loss_clip": 0.01341743, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.23468065, + "balance_loss_mlp": 1.01899815, + "epoch": 0.778475875544867, + "flos": 20818795251960.0, + "grad_norm": 2.315975936978106, + "language_loss": 0.65462977, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67837107, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13391113, + "step": 12948, + "time_per_iteration": 2.737622022628784 + }, + { + "auxiliary_loss_clip": 0.01341464, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.23421288, + "balance_loss_mlp": 1.01889729, + "epoch": 0.778535998797535, + "flos": 14359386513960.0, + "grad_norm": 1.8190925594870708, + "language_loss": 0.7579776, + "learning_rate": 4.926935870337625e-07, + "loss": 0.78171301, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13171387, + "step": 12949, + "time_per_iteration": 2.815617322921753 + }, + { + "auxiliary_loss_clip": 0.01352405, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.24209547, + "balance_loss_mlp": 1.01695049, + "epoch": 0.7785961220502029, + "flos": 19214415779040.0, + "grad_norm": 1.3786776014853417, + "language_loss": 0.68887377, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71271002, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.14263916, + "step": 12950, + "time_per_iteration": 2.8064348697662354 + }, + { + "auxiliary_loss_clip": 0.01346875, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.23730564, + "balance_loss_mlp": 1.01674473, + "epoch": 0.7786562453028709, + "flos": 25744246059240.0, + "grad_norm": 1.7565700228236156, + "language_loss": 0.71993905, + "learning_rate": 4.921817366297938e-07, + "loss": 0.7437042, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12902832, + "step": 12951, + "time_per_iteration": 2.8393912315368652 + }, + { + "auxiliary_loss_clip": 0.01342568, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.2365818, + "balance_loss_mlp": 1.01823807, + "epoch": 0.7787163685555388, + "flos": 25745017618080.0, + "grad_norm": 1.7569298095430874, + "language_loss": 0.65631914, + "learning_rate": 4.919258971878877e-07, + "loss": 0.68006122, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13397217, + "step": 12952, + "time_per_iteration": 2.8375678062438965 + }, + { + "auxiliary_loss_clip": 0.01326997, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.22531557, + "balance_loss_mlp": 1.01882768, + "epoch": 0.7787764918082068, + "flos": 22752924210720.0, + "grad_norm": 1.4704153749337332, + "language_loss": 0.81206822, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83564848, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.12200928, + "step": 12953, + "time_per_iteration": 2.7554140090942383 + }, + { + "auxiliary_loss_clip": 0.01351313, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.23967147, + "balance_loss_mlp": 1.02079928, + "epoch": 0.7788366150608748, + "flos": 15194512953360.0, + "grad_norm": 1.8834862266063026, + "language_loss": 0.76333457, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78719258, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13696289, + "step": 12954, + "time_per_iteration": 2.7561755180358887 + }, + { + "auxiliary_loss_clip": 0.01353243, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.24193013, + "balance_loss_mlp": 1.01496339, + "epoch": 0.7788967383135428, + "flos": 21214580577120.0, + "grad_norm": 1.5643893099832107, + "language_loss": 0.7292906, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7531004, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12774658, + "step": 12955, + "time_per_iteration": 2.7982451915740967 + }, + { + "auxiliary_loss_clip": 0.01343688, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.2350173, + "balance_loss_mlp": 1.02046299, + "epoch": 0.7789568615662107, + "flos": 21687162957000.0, + "grad_norm": 1.3811292926622944, + "language_loss": 0.68994129, + "learning_rate": 4.909031113804551e-07, + "loss": 0.71371734, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13452148, + "step": 12956, + "time_per_iteration": 2.7245702743530273 + }, + { + "auxiliary_loss_clip": 0.01348391, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.23952258, + "balance_loss_mlp": 1.02300704, + "epoch": 0.7790169848188787, + "flos": 26366118062400.0, + "grad_norm": 1.5864489278464629, + "language_loss": 0.76075149, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78459382, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12835693, + "step": 12957, + "time_per_iteration": 2.801703691482544 + }, + { + "auxiliary_loss_clip": 0.01342034, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.23429441, + "balance_loss_mlp": 1.0152458, + "epoch": 0.7790771080715466, + "flos": 25521164399880.0, + "grad_norm": 1.5007960234311901, + "language_loss": 0.77695352, + "learning_rate": 4.903920617885917e-07, + "loss": 0.80065536, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12902832, + "step": 12958, + "time_per_iteration": 2.7507801055908203 + }, + { + "auxiliary_loss_clip": 0.01346059, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.23801553, + "balance_loss_mlp": 1.02222323, + "epoch": 0.7791372313242146, + "flos": 16038492015240.0, + "grad_norm": 1.9060551235299141, + "language_loss": 0.71617937, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73999304, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13092041, + "step": 12959, + "time_per_iteration": 2.7521140575408936 + }, + { + "auxiliary_loss_clip": 0.01339286, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.23206568, + "balance_loss_mlp": 1.02295327, + "epoch": 0.7791973545768827, + "flos": 23847541027200.0, + "grad_norm": 1.5700740902262753, + "language_loss": 0.78109741, + "learning_rate": 4.898812411746632e-07, + "loss": 0.80484802, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.12811279, + "step": 12960, + "time_per_iteration": 2.851179838180542 + }, + { + "auxiliary_loss_clip": 0.01343724, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.23507893, + "balance_loss_mlp": 1.02388871, + "epoch": 0.7792574778295506, + "flos": 24173473327200.0, + "grad_norm": 2.011404202223644, + "language_loss": 0.75408894, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77789867, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13366699, + "step": 12961, + "time_per_iteration": 2.9360411167144775 + }, + { + "auxiliary_loss_clip": 0.01327867, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.22609699, + "balance_loss_mlp": 1.02097607, + "epoch": 0.7793176010822186, + "flos": 21469197951000.0, + "grad_norm": 1.8699312253143139, + "language_loss": 0.73654944, + "learning_rate": 4.893706496161511e-07, + "loss": 0.76015776, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.11993408, + "step": 12962, + "time_per_iteration": 2.849588632583618 + }, + { + "auxiliary_loss_clip": 0.01336854, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.23053384, + "balance_loss_mlp": 1.0146569, + "epoch": 0.7793777243348865, + "flos": 20671454829960.0, + "grad_norm": 1.8846732111607594, + "language_loss": 0.69990349, + "learning_rate": 4.891154397568795e-07, + "loss": 0.7235446, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12615967, + "step": 12963, + "time_per_iteration": 2.7641687393188477 + }, + { + "auxiliary_loss_clip": 0.01336115, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.23150456, + "balance_loss_mlp": 1.01928878, + "epoch": 0.7794378475875545, + "flos": 27131513301720.0, + "grad_norm": 1.7923899787785267, + "language_loss": 0.63929188, + "learning_rate": 4.888602871905019e-07, + "loss": 0.66297436, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.12854004, + "step": 12964, + "time_per_iteration": 2.8491008281707764 + }, + { + "auxiliary_loss_clip": 0.01347593, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.23792279, + "balance_loss_mlp": 1.02109885, + "epoch": 0.7794979708402224, + "flos": 28079967338640.0, + "grad_norm": 1.8615367603680928, + "language_loss": 0.77067482, + "learning_rate": 4.88605191926694e-07, + "loss": 0.79448831, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12658691, + "step": 12965, + "time_per_iteration": 2.7933995723724365 + }, + { + "auxiliary_loss_clip": 0.01326597, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.22503603, + "balance_loss_mlp": 1.02179527, + "epoch": 0.7795580940928905, + "flos": 26875149768360.0, + "grad_norm": 1.6075785068919992, + "language_loss": 0.73076862, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75436985, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.11724854, + "step": 12966, + "time_per_iteration": 2.863722801208496 + }, + { + "auxiliary_loss_clip": 0.01331876, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.22875953, + "balance_loss_mlp": 1.01706457, + "epoch": 0.7796182173455584, + "flos": 23839825438800.0, + "grad_norm": 1.5601624541950834, + "language_loss": 0.74598503, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76958609, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.1116333, + "step": 12967, + "time_per_iteration": 2.863830327987671 + }, + { + "auxiliary_loss_clip": 0.01345911, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.23714709, + "balance_loss_mlp": 1.01688802, + "epoch": 0.7796783405982264, + "flos": 19797036479280.0, + "grad_norm": 2.2161643199488608, + "language_loss": 0.72666705, + "learning_rate": 4.878402500474073e-07, + "loss": 0.75042319, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12811279, + "step": 12968, + "time_per_iteration": 2.850428819656372 + }, + { + "auxiliary_loss_clip": 0.01339995, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.23350096, + "balance_loss_mlp": 1.0189352, + "epoch": 0.7797384638508943, + "flos": 15454409414040.0, + "grad_norm": 1.8515204248692234, + "language_loss": 0.61137366, + "learning_rate": 4.875853840905874e-07, + "loss": 0.63508981, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12677002, + "step": 12969, + "time_per_iteration": 2.7080042362213135 + }, + { + "auxiliary_loss_clip": 0.01326319, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.22367465, + "balance_loss_mlp": 1.02054155, + "epoch": 0.7797985871035623, + "flos": 20927615321520.0, + "grad_norm": 1.833896775133097, + "language_loss": 0.7023524, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72594136, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.12042236, + "step": 12970, + "time_per_iteration": 2.727813959121704 + }, + { + "auxiliary_loss_clip": 0.01340298, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.23249698, + "balance_loss_mlp": 1.01949346, + "epoch": 0.7798587103562302, + "flos": 36943813347120.0, + "grad_norm": 1.6209807293345204, + "language_loss": 0.72383428, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74757355, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.14135742, + "step": 12971, + "time_per_iteration": 2.9761059284210205 + }, + { + "auxiliary_loss_clip": 0.01351529, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.23876011, + "balance_loss_mlp": 1.0245806, + "epoch": 0.7799188336088982, + "flos": 22424433584040.0, + "grad_norm": 1.7332588961094242, + "language_loss": 0.74367344, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76757073, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.1362915, + "step": 12972, + "time_per_iteration": 2.799330472946167 + }, + { + "auxiliary_loss_clip": 0.0134227, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.23407722, + "balance_loss_mlp": 1.01387262, + "epoch": 0.7799789568615663, + "flos": 18885194201880.0, + "grad_norm": 1.8347736866892403, + "language_loss": 0.71610457, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73979169, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12561035, + "step": 12973, + "time_per_iteration": 2.9844818115234375 + }, + { + "auxiliary_loss_clip": 0.0133651, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.23091388, + "balance_loss_mlp": 1.01690209, + "epoch": 0.7800390801142342, + "flos": 20266573232160.0, + "grad_norm": 2.0068908246261588, + "language_loss": 0.77689523, + "learning_rate": 4.863119147634089e-07, + "loss": 0.80054617, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.11676025, + "step": 12974, + "time_per_iteration": 2.742220163345337 + }, + { + "auxiliary_loss_clip": 0.01335919, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.22969604, + "balance_loss_mlp": 1.01589322, + "epoch": 0.7800992033669022, + "flos": 16694620493040.0, + "grad_norm": 2.004329763255529, + "language_loss": 0.69221419, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71586084, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12860107, + "step": 12975, + "time_per_iteration": 4.342424392700195 + }, + { + "auxiliary_loss_clip": 0.01335081, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.22955871, + "balance_loss_mlp": 1.01875174, + "epoch": 0.7801593266195701, + "flos": 18589457540520.0, + "grad_norm": 2.10084080266897, + "language_loss": 0.82007635, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84373772, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12316895, + "step": 12976, + "time_per_iteration": 4.125143527984619 + }, + { + "auxiliary_loss_clip": 0.01346384, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.23681307, + "balance_loss_mlp": 1.01684248, + "epoch": 0.7802194498722381, + "flos": 25490603286000.0, + "grad_norm": 1.7723262465001768, + "language_loss": 0.66591603, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68968832, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13983154, + "step": 12977, + "time_per_iteration": 2.7839865684509277 + }, + { + "auxiliary_loss_clip": 0.01337417, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.23086679, + "balance_loss_mlp": 1.01855898, + "epoch": 0.780279573124906, + "flos": 31182667583400.0, + "grad_norm": 1.4337892506836498, + "language_loss": 0.7504915, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77417898, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12762451, + "step": 12978, + "time_per_iteration": 2.788727283477783 + }, + { + "auxiliary_loss_clip": 0.01348493, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.23745394, + "balance_loss_mlp": 1.01917791, + "epoch": 0.780339696377574, + "flos": 26949835188360.0, + "grad_norm": 1.7308115235863533, + "language_loss": 0.61915606, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64296675, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13409424, + "step": 12979, + "time_per_iteration": 2.8080811500549316 + }, + { + "auxiliary_loss_clip": 0.01341285, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.23403573, + "balance_loss_mlp": 1.01695657, + "epoch": 0.780399819630242, + "flos": 27962294646600.0, + "grad_norm": 2.1785134696156616, + "language_loss": 0.76898026, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79269105, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12841797, + "step": 12980, + "time_per_iteration": 2.77487850189209 + }, + { + "auxiliary_loss_clip": 0.01343834, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.23442912, + "balance_loss_mlp": 1.02523589, + "epoch": 0.78045994288291, + "flos": 22491240982200.0, + "grad_norm": 1.9435465557376477, + "language_loss": 0.77965426, + "learning_rate": 4.845314687419046e-07, + "loss": 0.8034724, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12750244, + "step": 12981, + "time_per_iteration": 2.773238182067871 + }, + { + "auxiliary_loss_clip": 0.01340793, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.23310184, + "balance_loss_mlp": 1.01655221, + "epoch": 0.7805200661355779, + "flos": 20855853703440.0, + "grad_norm": 1.7298032965033843, + "language_loss": 0.73004889, + "learning_rate": 4.842773491000067e-07, + "loss": 0.75374413, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12176514, + "step": 12982, + "time_per_iteration": 2.7366559505462646 + }, + { + "auxiliary_loss_clip": 0.0133885, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.23102283, + "balance_loss_mlp": 1.01561987, + "epoch": 0.7805801893882459, + "flos": 25671469232160.0, + "grad_norm": 1.5402319586955004, + "language_loss": 0.73698097, + "learning_rate": 4.840232869344636e-07, + "loss": 0.76064825, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12255859, + "step": 12983, + "time_per_iteration": 2.778655767440796 + }, + { + "auxiliary_loss_clip": 0.01339442, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.23239136, + "balance_loss_mlp": 1.01622391, + "epoch": 0.7806403126409138, + "flos": 11331534122640.0, + "grad_norm": 1.657145089517986, + "language_loss": 0.74840474, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77209365, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13232422, + "step": 12984, + "time_per_iteration": 2.821444034576416 + }, + { + "auxiliary_loss_clip": 0.01338381, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.23041916, + "balance_loss_mlp": 1.01953244, + "epoch": 0.7807004358935818, + "flos": 19578137481000.0, + "grad_norm": 1.7708997121876224, + "language_loss": 0.8142035, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83790767, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.125, + "step": 12985, + "time_per_iteration": 4.305377244949341 + }, + { + "auxiliary_loss_clip": 0.01336057, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.22932231, + "balance_loss_mlp": 1.02194691, + "epoch": 0.7807605591462499, + "flos": 19140583134600.0, + "grad_norm": 1.387135375345823, + "language_loss": 0.77403772, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79774421, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12646484, + "step": 12986, + "time_per_iteration": 2.78889536857605 + }, + { + "auxiliary_loss_clip": 0.01343512, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.23443902, + "balance_loss_mlp": 1.02261341, + "epoch": 0.7808206823989178, + "flos": 32380419299040.0, + "grad_norm": 1.750746460328804, + "language_loss": 0.74345374, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76724756, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13262939, + "step": 12987, + "time_per_iteration": 2.882932186126709 + }, + { + "auxiliary_loss_clip": 0.01154415, + "auxiliary_loss_mlp": 0.01002876, + "balance_loss_clip": 1.11219287, + "balance_loss_mlp": 1.000265, + "epoch": 0.7808808056515858, + "flos": 55066065792480.0, + "grad_norm": 0.7431867282192526, + "language_loss": 0.55133861, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57291156, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02612305, + "step": 12988, + "time_per_iteration": 3.277914524078369 + }, + { + "auxiliary_loss_clip": 0.01330737, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.22607625, + "balance_loss_mlp": 1.02306366, + "epoch": 0.7809409289042537, + "flos": 12863299201920.0, + "grad_norm": 2.488180243426611, + "language_loss": 0.80238283, + "learning_rate": 4.82500121484009e-07, + "loss": 0.82604909, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12817383, + "step": 12989, + "time_per_iteration": 2.739067554473877 + }, + { + "auxiliary_loss_clip": 0.01335213, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.22957933, + "balance_loss_mlp": 1.0162226, + "epoch": 0.7810010521569217, + "flos": 21691995351840.0, + "grad_norm": 2.178136642603575, + "language_loss": 0.70977116, + "learning_rate": 4.822464619225806e-07, + "loss": 0.73341703, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.13153076, + "step": 12990, + "time_per_iteration": 2.743583917617798 + }, + { + "auxiliary_loss_clip": 0.0134076, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.2326231, + "balance_loss_mlp": 1.01941347, + "epoch": 0.7810611754095896, + "flos": 16760575115640.0, + "grad_norm": 2.0011483425156236, + "language_loss": 0.77756929, + "learning_rate": 4.819928599145184e-07, + "loss": 0.80131102, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.14001465, + "step": 12991, + "time_per_iteration": 2.76601505279541 + }, + { + "auxiliary_loss_clip": 0.01338808, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.22974718, + "balance_loss_mlp": 1.02684188, + "epoch": 0.7811212986622577, + "flos": 43515290823480.0, + "grad_norm": 1.4883773514967087, + "language_loss": 0.65970045, + "learning_rate": 4.817393154694398e-07, + "loss": 0.6834963, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.1394043, + "step": 12992, + "time_per_iteration": 2.9334566593170166 + }, + { + "auxiliary_loss_clip": 0.01342885, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.23279607, + "balance_loss_mlp": 1.02268064, + "epoch": 0.7811814219149256, + "flos": 21762132635520.0, + "grad_norm": 1.9786676950650388, + "language_loss": 0.62179482, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64557934, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12884521, + "step": 12993, + "time_per_iteration": 2.8610727787017822 + }, + { + "auxiliary_loss_clip": 0.01338159, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.2321198, + "balance_loss_mlp": 1.01984, + "epoch": 0.7812415451675936, + "flos": 24066927325800.0, + "grad_norm": 2.9559776187214735, + "language_loss": 0.68666589, + "learning_rate": 4.812323993066862e-07, + "loss": 0.71037912, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13336182, + "step": 12994, + "time_per_iteration": 2.8627312183380127 + }, + { + "auxiliary_loss_clip": 0.01336122, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.22914791, + "balance_loss_mlp": 1.0137428, + "epoch": 0.7813016684202615, + "flos": 18994420355040.0, + "grad_norm": 1.8009851076041368, + "language_loss": 0.69131774, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71494019, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12384033, + "step": 12995, + "time_per_iteration": 2.745750904083252 + }, + { + "auxiliary_loss_clip": 0.01327729, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.22468054, + "balance_loss_mlp": 1.01343024, + "epoch": 0.7813617916729295, + "flos": 25265734858800.0, + "grad_norm": 1.6954177347346253, + "language_loss": 0.75219572, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77573097, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.12347412, + "step": 12996, + "time_per_iteration": 2.77443528175354 + }, + { + "auxiliary_loss_clip": 0.01347053, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.2355113, + "balance_loss_mlp": 1.01572835, + "epoch": 0.7814219149255974, + "flos": 17970346905840.0, + "grad_norm": 5.771155469472484, + "language_loss": 0.68419236, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70797145, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.15142822, + "step": 12997, + "time_per_iteration": 2.7109158039093018 + }, + { + "auxiliary_loss_clip": 0.0134725, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.23447239, + "balance_loss_mlp": 1.01698995, + "epoch": 0.7814820381782654, + "flos": 25781791811040.0, + "grad_norm": 2.5074727822750034, + "language_loss": 0.82473934, + "learning_rate": 4.802192581598614e-07, + "loss": 0.8485201, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13824463, + "step": 12998, + "time_per_iteration": 2.7897725105285645 + }, + { + "auxiliary_loss_clip": 0.01340915, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.23221302, + "balance_loss_mlp": 1.020751, + "epoch": 0.7815421614309335, + "flos": 20524155016320.0, + "grad_norm": 2.72303739052581, + "language_loss": 0.74751544, + "learning_rate": 4.799661169247453e-07, + "loss": 0.77127206, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.14007568, + "step": 12999, + "time_per_iteration": 2.7748525142669678 + }, + { + "auxiliary_loss_clip": 0.01347234, + "auxiliary_loss_mlp": 0.01041749, + "balance_loss_clip": 1.23661351, + "balance_loss_mlp": 1.02732491, + "epoch": 0.7816022846836014, + "flos": 21292595882640.0, + "grad_norm": 1.6042306438319205, + "language_loss": 0.84725267, + "learning_rate": 4.797130333294652e-07, + "loss": 0.87114251, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14416504, + "step": 13000, + "time_per_iteration": 2.856477975845337 + }, + { + "auxiliary_loss_clip": 0.01336887, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.22751272, + "balance_loss_mlp": 1.01721275, + "epoch": 0.7816624079362694, + "flos": 19213116311520.0, + "grad_norm": 1.891825298843562, + "language_loss": 0.65970182, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68337435, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13146973, + "step": 13001, + "time_per_iteration": 2.7331864833831787 + }, + { + "auxiliary_loss_clip": 0.01342898, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.23289645, + "balance_loss_mlp": 1.01824999, + "epoch": 0.7817225311889373, + "flos": 26110363654440.0, + "grad_norm": 1.481825184150606, + "language_loss": 0.66915083, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69289017, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12805176, + "step": 13002, + "time_per_iteration": 2.883943557739258 + }, + { + "auxiliary_loss_clip": 0.01348251, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.23922193, + "balance_loss_mlp": 1.02064967, + "epoch": 0.7817826544416053, + "flos": 21255821689680.0, + "grad_norm": 2.382658203936943, + "language_loss": 0.73575675, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75958443, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13867188, + "step": 13003, + "time_per_iteration": 2.796884298324585 + }, + { + "auxiliary_loss_clip": 0.0133344, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.2253902, + "balance_loss_mlp": 1.02174902, + "epoch": 0.7818427776942732, + "flos": 19936214620920.0, + "grad_norm": 1.4977670386979036, + "language_loss": 0.6239602, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64764923, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13696289, + "step": 13004, + "time_per_iteration": 2.7672786712646484 + }, + { + "auxiliary_loss_clip": 0.01324471, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.22129977, + "balance_loss_mlp": 1.0190599, + "epoch": 0.7819029009469413, + "flos": 11367414931680.0, + "grad_norm": 2.709985409464096, + "language_loss": 0.82772505, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85127926, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.11883545, + "step": 13005, + "time_per_iteration": 2.969090700149536 + }, + { + "auxiliary_loss_clip": 0.0134, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.2318393, + "balance_loss_mlp": 1.01673567, + "epoch": 0.7819630241996092, + "flos": 24284567464920.0, + "grad_norm": 1.7443820305388165, + "language_loss": 0.72849977, + "learning_rate": 4.781957427316432e-07, + "loss": 0.75220191, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13470459, + "step": 13006, + "time_per_iteration": 2.8034605979919434 + }, + { + "auxiliary_loss_clip": 0.01347323, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.23637462, + "balance_loss_mlp": 1.01680887, + "epoch": 0.7820231474522772, + "flos": 22713794732880.0, + "grad_norm": 1.5682718038359873, + "language_loss": 0.71840489, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74218655, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14050293, + "step": 13007, + "time_per_iteration": 2.9135522842407227 + }, + { + "auxiliary_loss_clip": 0.01343009, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.23173833, + "balance_loss_mlp": 1.01934826, + "epoch": 0.7820832707049451, + "flos": 20052222370200.0, + "grad_norm": 2.0254912353485666, + "language_loss": 0.69155586, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71531636, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13696289, + "step": 13008, + "time_per_iteration": 2.777090072631836 + }, + { + "auxiliary_loss_clip": 0.01339368, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.23143101, + "balance_loss_mlp": 1.01710498, + "epoch": 0.7821433939576131, + "flos": 27168612361560.0, + "grad_norm": 1.7283470085077197, + "language_loss": 0.69900972, + "learning_rate": 4.774378763473954e-07, + "loss": 0.72271413, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13989258, + "step": 13009, + "time_per_iteration": 2.801454782485962 + }, + { + "auxiliary_loss_clip": 0.01337733, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.22970009, + "balance_loss_mlp": 1.0172087, + "epoch": 0.782203517210281, + "flos": 22607289339840.0, + "grad_norm": 2.1863387955381457, + "language_loss": 0.81544673, + "learning_rate": 4.771853696779586e-07, + "loss": 0.8391369, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.14050293, + "step": 13010, + "time_per_iteration": 2.8139541149139404 + }, + { + "auxiliary_loss_clip": 0.0133765, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.23137474, + "balance_loss_mlp": 1.02100635, + "epoch": 0.782263640462949, + "flos": 29065845302280.0, + "grad_norm": 1.571863439904197, + "language_loss": 0.62569463, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64941067, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12939453, + "step": 13011, + "time_per_iteration": 2.9456865787506104 + }, + { + "auxiliary_loss_clip": 0.01335198, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.22981834, + "balance_loss_mlp": 1.01934171, + "epoch": 0.782323763715617, + "flos": 25304742511560.0, + "grad_norm": 1.8599726500156026, + "language_loss": 0.70068836, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72434616, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11236572, + "step": 13012, + "time_per_iteration": 2.970348834991455 + }, + { + "auxiliary_loss_clip": 0.01154337, + "auxiliary_loss_mlp": 0.01005903, + "balance_loss_clip": 1.11171615, + "balance_loss_mlp": 1.0027802, + "epoch": 0.782383886968285, + "flos": 65211688859400.0, + "grad_norm": 0.7066591238006049, + "language_loss": 0.55061138, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57221383, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.03125, + "step": 13013, + "time_per_iteration": 3.3808188438415527 + }, + { + "auxiliary_loss_clip": 0.01353369, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.24195886, + "balance_loss_mlp": 1.02220035, + "epoch": 0.782444010220953, + "flos": 18410012886960.0, + "grad_norm": 1.7804569187079338, + "language_loss": 0.6518935, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67577887, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12982178, + "step": 13014, + "time_per_iteration": 5.865183591842651 + }, + { + "auxiliary_loss_clip": 0.01156711, + "auxiliary_loss_mlp": 0.01004606, + "balance_loss_clip": 1.11374426, + "balance_loss_mlp": 1.00192344, + "epoch": 0.7825041334736209, + "flos": 63972289947600.0, + "grad_norm": 0.7179494039802553, + "language_loss": 0.58467489, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60628808, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02685547, + "step": 13015, + "time_per_iteration": 4.828173875808716 + }, + { + "auxiliary_loss_clip": 0.01341004, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.2336756, + "balance_loss_mlp": 1.02206087, + "epoch": 0.7825642567262889, + "flos": 20344223062440.0, + "grad_norm": 1.5907717675829163, + "language_loss": 0.74724352, + "learning_rate": 4.756715426472666e-07, + "loss": 0.77100718, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13299561, + "step": 13016, + "time_per_iteration": 2.721989631652832 + }, + { + "auxiliary_loss_clip": 0.01342422, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.23268116, + "balance_loss_mlp": 1.01989937, + "epoch": 0.7826243799789568, + "flos": 20267344791000.0, + "grad_norm": 1.6277802822460308, + "language_loss": 0.75364709, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77742523, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.15478516, + "step": 13017, + "time_per_iteration": 2.808328151702881 + }, + { + "auxiliary_loss_clip": 0.0134431, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.23358631, + "balance_loss_mlp": 1.01645923, + "epoch": 0.7826845032316249, + "flos": 21136687096680.0, + "grad_norm": 1.9448515008137135, + "language_loss": 0.75343478, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77718753, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14501953, + "step": 13018, + "time_per_iteration": 2.6743831634521484 + }, + { + "auxiliary_loss_clip": 0.01338635, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.23116708, + "balance_loss_mlp": 1.01620078, + "epoch": 0.7827446264842928, + "flos": 22497413452920.0, + "grad_norm": 1.3492569168817758, + "language_loss": 0.77311945, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79679632, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.128479, + "step": 13019, + "time_per_iteration": 2.7872865200042725 + }, + { + "auxiliary_loss_clip": 0.01333852, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.22658432, + "balance_loss_mlp": 1.01515365, + "epoch": 0.7828047497369608, + "flos": 28846621437120.0, + "grad_norm": 1.6834681428638656, + "language_loss": 0.67763239, + "learning_rate": 4.746634805529852e-07, + "loss": 0.70125514, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13275146, + "step": 13020, + "time_per_iteration": 2.7520346641540527 + }, + { + "auxiliary_loss_clip": 0.01347048, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.23897696, + "balance_loss_mlp": 1.0179069, + "epoch": 0.7828648729896287, + "flos": 23262849300600.0, + "grad_norm": 2.384446536376124, + "language_loss": 0.62865472, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.65243578, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13140869, + "step": 13021, + "time_per_iteration": 2.7889151573181152 + }, + { + "auxiliary_loss_clip": 0.013401, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.23350513, + "balance_loss_mlp": 1.02124953, + "epoch": 0.7829249962422967, + "flos": 25271257595760.0, + "grad_norm": 1.5525695027409887, + "language_loss": 0.69501948, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71875203, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11914062, + "step": 13022, + "time_per_iteration": 2.767293691635132 + }, + { + "auxiliary_loss_clip": 0.01153394, + "auxiliary_loss_mlp": 0.01007256, + "balance_loss_clip": 1.11000204, + "balance_loss_mlp": 1.0044899, + "epoch": 0.7829851194949646, + "flos": 70737692280840.0, + "grad_norm": 0.6415082626387401, + "language_loss": 0.56202281, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58362931, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02770996, + "step": 13023, + "time_per_iteration": 4.8976898193359375 + }, + { + "auxiliary_loss_clip": 0.013282, + "auxiliary_loss_mlp": 0.01030373, + "balance_loss_clip": 1.22459102, + "balance_loss_mlp": 1.01801682, + "epoch": 0.7830452427476327, + "flos": 25665621628320.0, + "grad_norm": 1.544539107444457, + "language_loss": 0.67231715, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69590282, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12371826, + "step": 13024, + "time_per_iteration": 2.7835004329681396 + }, + { + "auxiliary_loss_clip": 0.0134686, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.23694193, + "balance_loss_mlp": 1.01746964, + "epoch": 0.7831053660003006, + "flos": 22789535970240.0, + "grad_norm": 2.071130995775945, + "language_loss": 0.77980506, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80359244, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.144104, + "step": 13025, + "time_per_iteration": 2.7767741680145264 + }, + { + "auxiliary_loss_clip": 0.01339776, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.23300707, + "balance_loss_mlp": 1.01980925, + "epoch": 0.7831654892529686, + "flos": 25817835053520.0, + "grad_norm": 1.695866897882429, + "language_loss": 0.78182, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80554229, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12646484, + "step": 13026, + "time_per_iteration": 2.7814149856567383 + }, + { + "auxiliary_loss_clip": 0.01336466, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.22862411, + "balance_loss_mlp": 1.01571262, + "epoch": 0.7832256125056366, + "flos": 20775726763200.0, + "grad_norm": 1.9297905825186912, + "language_loss": 0.75484848, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77849752, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12738037, + "step": 13027, + "time_per_iteration": 2.777822256088257 + }, + { + "auxiliary_loss_clip": 0.01331222, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.22589004, + "balance_loss_mlp": 1.01567364, + "epoch": 0.7832857357583045, + "flos": 21511494880920.0, + "grad_norm": 1.8191923278256483, + "language_loss": 0.70900404, + "learning_rate": 4.726501333391997e-07, + "loss": 0.73260248, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12963867, + "step": 13028, + "time_per_iteration": 2.8456509113311768 + }, + { + "auxiliary_loss_clip": 0.01346329, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.23491216, + "balance_loss_mlp": 1.02735317, + "epoch": 0.7833458590109725, + "flos": 18082415644200.0, + "grad_norm": 2.1127273767656876, + "language_loss": 0.69216585, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71604735, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14465332, + "step": 13029, + "time_per_iteration": 2.750837802886963 + }, + { + "auxiliary_loss_clip": 0.01346025, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.23422551, + "balance_loss_mlp": 1.01693773, + "epoch": 0.7834059822636404, + "flos": 28294033942080.0, + "grad_norm": 1.8042382353368736, + "language_loss": 0.8127079, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83647621, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1385498, + "step": 13030, + "time_per_iteration": 2.7961275577545166 + }, + { + "auxiliary_loss_clip": 0.01346668, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.23508072, + "balance_loss_mlp": 1.01985097, + "epoch": 0.7834661055163085, + "flos": 31692511456560.0, + "grad_norm": 1.5284243762937424, + "language_loss": 0.70435882, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72816145, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13739014, + "step": 13031, + "time_per_iteration": 2.91176438331604 + }, + { + "auxiliary_loss_clip": 0.01341557, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.23191774, + "balance_loss_mlp": 1.01843953, + "epoch": 0.7835262287689764, + "flos": 12937131846360.0, + "grad_norm": 1.8050375497296869, + "language_loss": 0.791601, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.81533158, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13061523, + "step": 13032, + "time_per_iteration": 2.7141613960266113 + }, + { + "auxiliary_loss_clip": 0.01349486, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.23867357, + "balance_loss_mlp": 1.02314997, + "epoch": 0.7835863520216444, + "flos": 16147474518240.0, + "grad_norm": 1.978047574218139, + "language_loss": 0.62941957, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65329266, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14660645, + "step": 13033, + "time_per_iteration": 2.752514600753784 + }, + { + "auxiliary_loss_clip": 0.01339189, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.23101413, + "balance_loss_mlp": 1.02383268, + "epoch": 0.7836464752743123, + "flos": 11513618319600.0, + "grad_norm": 1.4110595423772432, + "language_loss": 0.71933609, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74309731, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13104248, + "step": 13034, + "time_per_iteration": 2.884039878845215 + }, + { + "auxiliary_loss_clip": 0.01345608, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.23478758, + "balance_loss_mlp": 1.02360976, + "epoch": 0.7837065985269803, + "flos": 18228334773600.0, + "grad_norm": 1.6099936232195735, + "language_loss": 0.72293425, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.74676353, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.137146, + "step": 13035, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.01347961, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.23831487, + "balance_loss_mlp": 1.02251577, + "epoch": 0.7837667217796482, + "flos": 24759870604920.0, + "grad_norm": 1.890133220549448, + "language_loss": 0.65962321, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68346167, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13391113, + "step": 13036, + "time_per_iteration": 2.844219446182251 + }, + { + "auxiliary_loss_clip": 0.0135585, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.24146819, + "balance_loss_mlp": 1.01785612, + "epoch": 0.7838268450323163, + "flos": 22388309124840.0, + "grad_norm": 1.9463286719864088, + "language_loss": 0.72516048, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74903727, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13970947, + "step": 13037, + "time_per_iteration": 2.6978869438171387 + }, + { + "auxiliary_loss_clip": 0.01341395, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.23219121, + "balance_loss_mlp": 1.02012908, + "epoch": 0.7838869682849842, + "flos": 19505035787040.0, + "grad_norm": 1.9643883602580452, + "language_loss": 0.60317922, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62692928, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13482666, + "step": 13038, + "time_per_iteration": 2.7637381553649902 + }, + { + "auxiliary_loss_clip": 0.01343815, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.23541379, + "balance_loss_mlp": 1.0190866, + "epoch": 0.7839470915376522, + "flos": 32899643703360.0, + "grad_norm": 1.7064342239344272, + "language_loss": 0.68364429, + "learning_rate": 4.698878342684349e-07, + "loss": 0.7073971, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12390137, + "step": 13039, + "time_per_iteration": 2.827371597290039 + }, + { + "auxiliary_loss_clip": 0.01332368, + "auxiliary_loss_mlp": 0.01029932, + "balance_loss_clip": 1.22585225, + "balance_loss_mlp": 1.0179162, + "epoch": 0.7840072147903202, + "flos": 29681585443080.0, + "grad_norm": 1.748437743125101, + "language_loss": 0.69396096, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71758395, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12011719, + "step": 13040, + "time_per_iteration": 2.889462471008301 + }, + { + "auxiliary_loss_clip": 0.01341409, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.2309947, + "balance_loss_mlp": 1.0208205, + "epoch": 0.7840673380429881, + "flos": 18191316930480.0, + "grad_norm": 1.4907354394702028, + "language_loss": 0.67574835, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.6995151, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 1.10302734, + "router_z_loss_mlp": 0.14428711, + "step": 13041, + "time_per_iteration": 2.705665349960327 + }, + { + "auxiliary_loss_clip": 0.01149419, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.10672164, + "balance_loss_mlp": 0.99956626, + "epoch": 0.7841274612956561, + "flos": 66361377258000.0, + "grad_norm": 0.6648130029097828, + "language_loss": 0.57460082, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59611988, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0291748, + "step": 13042, + "time_per_iteration": 3.210519790649414 + }, + { + "auxiliary_loss_clip": 0.01346605, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.23620069, + "balance_loss_mlp": 1.01876378, + "epoch": 0.784187584548324, + "flos": 26653773660120.0, + "grad_norm": 1.7983811101674132, + "language_loss": 0.83976901, + "learning_rate": 4.688851018730369e-07, + "loss": 0.86355996, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13739014, + "step": 13043, + "time_per_iteration": 2.7862048149108887 + }, + { + "auxiliary_loss_clip": 0.0133582, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.23021603, + "balance_loss_mlp": 1.01897824, + "epoch": 0.7842477078009921, + "flos": 25745829785280.0, + "grad_norm": 1.5932871631013323, + "language_loss": 0.88142276, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90509403, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12341309, + "step": 13044, + "time_per_iteration": 2.7592015266418457 + }, + { + "auxiliary_loss_clip": 0.01357962, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.24415433, + "balance_loss_mlp": 1.01613235, + "epoch": 0.78430783105366, + "flos": 21985985853720.0, + "grad_norm": 1.666153680922348, + "language_loss": 0.7918672, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81574386, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13562012, + "step": 13045, + "time_per_iteration": 2.924112319946289 + }, + { + "auxiliary_loss_clip": 0.01337181, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.23091364, + "balance_loss_mlp": 1.01616693, + "epoch": 0.784367954306328, + "flos": 23847622243920.0, + "grad_norm": 1.5284162695100565, + "language_loss": 0.72477806, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74843764, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12615967, + "step": 13046, + "time_per_iteration": 2.7719528675079346 + }, + { + "auxiliary_loss_clip": 0.01334665, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.22954595, + "balance_loss_mlp": 1.01907539, + "epoch": 0.7844280775589959, + "flos": 24832038306600.0, + "grad_norm": 1.4403905002534123, + "language_loss": 0.63397336, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65764588, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.13494873, + "step": 13047, + "time_per_iteration": 2.8120529651641846 + }, + { + "auxiliary_loss_clip": 0.01330874, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.22597075, + "balance_loss_mlp": 1.01664031, + "epoch": 0.7844882008116639, + "flos": 22460801693400.0, + "grad_norm": 1.6163657551312434, + "language_loss": 0.73079979, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75440109, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12609863, + "step": 13048, + "time_per_iteration": 2.782029867172241 + }, + { + "auxiliary_loss_clip": 0.01347282, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.23611832, + "balance_loss_mlp": 1.01713872, + "epoch": 0.7845483240643318, + "flos": 26109632703960.0, + "grad_norm": 3.323659811515746, + "language_loss": 0.75249827, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.77627897, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 1.11279297, + "router_z_loss_mlp": 0.13653564, + "step": 13049, + "time_per_iteration": 2.891986846923828 + }, + { + "auxiliary_loss_clip": 0.01345919, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.23335385, + "balance_loss_mlp": 1.02316654, + "epoch": 0.7846084473169999, + "flos": 19359035440920.0, + "grad_norm": 1.7411730979102003, + "language_loss": 0.72832948, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.75216597, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14556885, + "step": 13050, + "time_per_iteration": 2.762784242630005 + }, + { + "auxiliary_loss_clip": 0.01340326, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.23278236, + "balance_loss_mlp": 1.01882863, + "epoch": 0.7846685705696678, + "flos": 23330347040880.0, + "grad_norm": 1.9058534230816566, + "language_loss": 0.73626608, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75998628, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12860107, + "step": 13051, + "time_per_iteration": 2.787200927734375 + }, + { + "auxiliary_loss_clip": 0.01341491, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.23127413, + "balance_loss_mlp": 1.02126718, + "epoch": 0.7847286938223358, + "flos": 35816442565320.0, + "grad_norm": 2.1423113666416547, + "language_loss": 0.73090863, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75467211, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13586426, + "step": 13052, + "time_per_iteration": 4.503370046615601 + }, + { + "auxiliary_loss_clip": 0.0133069, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.2271775, + "balance_loss_mlp": 1.02335095, + "epoch": 0.7847888170750038, + "flos": 18482708497320.0, + "grad_norm": 2.7557481795343124, + "language_loss": 0.69475091, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71841407, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.1227417, + "step": 13053, + "time_per_iteration": 2.7251460552215576 + }, + { + "auxiliary_loss_clip": 0.01337013, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.23005068, + "balance_loss_mlp": 1.0173707, + "epoch": 0.7848489403276717, + "flos": 25124648124240.0, + "grad_norm": 1.9551733643344538, + "language_loss": 0.70615935, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72982746, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12438965, + "step": 13054, + "time_per_iteration": 5.7824790477752686 + }, + { + "auxiliary_loss_clip": 0.01345784, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.23469973, + "balance_loss_mlp": 1.01952386, + "epoch": 0.7849090635803397, + "flos": 26507692097280.0, + "grad_norm": 1.5056866932371078, + "language_loss": 0.7614876, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78528309, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14239502, + "step": 13055, + "time_per_iteration": 2.7554666996002197 + }, + { + "auxiliary_loss_clip": 0.01350023, + "auxiliary_loss_mlp": 0.01037904, + "balance_loss_clip": 1.23724055, + "balance_loss_mlp": 1.02364635, + "epoch": 0.7849691868330076, + "flos": 20964592556280.0, + "grad_norm": 2.292678823304336, + "language_loss": 0.74852687, + "learning_rate": 4.656326403684283e-07, + "loss": 0.7724061, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14251709, + "step": 13056, + "time_per_iteration": 2.7854623794555664 + }, + { + "auxiliary_loss_clip": 0.01341707, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.23331344, + "balance_loss_mlp": 1.02043521, + "epoch": 0.7850293100856757, + "flos": 26073061552800.0, + "grad_norm": 1.9877777797264387, + "language_loss": 0.70151776, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72527313, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1338501, + "step": 13057, + "time_per_iteration": 2.7751920223236084 + }, + { + "auxiliary_loss_clip": 0.01350959, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.24047756, + "balance_loss_mlp": 1.0186584, + "epoch": 0.7850894333383436, + "flos": 22497007369320.0, + "grad_norm": 1.9229186613711573, + "language_loss": 0.76782513, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.79164994, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12872314, + "step": 13058, + "time_per_iteration": 2.7657272815704346 + }, + { + "auxiliary_loss_clip": 0.01343563, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.23540497, + "balance_loss_mlp": 1.01953208, + "epoch": 0.7851495565910116, + "flos": 20563568752680.0, + "grad_norm": 1.874598965542791, + "language_loss": 0.71302342, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73678976, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13549805, + "step": 13059, + "time_per_iteration": 2.737539052963257 + }, + { + "auxiliary_loss_clip": 0.01355097, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.24014843, + "balance_loss_mlp": 1.02189863, + "epoch": 0.7852096798436795, + "flos": 15929144037000.0, + "grad_norm": 1.9450690538283224, + "language_loss": 0.77091432, + "learning_rate": 4.646338602497144e-07, + "loss": 0.79482663, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.14227295, + "step": 13060, + "time_per_iteration": 2.696489095687866 + }, + { + "auxiliary_loss_clip": 0.01343184, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.23395634, + "balance_loss_mlp": 1.01976871, + "epoch": 0.7852698030963475, + "flos": 19066953531960.0, + "grad_norm": 2.009088143224296, + "language_loss": 0.76784062, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79160798, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13763428, + "step": 13061, + "time_per_iteration": 4.284128189086914 + }, + { + "auxiliary_loss_clip": 0.01337513, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.22808242, + "balance_loss_mlp": 1.01907182, + "epoch": 0.7853299263490154, + "flos": 24649751067840.0, + "grad_norm": 1.8442685297371764, + "language_loss": 0.74439311, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76809847, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.1395874, + "step": 13062, + "time_per_iteration": 2.7420215606689453 + }, + { + "auxiliary_loss_clip": 0.01333631, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.22660303, + "balance_loss_mlp": 1.02114201, + "epoch": 0.7853900496016835, + "flos": 22023084913560.0, + "grad_norm": 1.368444616519216, + "language_loss": 0.68625546, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70993412, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13085938, + "step": 13063, + "time_per_iteration": 2.7632009983062744 + }, + { + "auxiliary_loss_clip": 0.01338716, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.23299623, + "balance_loss_mlp": 1.01742756, + "epoch": 0.7854501728543514, + "flos": 30233604421080.0, + "grad_norm": 2.0911561190636414, + "language_loss": 0.7295779, + "learning_rate": 4.636360116707625e-07, + "loss": 0.75326496, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12561035, + "step": 13064, + "time_per_iteration": 2.8669514656066895 + }, + { + "auxiliary_loss_clip": 0.01343028, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.23123848, + "balance_loss_mlp": 1.01521289, + "epoch": 0.7855102961070194, + "flos": 18848379400560.0, + "grad_norm": 1.5758768244379833, + "language_loss": 0.67654228, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70025945, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13476562, + "step": 13065, + "time_per_iteration": 2.764859914779663 + }, + { + "auxiliary_loss_clip": 0.01340789, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.2333312, + "balance_loss_mlp": 1.02138019, + "epoch": 0.7855704193596874, + "flos": 22315126214160.0, + "grad_norm": 1.7129553235954624, + "language_loss": 0.76561636, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.7893672, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12927246, + "step": 13066, + "time_per_iteration": 2.7633204460144043 + }, + { + "auxiliary_loss_clip": 0.01155251, + "auxiliary_loss_mlp": 0.01000252, + "balance_loss_clip": 1.11223531, + "balance_loss_mlp": 0.99736732, + "epoch": 0.7856305426123553, + "flos": 60019884862200.0, + "grad_norm": 0.705132388293483, + "language_loss": 0.53410852, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55566359, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02880859, + "step": 13067, + "time_per_iteration": 3.3035194873809814 + }, + { + "auxiliary_loss_clip": 0.01343715, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.23350811, + "balance_loss_mlp": 1.0215795, + "epoch": 0.7856906658650233, + "flos": 21873226773240.0, + "grad_norm": 1.9380027440216627, + "language_loss": 0.67587799, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69966835, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13745117, + "step": 13068, + "time_per_iteration": 2.7753074169158936 + }, + { + "auxiliary_loss_clip": 0.01338016, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.23159838, + "balance_loss_mlp": 1.0202347, + "epoch": 0.7857507891176913, + "flos": 23628195336960.0, + "grad_norm": 1.8353115178177377, + "language_loss": 0.68186873, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70557958, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12860107, + "step": 13069, + "time_per_iteration": 2.7520527839660645 + }, + { + "auxiliary_loss_clip": 0.01342512, + "auxiliary_loss_mlp": 0.01040403, + "balance_loss_clip": 1.23332524, + "balance_loss_mlp": 1.02717066, + "epoch": 0.7858109123703593, + "flos": 25525550102760.0, + "grad_norm": 1.942074553400774, + "language_loss": 0.7720232, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.7958523, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13238525, + "step": 13070, + "time_per_iteration": 2.8252460956573486 + }, + { + "auxiliary_loss_clip": 0.01335255, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.22881842, + "balance_loss_mlp": 1.01679611, + "epoch": 0.7858710356230272, + "flos": 17461924325280.0, + "grad_norm": 1.5648207959351745, + "language_loss": 0.66329682, + "learning_rate": 4.618920199958083e-07, + "loss": 0.6869452, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12799072, + "step": 13071, + "time_per_iteration": 2.7970902919769287 + }, + { + "auxiliary_loss_clip": 0.01350458, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.23874831, + "balance_loss_mlp": 1.02146339, + "epoch": 0.7859311588756952, + "flos": 24684900926400.0, + "grad_norm": 1.4551720026380361, + "language_loss": 0.73989773, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76375145, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13439941, + "step": 13072, + "time_per_iteration": 2.7913544178009033 + }, + { + "auxiliary_loss_clip": 0.01349758, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.23980594, + "balance_loss_mlp": 1.0165956, + "epoch": 0.7859912821283631, + "flos": 21804226523640.0, + "grad_norm": 1.710350361310439, + "language_loss": 0.71774721, + "learning_rate": 4.613942614453268e-07, + "loss": 0.74155331, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14245605, + "step": 13073, + "time_per_iteration": 2.8455538749694824 + }, + { + "auxiliary_loss_clip": 0.01339558, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.23019266, + "balance_loss_mlp": 1.01761127, + "epoch": 0.7860514053810311, + "flos": 20851833475800.0, + "grad_norm": 1.6320826653960077, + "language_loss": 0.76608813, + "learning_rate": 4.611454696814938e-07, + "loss": 0.7897954, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13555908, + "step": 13074, + "time_per_iteration": 2.7341513633728027 + }, + { + "auxiliary_loss_clip": 0.01335354, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.22993422, + "balance_loss_mlp": 1.01832676, + "epoch": 0.786111528633699, + "flos": 24320976182640.0, + "grad_norm": 1.4987990023864257, + "language_loss": 0.74901056, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77267659, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12927246, + "step": 13075, + "time_per_iteration": 2.8126003742218018 + }, + { + "auxiliary_loss_clip": 0.01343556, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.23509121, + "balance_loss_mlp": 1.01636183, + "epoch": 0.7861716518863671, + "flos": 24358603151160.0, + "grad_norm": 1.7642425133887063, + "language_loss": 0.68906778, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71279573, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12884521, + "step": 13076, + "time_per_iteration": 2.7582883834838867 + }, + { + "auxiliary_loss_clip": 0.0133935, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.23314071, + "balance_loss_mlp": 1.02188373, + "epoch": 0.786231775139035, + "flos": 14025779233920.0, + "grad_norm": 1.902774877894214, + "language_loss": 0.80152011, + "learning_rate": 4.603994445488282e-07, + "loss": 0.8252629, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.1307373, + "step": 13077, + "time_per_iteration": 2.707257032394409 + }, + { + "auxiliary_loss_clip": 0.01340437, + "auxiliary_loss_mlp": 0.01032671, + "balance_loss_clip": 1.23265028, + "balance_loss_mlp": 1.01926553, + "epoch": 0.786291898391703, + "flos": 33730140789720.0, + "grad_norm": 3.4908758278606267, + "language_loss": 0.70956171, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73329276, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1340332, + "step": 13078, + "time_per_iteration": 2.8197295665740967 + }, + { + "auxiliary_loss_clip": 0.01337569, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.23167908, + "balance_loss_mlp": 1.021662, + "epoch": 0.786352021644371, + "flos": 25816941669600.0, + "grad_norm": 1.4529957851326725, + "language_loss": 0.81530046, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83902335, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13043213, + "step": 13079, + "time_per_iteration": 2.74699330329895 + }, + { + "auxiliary_loss_clip": 0.01332228, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.22811341, + "balance_loss_mlp": 1.01598728, + "epoch": 0.7864121448970389, + "flos": 28916433853920.0, + "grad_norm": 1.4666943671478958, + "language_loss": 0.68437421, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70798612, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12988281, + "step": 13080, + "time_per_iteration": 2.8001036643981934 + }, + { + "auxiliary_loss_clip": 0.01340687, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.23176455, + "balance_loss_mlp": 1.02138543, + "epoch": 0.7864722681497069, + "flos": 19213603611840.0, + "grad_norm": 1.531864414915674, + "language_loss": 0.69912803, + "learning_rate": 4.594055617612016e-07, + "loss": 0.72288859, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13989258, + "step": 13081, + "time_per_iteration": 2.698394536972046 + }, + { + "auxiliary_loss_clip": 0.01349661, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.23958063, + "balance_loss_mlp": 1.02089334, + "epoch": 0.7865323914023749, + "flos": 21876597267120.0, + "grad_norm": 1.8302098114937069, + "language_loss": 0.68894643, + "learning_rate": 4.591572370894838e-07, + "loss": 0.71278298, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13092041, + "step": 13082, + "time_per_iteration": 2.8017261028289795 + }, + { + "auxiliary_loss_clip": 0.01342201, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.23458552, + "balance_loss_mlp": 1.01906538, + "epoch": 0.7865925146550429, + "flos": 25525834361280.0, + "grad_norm": 1.7693067847644375, + "language_loss": 0.66779178, + "learning_rate": 4.589089708466789e-07, + "loss": 0.69153708, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13256836, + "step": 13083, + "time_per_iteration": 2.7781143188476562 + }, + { + "auxiliary_loss_clip": 0.01351424, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.23891497, + "balance_loss_mlp": 1.02033281, + "epoch": 0.7866526379077108, + "flos": 19101859740360.0, + "grad_norm": 1.9338320285425792, + "language_loss": 0.74894553, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.77280498, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14190674, + "step": 13084, + "time_per_iteration": 2.765892505645752 + }, + { + "auxiliary_loss_clip": 0.01334555, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.22887588, + "balance_loss_mlp": 1.01971447, + "epoch": 0.7867127611603788, + "flos": 16177264073280.0, + "grad_norm": 1.8535397856902371, + "language_loss": 0.70778787, + "learning_rate": 4.584126136854591e-07, + "loss": 0.73145032, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11987305, + "step": 13085, + "time_per_iteration": 2.7887117862701416 + }, + { + "auxiliary_loss_clip": 0.01348738, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.23560858, + "balance_loss_mlp": 1.02035034, + "epoch": 0.7867728844130467, + "flos": 20778082048080.0, + "grad_norm": 1.7248359183507531, + "language_loss": 0.72472835, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74855715, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13800049, + "step": 13086, + "time_per_iteration": 2.7484073638916016 + }, + { + "auxiliary_loss_clip": 0.01342037, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.23288655, + "balance_loss_mlp": 1.01971745, + "epoch": 0.7868330076657147, + "flos": 21764853395640.0, + "grad_norm": 1.7266876424197881, + "language_loss": 0.74549884, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76924801, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13153076, + "step": 13087, + "time_per_iteration": 2.7924020290374756 + }, + { + "auxiliary_loss_clip": 0.01336346, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.22958589, + "balance_loss_mlp": 1.02182698, + "epoch": 0.7868931309183826, + "flos": 25705969356960.0, + "grad_norm": 1.5636860133635664, + "language_loss": 0.71467471, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.738379, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12261963, + "step": 13088, + "time_per_iteration": 2.7924246788024902 + }, + { + "auxiliary_loss_clip": 0.01150825, + "auxiliary_loss_mlp": 0.01002827, + "balance_loss_clip": 1.10841703, + "balance_loss_mlp": 0.99975151, + "epoch": 0.7869532541710507, + "flos": 64661822124480.0, + "grad_norm": 0.6733340184751848, + "language_loss": 0.55473512, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57627159, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.03063965, + "step": 13089, + "time_per_iteration": 3.325772762298584 + }, + { + "auxiliary_loss_clip": 0.01152778, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.11069727, + "balance_loss_mlp": 0.99930155, + "epoch": 0.7870133774237186, + "flos": 67470816125880.0, + "grad_norm": 0.7208936268694506, + "language_loss": 0.50054044, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52209181, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.03051758, + "step": 13090, + "time_per_iteration": 3.2856380939483643 + }, + { + "auxiliary_loss_clip": 0.01335758, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.22943711, + "balance_loss_mlp": 1.01860332, + "epoch": 0.7870735006763866, + "flos": 26073995545080.0, + "grad_norm": 1.4727506148838325, + "language_loss": 0.8369593, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.86062807, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12512207, + "step": 13091, + "time_per_iteration": 4.244602918624878 + }, + { + "auxiliary_loss_clip": 0.01152574, + "auxiliary_loss_mlp": 0.0100335, + "balance_loss_clip": 1.11017668, + "balance_loss_mlp": 1.0005604, + "epoch": 0.7871336239290546, + "flos": 70305863713200.0, + "grad_norm": 0.7219858774950728, + "language_loss": 0.6401214, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66168064, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.0279541, + "step": 13092, + "time_per_iteration": 4.667778015136719 + }, + { + "auxiliary_loss_clip": 0.01343177, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.23406887, + "balance_loss_mlp": 1.01910853, + "epoch": 0.7871937471817225, + "flos": 15782818824000.0, + "grad_norm": 2.0248587309058657, + "language_loss": 0.79565752, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81941605, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13549805, + "step": 13093, + "time_per_iteration": 4.275492429733276 + }, + { + "auxiliary_loss_clip": 0.01339269, + "auxiliary_loss_mlp": 0.01027561, + "balance_loss_clip": 1.23230243, + "balance_loss_mlp": 1.01462078, + "epoch": 0.7872538704343905, + "flos": 20490101583480.0, + "grad_norm": 1.8141421337317183, + "language_loss": 0.75927782, + "learning_rate": 4.561819011749106e-07, + "loss": 0.78294617, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12945557, + "step": 13094, + "time_per_iteration": 2.768728256225586 + }, + { + "auxiliary_loss_clip": 0.01341909, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.23156881, + "balance_loss_mlp": 1.02028584, + "epoch": 0.7873139936870585, + "flos": 25088158189800.0, + "grad_norm": 1.8465706608679573, + "language_loss": 0.79764116, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.82139075, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12768555, + "step": 13095, + "time_per_iteration": 2.874197483062744 + }, + { + "auxiliary_loss_clip": 0.01351524, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.23988175, + "balance_loss_mlp": 1.01978719, + "epoch": 0.7873741169397265, + "flos": 30889326815280.0, + "grad_norm": 1.6661718369151062, + "language_loss": 0.68142384, + "learning_rate": 4.556868310016715e-07, + "loss": 0.7052722, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13531494, + "step": 13096, + "time_per_iteration": 2.84043288230896 + }, + { + "auxiliary_loss_clip": 0.01333443, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.22797263, + "balance_loss_mlp": 1.01540351, + "epoch": 0.7874342401923944, + "flos": 46800318915360.0, + "grad_norm": 1.4323926873674118, + "language_loss": 0.70724738, + "learning_rate": 4.55439383751125e-07, + "loss": 0.73085344, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11761475, + "step": 13097, + "time_per_iteration": 2.943164587020874 + }, + { + "auxiliary_loss_clip": 0.01348857, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.23801541, + "balance_loss_mlp": 1.0232017, + "epoch": 0.7874943634450624, + "flos": 23589553159440.0, + "grad_norm": 2.6415514196228083, + "language_loss": 0.80741739, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.83127284, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13482666, + "step": 13098, + "time_per_iteration": 2.9250173568725586 + }, + { + "auxiliary_loss_clip": 0.01343038, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.23583317, + "balance_loss_mlp": 1.01505351, + "epoch": 0.7875544866977303, + "flos": 20195704998000.0, + "grad_norm": 1.533730856379353, + "language_loss": 0.7442801, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76798064, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11968994, + "step": 13099, + "time_per_iteration": 2.753477096557617 + }, + { + "auxiliary_loss_clip": 0.01339701, + "auxiliary_loss_mlp": 0.01024616, + "balance_loss_clip": 1.23267543, + "balance_loss_mlp": 1.01106238, + "epoch": 0.7876146099503983, + "flos": 22607857856880.0, + "grad_norm": 1.4586891057109752, + "language_loss": 0.78642893, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.81007206, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13562012, + "step": 13100, + "time_per_iteration": 2.7669081687927246 + }, + { + "auxiliary_loss_clip": 0.01356566, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.24170399, + "balance_loss_mlp": 1.0201081, + "epoch": 0.7876747332030662, + "flos": 10709256035880.0, + "grad_norm": 2.220248333767889, + "language_loss": 0.66088998, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68479884, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.14233398, + "step": 13101, + "time_per_iteration": 4.21028208732605 + }, + { + "auxiliary_loss_clip": 0.013428, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.23436761, + "balance_loss_mlp": 1.02172279, + "epoch": 0.7877348564557343, + "flos": 38406456351720.0, + "grad_norm": 2.2615150950032534, + "language_loss": 0.77828556, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.80205894, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12817383, + "step": 13102, + "time_per_iteration": 2.88034987449646 + }, + { + "auxiliary_loss_clip": 0.01341093, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.23175073, + "balance_loss_mlp": 1.02587664, + "epoch": 0.7877949797084022, + "flos": 18333703132560.0, + "grad_norm": 1.8774429326475228, + "language_loss": 0.82711047, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.85090351, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12341309, + "step": 13103, + "time_per_iteration": 2.7427585124969482 + }, + { + "auxiliary_loss_clip": 0.01348467, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.23833096, + "balance_loss_mlp": 1.0217998, + "epoch": 0.7878551029610702, + "flos": 25811581366080.0, + "grad_norm": 2.0175343695616146, + "language_loss": 0.8079778, + "learning_rate": 4.537088934794913e-07, + "loss": 0.83181167, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.13116455, + "step": 13104, + "time_per_iteration": 2.770603895187378 + }, + { + "auxiliary_loss_clip": 0.01344469, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.23521042, + "balance_loss_mlp": 1.01994932, + "epoch": 0.7879152262137382, + "flos": 22347352270800.0, + "grad_norm": 1.6448750608690104, + "language_loss": 0.74146467, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76523888, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13006592, + "step": 13105, + "time_per_iteration": 2.785640001296997 + }, + { + "auxiliary_loss_clip": 0.013462, + "auxiliary_loss_mlp": 0.01040216, + "balance_loss_clip": 1.23384166, + "balance_loss_mlp": 1.02661979, + "epoch": 0.7879753494664061, + "flos": 24790066243560.0, + "grad_norm": 1.6139096475854244, + "language_loss": 0.75924188, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78310603, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 1.12158203, + "router_z_loss_mlp": 0.13604736, + "step": 13106, + "time_per_iteration": 3.024224281311035 + }, + { + "auxiliary_loss_clip": 0.01344961, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.23572993, + "balance_loss_mlp": 1.01963139, + "epoch": 0.7880354727190741, + "flos": 16913397666240.0, + "grad_norm": 2.196944741293496, + "language_loss": 0.73481476, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75859755, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13696289, + "step": 13107, + "time_per_iteration": 2.7169551849365234 + }, + { + "auxiliary_loss_clip": 0.01336064, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.23002338, + "balance_loss_mlp": 1.01953053, + "epoch": 0.7880955959717421, + "flos": 22234796232120.0, + "grad_norm": 1.4570181753388451, + "language_loss": 0.732656, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75633955, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12780762, + "step": 13108, + "time_per_iteration": 2.8041412830352783 + }, + { + "auxiliary_loss_clip": 0.01151111, + "auxiliary_loss_mlp": 0.01002583, + "balance_loss_clip": 1.10918677, + "balance_loss_mlp": 0.9999243, + "epoch": 0.7881557192244101, + "flos": 69197805529560.0, + "grad_norm": 1.470207376122647, + "language_loss": 0.60411334, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62565017, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.02661133, + "step": 13109, + "time_per_iteration": 3.2143609523773193 + }, + { + "auxiliary_loss_clip": 0.01335846, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.23175883, + "balance_loss_mlp": 1.01885962, + "epoch": 0.788215842477078, + "flos": 24940736551080.0, + "grad_norm": 1.5929281066112668, + "language_loss": 0.72488308, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74855888, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12878418, + "step": 13110, + "time_per_iteration": 2.8056039810180664 + }, + { + "auxiliary_loss_clip": 0.01335388, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.22910273, + "balance_loss_mlp": 1.01756907, + "epoch": 0.788275965729746, + "flos": 26112394072440.0, + "grad_norm": 1.363133507000353, + "language_loss": 0.75370407, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77735615, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12255859, + "step": 13111, + "time_per_iteration": 2.8753249645233154 + }, + { + "auxiliary_loss_clip": 0.01339879, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.23179221, + "balance_loss_mlp": 1.02377057, + "epoch": 0.7883360889824139, + "flos": 21219737838840.0, + "grad_norm": 1.9309856637707252, + "language_loss": 0.62294555, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.6467123, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13018799, + "step": 13112, + "time_per_iteration": 2.779935598373413 + }, + { + "auxiliary_loss_clip": 0.01340998, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.23255312, + "balance_loss_mlp": 1.01789761, + "epoch": 0.7883962122350819, + "flos": 21147570137160.0, + "grad_norm": 1.7732177680218515, + "language_loss": 0.67537004, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69909537, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13641357, + "step": 13113, + "time_per_iteration": 2.853712558746338 + }, + { + "auxiliary_loss_clip": 0.01338137, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.23192418, + "balance_loss_mlp": 1.01843858, + "epoch": 0.7884563354877498, + "flos": 15306906558600.0, + "grad_norm": 3.1887427786722347, + "language_loss": 0.59402239, + "learning_rate": 4.5124174933361e-07, + "loss": 0.61771059, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12243652, + "step": 13114, + "time_per_iteration": 2.696187734603882 + }, + { + "auxiliary_loss_clip": 0.01343055, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.23353302, + "balance_loss_mlp": 1.02265382, + "epoch": 0.7885164587404179, + "flos": 24393549967920.0, + "grad_norm": 1.5936960578759263, + "language_loss": 0.6714744, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69526017, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12872314, + "step": 13115, + "time_per_iteration": 2.794529438018799 + }, + { + "auxiliary_loss_clip": 0.01341116, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.23313355, + "balance_loss_mlp": 1.01719177, + "epoch": 0.7885765819930858, + "flos": 14389135460640.0, + "grad_norm": 1.8313178046488165, + "language_loss": 0.88775176, + "learning_rate": 4.50749024954048e-07, + "loss": 0.91146326, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.128479, + "step": 13116, + "time_per_iteration": 2.7182459831237793 + }, + { + "auxiliary_loss_clip": 0.01356177, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.23965776, + "balance_loss_mlp": 1.02459586, + "epoch": 0.7886367052457538, + "flos": 18264540449520.0, + "grad_norm": 2.3681552259128993, + "language_loss": 0.73006475, + "learning_rate": 4.505027508812245e-07, + "loss": 0.75401723, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.14477539, + "step": 13117, + "time_per_iteration": 2.8038406372070312 + }, + { + "auxiliary_loss_clip": 0.01337926, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.23184705, + "balance_loss_mlp": 1.01923203, + "epoch": 0.7886968284984217, + "flos": 15309383668560.0, + "grad_norm": 1.411766560621819, + "language_loss": 0.80534321, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82903963, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.125, + "step": 13118, + "time_per_iteration": 2.7758843898773193 + }, + { + "auxiliary_loss_clip": 0.01341011, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.23411858, + "balance_loss_mlp": 1.0161047, + "epoch": 0.7887569517510897, + "flos": 21220468789320.0, + "grad_norm": 1.954175032009286, + "language_loss": 0.7320655, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75576603, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12921143, + "step": 13119, + "time_per_iteration": 2.7812018394470215 + }, + { + "auxiliary_loss_clip": 0.0134357, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.23355293, + "balance_loss_mlp": 1.01516306, + "epoch": 0.7888170750037578, + "flos": 22716434276280.0, + "grad_norm": 1.2635759848764465, + "language_loss": 0.72175348, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74547815, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13739014, + "step": 13120, + "time_per_iteration": 2.7370567321777344 + }, + { + "auxiliary_loss_clip": 0.01338701, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.23086953, + "balance_loss_mlp": 1.01902401, + "epoch": 0.7888771982564257, + "flos": 36436040500320.0, + "grad_norm": 1.4727376474536125, + "language_loss": 0.79305375, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.81675673, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12573242, + "step": 13121, + "time_per_iteration": 2.878629207611084 + }, + { + "auxiliary_loss_clip": 0.0133321, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.22651875, + "balance_loss_mlp": 1.01407456, + "epoch": 0.7889373215090937, + "flos": 27315993391920.0, + "grad_norm": 1.617464280270737, + "language_loss": 0.80376416, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82736784, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13085938, + "step": 13122, + "time_per_iteration": 2.831455707550049 + }, + { + "auxiliary_loss_clip": 0.01339752, + "auxiliary_loss_mlp": 0.01027718, + "balance_loss_clip": 1.22983718, + "balance_loss_mlp": 1.0147301, + "epoch": 0.7889974447617616, + "flos": 19833851280600.0, + "grad_norm": 1.8451714801864436, + "language_loss": 0.7810905, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80476522, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13006592, + "step": 13123, + "time_per_iteration": 2.724510908126831 + }, + { + "auxiliary_loss_clip": 0.01347554, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.23651779, + "balance_loss_mlp": 1.01695216, + "epoch": 0.7890575680144296, + "flos": 17275210775280.0, + "grad_norm": 1.8040744946500804, + "language_loss": 0.67154008, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69531655, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13128662, + "step": 13124, + "time_per_iteration": 2.7539334297180176 + }, + { + "auxiliary_loss_clip": 0.01350293, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.23802674, + "balance_loss_mlp": 1.02042794, + "epoch": 0.7891176912670975, + "flos": 27605963666160.0, + "grad_norm": 1.8328278655209356, + "language_loss": 0.72865891, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75249451, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.1282959, + "step": 13125, + "time_per_iteration": 2.768310546875 + }, + { + "auxiliary_loss_clip": 0.01344416, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.23368526, + "balance_loss_mlp": 1.01848376, + "epoch": 0.7891778145197655, + "flos": 22716921576600.0, + "grad_norm": 2.848737715268738, + "language_loss": 0.72892934, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.7526952, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13677979, + "step": 13126, + "time_per_iteration": 2.815328598022461 + }, + { + "auxiliary_loss_clip": 0.01344332, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.23356926, + "balance_loss_mlp": 1.01584029, + "epoch": 0.7892379377724335, + "flos": 17315193028680.0, + "grad_norm": 1.5755129846374607, + "language_loss": 0.76650459, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79024625, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13995361, + "step": 13127, + "time_per_iteration": 2.73028302192688 + }, + { + "auxiliary_loss_clip": 0.01334417, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.22936678, + "balance_loss_mlp": 1.01731336, + "epoch": 0.7892980610251015, + "flos": 25781223294000.0, + "grad_norm": 1.6011106410958844, + "language_loss": 0.85990173, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88354743, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12854004, + "step": 13128, + "time_per_iteration": 2.8840813636779785 + }, + { + "auxiliary_loss_clip": 0.01335017, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.22799301, + "balance_loss_mlp": 1.02259314, + "epoch": 0.7893581842777694, + "flos": 21584962050120.0, + "grad_norm": 2.072353314315976, + "language_loss": 0.69906968, + "learning_rate": 4.475520477290904e-07, + "loss": 0.72277093, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12506104, + "step": 13129, + "time_per_iteration": 2.851027488708496 + }, + { + "auxiliary_loss_clip": 0.01148522, + "auxiliary_loss_mlp": 0.01002082, + "balance_loss_clip": 1.10601676, + "balance_loss_mlp": 0.99954265, + "epoch": 0.7894183075304374, + "flos": 69033101710680.0, + "grad_norm": 0.7139763134237993, + "language_loss": 0.61632025, + "learning_rate": 4.473065382260597e-07, + "loss": 0.6378262, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02539062, + "step": 13130, + "time_per_iteration": 4.816535234451294 + }, + { + "auxiliary_loss_clip": 0.01346303, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.23595071, + "balance_loss_mlp": 1.01853931, + "epoch": 0.7894784307831053, + "flos": 24248605439160.0, + "grad_norm": 1.6935102553566808, + "language_loss": 0.73731935, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76109552, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12774658, + "step": 13131, + "time_per_iteration": 5.71938681602478 + }, + { + "auxiliary_loss_clip": 0.01365848, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.24618649, + "balance_loss_mlp": 1.01491094, + "epoch": 0.7895385540357733, + "flos": 20271405627000.0, + "grad_norm": 2.5670323035387077, + "language_loss": 0.69206071, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71602058, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.15222168, + "step": 13132, + "time_per_iteration": 2.746851921081543 + }, + { + "auxiliary_loss_clip": 0.0134385, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.23291206, + "balance_loss_mlp": 1.02482879, + "epoch": 0.7895986772884414, + "flos": 21001569791040.0, + "grad_norm": 1.792175914244658, + "language_loss": 0.62626177, + "learning_rate": 4.465703630239468e-07, + "loss": 0.65008432, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 1.11083984, + "router_z_loss_mlp": 0.13580322, + "step": 13133, + "time_per_iteration": 2.834688901901245 + }, + { + "auxiliary_loss_clip": 0.01341808, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.23003054, + "balance_loss_mlp": 1.01932645, + "epoch": 0.7896588005411093, + "flos": 18662234367600.0, + "grad_norm": 2.480563436736113, + "language_loss": 0.80041182, + "learning_rate": 4.463250890899195e-07, + "loss": 0.82416528, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14221191, + "step": 13134, + "time_per_iteration": 2.6953115463256836 + }, + { + "auxiliary_loss_clip": 0.01345431, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.23484886, + "balance_loss_mlp": 1.02005243, + "epoch": 0.7897189237937773, + "flos": 18410540795640.0, + "grad_norm": 1.7838483927387956, + "language_loss": 0.79950714, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82329828, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1362915, + "step": 13135, + "time_per_iteration": 2.7154932022094727 + }, + { + "auxiliary_loss_clip": 0.01338219, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.23100734, + "balance_loss_mlp": 1.02120829, + "epoch": 0.7897790470464452, + "flos": 23736771756360.0, + "grad_norm": 1.5388377989531388, + "language_loss": 0.72408891, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74781847, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13531494, + "step": 13136, + "time_per_iteration": 2.8119449615478516 + }, + { + "auxiliary_loss_clip": 0.01352335, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.23715162, + "balance_loss_mlp": 1.02873421, + "epoch": 0.7898391702991132, + "flos": 15922484265960.0, + "grad_norm": 1.8549290661645994, + "language_loss": 0.71209955, + "learning_rate": 4.455896208180778e-07, + "loss": 0.7360487, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.1385498, + "step": 13137, + "time_per_iteration": 2.8200724124908447 + }, + { + "auxiliary_loss_clip": 0.01337761, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.2313261, + "balance_loss_mlp": 1.02422118, + "epoch": 0.7898992935517811, + "flos": 19833891888960.0, + "grad_norm": 1.8946930770425028, + "language_loss": 0.74460113, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76835895, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13806152, + "step": 13138, + "time_per_iteration": 4.239608287811279 + }, + { + "auxiliary_loss_clip": 0.01338848, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.23058629, + "balance_loss_mlp": 1.02319288, + "epoch": 0.7899594168044491, + "flos": 16220535603840.0, + "grad_norm": 2.4562760011646243, + "language_loss": 0.68968987, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.71344042, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13006592, + "step": 13139, + "time_per_iteration": 2.7850310802459717 + }, + { + "auxiliary_loss_clip": 0.01147891, + "auxiliary_loss_mlp": 0.01003895, + "balance_loss_clip": 1.10571194, + "balance_loss_mlp": 1.00139141, + "epoch": 0.790019540057117, + "flos": 68348564362080.0, + "grad_norm": 0.9295912913029297, + "language_loss": 0.60278475, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62430263, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02502441, + "step": 13140, + "time_per_iteration": 3.322828531265259 + }, + { + "auxiliary_loss_clip": 0.01348585, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.23927999, + "balance_loss_mlp": 1.02073836, + "epoch": 0.7900796633097851, + "flos": 30338444871360.0, + "grad_norm": 1.5136283064960676, + "language_loss": 0.76194501, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78577858, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14038086, + "step": 13141, + "time_per_iteration": 2.9964418411254883 + }, + { + "auxiliary_loss_clip": 0.01345018, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.23426902, + "balance_loss_mlp": 1.02335608, + "epoch": 0.790139786562453, + "flos": 22131823766400.0, + "grad_norm": 2.8809789356133786, + "language_loss": 0.68512475, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70895028, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14178467, + "step": 13142, + "time_per_iteration": 2.8908021450042725 + }, + { + "auxiliary_loss_clip": 0.01148388, + "auxiliary_loss_mlp": 0.01003968, + "balance_loss_clip": 1.10635006, + "balance_loss_mlp": 1.00121439, + "epoch": 0.790199909815121, + "flos": 58221279292680.0, + "grad_norm": 0.8221884748083315, + "language_loss": 0.60057521, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62209868, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.02758789, + "step": 13143, + "time_per_iteration": 3.0192432403564453 + }, + { + "auxiliary_loss_clip": 0.01347264, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.23616087, + "balance_loss_mlp": 1.01845038, + "epoch": 0.7902600330677889, + "flos": 34540147635480.0, + "grad_norm": 1.4474807107832326, + "language_loss": 0.7448895, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.7686826, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13598633, + "step": 13144, + "time_per_iteration": 2.8845601081848145 + }, + { + "auxiliary_loss_clip": 0.01342394, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.23177493, + "balance_loss_mlp": 1.01949549, + "epoch": 0.7903201563204569, + "flos": 22351534931880.0, + "grad_norm": 1.8276585575169977, + "language_loss": 0.83319765, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85695738, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14080811, + "step": 13145, + "time_per_iteration": 2.879244089126587 + }, + { + "auxiliary_loss_clip": 0.01331651, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.22595108, + "balance_loss_mlp": 1.01818526, + "epoch": 0.790380279573125, + "flos": 22058519030640.0, + "grad_norm": 1.5078098670224693, + "language_loss": 0.73154414, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75516772, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12518311, + "step": 13146, + "time_per_iteration": 2.834336519241333 + }, + { + "auxiliary_loss_clip": 0.01343937, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.2320919, + "balance_loss_mlp": 1.01582408, + "epoch": 0.7904404028257929, + "flos": 20307408261120.0, + "grad_norm": 1.9995768780912033, + "language_loss": 0.75850081, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.78222942, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13110352, + "step": 13147, + "time_per_iteration": 2.7976233959198 + }, + { + "auxiliary_loss_clip": 0.01338462, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.2311157, + "balance_loss_mlp": 1.0232141, + "epoch": 0.7905005260784609, + "flos": 20013458367600.0, + "grad_norm": 1.757181469965702, + "language_loss": 0.72244275, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74619812, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.13842773, + "step": 13148, + "time_per_iteration": 2.775127649307251 + }, + { + "auxiliary_loss_clip": 0.01343014, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.23302865, + "balance_loss_mlp": 1.01915681, + "epoch": 0.7905606493311288, + "flos": 26911517877720.0, + "grad_norm": 1.6466965667365752, + "language_loss": 0.71754521, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.74130356, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13665771, + "step": 13149, + "time_per_iteration": 2.7795135974884033 + }, + { + "auxiliary_loss_clip": 0.01343812, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.23274314, + "balance_loss_mlp": 1.0184902, + "epoch": 0.7906207725837968, + "flos": 23701540681080.0, + "grad_norm": 1.8991972440169453, + "language_loss": 0.65138543, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67515564, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14733887, + "step": 13150, + "time_per_iteration": 2.93011474609375 + }, + { + "auxiliary_loss_clip": 0.0134163, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.23247266, + "balance_loss_mlp": 1.01952243, + "epoch": 0.7906808958364647, + "flos": 20853579635280.0, + "grad_norm": 1.6512101157075805, + "language_loss": 0.70271504, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72645557, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12884521, + "step": 13151, + "time_per_iteration": 2.738612174987793 + }, + { + "auxiliary_loss_clip": 0.0135097, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_clip": 1.23909283, + "balance_loss_mlp": 1.02909255, + "epoch": 0.7907410190891327, + "flos": 40742746148160.0, + "grad_norm": 1.701280059607747, + "language_loss": 0.7023077, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72624993, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.1416626, + "step": 13152, + "time_per_iteration": 2.91650390625 + }, + { + "auxiliary_loss_clip": 0.01341728, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.23247576, + "balance_loss_mlp": 1.01652348, + "epoch": 0.7908011423418007, + "flos": 13265094564360.0, + "grad_norm": 1.865064147370942, + "language_loss": 0.7284736, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.75218451, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12854004, + "step": 13153, + "time_per_iteration": 2.741959810256958 + }, + { + "auxiliary_loss_clip": 0.0134265, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.23411393, + "balance_loss_mlp": 1.02455139, + "epoch": 0.7908612655944687, + "flos": 19759409510760.0, + "grad_norm": 1.5531072429554793, + "language_loss": 0.79173613, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81553978, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13153076, + "step": 13154, + "time_per_iteration": 2.7758195400238037 + }, + { + "auxiliary_loss_clip": 0.01358077, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.2419194, + "balance_loss_mlp": 1.01680303, + "epoch": 0.7909213888471366, + "flos": 21292677099360.0, + "grad_norm": 2.008226583586474, + "language_loss": 0.70524412, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72914231, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.14929199, + "step": 13155, + "time_per_iteration": 2.777174234390259 + }, + { + "auxiliary_loss_clip": 0.01347875, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.23748171, + "balance_loss_mlp": 1.02174997, + "epoch": 0.7909815120998046, + "flos": 22534065820800.0, + "grad_norm": 3.215649332492916, + "language_loss": 0.7693212, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79315728, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.13989258, + "step": 13156, + "time_per_iteration": 2.736686944961548 + }, + { + "auxiliary_loss_clip": 0.01346663, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.23780179, + "balance_loss_mlp": 1.02215207, + "epoch": 0.7910416353524725, + "flos": 26733819383640.0, + "grad_norm": 1.5388420066533441, + "language_loss": 0.65541744, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67924166, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13604736, + "step": 13157, + "time_per_iteration": 2.852804660797119 + }, + { + "auxiliary_loss_clip": 0.01345419, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.23413563, + "balance_loss_mlp": 1.02471447, + "epoch": 0.7911017586051405, + "flos": 24650278976520.0, + "grad_norm": 1.6322694498996966, + "language_loss": 0.74331355, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76716083, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14593506, + "step": 13158, + "time_per_iteration": 2.7907896041870117 + }, + { + "auxiliary_loss_clip": 0.01334811, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.22817945, + "balance_loss_mlp": 1.02040958, + "epoch": 0.7911618818578086, + "flos": 17569851010920.0, + "grad_norm": 2.538615279191666, + "language_loss": 0.67320287, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69688106, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12609863, + "step": 13159, + "time_per_iteration": 2.726594924926758 + }, + { + "auxiliary_loss_clip": 0.0133374, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.22661185, + "balance_loss_mlp": 1.02185512, + "epoch": 0.7912220051104765, + "flos": 16725303432000.0, + "grad_norm": 1.7749549089032248, + "language_loss": 0.67038178, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69406426, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12658691, + "step": 13160, + "time_per_iteration": 2.7233636379241943 + }, + { + "auxiliary_loss_clip": 0.01327602, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.22401595, + "balance_loss_mlp": 1.01634121, + "epoch": 0.7912821283631445, + "flos": 13702689519120.0, + "grad_norm": 1.8520263600609161, + "language_loss": 0.72847247, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75202656, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.11468506, + "step": 13161, + "time_per_iteration": 2.708738088607788 + }, + { + "auxiliary_loss_clip": 0.01337725, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.23145342, + "balance_loss_mlp": 1.01375842, + "epoch": 0.7913422516158124, + "flos": 23774317508160.0, + "grad_norm": 2.1083591465262312, + "language_loss": 0.7339552, + "learning_rate": 4.39481372557418e-07, + "loss": 0.7576021, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13214111, + "step": 13162, + "time_per_iteration": 2.8579635620117188 + }, + { + "auxiliary_loss_clip": 0.01347227, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.2354033, + "balance_loss_mlp": 1.01819205, + "epoch": 0.7914023748684804, + "flos": 19943077433760.0, + "grad_norm": 1.9093396362592099, + "language_loss": 0.72145462, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74524117, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13220215, + "step": 13163, + "time_per_iteration": 2.772956371307373 + }, + { + "auxiliary_loss_clip": 0.01340577, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.23244834, + "balance_loss_mlp": 1.01528096, + "epoch": 0.7914624981211483, + "flos": 20599693211880.0, + "grad_norm": 1.8108535986880856, + "language_loss": 0.69660449, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72030258, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1395874, + "step": 13164, + "time_per_iteration": 2.749755859375 + }, + { + "auxiliary_loss_clip": 0.01337813, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.22988987, + "balance_loss_mlp": 1.01926088, + "epoch": 0.7915226213738163, + "flos": 21804591998880.0, + "grad_norm": 1.9574568142158697, + "language_loss": 0.66886371, + "learning_rate": 4.387508652677177e-07, + "loss": 0.69256914, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13464355, + "step": 13165, + "time_per_iteration": 2.7363064289093018 + }, + { + "auxiliary_loss_clip": 0.01332214, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.22746956, + "balance_loss_mlp": 1.01540446, + "epoch": 0.7915827446264843, + "flos": 16292540872080.0, + "grad_norm": 6.593423710336049, + "language_loss": 0.7224291, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74603188, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12664795, + "step": 13166, + "time_per_iteration": 2.871825933456421 + }, + { + "auxiliary_loss_clip": 0.01332249, + "auxiliary_loss_mlp": 0.01035304, + "balance_loss_clip": 1.2244978, + "balance_loss_mlp": 1.02152967, + "epoch": 0.7916428678791523, + "flos": 25708121600040.0, + "grad_norm": 1.5829296107073259, + "language_loss": 0.776335, + "learning_rate": 4.382641564061462e-07, + "loss": 0.8000105, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13769531, + "step": 13167, + "time_per_iteration": 2.7999236583709717 + }, + { + "auxiliary_loss_clip": 0.01336208, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.22943115, + "balance_loss_mlp": 1.01910865, + "epoch": 0.7917029911318202, + "flos": 23883827919840.0, + "grad_norm": 1.6388864558308973, + "language_loss": 0.84459627, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86827338, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.1239624, + "step": 13168, + "time_per_iteration": 4.315226793289185 + }, + { + "auxiliary_loss_clip": 0.01339607, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.23098326, + "balance_loss_mlp": 1.01859069, + "epoch": 0.7917631143844882, + "flos": 21650794847640.0, + "grad_norm": 1.6941253134711904, + "language_loss": 0.7307415, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.75445384, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13024902, + "step": 13169, + "time_per_iteration": 2.7944741249084473 + }, + { + "auxiliary_loss_clip": 0.01347597, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.23639107, + "balance_loss_mlp": 1.02291942, + "epoch": 0.7918232376371561, + "flos": 38881475233200.0, + "grad_norm": 1.9394289584352804, + "language_loss": 0.66866475, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69250935, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13934326, + "step": 13170, + "time_per_iteration": 5.799527168273926 + }, + { + "auxiliary_loss_clip": 0.01346039, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.23723722, + "balance_loss_mlp": 1.01762962, + "epoch": 0.7918833608898241, + "flos": 20780234291160.0, + "grad_norm": 1.574491895339915, + "language_loss": 0.7121039, + "learning_rate": 4.372914494109412e-07, + "loss": 0.7358675, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12701416, + "step": 13171, + "time_per_iteration": 2.7280545234680176 + }, + { + "auxiliary_loss_clip": 0.01338398, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.23004878, + "balance_loss_mlp": 1.01731455, + "epoch": 0.7919434841424922, + "flos": 33916488864480.0, + "grad_norm": 1.7063476726416142, + "language_loss": 0.67801833, + "learning_rate": 4.370484207842553e-07, + "loss": 0.70170522, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1295166, + "step": 13172, + "time_per_iteration": 2.885631799697876 + }, + { + "auxiliary_loss_clip": 0.0133826, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.22978616, + "balance_loss_mlp": 1.01510072, + "epoch": 0.7920036073951601, + "flos": 21069026922960.0, + "grad_norm": 1.7135611763930432, + "language_loss": 0.79987848, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.82354856, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13659668, + "step": 13173, + "time_per_iteration": 2.810056447982788 + }, + { + "auxiliary_loss_clip": 0.01339952, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.23115826, + "balance_loss_mlp": 1.01935291, + "epoch": 0.7920637306478281, + "flos": 23660786868840.0, + "grad_norm": 1.8839138664715982, + "language_loss": 0.76870465, + "learning_rate": 4.365625413419365e-07, + "loss": 0.79241961, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12188721, + "step": 13174, + "time_per_iteration": 2.727105140686035 + }, + { + "auxiliary_loss_clip": 0.01339303, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.23335421, + "balance_loss_mlp": 1.02282429, + "epoch": 0.792123853900496, + "flos": 27200757201480.0, + "grad_norm": 2.221079164230076, + "language_loss": 0.71739221, + "learning_rate": 4.363196905447297e-07, + "loss": 0.74113369, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12023926, + "step": 13175, + "time_per_iteration": 2.817335605621338 + }, + { + "auxiliary_loss_clip": 0.01337828, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.22934842, + "balance_loss_mlp": 1.01797462, + "epoch": 0.792183977153164, + "flos": 19103321641320.0, + "grad_norm": 1.8889454381401307, + "language_loss": 0.59847069, + "learning_rate": 4.360768990424364e-07, + "loss": 0.62216103, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13238525, + "step": 13176, + "time_per_iteration": 2.6778059005737305 + }, + { + "auxiliary_loss_clip": 0.01339558, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.233132, + "balance_loss_mlp": 1.01989985, + "epoch": 0.7922441004058319, + "flos": 17133311873520.0, + "grad_norm": 1.7432447276317398, + "language_loss": 0.73402089, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75775218, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13665771, + "step": 13177, + "time_per_iteration": 4.2130372524261475 + }, + { + "auxiliary_loss_clip": 0.01336751, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.23070776, + "balance_loss_mlp": 1.02295852, + "epoch": 0.7923042236585, + "flos": 17826133327560.0, + "grad_norm": 2.0079980487543563, + "language_loss": 0.64312279, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66684741, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12744141, + "step": 13178, + "time_per_iteration": 2.839745044708252 + }, + { + "auxiliary_loss_clip": 0.01335923, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.22801876, + "balance_loss_mlp": 1.01737297, + "epoch": 0.7923643469111679, + "flos": 29941319470320.0, + "grad_norm": 1.429679784414081, + "language_loss": 0.68756616, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.71121621, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.11724854, + "step": 13179, + "time_per_iteration": 2.7726528644561768 + }, + { + "auxiliary_loss_clip": 0.01328292, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.22164917, + "balance_loss_mlp": 1.01997662, + "epoch": 0.7924244701638359, + "flos": 22679903733480.0, + "grad_norm": 1.8630635426109485, + "language_loss": 0.74716389, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.77077818, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13153076, + "step": 13180, + "time_per_iteration": 2.7464182376861572 + }, + { + "auxiliary_loss_clip": 0.01344719, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.23419356, + "balance_loss_mlp": 1.02080476, + "epoch": 0.7924845934165038, + "flos": 17972742799080.0, + "grad_norm": 1.8866006479009807, + "language_loss": 0.81446886, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83826864, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14434814, + "step": 13181, + "time_per_iteration": 2.855288505554199 + }, + { + "auxiliary_loss_clip": 0.01336515, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.22877705, + "balance_loss_mlp": 1.01940894, + "epoch": 0.7925447166691718, + "flos": 23482194990840.0, + "grad_norm": 1.8786342046301727, + "language_loss": 0.77736127, + "learning_rate": 4.346213957372895e-07, + "loss": 0.80105948, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13897705, + "step": 13182, + "time_per_iteration": 2.883291244506836 + }, + { + "auxiliary_loss_clip": 0.01349172, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.23561835, + "balance_loss_mlp": 1.02254939, + "epoch": 0.7926048399218397, + "flos": 20452271573160.0, + "grad_norm": 1.9640156656813645, + "language_loss": 0.74512041, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76898485, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.1472168, + "step": 13183, + "time_per_iteration": 2.7241692543029785 + }, + { + "auxiliary_loss_clip": 0.01337861, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.22991967, + "balance_loss_mlp": 1.01556671, + "epoch": 0.7926649631745077, + "flos": 37167179265000.0, + "grad_norm": 1.6054007189491981, + "language_loss": 0.68350917, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70717096, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12768555, + "step": 13184, + "time_per_iteration": 2.916349172592163 + }, + { + "auxiliary_loss_clip": 0.01346481, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.23672438, + "balance_loss_mlp": 1.01799011, + "epoch": 0.7927250864271758, + "flos": 17022948686280.0, + "grad_norm": 1.807993359868464, + "language_loss": 0.71129465, + "learning_rate": 4.338944453112907e-07, + "loss": 0.73507529, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13604736, + "step": 13185, + "time_per_iteration": 2.7631633281707764 + }, + { + "auxiliary_loss_clip": 0.01345895, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.23439038, + "balance_loss_mlp": 1.01742792, + "epoch": 0.7927852096798437, + "flos": 17753762584080.0, + "grad_norm": 1.9590509196447017, + "language_loss": 0.65350509, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.6772759, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13757324, + "step": 13186, + "time_per_iteration": 2.805846691131592 + }, + { + "auxiliary_loss_clip": 0.01333358, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.22603798, + "balance_loss_mlp": 1.01845074, + "epoch": 0.7928453329325117, + "flos": 23843277149400.0, + "grad_norm": 2.250076305189251, + "language_loss": 0.7726801, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79632425, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1260376, + "step": 13187, + "time_per_iteration": 2.7909393310546875 + }, + { + "auxiliary_loss_clip": 0.01337959, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.23070228, + "balance_loss_mlp": 1.02252007, + "epoch": 0.7929054561851796, + "flos": 17459122348440.0, + "grad_norm": 2.049525230687138, + "language_loss": 0.72639358, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.75012869, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13037109, + "step": 13188, + "time_per_iteration": 2.678314208984375 + }, + { + "auxiliary_loss_clip": 0.01344658, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.23370039, + "balance_loss_mlp": 1.02214193, + "epoch": 0.7929655794378476, + "flos": 21986351328960.0, + "grad_norm": 2.693967209525575, + "language_loss": 0.63035274, + "learning_rate": 4.329260095357725e-07, + "loss": 0.65416193, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14117432, + "step": 13189, + "time_per_iteration": 2.8054234981536865 + }, + { + "auxiliary_loss_clip": 0.01342211, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.23334551, + "balance_loss_mlp": 1.02074099, + "epoch": 0.7930257026905155, + "flos": 17278297010640.0, + "grad_norm": 1.8719923193154826, + "language_loss": 0.72541142, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74917161, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.13061523, + "step": 13190, + "time_per_iteration": 2.7730183601379395 + }, + { + "auxiliary_loss_clip": 0.01336043, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.23102427, + "balance_loss_mlp": 1.01851082, + "epoch": 0.7930858259431836, + "flos": 27305354001600.0, + "grad_norm": 1.640034284299937, + "language_loss": 0.73321939, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75688326, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.1184082, + "step": 13191, + "time_per_iteration": 2.76373553276062 + }, + { + "auxiliary_loss_clip": 0.01340332, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.23181915, + "balance_loss_mlp": 1.02298188, + "epoch": 0.7931459491958515, + "flos": 19868351405400.0, + "grad_norm": 1.6831471855363653, + "language_loss": 0.69599378, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71976548, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1385498, + "step": 13192, + "time_per_iteration": 2.714418649673462 + }, + { + "auxiliary_loss_clip": 0.0134548, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.236094, + "balance_loss_mlp": 1.01885939, + "epoch": 0.7932060724485195, + "flos": 23152120638120.0, + "grad_norm": 1.5611868115297833, + "language_loss": 0.75508898, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77885973, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1272583, + "step": 13193, + "time_per_iteration": 2.715632915496826 + }, + { + "auxiliary_loss_clip": 0.01342949, + "auxiliary_loss_mlp": 0.01038982, + "balance_loss_clip": 1.23399174, + "balance_loss_mlp": 1.02439713, + "epoch": 0.7932661957011874, + "flos": 29940913386720.0, + "grad_norm": 1.4277950009908404, + "language_loss": 0.72489101, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74871033, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.14599609, + "step": 13194, + "time_per_iteration": 2.787461519241333 + }, + { + "auxiliary_loss_clip": 0.01351521, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.23811591, + "balance_loss_mlp": 1.01938272, + "epoch": 0.7933263189538554, + "flos": 22563489900600.0, + "grad_norm": 1.8920110819923799, + "language_loss": 0.70376122, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72761005, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13964844, + "step": 13195, + "time_per_iteration": 2.768508195877075 + }, + { + "auxiliary_loss_clip": 0.01340722, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.23238719, + "balance_loss_mlp": 1.02107716, + "epoch": 0.7933864422065233, + "flos": 25483943514960.0, + "grad_norm": 1.4976375355472318, + "language_loss": 0.77462053, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79836971, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13122559, + "step": 13196, + "time_per_iteration": 2.8862407207489014 + }, + { + "auxiliary_loss_clip": 0.01346217, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.23574436, + "balance_loss_mlp": 1.02723873, + "epoch": 0.7934465654591913, + "flos": 33589663180560.0, + "grad_norm": 1.4615581996131655, + "language_loss": 0.68827438, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71214318, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13421631, + "step": 13197, + "time_per_iteration": 2.815046787261963 + }, + { + "auxiliary_loss_clip": 0.01341631, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.23329186, + "balance_loss_mlp": 1.01979685, + "epoch": 0.7935066887118594, + "flos": 31439031116760.0, + "grad_norm": 1.5606637590334287, + "language_loss": 0.65162301, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67536914, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13171387, + "step": 13198, + "time_per_iteration": 2.8531107902526855 + }, + { + "auxiliary_loss_clip": 0.01346309, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.23471665, + "balance_loss_mlp": 1.01972222, + "epoch": 0.7935668119645273, + "flos": 14686537064760.0, + "grad_norm": 2.10251860136883, + "language_loss": 0.7322886, + "learning_rate": 4.30509081032864e-07, + "loss": 0.75609356, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14459229, + "step": 13199, + "time_per_iteration": 2.6888673305511475 + }, + { + "auxiliary_loss_clip": 0.0134475, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.23546684, + "balance_loss_mlp": 1.01753473, + "epoch": 0.7936269352171953, + "flos": 18008786041560.0, + "grad_norm": 2.1304174135669065, + "language_loss": 0.80947995, + "learning_rate": 4.302677153653349e-07, + "loss": 0.83322775, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12506104, + "step": 13200, + "time_per_iteration": 2.827812910079956 + }, + { + "auxiliary_loss_clip": 0.01334282, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.23042858, + "balance_loss_mlp": 1.01628113, + "epoch": 0.7936870584698632, + "flos": 18884909943360.0, + "grad_norm": 1.6040267369095944, + "language_loss": 0.77628899, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79992175, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12713623, + "step": 13201, + "time_per_iteration": 2.7046256065368652 + }, + { + "auxiliary_loss_clip": 0.01334635, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.22743559, + "balance_loss_mlp": 1.02260756, + "epoch": 0.7937471817225312, + "flos": 23372156670480.0, + "grad_norm": 1.5035488479504895, + "language_loss": 0.67478013, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69848359, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13122559, + "step": 13202, + "time_per_iteration": 2.7857329845428467 + }, + { + "auxiliary_loss_clip": 0.01344096, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.23426533, + "balance_loss_mlp": 1.01996326, + "epoch": 0.7938073049751991, + "flos": 22679578866600.0, + "grad_norm": 2.093850116001342, + "language_loss": 0.74812496, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.771909, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14349365, + "step": 13203, + "time_per_iteration": 2.7961010932922363 + }, + { + "auxiliary_loss_clip": 0.01343282, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.23332787, + "balance_loss_mlp": 1.02074814, + "epoch": 0.7938674282278672, + "flos": 22855896676440.0, + "grad_norm": 2.2989097209767793, + "language_loss": 0.66672146, + "learning_rate": 4.293028480307643e-07, + "loss": 0.69048733, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12554932, + "step": 13204, + "time_per_iteration": 2.8372273445129395 + }, + { + "auxiliary_loss_clip": 0.01334833, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.22737277, + "balance_loss_mlp": 1.02042747, + "epoch": 0.7939275514805351, + "flos": 27017739012240.0, + "grad_norm": 1.3394403123776621, + "language_loss": 0.79626298, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81994772, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.13226318, + "step": 13205, + "time_per_iteration": 2.796049118041992 + }, + { + "auxiliary_loss_clip": 0.01333975, + "auxiliary_loss_mlp": 0.01029286, + "balance_loss_clip": 1.22830343, + "balance_loss_mlp": 1.01614881, + "epoch": 0.7939876747332031, + "flos": 21147895004040.0, + "grad_norm": 2.0015610498505025, + "language_loss": 0.77610731, + "learning_rate": 4.28820771692858e-07, + "loss": 0.7997399, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13140869, + "step": 13206, + "time_per_iteration": 2.709529161453247 + }, + { + "auxiliary_loss_clip": 0.01348439, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.23671389, + "balance_loss_mlp": 1.01831532, + "epoch": 0.794047797985871, + "flos": 23293572847920.0, + "grad_norm": 4.8189227649527115, + "language_loss": 0.79001594, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81381744, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13391113, + "step": 13207, + "time_per_iteration": 2.9075605869293213 + }, + { + "auxiliary_loss_clip": 0.01341253, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.23309159, + "balance_loss_mlp": 1.02332914, + "epoch": 0.794107921238539, + "flos": 24613464175200.0, + "grad_norm": 1.6974087628058694, + "language_loss": 0.83698142, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.8607614, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13427734, + "step": 13208, + "time_per_iteration": 5.665689468383789 + }, + { + "auxiliary_loss_clip": 0.01152479, + "auxiliary_loss_mlp": 0.0100089, + "balance_loss_clip": 1.10894096, + "balance_loss_mlp": 0.99818385, + "epoch": 0.7941680444912069, + "flos": 64110046796640.0, + "grad_norm": 0.7267288957066805, + "language_loss": 0.58442318, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60595685, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02709961, + "step": 13209, + "time_per_iteration": 4.868506193161011 + }, + { + "auxiliary_loss_clip": 0.01348369, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.23538423, + "balance_loss_mlp": 1.02103686, + "epoch": 0.794228167743875, + "flos": 24394159093320.0, + "grad_norm": 2.706921847608261, + "language_loss": 0.63641942, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.66025627, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14294434, + "step": 13210, + "time_per_iteration": 2.7932145595550537 + }, + { + "auxiliary_loss_clip": 0.0134131, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.23291779, + "balance_loss_mlp": 1.0227071, + "epoch": 0.794288290996543, + "flos": 28518455677320.0, + "grad_norm": 2.101512886812407, + "language_loss": 0.69118583, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71495813, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13214111, + "step": 13211, + "time_per_iteration": 2.77661395072937 + }, + { + "auxiliary_loss_clip": 0.01348031, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.2355516, + "balance_loss_mlp": 1.02657163, + "epoch": 0.7943484142492109, + "flos": 25927751548800.0, + "grad_norm": 1.6321315914434682, + "language_loss": 0.7242496, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74814069, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.1449585, + "step": 13212, + "time_per_iteration": 2.771155595779419 + }, + { + "auxiliary_loss_clip": 0.01335738, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.23082685, + "balance_loss_mlp": 1.0175519, + "epoch": 0.7944085375018789, + "flos": 23920642721160.0, + "grad_norm": 1.611348518199626, + "language_loss": 0.80595094, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82960761, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12384033, + "step": 13213, + "time_per_iteration": 2.723644256591797 + }, + { + "auxiliary_loss_clip": 0.01353361, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.24078977, + "balance_loss_mlp": 1.01959944, + "epoch": 0.7944686607545468, + "flos": 20234793867480.0, + "grad_norm": 2.153688345628309, + "language_loss": 0.67803639, + "learning_rate": 4.268948502428327e-07, + "loss": 0.7019074, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14141846, + "step": 13214, + "time_per_iteration": 2.7511236667633057 + }, + { + "auxiliary_loss_clip": 0.01334217, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.22820091, + "balance_loss_mlp": 1.02066183, + "epoch": 0.7945287840072148, + "flos": 21986026462080.0, + "grad_norm": 1.7611841104207846, + "language_loss": 0.72497666, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.7486521, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12664795, + "step": 13215, + "time_per_iteration": 2.7094857692718506 + }, + { + "auxiliary_loss_clip": 0.01339731, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.2333169, + "balance_loss_mlp": 1.02218688, + "epoch": 0.7945889072598827, + "flos": 26403582597480.0, + "grad_norm": 1.5563486381837446, + "language_loss": 0.78884542, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.81259513, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13049316, + "step": 13216, + "time_per_iteration": 4.218875408172607 + }, + { + "auxiliary_loss_clip": 0.01343836, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.23434079, + "balance_loss_mlp": 1.02062535, + "epoch": 0.7946490305125508, + "flos": 25815560985360.0, + "grad_norm": 1.5578189864453487, + "language_loss": 0.73883957, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76261491, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13079834, + "step": 13217, + "time_per_iteration": 2.847357749938965 + }, + { + "auxiliary_loss_clip": 0.01336185, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.23044133, + "balance_loss_mlp": 1.02070117, + "epoch": 0.7947091537652187, + "flos": 15965227887840.0, + "grad_norm": 1.714755208423164, + "language_loss": 0.73711419, + "learning_rate": 4.259333208810907e-07, + "loss": 0.760813, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12994385, + "step": 13218, + "time_per_iteration": 2.7019407749176025 + }, + { + "auxiliary_loss_clip": 0.01343795, + "auxiliary_loss_mlp": 0.01033017, + "balance_loss_clip": 1.23245251, + "balance_loss_mlp": 1.01901019, + "epoch": 0.7947692770178867, + "flos": 18592462559160.0, + "grad_norm": 1.8428439291580978, + "language_loss": 0.83480495, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85857308, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13989258, + "step": 13219, + "time_per_iteration": 2.7653415203094482 + }, + { + "auxiliary_loss_clip": 0.01356004, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.24327683, + "balance_loss_mlp": 1.01914799, + "epoch": 0.7948294002705546, + "flos": 20446220927520.0, + "grad_norm": 1.9400325671017977, + "language_loss": 0.759022, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.78292048, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14691162, + "step": 13220, + "time_per_iteration": 2.783214807510376 + }, + { + "auxiliary_loss_clip": 0.0134408, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.23202205, + "balance_loss_mlp": 1.02082002, + "epoch": 0.7948895235232226, + "flos": 38188410129000.0, + "grad_norm": 5.45898696229778, + "language_loss": 0.72810215, + "learning_rate": 4.252128005599176e-07, + "loss": 0.75188267, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13146973, + "step": 13221, + "time_per_iteration": 2.890930652618408 + }, + { + "auxiliary_loss_clip": 0.01337982, + "auxiliary_loss_mlp": 0.01030949, + "balance_loss_clip": 1.23129654, + "balance_loss_mlp": 1.0179615, + "epoch": 0.7949496467758905, + "flos": 15564041650800.0, + "grad_norm": 1.8173846007507788, + "language_loss": 0.74550301, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76919234, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13000488, + "step": 13222, + "time_per_iteration": 2.73525333404541 + }, + { + "auxiliary_loss_clip": 0.01152419, + "auxiliary_loss_mlp": 0.01000945, + "balance_loss_clip": 1.10919595, + "balance_loss_mlp": 0.99815571, + "epoch": 0.7950097700285585, + "flos": 70911736022520.0, + "grad_norm": 0.7670700263343653, + "language_loss": 0.67077523, + "learning_rate": 4.247327522443993e-07, + "loss": 0.6923089, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.0279541, + "step": 13223, + "time_per_iteration": 3.1659085750579834 + }, + { + "auxiliary_loss_clip": 0.01341637, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.23272216, + "balance_loss_mlp": 1.01889193, + "epoch": 0.7950698932812266, + "flos": 23956970222160.0, + "grad_norm": 1.7069829753978183, + "language_loss": 0.71228004, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73601937, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13409424, + "step": 13224, + "time_per_iteration": 2.82784104347229 + }, + { + "auxiliary_loss_clip": 0.01151718, + "auxiliary_loss_mlp": 0.01002694, + "balance_loss_clip": 1.10866368, + "balance_loss_mlp": 0.99999982, + "epoch": 0.7951300165338945, + "flos": 60296251335840.0, + "grad_norm": 0.6833848779089504, + "language_loss": 0.5503341, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57187819, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02697754, + "step": 13225, + "time_per_iteration": 3.284771680831909 + }, + { + "auxiliary_loss_clip": 0.0133128, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.22630596, + "balance_loss_mlp": 1.01504982, + "epoch": 0.7951901397865625, + "flos": 22823995486680.0, + "grad_norm": 1.8648078859069608, + "language_loss": 0.65328276, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.6768719, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12579346, + "step": 13226, + "time_per_iteration": 3.06657075881958 + }, + { + "auxiliary_loss_clip": 0.01339696, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.23124492, + "balance_loss_mlp": 1.02369952, + "epoch": 0.7952502630392304, + "flos": 35701977933720.0, + "grad_norm": 2.4918648448248226, + "language_loss": 0.70149326, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72525907, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13171387, + "step": 13227, + "time_per_iteration": 2.980404853820801 + }, + { + "auxiliary_loss_clip": 0.01338102, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.2322377, + "balance_loss_mlp": 1.019292, + "epoch": 0.7953103862918984, + "flos": 25635385381320.0, + "grad_norm": 1.7138274480843154, + "language_loss": 0.70049095, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.72418821, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12322998, + "step": 13228, + "time_per_iteration": 2.890923261642456 + }, + { + "auxiliary_loss_clip": 0.0134326, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.23350322, + "balance_loss_mlp": 1.02483165, + "epoch": 0.7953705095445663, + "flos": 40559403092040.0, + "grad_norm": 1.318591726873926, + "language_loss": 0.70824659, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73206306, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13562012, + "step": 13229, + "time_per_iteration": 2.9786651134490967 + }, + { + "auxiliary_loss_clip": 0.01353541, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.24074733, + "balance_loss_mlp": 1.02259314, + "epoch": 0.7954306327972344, + "flos": 27642494208960.0, + "grad_norm": 1.804921672425972, + "language_loss": 0.72146058, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74535561, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 1.12841797, + "router_z_loss_mlp": 0.13378906, + "step": 13230, + "time_per_iteration": 2.9662580490112305 + }, + { + "auxiliary_loss_clip": 0.01153099, + "auxiliary_loss_mlp": 0.01003541, + "balance_loss_clip": 1.10997438, + "balance_loss_mlp": 1.00076389, + "epoch": 0.7954907560499023, + "flos": 59520802204440.0, + "grad_norm": 0.8970397097039712, + "language_loss": 0.63585275, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65741915, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02783203, + "step": 13231, + "time_per_iteration": 3.2983474731445312 + }, + { + "auxiliary_loss_clip": 0.01336978, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.22969127, + "balance_loss_mlp": 1.01786113, + "epoch": 0.7955508793025703, + "flos": 20125364672520.0, + "grad_norm": 1.6302133423691065, + "language_loss": 0.69739223, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72107178, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13110352, + "step": 13232, + "time_per_iteration": 2.747380495071411 + }, + { + "auxiliary_loss_clip": 0.01338809, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.2306577, + "balance_loss_mlp": 1.01748168, + "epoch": 0.7956110025552382, + "flos": 26511021982800.0, + "grad_norm": 1.5191455067403181, + "language_loss": 0.78226829, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80597097, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13989258, + "step": 13233, + "time_per_iteration": 2.7850501537323 + }, + { + "auxiliary_loss_clip": 0.01341594, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.23215079, + "balance_loss_mlp": 1.01969361, + "epoch": 0.7956711258079062, + "flos": 22570677580320.0, + "grad_norm": 1.9813210285131047, + "language_loss": 0.78834569, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81209028, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.1315918, + "step": 13234, + "time_per_iteration": 2.7818984985351562 + }, + { + "auxiliary_loss_clip": 0.01344192, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.23507154, + "balance_loss_mlp": 1.02055871, + "epoch": 0.7957312490605741, + "flos": 17382812594040.0, + "grad_norm": 1.6209444947853644, + "language_loss": 0.69795001, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72172195, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12438965, + "step": 13235, + "time_per_iteration": 2.707310914993286 + }, + { + "auxiliary_loss_clip": 0.01342463, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.2327168, + "balance_loss_mlp": 1.01951659, + "epoch": 0.7957913723132422, + "flos": 22496479460640.0, + "grad_norm": 1.4412628787645907, + "language_loss": 0.67858803, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70234501, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13720703, + "step": 13236, + "time_per_iteration": 2.744706630706787 + }, + { + "auxiliary_loss_clip": 0.01335808, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.22906935, + "balance_loss_mlp": 1.01719284, + "epoch": 0.7958514955659101, + "flos": 22643251365600.0, + "grad_norm": 1.60609932359835, + "language_loss": 0.75174022, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77539772, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12750244, + "step": 13237, + "time_per_iteration": 2.762180805206299 + }, + { + "auxiliary_loss_clip": 0.01338142, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.22960556, + "balance_loss_mlp": 1.02190113, + "epoch": 0.7959116188185781, + "flos": 20709122406840.0, + "grad_norm": 1.7955893090685842, + "language_loss": 0.71525979, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73900282, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.14251709, + "step": 13238, + "time_per_iteration": 2.8489420413970947 + }, + { + "auxiliary_loss_clip": 0.01343073, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.2335192, + "balance_loss_mlp": 1.01732063, + "epoch": 0.7959717420712461, + "flos": 19029204738360.0, + "grad_norm": 1.7496563543332426, + "language_loss": 0.73892891, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.76266229, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.1295166, + "step": 13239, + "time_per_iteration": 2.8343615531921387 + }, + { + "auxiliary_loss_clip": 0.0134615, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.2350558, + "balance_loss_mlp": 1.01780057, + "epoch": 0.796031865323914, + "flos": 26361894792960.0, + "grad_norm": 1.6699959134018922, + "language_loss": 0.69513768, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71890765, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13031006, + "step": 13240, + "time_per_iteration": 2.8400535583496094 + }, + { + "auxiliary_loss_clip": 0.01152364, + "auxiliary_loss_mlp": 0.01005503, + "balance_loss_clip": 1.1095736, + "balance_loss_mlp": 1.0027138, + "epoch": 0.796091988576582, + "flos": 62083242914400.0, + "grad_norm": 0.889084403290044, + "language_loss": 0.5874567, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60903537, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.0279541, + "step": 13241, + "time_per_iteration": 3.09786057472229 + }, + { + "auxiliary_loss_clip": 0.01341944, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.23383808, + "balance_loss_mlp": 1.02165723, + "epoch": 0.7961521118292499, + "flos": 39027394362600.0, + "grad_norm": 1.8020698731655507, + "language_loss": 0.64845169, + "learning_rate": 4.201842205128772e-07, + "loss": 0.67221212, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12457275, + "step": 13242, + "time_per_iteration": 2.860323905944824 + }, + { + "auxiliary_loss_clip": 0.01339652, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.22999275, + "balance_loss_mlp": 1.02156973, + "epoch": 0.796212235081918, + "flos": 21767939631000.0, + "grad_norm": 1.7908523643388021, + "language_loss": 0.76184499, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78558928, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13208008, + "step": 13243, + "time_per_iteration": 2.772313356399536 + }, + { + "auxiliary_loss_clip": 0.01342541, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.23316884, + "balance_loss_mlp": 1.01888561, + "epoch": 0.7962723583345859, + "flos": 21183857029800.0, + "grad_norm": 2.0381816231240486, + "language_loss": 0.7975812, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.82133222, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13684082, + "step": 13244, + "time_per_iteration": 2.766425132751465 + }, + { + "auxiliary_loss_clip": 0.01346349, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.23419642, + "balance_loss_mlp": 1.01864147, + "epoch": 0.7963324815872539, + "flos": 17133068223360.0, + "grad_norm": 2.015538475540641, + "language_loss": 0.68524241, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70903623, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.14385986, + "step": 13245, + "time_per_iteration": 2.7543187141418457 + }, + { + "auxiliary_loss_clip": 0.01342291, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.2331295, + "balance_loss_mlp": 1.02380431, + "epoch": 0.7963926048399218, + "flos": 21402471769560.0, + "grad_norm": 1.4558898751744747, + "language_loss": 0.79309833, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81689012, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13085938, + "step": 13246, + "time_per_iteration": 4.236014127731323 + }, + { + "auxiliary_loss_clip": 0.01345312, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.23363519, + "balance_loss_mlp": 1.01756716, + "epoch": 0.7964527280925898, + "flos": 25188653545560.0, + "grad_norm": 1.785488445843698, + "language_loss": 0.66494411, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68870771, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13476562, + "step": 13247, + "time_per_iteration": 4.190492391586304 + }, + { + "auxiliary_loss_clip": 0.01333868, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.22828937, + "balance_loss_mlp": 1.01787734, + "epoch": 0.7965128513452577, + "flos": 27021556198080.0, + "grad_norm": 1.9180482435953201, + "language_loss": 0.72166884, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.74530941, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12304688, + "step": 13248, + "time_per_iteration": 4.3198559284210205 + }, + { + "auxiliary_loss_clip": 0.01351411, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.23925614, + "balance_loss_mlp": 1.01857662, + "epoch": 0.7965729745979258, + "flos": 24424151690160.0, + "grad_norm": 2.4727350366252896, + "language_loss": 0.76302624, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78685975, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13336182, + "step": 13249, + "time_per_iteration": 2.812274932861328 + }, + { + "auxiliary_loss_clip": 0.01335469, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.22871935, + "balance_loss_mlp": 1.01771712, + "epoch": 0.7966330978505937, + "flos": 18844643431440.0, + "grad_norm": 2.1267229616226304, + "language_loss": 0.61521691, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63887334, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12469482, + "step": 13250, + "time_per_iteration": 2.7919821739196777 + }, + { + "auxiliary_loss_clip": 0.01338843, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.23037124, + "balance_loss_mlp": 1.01625872, + "epoch": 0.7966932211032617, + "flos": 13156315103160.0, + "grad_norm": 2.1060750282203333, + "language_loss": 0.72515142, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74883211, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12982178, + "step": 13251, + "time_per_iteration": 2.7499215602874756 + }, + { + "auxiliary_loss_clip": 0.01348101, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.23642206, + "balance_loss_mlp": 1.01884794, + "epoch": 0.7967533443559297, + "flos": 23954858587440.0, + "grad_norm": 2.9678981067737586, + "language_loss": 0.73270428, + "learning_rate": 4.177989389787624e-07, + "loss": 0.7565186, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14477539, + "step": 13252, + "time_per_iteration": 2.769763708114624 + }, + { + "auxiliary_loss_clip": 0.01332083, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.22605705, + "balance_loss_mlp": 1.01951444, + "epoch": 0.7968134676085976, + "flos": 30374406897120.0, + "grad_norm": 1.7498212413336969, + "language_loss": 0.66589177, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68953353, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12585449, + "step": 13253, + "time_per_iteration": 2.7945399284362793 + }, + { + "auxiliary_loss_clip": 0.01347112, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.23709416, + "balance_loss_mlp": 1.02417254, + "epoch": 0.7968735908612656, + "flos": 23080155978240.0, + "grad_norm": 1.6101830393750907, + "language_loss": 0.6797539, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70360738, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.140625, + "step": 13254, + "time_per_iteration": 2.742330312728882 + }, + { + "auxiliary_loss_clip": 0.01336401, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.2286948, + "balance_loss_mlp": 1.02435803, + "epoch": 0.7969337141139335, + "flos": 23586954224400.0, + "grad_norm": 2.0789951114637373, + "language_loss": 0.69496262, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71870017, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12982178, + "step": 13255, + "time_per_iteration": 4.31689715385437 + }, + { + "auxiliary_loss_clip": 0.01330891, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.22426248, + "balance_loss_mlp": 1.02042508, + "epoch": 0.7969938373666016, + "flos": 19760668369920.0, + "grad_norm": 1.7617837421009142, + "language_loss": 0.79193044, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81556976, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12634277, + "step": 13256, + "time_per_iteration": 2.6850123405456543 + }, + { + "auxiliary_loss_clip": 0.01342151, + "auxiliary_loss_mlp": 0.0102742, + "balance_loss_clip": 1.23319077, + "balance_loss_mlp": 1.01413417, + "epoch": 0.7970539606192695, + "flos": 24139460502720.0, + "grad_norm": 1.643184694317934, + "language_loss": 0.66054165, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68423736, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13299561, + "step": 13257, + "time_per_iteration": 2.7639379501342773 + }, + { + "auxiliary_loss_clip": 0.01352862, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.23945308, + "balance_loss_mlp": 1.01829672, + "epoch": 0.7971140838719375, + "flos": 17973270707760.0, + "grad_norm": 1.8696018225634299, + "language_loss": 0.72526085, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74910486, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13250732, + "step": 13258, + "time_per_iteration": 2.6762101650238037 + }, + { + "auxiliary_loss_clip": 0.01342605, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.23238528, + "balance_loss_mlp": 1.01579928, + "epoch": 0.7971742071246054, + "flos": 19173946225320.0, + "grad_norm": 1.6278760597383626, + "language_loss": 0.69181091, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71553445, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13952637, + "step": 13259, + "time_per_iteration": 2.731837034225464 + }, + { + "auxiliary_loss_clip": 0.01334062, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.22871017, + "balance_loss_mlp": 1.01604617, + "epoch": 0.7972343303772734, + "flos": 27131716343520.0, + "grad_norm": 1.6409885600356842, + "language_loss": 0.73577762, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75940615, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12750244, + "step": 13260, + "time_per_iteration": 2.7585184574127197 + }, + { + "auxiliary_loss_clip": 0.01331267, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.22494125, + "balance_loss_mlp": 1.01892209, + "epoch": 0.7972944536299413, + "flos": 21001651007760.0, + "grad_norm": 1.7311107724456871, + "language_loss": 0.78909868, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.81272894, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12841797, + "step": 13261, + "time_per_iteration": 2.8071835041046143 + }, + { + "auxiliary_loss_clip": 0.01323473, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.22140861, + "balance_loss_mlp": 1.01971078, + "epoch": 0.7973545768826094, + "flos": 21585043266840.0, + "grad_norm": 1.5597252864521776, + "language_loss": 0.76123512, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78477961, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.11273193, + "step": 13262, + "time_per_iteration": 2.8063409328460693 + }, + { + "auxiliary_loss_clip": 0.01351543, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.23926854, + "balance_loss_mlp": 1.01848125, + "epoch": 0.7974147001352773, + "flos": 20563446927600.0, + "grad_norm": 2.3169294116683092, + "language_loss": 0.70935959, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.73320162, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.1418457, + "step": 13263, + "time_per_iteration": 2.772036552429199 + }, + { + "auxiliary_loss_clip": 0.01351587, + "auxiliary_loss_mlp": 0.010391, + "balance_loss_clip": 1.23704958, + "balance_loss_mlp": 1.02495039, + "epoch": 0.7974748233879453, + "flos": 21001813441200.0, + "grad_norm": 1.6757708835745002, + "language_loss": 0.7161603, + "learning_rate": 4.149445215631153e-07, + "loss": 0.74006724, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 1.14501953, + "router_z_loss_mlp": 0.14135742, + "step": 13264, + "time_per_iteration": 2.7674057483673096 + }, + { + "auxiliary_loss_clip": 0.01332471, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.22719753, + "balance_loss_mlp": 1.02090549, + "epoch": 0.7975349466406133, + "flos": 22570352713440.0, + "grad_norm": 1.6429194248762777, + "language_loss": 0.77312732, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79678309, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12213135, + "step": 13265, + "time_per_iteration": 2.7091426849365234 + }, + { + "auxiliary_loss_clip": 0.01344869, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.23440075, + "balance_loss_mlp": 1.01629376, + "epoch": 0.7975950698932812, + "flos": 21694594286880.0, + "grad_norm": 1.7930987723723664, + "language_loss": 0.75415689, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77790105, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13238525, + "step": 13266, + "time_per_iteration": 2.8110544681549072 + }, + { + "auxiliary_loss_clip": 0.013363, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.22869563, + "balance_loss_mlp": 1.01602709, + "epoch": 0.7976551931459492, + "flos": 19608657986520.0, + "grad_norm": 1.9687793870860273, + "language_loss": 0.83835804, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86200726, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12597656, + "step": 13267, + "time_per_iteration": 2.7173471450805664 + }, + { + "auxiliary_loss_clip": 0.01337161, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.22989869, + "balance_loss_mlp": 1.01907516, + "epoch": 0.7977153163986171, + "flos": 21692401435440.0, + "grad_norm": 1.4754361017825581, + "language_loss": 0.76350212, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78719896, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13452148, + "step": 13268, + "time_per_iteration": 2.800755500793457 + }, + { + "auxiliary_loss_clip": 0.01340205, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.23279345, + "balance_loss_mlp": 1.01483881, + "epoch": 0.7977754396512852, + "flos": 23481991949040.0, + "grad_norm": 1.5351079548235251, + "language_loss": 0.77778435, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80146706, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13238525, + "step": 13269, + "time_per_iteration": 2.787513494491577 + }, + { + "auxiliary_loss_clip": 0.01335404, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.23067427, + "balance_loss_mlp": 1.01899815, + "epoch": 0.7978355629039531, + "flos": 22387293915840.0, + "grad_norm": 1.6552170556797259, + "language_loss": 0.8203367, + "learning_rate": 4.135205575764922e-07, + "loss": 0.8440029, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12219238, + "step": 13270, + "time_per_iteration": 2.791353464126587 + }, + { + "auxiliary_loss_clip": 0.01340014, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.2318747, + "balance_loss_mlp": 1.01790333, + "epoch": 0.7978956861566211, + "flos": 20271161976840.0, + "grad_norm": 1.7010029192555347, + "language_loss": 0.59858769, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.62230176, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13476562, + "step": 13271, + "time_per_iteration": 2.7701637744903564 + }, + { + "auxiliary_loss_clip": 0.01350307, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.23765779, + "balance_loss_mlp": 1.0201602, + "epoch": 0.797955809409289, + "flos": 28118487691080.0, + "grad_norm": 1.3944177940210292, + "language_loss": 0.73986268, + "learning_rate": 4.130463840939975e-07, + "loss": 0.7636956, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12854004, + "step": 13272, + "time_per_iteration": 2.8775370121002197 + }, + { + "auxiliary_loss_clip": 0.01335062, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.22839141, + "balance_loss_mlp": 1.01905441, + "epoch": 0.798015932661957, + "flos": 15563798000640.0, + "grad_norm": 1.869973733154601, + "language_loss": 0.71743339, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74110198, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12744141, + "step": 13273, + "time_per_iteration": 2.7560956478118896 + }, + { + "auxiliary_loss_clip": 0.01345831, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.23518336, + "balance_loss_mlp": 1.01901126, + "epoch": 0.7980760559146249, + "flos": 23956482921840.0, + "grad_norm": 1.835141425851657, + "language_loss": 0.75925267, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.78303427, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13317871, + "step": 13274, + "time_per_iteration": 2.849247932434082 + }, + { + "auxiliary_loss_clip": 0.01332238, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.22854352, + "balance_loss_mlp": 1.01664281, + "epoch": 0.798136179167293, + "flos": 28043964704520.0, + "grad_norm": 1.6159826595583895, + "language_loss": 0.77863657, + "learning_rate": 4.12335575223518e-07, + "loss": 0.8022452, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.11962891, + "step": 13275, + "time_per_iteration": 2.9273879528045654 + }, + { + "auxiliary_loss_clip": 0.01347121, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.23623466, + "balance_loss_mlp": 1.02267683, + "epoch": 0.7981963024199609, + "flos": 35990486307000.0, + "grad_norm": 2.0307181587754686, + "language_loss": 0.64834791, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.67217821, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13232422, + "step": 13276, + "time_per_iteration": 2.8289551734924316 + }, + { + "auxiliary_loss_clip": 0.01334891, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.22992051, + "balance_loss_mlp": 1.01941037, + "epoch": 0.7982564256726289, + "flos": 25890449447160.0, + "grad_norm": 1.5572378471967088, + "language_loss": 0.6091454, + "learning_rate": 4.118620036501945e-07, + "loss": 0.6328131, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12475586, + "step": 13277, + "time_per_iteration": 2.822418689727783 + }, + { + "auxiliary_loss_clip": 0.01347919, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.23709846, + "balance_loss_mlp": 1.0186553, + "epoch": 0.7983165489252969, + "flos": 25744692751200.0, + "grad_norm": 1.9169423220251243, + "language_loss": 0.79465157, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81844425, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12695312, + "step": 13278, + "time_per_iteration": 2.738614082336426 + }, + { + "auxiliary_loss_clip": 0.01342627, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.23255181, + "balance_loss_mlp": 1.02303553, + "epoch": 0.7983766721779648, + "flos": 21913249635000.0, + "grad_norm": 2.3327154835463664, + "language_loss": 0.63600528, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65980405, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.14227295, + "step": 13279, + "time_per_iteration": 2.7303011417388916 + }, + { + "auxiliary_loss_clip": 0.01324122, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.22162044, + "balance_loss_mlp": 1.01555204, + "epoch": 0.7984367954306328, + "flos": 29352932382960.0, + "grad_norm": 1.991791152214885, + "language_loss": 0.7155543, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73907828, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.1270752, + "step": 13280, + "time_per_iteration": 2.770108461380005 + }, + { + "auxiliary_loss_clip": 0.01348214, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.23707914, + "balance_loss_mlp": 1.02585018, + "epoch": 0.7984969186833007, + "flos": 31364020829880.0, + "grad_norm": 2.0144641095877196, + "language_loss": 0.62692028, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.65079337, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 1.11083984, + "router_z_loss_mlp": 0.13262939, + "step": 13281, + "time_per_iteration": 2.904160499572754 + }, + { + "auxiliary_loss_clip": 0.01348719, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.23612833, + "balance_loss_mlp": 1.0227809, + "epoch": 0.7985570419359688, + "flos": 24317849338920.0, + "grad_norm": 1.6653562609981007, + "language_loss": 0.80615371, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8300066, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13787842, + "step": 13282, + "time_per_iteration": 2.8195252418518066 + }, + { + "auxiliary_loss_clip": 0.01342027, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.23127723, + "balance_loss_mlp": 1.01726186, + "epoch": 0.7986171651886367, + "flos": 15746491323000.0, + "grad_norm": 1.9403631979746314, + "language_loss": 0.71567869, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73940462, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13299561, + "step": 13283, + "time_per_iteration": 2.8022141456604004 + }, + { + "auxiliary_loss_clip": 0.01337251, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.22972202, + "balance_loss_mlp": 1.02261066, + "epoch": 0.7986772884413047, + "flos": 11623128731280.0, + "grad_norm": 2.1557726772387835, + "language_loss": 0.74165159, + "learning_rate": 4.102064006186967e-07, + "loss": 0.76537061, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12042236, + "step": 13284, + "time_per_iteration": 2.74676251411438 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.23084223, + "balance_loss_mlp": 1.01939416, + "epoch": 0.7987374116939726, + "flos": 22096186607520.0, + "grad_norm": 1.736978372707182, + "language_loss": 0.70712727, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.73081893, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.11505127, + "step": 13285, + "time_per_iteration": 4.214546203613281 + }, + { + "auxiliary_loss_clip": 0.01341565, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.23343718, + "balance_loss_mlp": 1.02050638, + "epoch": 0.7987975349466406, + "flos": 17894686885200.0, + "grad_norm": 1.552353100754058, + "language_loss": 0.73894119, + "learning_rate": 4.097339136128437e-07, + "loss": 0.7626884, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12652588, + "step": 13286, + "time_per_iteration": 4.320472002029419 + }, + { + "auxiliary_loss_clip": 0.01337897, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.22966671, + "balance_loss_mlp": 1.02254224, + "epoch": 0.7988576581993085, + "flos": 19724016002040.0, + "grad_norm": 1.9298735377373064, + "language_loss": 0.75278366, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.7765218, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13378906, + "step": 13287, + "time_per_iteration": 4.3307390213012695 + }, + { + "auxiliary_loss_clip": 0.01335984, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.22955298, + "balance_loss_mlp": 1.01690078, + "epoch": 0.7989177814519766, + "flos": 28042015503240.0, + "grad_norm": 1.4143546097177775, + "language_loss": 0.62145197, + "learning_rate": 4.092616678191863e-07, + "loss": 0.6451059, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12506104, + "step": 13288, + "time_per_iteration": 2.80926775932312 + }, + { + "auxiliary_loss_clip": 0.01333612, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.22814286, + "balance_loss_mlp": 1.02370226, + "epoch": 0.7989779047046445, + "flos": 28876207950360.0, + "grad_norm": 2.8423953278386627, + "language_loss": 0.70595872, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72965717, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12536621, + "step": 13289, + "time_per_iteration": 2.7982683181762695 + }, + { + "auxiliary_loss_clip": 0.01327536, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.22438693, + "balance_loss_mlp": 1.01852155, + "epoch": 0.7990380279573125, + "flos": 18191479363920.0, + "grad_norm": 2.2457742654452093, + "language_loss": 0.62845969, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.65205026, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.13006592, + "step": 13290, + "time_per_iteration": 2.7108991146087646 + }, + { + "auxiliary_loss_clip": 0.01338839, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.22956228, + "balance_loss_mlp": 1.01857305, + "epoch": 0.7990981512099805, + "flos": 20883978315720.0, + "grad_norm": 1.82371678414729, + "language_loss": 0.71781492, + "learning_rate": 4.08553751558248e-07, + "loss": 0.74152684, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13800049, + "step": 13291, + "time_per_iteration": 2.7940289974212646 + }, + { + "auxiliary_loss_clip": 0.01329962, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.22386563, + "balance_loss_mlp": 1.01990485, + "epoch": 0.7991582744626484, + "flos": 26104962742560.0, + "grad_norm": 1.5325060927603462, + "language_loss": 0.63600862, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65962899, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12164307, + "step": 13292, + "time_per_iteration": 2.7817981243133545 + }, + { + "auxiliary_loss_clip": 0.01336392, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.22926843, + "balance_loss_mlp": 1.02078569, + "epoch": 0.7992183977153164, + "flos": 35302294206000.0, + "grad_norm": 1.6107350996283907, + "language_loss": 0.56013548, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58383077, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12365723, + "step": 13293, + "time_per_iteration": 4.357420921325684 + }, + { + "auxiliary_loss_clip": 0.01333464, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.2263391, + "balance_loss_mlp": 1.01744795, + "epoch": 0.7992785209679844, + "flos": 51861130678440.0, + "grad_norm": 2.6492021711445988, + "language_loss": 0.71952659, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.74316937, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13360596, + "step": 13294, + "time_per_iteration": 3.0751993656158447 + }, + { + "auxiliary_loss_clip": 0.0134103, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.23249388, + "balance_loss_mlp": 1.01987863, + "epoch": 0.7993386442206524, + "flos": 22570271496720.0, + "grad_norm": 1.780090841074748, + "language_loss": 0.72599208, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74972659, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12548828, + "step": 13295, + "time_per_iteration": 2.7102761268615723 + }, + { + "auxiliary_loss_clip": 0.01332593, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.22753167, + "balance_loss_mlp": 1.02256382, + "epoch": 0.7993987674733203, + "flos": 18804052052640.0, + "grad_norm": 1.8956191314298336, + "language_loss": 0.76342422, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78709775, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12188721, + "step": 13296, + "time_per_iteration": 2.8323447704315186 + }, + { + "auxiliary_loss_clip": 0.01154687, + "auxiliary_loss_mlp": 0.01008838, + "balance_loss_clip": 1.10997295, + "balance_loss_mlp": 1.00608432, + "epoch": 0.7994588907259883, + "flos": 69438917300760.0, + "grad_norm": 0.6927714090403915, + "language_loss": 0.6082195, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62985468, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.02758789, + "step": 13297, + "time_per_iteration": 3.2849488258361816 + }, + { + "auxiliary_loss_clip": 0.01331033, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.22501314, + "balance_loss_mlp": 1.02202654, + "epoch": 0.7995190139786562, + "flos": 13484318429520.0, + "grad_norm": 2.00882122470604, + "language_loss": 0.70352745, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72718078, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.1227417, + "step": 13298, + "time_per_iteration": 2.7588634490966797 + }, + { + "auxiliary_loss_clip": 0.01344409, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.23290515, + "balance_loss_mlp": 1.01634836, + "epoch": 0.7995791372313242, + "flos": 21657860702280.0, + "grad_norm": 1.9998642459797042, + "language_loss": 0.75717509, + "learning_rate": 4.066686308212037e-07, + "loss": 0.78092945, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14678955, + "step": 13299, + "time_per_iteration": 2.722342014312744 + }, + { + "auxiliary_loss_clip": 0.0133144, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.22689211, + "balance_loss_mlp": 1.01854908, + "epoch": 0.7996392604839921, + "flos": 26073792503280.0, + "grad_norm": 1.8057041876896611, + "language_loss": 0.78030634, + "learning_rate": 4.064332625220828e-07, + "loss": 0.80393052, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12445068, + "step": 13300, + "time_per_iteration": 2.8013179302215576 + }, + { + "auxiliary_loss_clip": 0.01344797, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.23387814, + "balance_loss_mlp": 1.01909184, + "epoch": 0.7996993837366602, + "flos": 24612124099320.0, + "grad_norm": 1.7271061853989362, + "language_loss": 0.63714439, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.66091371, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1305542, + "step": 13301, + "time_per_iteration": 2.7671799659729004 + }, + { + "auxiliary_loss_clip": 0.01329899, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.22455883, + "balance_loss_mlp": 1.024647, + "epoch": 0.7997595069893281, + "flos": 20996453137680.0, + "grad_norm": 1.7004010815997603, + "language_loss": 0.71952367, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74320203, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.13287354, + "step": 13302, + "time_per_iteration": 2.8031976222991943 + }, + { + "auxiliary_loss_clip": 0.01342211, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.23231411, + "balance_loss_mlp": 1.01804006, + "epoch": 0.7998196302419961, + "flos": 24431989103640.0, + "grad_norm": 1.7604595566438308, + "language_loss": 0.83922935, + "learning_rate": 4.057275202296684e-07, + "loss": 0.86296356, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13165283, + "step": 13303, + "time_per_iteration": 2.83172607421875 + }, + { + "auxiliary_loss_clip": 0.0132867, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.22389579, + "balance_loss_mlp": 1.01718831, + "epoch": 0.7998797534946641, + "flos": 30270947131080.0, + "grad_norm": 1.7088539279338664, + "language_loss": 0.59025943, + "learning_rate": 4.054923936969166e-07, + "loss": 0.61383796, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11999512, + "step": 13304, + "time_per_iteration": 2.837597370147705 + }, + { + "auxiliary_loss_clip": 0.01339058, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.22799623, + "balance_loss_mlp": 1.01922107, + "epoch": 0.799939876747332, + "flos": 23519294050680.0, + "grad_norm": 2.037713587236292, + "language_loss": 0.69303286, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71674573, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13024902, + "step": 13305, + "time_per_iteration": 2.8043220043182373 + }, + { + "auxiliary_loss_clip": 0.01325833, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.2209667, + "balance_loss_mlp": 1.01644266, + "epoch": 0.8, + "flos": 19322870373360.0, + "grad_norm": 1.4711539654072125, + "language_loss": 0.6957981, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71934116, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12023926, + "step": 13306, + "time_per_iteration": 2.730687379837036 + }, + { + "auxiliary_loss_clip": 0.0133942, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.23033023, + "balance_loss_mlp": 1.0200913, + "epoch": 0.800060123252668, + "flos": 32418371134440.0, + "grad_norm": 1.3687998851918755, + "language_loss": 0.69639903, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.72012019, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12615967, + "step": 13307, + "time_per_iteration": 2.9612574577331543 + }, + { + "auxiliary_loss_clip": 0.01340237, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.23138118, + "balance_loss_mlp": 1.02511263, + "epoch": 0.800120246505336, + "flos": 20015366960520.0, + "grad_norm": 1.8016565577119887, + "language_loss": 0.77385449, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79763889, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13098145, + "step": 13308, + "time_per_iteration": 2.761218309402466 + }, + { + "auxiliary_loss_clip": 0.01346127, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.23380601, + "balance_loss_mlp": 1.01629806, + "epoch": 0.8001803697580039, + "flos": 31874555045160.0, + "grad_norm": 1.4496211646001365, + "language_loss": 0.79192531, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81569391, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14428711, + "step": 13309, + "time_per_iteration": 2.866518259048462 + }, + { + "auxiliary_loss_clip": 0.01150408, + "auxiliary_loss_mlp": 0.01001983, + "balance_loss_clip": 1.10558534, + "balance_loss_mlp": 0.99893111, + "epoch": 0.8002404930106719, + "flos": 63407235686040.0, + "grad_norm": 0.902879461936018, + "language_loss": 0.64693964, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66846353, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.03051758, + "step": 13310, + "time_per_iteration": 3.2252352237701416 + }, + { + "auxiliary_loss_clip": 0.01334248, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.22629833, + "balance_loss_mlp": 1.02516031, + "epoch": 0.8003006162633398, + "flos": 27861311990520.0, + "grad_norm": 1.9474506985890585, + "language_loss": 0.83055317, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85427606, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12884521, + "step": 13311, + "time_per_iteration": 2.828171491622925 + }, + { + "auxiliary_loss_clip": 0.01334422, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.22721314, + "balance_loss_mlp": 1.01583564, + "epoch": 0.8003607395160078, + "flos": 18227928690000.0, + "grad_norm": 1.942282823462757, + "language_loss": 0.66400802, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68763781, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1272583, + "step": 13312, + "time_per_iteration": 2.7571256160736084 + }, + { + "auxiliary_loss_clip": 0.01341454, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.23124194, + "balance_loss_mlp": 1.02059722, + "epoch": 0.8004208627686757, + "flos": 20891937554280.0, + "grad_norm": 1.8871591247275765, + "language_loss": 0.75690287, + "learning_rate": 4.033789768462843e-07, + "loss": 0.7806648, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14141846, + "step": 13313, + "time_per_iteration": 2.847407817840576 + }, + { + "auxiliary_loss_clip": 0.0133604, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.22836924, + "balance_loss_mlp": 1.01945913, + "epoch": 0.8004809860213438, + "flos": 26442265383360.0, + "grad_norm": 1.3294076887139452, + "language_loss": 0.76226419, + "learning_rate": 4.031444553532575e-07, + "loss": 0.78594685, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12762451, + "step": 13314, + "time_per_iteration": 2.8909428119659424 + }, + { + "auxiliary_loss_clip": 0.01151921, + "auxiliary_loss_mlp": 0.01007255, + "balance_loss_clip": 1.10662687, + "balance_loss_mlp": 1.00408363, + "epoch": 0.8005411092740117, + "flos": 63663964694640.0, + "grad_norm": 0.780733481858197, + "language_loss": 0.53788936, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55948114, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.03173828, + "step": 13315, + "time_per_iteration": 3.2226364612579346 + }, + { + "auxiliary_loss_clip": 0.01331446, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.22438884, + "balance_loss_mlp": 1.01638007, + "epoch": 0.8006012325266797, + "flos": 36145014408720.0, + "grad_norm": 2.845391281100484, + "language_loss": 0.71461582, + "learning_rate": 4.026755940348603e-07, + "loss": 0.7382232, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12902832, + "step": 13316, + "time_per_iteration": 3.0703723430633545 + }, + { + "auxiliary_loss_clip": 0.01341874, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.23060417, + "balance_loss_mlp": 1.01804256, + "epoch": 0.8006613557793477, + "flos": 33845539413600.0, + "grad_norm": 1.6046873514722746, + "language_loss": 0.64730179, + "learning_rate": 4.024412542272706e-07, + "loss": 0.67102998, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.12902832, + "step": 13317, + "time_per_iteration": 3.01936411857605 + }, + { + "auxiliary_loss_clip": 0.01152854, + "auxiliary_loss_mlp": 0.01005026, + "balance_loss_clip": 1.1073842, + "balance_loss_mlp": 1.00226021, + "epoch": 0.8007214790320156, + "flos": 67366056892320.0, + "grad_norm": 0.7885353350241202, + "language_loss": 0.59124041, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61281919, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.02770996, + "step": 13318, + "time_per_iteration": 3.3037869930267334 + }, + { + "auxiliary_loss_clip": 0.013344, + "auxiliary_loss_mlp": 0.01025043, + "balance_loss_clip": 1.22669339, + "balance_loss_mlp": 1.012097, + "epoch": 0.8007816022846836, + "flos": 23190925249080.0, + "grad_norm": 1.8435804992052884, + "language_loss": 0.6656127, + "learning_rate": 4.019727563597366e-07, + "loss": 0.6892072, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12969971, + "step": 13319, + "time_per_iteration": 2.8307595252990723 + }, + { + "auxiliary_loss_clip": 0.01338506, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.22793508, + "balance_loss_mlp": 1.02112627, + "epoch": 0.8008417255373516, + "flos": 21986473154040.0, + "grad_norm": 1.9733355189816864, + "language_loss": 0.74050021, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76423562, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13916016, + "step": 13320, + "time_per_iteration": 2.770458459854126 + }, + { + "auxiliary_loss_clip": 0.01338364, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.2281481, + "balance_loss_mlp": 1.01853633, + "epoch": 0.8009018487900196, + "flos": 16731719552880.0, + "grad_norm": 1.9892136476164695, + "language_loss": 0.80979586, + "learning_rate": 4.015045008816138e-07, + "loss": 0.83349895, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.1340332, + "step": 13321, + "time_per_iteration": 2.735373020172119 + }, + { + "auxiliary_loss_clip": 0.01327674, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.22344089, + "balance_loss_mlp": 1.01841068, + "epoch": 0.8009619720426875, + "flos": 20818510993440.0, + "grad_norm": 2.0143924283182493, + "language_loss": 0.65569383, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.67927694, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12225342, + "step": 13322, + "time_per_iteration": 2.7503392696380615 + }, + { + "auxiliary_loss_clip": 0.01337919, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.23009706, + "balance_loss_mlp": 1.01588726, + "epoch": 0.8010220952953555, + "flos": 17935603130880.0, + "grad_norm": 1.770343170557277, + "language_loss": 0.77831995, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80198944, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13146973, + "step": 13323, + "time_per_iteration": 4.136154651641846 + }, + { + "auxiliary_loss_clip": 0.01341686, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.23038816, + "balance_loss_mlp": 1.01749134, + "epoch": 0.8010822185480234, + "flos": 24577705191240.0, + "grad_norm": 2.777910064259111, + "language_loss": 0.71722519, + "learning_rate": 4.00802572299932e-07, + "loss": 0.74094903, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13201904, + "step": 13324, + "time_per_iteration": 4.319352388381958 + }, + { + "auxiliary_loss_clip": 0.01340508, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.22962093, + "balance_loss_mlp": 1.0223949, + "epoch": 0.8011423418006914, + "flos": 21834340945560.0, + "grad_norm": 1.7359007674320976, + "language_loss": 0.76367223, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78743958, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13842773, + "step": 13325, + "time_per_iteration": 4.199001789093018 + }, + { + "auxiliary_loss_clip": 0.01318286, + "auxiliary_loss_mlp": 0.01027253, + "balance_loss_clip": 1.21613193, + "balance_loss_mlp": 1.01561213, + "epoch": 0.8012024650533593, + "flos": 23920520896080.0, + "grad_norm": 1.5233745530939276, + "language_loss": 0.79879326, + "learning_rate": 4.003349231059898e-07, + "loss": 0.8222487, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.11657715, + "step": 13326, + "time_per_iteration": 2.779677152633667 + }, + { + "auxiliary_loss_clip": 0.0132405, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.21969724, + "balance_loss_mlp": 1.0221653, + "epoch": 0.8012625883060274, + "flos": 23592395744640.0, + "grad_norm": 1.8700429843627544, + "language_loss": 0.66284502, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68642855, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12133789, + "step": 13327, + "time_per_iteration": 2.7281394004821777 + }, + { + "auxiliary_loss_clip": 0.01324314, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.22157669, + "balance_loss_mlp": 1.02116263, + "epoch": 0.8013227115586953, + "flos": 20818876468680.0, + "grad_norm": 1.5278135725566446, + "language_loss": 0.73846352, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.76204228, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.12402344, + "step": 13328, + "time_per_iteration": 2.8553097248077393 + }, + { + "auxiliary_loss_clip": 0.01338594, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.22826111, + "balance_loss_mlp": 1.02019835, + "epoch": 0.8013828348113633, + "flos": 15892613494200.0, + "grad_norm": 4.201908395068151, + "language_loss": 0.74333823, + "learning_rate": 3.996339042831798e-07, + "loss": 0.76706231, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13616943, + "step": 13329, + "time_per_iteration": 2.7375168800354004 + }, + { + "auxiliary_loss_clip": 0.01153157, + "auxiliary_loss_mlp": 0.01007912, + "balance_loss_clip": 1.10748899, + "balance_loss_mlp": 1.00524175, + "epoch": 0.8014429580640313, + "flos": 71080314616800.0, + "grad_norm": 0.7055095222311769, + "language_loss": 0.52984393, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55145466, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.0267334, + "step": 13330, + "time_per_iteration": 3.3666539192199707 + }, + { + "auxiliary_loss_clip": 0.01341618, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.22926247, + "balance_loss_mlp": 1.02115822, + "epoch": 0.8015030813166992, + "flos": 23081617879200.0, + "grad_norm": 3.176084571056016, + "language_loss": 0.73090881, + "learning_rate": 3.991668618167519e-07, + "loss": 0.75468397, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14727783, + "step": 13331, + "time_per_iteration": 2.8071885108947754 + }, + { + "auxiliary_loss_clip": 0.01328172, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.22171175, + "balance_loss_mlp": 1.01747036, + "epoch": 0.8015632045693672, + "flos": 21877450042680.0, + "grad_norm": 2.1149815066774993, + "language_loss": 0.77673012, + "learning_rate": 3.989334316347401e-07, + "loss": 0.80030835, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12176514, + "step": 13332, + "time_per_iteration": 4.3301990032196045 + }, + { + "auxiliary_loss_clip": 0.01340011, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.23129857, + "balance_loss_mlp": 1.0160712, + "epoch": 0.8016233278220352, + "flos": 23661639644400.0, + "grad_norm": 2.0229908488791097, + "language_loss": 0.83727825, + "learning_rate": 3.987000621653338e-07, + "loss": 0.86096644, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12744141, + "step": 13333, + "time_per_iteration": 2.802342653274536 + }, + { + "auxiliary_loss_clip": 0.01341421, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.23211217, + "balance_loss_mlp": 1.01252878, + "epoch": 0.8016834510747032, + "flos": 16257512838600.0, + "grad_norm": 1.588925300659274, + "language_loss": 0.73892337, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.76259601, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13311768, + "step": 13334, + "time_per_iteration": 2.8756473064422607 + }, + { + "auxiliary_loss_clip": 0.01329184, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.22481084, + "balance_loss_mlp": 1.0169934, + "epoch": 0.8017435743273711, + "flos": 12279825726120.0, + "grad_norm": 2.138817259205547, + "language_loss": 0.75090432, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.77449226, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12628174, + "step": 13335, + "time_per_iteration": 2.8398277759552 + }, + { + "auxiliary_loss_clip": 0.01329559, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.22303569, + "balance_loss_mlp": 1.01813626, + "epoch": 0.8018036975800391, + "flos": 17199835013160.0, + "grad_norm": 1.6670182138542275, + "language_loss": 0.75433254, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77794385, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13433838, + "step": 13336, + "time_per_iteration": 2.8475687503814697 + }, + { + "auxiliary_loss_clip": 0.01348001, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.23399544, + "balance_loss_mlp": 1.01622844, + "epoch": 0.801863820832707, + "flos": 20636832880080.0, + "grad_norm": 2.1659755646111103, + "language_loss": 0.75353134, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77731723, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.14349365, + "step": 13337, + "time_per_iteration": 2.778174877166748 + }, + { + "auxiliary_loss_clip": 0.0134397, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.23289275, + "balance_loss_mlp": 1.02617908, + "epoch": 0.801923944085375, + "flos": 30451691252160.0, + "grad_norm": 2.806661609144833, + "language_loss": 0.80836326, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.83219928, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13458252, + "step": 13338, + "time_per_iteration": 2.848332643508911 + }, + { + "auxiliary_loss_clip": 0.01337496, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.22631955, + "balance_loss_mlp": 1.01779687, + "epoch": 0.801984067338043, + "flos": 20015326352160.0, + "grad_norm": 2.194387811434763, + "language_loss": 0.75274891, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77643764, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13568115, + "step": 13339, + "time_per_iteration": 2.809049367904663 + }, + { + "auxiliary_loss_clip": 0.01327939, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.22208285, + "balance_loss_mlp": 1.01495445, + "epoch": 0.802044190590711, + "flos": 22789211103360.0, + "grad_norm": 1.5400667028284565, + "language_loss": 0.79282236, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81637716, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12597656, + "step": 13340, + "time_per_iteration": 2.7736268043518066 + }, + { + "auxiliary_loss_clip": 0.01334645, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.22677445, + "balance_loss_mlp": 1.01809776, + "epoch": 0.8021043138433789, + "flos": 27605923057800.0, + "grad_norm": 1.702263196223203, + "language_loss": 0.67625093, + "learning_rate": 3.968352931252936e-07, + "loss": 0.69990051, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12219238, + "step": 13341, + "time_per_iteration": 2.777524948120117 + }, + { + "auxiliary_loss_clip": 0.01150303, + "auxiliary_loss_mlp": 0.01004898, + "balance_loss_clip": 1.1053164, + "balance_loss_mlp": 1.0021801, + "epoch": 0.8021644370960469, + "flos": 62076786185160.0, + "grad_norm": 0.9070543859334272, + "language_loss": 0.61644179, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63799381, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02722168, + "step": 13342, + "time_per_iteration": 3.2002902030944824 + }, + { + "auxiliary_loss_clip": 0.0133572, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.22762406, + "balance_loss_mlp": 1.01860559, + "epoch": 0.8022245603487148, + "flos": 23366268458280.0, + "grad_norm": 1.8970844978159611, + "language_loss": 0.64091259, + "learning_rate": 3.963697086102522e-07, + "loss": 0.66459084, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13494873, + "step": 13343, + "time_per_iteration": 2.7791898250579834 + }, + { + "auxiliary_loss_clip": 0.01319629, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.21685505, + "balance_loss_mlp": 1.01277399, + "epoch": 0.8022846836013828, + "flos": 10857002541480.0, + "grad_norm": 1.790153121154904, + "language_loss": 0.69463539, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71807939, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.11993408, + "step": 13344, + "time_per_iteration": 2.7347400188446045 + }, + { + "auxiliary_loss_clip": 0.01336614, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.22781014, + "balance_loss_mlp": 1.01720977, + "epoch": 0.8023448068540509, + "flos": 29246264556480.0, + "grad_norm": 1.4502035155461264, + "language_loss": 0.70252424, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72619617, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13360596, + "step": 13345, + "time_per_iteration": 2.8592021465301514 + }, + { + "auxiliary_loss_clip": 0.01151229, + "auxiliary_loss_mlp": 0.0100396, + "balance_loss_clip": 1.10570621, + "balance_loss_mlp": 1.00101554, + "epoch": 0.8024049301067188, + "flos": 64168245222480.0, + "grad_norm": 0.895006564785637, + "language_loss": 0.63069999, + "learning_rate": 3.956717879334059e-07, + "loss": 0.6522519, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02941895, + "step": 13346, + "time_per_iteration": 3.2560219764709473 + }, + { + "auxiliary_loss_clip": 0.01328697, + "auxiliary_loss_mlp": 0.010263, + "balance_loss_clip": 1.22372818, + "balance_loss_mlp": 1.01339591, + "epoch": 0.8024650533593868, + "flos": 28591313721120.0, + "grad_norm": 4.077299539792421, + "language_loss": 0.72581333, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74936324, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12902832, + "step": 13347, + "time_per_iteration": 2.8137569427490234 + }, + { + "auxiliary_loss_clip": 0.01339357, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.22957504, + "balance_loss_mlp": 1.01642907, + "epoch": 0.8025251766120547, + "flos": 16986621185280.0, + "grad_norm": 2.1954942516580056, + "language_loss": 0.73472542, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.75841153, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12817383, + "step": 13348, + "time_per_iteration": 2.7831645011901855 + }, + { + "auxiliary_loss_clip": 0.01332391, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.22430062, + "balance_loss_mlp": 1.0151124, + "epoch": 0.8025852998647227, + "flos": 22168760392800.0, + "grad_norm": 1.6473268790605395, + "language_loss": 0.76190293, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78550398, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.1260376, + "step": 13349, + "time_per_iteration": 2.779876470565796 + }, + { + "auxiliary_loss_clip": 0.01332791, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.22531152, + "balance_loss_mlp": 1.01940656, + "epoch": 0.8026454231173906, + "flos": 22021988487840.0, + "grad_norm": 1.9462930488098513, + "language_loss": 0.8459059, + "learning_rate": 3.947420787800755e-07, + "loss": 0.86955023, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12231445, + "step": 13350, + "time_per_iteration": 2.7460622787475586 + }, + { + "auxiliary_loss_clip": 0.01332341, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.22516334, + "balance_loss_mlp": 1.01784432, + "epoch": 0.8027055463700586, + "flos": 22496357635560.0, + "grad_norm": 1.794188379467879, + "language_loss": 0.71767342, + "learning_rate": 3.945098036485679e-07, + "loss": 0.74130636, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13098145, + "step": 13351, + "time_per_iteration": 2.856849431991577 + }, + { + "auxiliary_loss_clip": 0.01327621, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.22193623, + "balance_loss_mlp": 1.0189395, + "epoch": 0.8027656696227266, + "flos": 28918789138800.0, + "grad_norm": 1.793609768101628, + "language_loss": 0.61953974, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.64313567, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.13024902, + "step": 13352, + "time_per_iteration": 2.823458194732666 + }, + { + "auxiliary_loss_clip": 0.01336167, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.22886503, + "balance_loss_mlp": 1.01874745, + "epoch": 0.8028257928753946, + "flos": 18594168110280.0, + "grad_norm": 1.9598482355906683, + "language_loss": 0.77084064, + "learning_rate": 3.940454360354046e-07, + "loss": 0.79452169, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.13201904, + "step": 13353, + "time_per_iteration": 2.754763603210449 + }, + { + "auxiliary_loss_clip": 0.01347818, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.23239625, + "balance_loss_mlp": 1.01715267, + "epoch": 0.8028859161280625, + "flos": 19134004580280.0, + "grad_norm": 2.0706110289079502, + "language_loss": 0.72976148, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75354761, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13641357, + "step": 13354, + "time_per_iteration": 2.785062313079834 + }, + { + "auxiliary_loss_clip": 0.01341114, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.23087215, + "balance_loss_mlp": 1.0180068, + "epoch": 0.8029460393807305, + "flos": 20234712650760.0, + "grad_norm": 2.252336169827505, + "language_loss": 0.66545928, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68918073, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13018799, + "step": 13355, + "time_per_iteration": 2.7389161586761475 + }, + { + "auxiliary_loss_clip": 0.01343748, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.23222482, + "balance_loss_mlp": 1.01815486, + "epoch": 0.8030061626333984, + "flos": 49792006239120.0, + "grad_norm": 2.3205901374052402, + "language_loss": 0.69017875, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.71393633, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13867188, + "step": 13356, + "time_per_iteration": 3.0574984550476074 + }, + { + "auxiliary_loss_clip": 0.01334337, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.22587991, + "balance_loss_mlp": 1.01527548, + "epoch": 0.8030662858860664, + "flos": 21620111908680.0, + "grad_norm": 1.5798383245870153, + "language_loss": 0.77774554, + "learning_rate": 3.931174316549666e-07, + "loss": 0.80137312, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13134766, + "step": 13357, + "time_per_iteration": 2.8382859230041504 + }, + { + "auxiliary_loss_clip": 0.01340844, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.2280724, + "balance_loss_mlp": 1.01629269, + "epoch": 0.8031264091387345, + "flos": 25635629031480.0, + "grad_norm": 1.3924087441828135, + "language_loss": 0.77578259, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79949379, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13989258, + "step": 13358, + "time_per_iteration": 2.836947202682495 + }, + { + "auxiliary_loss_clip": 0.01330823, + "auxiliary_loss_mlp": 0.01026891, + "balance_loss_clip": 1.22368622, + "balance_loss_mlp": 1.01435614, + "epoch": 0.8031865323914024, + "flos": 19650995524800.0, + "grad_norm": 1.5617855647247778, + "language_loss": 0.85134327, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87492043, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12548828, + "step": 13359, + "time_per_iteration": 2.7513487339019775 + }, + { + "auxiliary_loss_clip": 0.01328263, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.2222048, + "balance_loss_mlp": 1.01605535, + "epoch": 0.8032466556440704, + "flos": 26173678733640.0, + "grad_norm": 1.9163603256356947, + "language_loss": 0.74039674, + "learning_rate": 3.924220681368928e-07, + "loss": 0.76396847, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12860107, + "step": 13360, + "time_per_iteration": 2.8452227115631104 + }, + { + "auxiliary_loss_clip": 0.01334606, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.22459817, + "balance_loss_mlp": 1.01679397, + "epoch": 0.8033067788967383, + "flos": 25525509494400.0, + "grad_norm": 1.618934898179169, + "language_loss": 0.70005643, + "learning_rate": 3.921904022048512e-07, + "loss": 0.72369933, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12878418, + "step": 13361, + "time_per_iteration": 2.881216287612915 + }, + { + "auxiliary_loss_clip": 0.01333608, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.22181654, + "balance_loss_mlp": 1.02180803, + "epoch": 0.8033669021494063, + "flos": 24029584615800.0, + "grad_norm": 1.518171471125105, + "language_loss": 0.70106602, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72475708, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13677979, + "step": 13362, + "time_per_iteration": 5.713644742965698 + }, + { + "auxiliary_loss_clip": 0.01353796, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.23850572, + "balance_loss_mlp": 1.02429473, + "epoch": 0.8034270254020742, + "flos": 13591839031560.0, + "grad_norm": 3.0464302529352363, + "language_loss": 0.79643816, + "learning_rate": 3.91727253254452e-07, + "loss": 0.82037282, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.15362549, + "step": 13363, + "time_per_iteration": 4.156592607498169 + }, + { + "auxiliary_loss_clip": 0.01336481, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.2260294, + "balance_loss_mlp": 1.01361156, + "epoch": 0.8034871486547422, + "flos": 27417788215200.0, + "grad_norm": 2.276533598989545, + "language_loss": 0.74739331, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77103293, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13873291, + "step": 13364, + "time_per_iteration": 2.7881453037261963 + }, + { + "auxiliary_loss_clip": 0.01334128, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.2278347, + "balance_loss_mlp": 1.01558232, + "epoch": 0.8035472719074102, + "flos": 32605044076080.0, + "grad_norm": 2.133612944519407, + "language_loss": 0.60877883, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.63240314, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1270752, + "step": 13365, + "time_per_iteration": 2.9000282287597656 + }, + { + "auxiliary_loss_clip": 0.01336879, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.22615778, + "balance_loss_mlp": 1.02362549, + "epoch": 0.8036073951600782, + "flos": 21292961357880.0, + "grad_norm": 1.7036936524269406, + "language_loss": 0.65885472, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68259585, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13592529, + "step": 13366, + "time_per_iteration": 2.7862563133239746 + }, + { + "auxiliary_loss_clip": 0.01332198, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.22591937, + "balance_loss_mlp": 1.0180006, + "epoch": 0.8036675184127461, + "flos": 18118418278320.0, + "grad_norm": 1.983988817755942, + "language_loss": 0.75566798, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77929413, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12426758, + "step": 13367, + "time_per_iteration": 2.6925811767578125 + }, + { + "auxiliary_loss_clip": 0.01328897, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.22159886, + "balance_loss_mlp": 1.01401925, + "epoch": 0.8037276416654141, + "flos": 26035759451160.0, + "grad_norm": 1.555808267246083, + "language_loss": 0.74511266, + "learning_rate": 3.905704482846428e-07, + "loss": 0.7686643, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12261963, + "step": 13368, + "time_per_iteration": 2.8336195945739746 + }, + { + "auxiliary_loss_clip": 0.01337964, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.22728729, + "balance_loss_mlp": 1.02041459, + "epoch": 0.803787764918082, + "flos": 18806204295720.0, + "grad_norm": 2.1086285109106417, + "language_loss": 0.70144844, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72516668, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.13433838, + "step": 13369, + "time_per_iteration": 2.74062442779541 + }, + { + "auxiliary_loss_clip": 0.01332778, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.22596443, + "balance_loss_mlp": 1.0179776, + "epoch": 0.80384788817075, + "flos": 20230083297720.0, + "grad_norm": 1.5747583907904386, + "language_loss": 0.74266869, + "learning_rate": 3.901081534434312e-07, + "loss": 0.7663002, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12384033, + "step": 13370, + "time_per_iteration": 4.315093755722046 + }, + { + "auxiliary_loss_clip": 0.0134569, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.23309982, + "balance_loss_mlp": 1.01931167, + "epoch": 0.8039080114234181, + "flos": 18519929382240.0, + "grad_norm": 2.3494759962625764, + "language_loss": 0.86919957, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89298403, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13446045, + "step": 13371, + "time_per_iteration": 2.8031837940216064 + }, + { + "auxiliary_loss_clip": 0.01339276, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.22631741, + "balance_loss_mlp": 1.01857734, + "epoch": 0.803968134676086, + "flos": 22387618782720.0, + "grad_norm": 1.9176166077885213, + "language_loss": 0.75246894, + "learning_rate": 3.89646102791259e-07, + "loss": 0.77618515, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13763428, + "step": 13372, + "time_per_iteration": 2.8828859329223633 + }, + { + "auxiliary_loss_clip": 0.01331235, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.22344542, + "balance_loss_mlp": 1.02051759, + "epoch": 0.804028257928754, + "flos": 23847947110800.0, + "grad_norm": 2.170431724516015, + "language_loss": 0.79352134, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81717587, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13708496, + "step": 13373, + "time_per_iteration": 2.8525941371917725 + }, + { + "auxiliary_loss_clip": 0.01331631, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.2254107, + "balance_loss_mlp": 1.01856232, + "epoch": 0.8040883811814219, + "flos": 23556068243640.0, + "grad_norm": 1.5600778273529683, + "language_loss": 0.74404639, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76766515, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.11676025, + "step": 13374, + "time_per_iteration": 2.7944653034210205 + }, + { + "auxiliary_loss_clip": 0.01341132, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.22919858, + "balance_loss_mlp": 1.01720333, + "epoch": 0.8041485044340899, + "flos": 19030991506200.0, + "grad_norm": 2.382208387074428, + "language_loss": 0.69527054, + "learning_rate": 3.889534848207452e-07, + "loss": 0.71898794, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.1340332, + "step": 13375, + "time_per_iteration": 2.7587947845458984 + }, + { + "auxiliary_loss_clip": 0.01150269, + "auxiliary_loss_mlp": 0.01008795, + "balance_loss_clip": 1.10389817, + "balance_loss_mlp": 1.0057795, + "epoch": 0.8042086276867578, + "flos": 70022959293600.0, + "grad_norm": 0.7285233033302039, + "language_loss": 0.55668336, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57827401, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.03015137, + "step": 13376, + "time_per_iteration": 3.391831636428833 + }, + { + "auxiliary_loss_clip": 0.01341493, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.23036671, + "balance_loss_mlp": 1.0194155, + "epoch": 0.8042687509394258, + "flos": 21877571867760.0, + "grad_norm": 1.7026328952345875, + "language_loss": 0.73156381, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75530183, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12896729, + "step": 13377, + "time_per_iteration": 2.784482479095459 + }, + { + "auxiliary_loss_clip": 0.01336044, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.22582912, + "balance_loss_mlp": 1.01658595, + "epoch": 0.8043288741920938, + "flos": 26620532394480.0, + "grad_norm": 1.627943838167686, + "language_loss": 0.70483166, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72848916, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13134766, + "step": 13378, + "time_per_iteration": 2.861163854598999 + }, + { + "auxiliary_loss_clip": 0.01337056, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.22741616, + "balance_loss_mlp": 1.01543856, + "epoch": 0.8043889974447618, + "flos": 33410218527000.0, + "grad_norm": 1.6134158601365571, + "language_loss": 0.69441271, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71806628, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12854004, + "step": 13379, + "time_per_iteration": 2.914884090423584 + }, + { + "auxiliary_loss_clip": 0.01346789, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.23327518, + "balance_loss_mlp": 1.01740909, + "epoch": 0.8044491206974297, + "flos": 20380834821960.0, + "grad_norm": 1.6283239110505638, + "language_loss": 0.7670694, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.79085028, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13897705, + "step": 13380, + "time_per_iteration": 2.8012635707855225 + }, + { + "auxiliary_loss_clip": 0.01329657, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.22158277, + "balance_loss_mlp": 1.01373124, + "epoch": 0.8045092439500977, + "flos": 23409012080160.0, + "grad_norm": 1.7843343558837124, + "language_loss": 0.6948452, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71840292, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12384033, + "step": 13381, + "time_per_iteration": 2.7088770866394043 + }, + { + "auxiliary_loss_clip": 0.01337036, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.22827959, + "balance_loss_mlp": 1.02058291, + "epoch": 0.8045693672027656, + "flos": 24102564484680.0, + "grad_norm": 1.714132993485657, + "language_loss": 0.64230198, + "learning_rate": 3.873395148176135e-07, + "loss": 0.66601336, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13531494, + "step": 13382, + "time_per_iteration": 2.7813353538513184 + }, + { + "auxiliary_loss_clip": 0.01330389, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.22255492, + "balance_loss_mlp": 1.01964116, + "epoch": 0.8046294904554336, + "flos": 27712672101000.0, + "grad_norm": 2.126969930675017, + "language_loss": 0.7622478, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78587031, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12219238, + "step": 13383, + "time_per_iteration": 2.9038023948669434 + }, + { + "auxiliary_loss_clip": 0.01330158, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.22254729, + "balance_loss_mlp": 1.02190244, + "epoch": 0.8046896137081017, + "flos": 24978444736320.0, + "grad_norm": 1.8086030441805188, + "language_loss": 0.69614351, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71978807, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12390137, + "step": 13384, + "time_per_iteration": 2.7917487621307373 + }, + { + "auxiliary_loss_clip": 0.0133832, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.22658229, + "balance_loss_mlp": 1.0189569, + "epoch": 0.8047497369607696, + "flos": 17679929939640.0, + "grad_norm": 2.391963828379274, + "language_loss": 0.79580557, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81951988, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14129639, + "step": 13385, + "time_per_iteration": 2.7646703720092773 + }, + { + "auxiliary_loss_clip": 0.01334029, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.22591662, + "balance_loss_mlp": 1.01942396, + "epoch": 0.8048098602134376, + "flos": 22387131482400.0, + "grad_norm": 1.59020564308875, + "language_loss": 0.72369623, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74736166, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13085938, + "step": 13386, + "time_per_iteration": 2.732048988342285 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.01008174, + "balance_loss_clip": 1.1047132, + "balance_loss_mlp": 1.00545633, + "epoch": 0.8048699834661055, + "flos": 71217259298640.0, + "grad_norm": 0.6630652879097876, + "language_loss": 0.51267332, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53425729, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.02722168, + "step": 13387, + "time_per_iteration": 3.3455517292022705 + }, + { + "auxiliary_loss_clip": 0.01336363, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.22758889, + "balance_loss_mlp": 1.02115405, + "epoch": 0.8049301067187735, + "flos": 23665700480400.0, + "grad_norm": 1.637937337816398, + "language_loss": 0.74130285, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76501769, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13952637, + "step": 13388, + "time_per_iteration": 2.7676570415496826 + }, + { + "auxiliary_loss_clip": 0.01330426, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.22361648, + "balance_loss_mlp": 1.01728225, + "epoch": 0.8049902299714414, + "flos": 24431786061840.0, + "grad_norm": 1.6026613400119205, + "language_loss": 0.71266246, + "learning_rate": 3.857285412741411e-07, + "loss": 0.7362653, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12567139, + "step": 13389, + "time_per_iteration": 2.809169292449951 + }, + { + "auxiliary_loss_clip": 0.01331696, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.22429645, + "balance_loss_mlp": 1.02342153, + "epoch": 0.8050503532241094, + "flos": 17496992967120.0, + "grad_norm": 1.9470856125863132, + "language_loss": 0.83120292, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.8548885, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13421631, + "step": 13390, + "time_per_iteration": 2.761915445327759 + }, + { + "auxiliary_loss_clip": 0.01148783, + "auxiliary_loss_mlp": 0.01005441, + "balance_loss_clip": 1.10301399, + "balance_loss_mlp": 1.0028069, + "epoch": 0.8051104764767774, + "flos": 57671696816280.0, + "grad_norm": 0.7825158096552, + "language_loss": 0.55547971, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57702196, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.02636719, + "step": 13391, + "time_per_iteration": 3.181807518005371 + }, + { + "auxiliary_loss_clip": 0.01325478, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.22068, + "balance_loss_mlp": 1.02012062, + "epoch": 0.8051705997294454, + "flos": 18007892657640.0, + "grad_norm": 1.470791426008259, + "language_loss": 0.85129106, + "learning_rate": 3.850390420667762e-07, + "loss": 0.87487143, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12420654, + "step": 13392, + "time_per_iteration": 2.8712449073791504 + }, + { + "auxiliary_loss_clip": 0.01336598, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.22560453, + "balance_loss_mlp": 1.01801026, + "epoch": 0.8052307229821133, + "flos": 26403623205840.0, + "grad_norm": 1.476692378212234, + "language_loss": 0.70469296, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72836578, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12689209, + "step": 13393, + "time_per_iteration": 2.8317277431488037 + }, + { + "auxiliary_loss_clip": 0.01337544, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.22774732, + "balance_loss_mlp": 1.02137566, + "epoch": 0.8052908462347813, + "flos": 21761361076680.0, + "grad_norm": 2.4952591527557035, + "language_loss": 0.7624737, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78619671, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13378906, + "step": 13394, + "time_per_iteration": 2.8404881954193115 + }, + { + "auxiliary_loss_clip": 0.01335064, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.22613406, + "balance_loss_mlp": 1.01607871, + "epoch": 0.8053509694874492, + "flos": 25447128713640.0, + "grad_norm": 1.5588502579594343, + "language_loss": 0.65218675, + "learning_rate": 3.843500940147304e-07, + "loss": 0.6758287, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1305542, + "step": 13395, + "time_per_iteration": 2.833416700363159 + }, + { + "auxiliary_loss_clip": 0.01149319, + "auxiliary_loss_mlp": 0.01003647, + "balance_loss_clip": 1.10421896, + "balance_loss_mlp": 1.000965, + "epoch": 0.8054110927401172, + "flos": 57683107765440.0, + "grad_norm": 0.8270674832877176, + "language_loss": 0.57403547, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59556514, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.02685547, + "step": 13396, + "time_per_iteration": 3.446115732192993 + }, + { + "auxiliary_loss_clip": 0.01331516, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.22232556, + "balance_loss_mlp": 1.02136087, + "epoch": 0.8054712159927853, + "flos": 19280248576560.0, + "grad_norm": 1.6477464605785448, + "language_loss": 0.77461493, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79828179, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13793945, + "step": 13397, + "time_per_iteration": 2.987921953201294 + }, + { + "auxiliary_loss_clip": 0.01334474, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.22649956, + "balance_loss_mlp": 1.01515603, + "epoch": 0.8055313392454532, + "flos": 17972133673680.0, + "grad_norm": 1.6693295142335478, + "language_loss": 0.70192659, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72554612, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12335205, + "step": 13398, + "time_per_iteration": 2.7046711444854736 + }, + { + "auxiliary_loss_clip": 0.01331192, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.22383189, + "balance_loss_mlp": 1.01742721, + "epoch": 0.8055914624981212, + "flos": 13482084969720.0, + "grad_norm": 2.074456193949891, + "language_loss": 0.69288647, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71650839, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13562012, + "step": 13399, + "time_per_iteration": 2.7326266765594482 + }, + { + "auxiliary_loss_clip": 0.01330192, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.22230744, + "balance_loss_mlp": 1.01891673, + "epoch": 0.8056515857507891, + "flos": 13228726455000.0, + "grad_norm": 2.27252833618814, + "language_loss": 0.72619027, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74981135, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13006592, + "step": 13400, + "time_per_iteration": 4.223674058914185 + }, + { + "auxiliary_loss_clip": 0.01328307, + "auxiliary_loss_mlp": 0.01027362, + "balance_loss_clip": 1.22166753, + "balance_loss_mlp": 1.01464272, + "epoch": 0.8057117090034571, + "flos": 23883462444600.0, + "grad_norm": 1.6742167744829526, + "language_loss": 0.63970089, + "learning_rate": 3.829738523169037e-07, + "loss": 0.6632576, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12713623, + "step": 13401, + "time_per_iteration": 4.303345203399658 + }, + { + "auxiliary_loss_clip": 0.01335858, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.22658575, + "balance_loss_mlp": 1.01827168, + "epoch": 0.805771832256125, + "flos": 21219169321800.0, + "grad_norm": 2.1391120331619438, + "language_loss": 0.84249932, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86617434, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1338501, + "step": 13402, + "time_per_iteration": 4.285802125930786 + }, + { + "auxiliary_loss_clip": 0.01336342, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.22732508, + "balance_loss_mlp": 1.01471281, + "epoch": 0.805831955508793, + "flos": 17571353520240.0, + "grad_norm": 2.2058702778990598, + "language_loss": 0.68014038, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70378059, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12969971, + "step": 13403, + "time_per_iteration": 2.729560613632202 + }, + { + "auxiliary_loss_clip": 0.0131955, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.21528625, + "balance_loss_mlp": 1.02161455, + "epoch": 0.805892078761461, + "flos": 26913345253920.0, + "grad_norm": 1.6969583501985954, + "language_loss": 0.85029131, + "learning_rate": 3.822865591408084e-07, + "loss": 0.87382519, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12225342, + "step": 13404, + "time_per_iteration": 2.8471999168395996 + }, + { + "auxiliary_loss_clip": 0.01320999, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.21587527, + "balance_loss_mlp": 1.01865506, + "epoch": 0.805952202014129, + "flos": 31513066803000.0, + "grad_norm": 1.509598410609517, + "language_loss": 0.7070089, + "learning_rate": 3.820575840915743e-07, + "loss": 0.73053122, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12579346, + "step": 13405, + "time_per_iteration": 2.971391439437866 + }, + { + "auxiliary_loss_clip": 0.01327292, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.22084463, + "balance_loss_mlp": 1.01530445, + "epoch": 0.8060123252667969, + "flos": 24395417952480.0, + "grad_norm": 2.096067380153727, + "language_loss": 0.75594085, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77948701, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12023926, + "step": 13406, + "time_per_iteration": 2.802809715270996 + }, + { + "auxiliary_loss_clip": 0.01337057, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.22736812, + "balance_loss_mlp": 1.02217174, + "epoch": 0.8060724485194649, + "flos": 23485565484720.0, + "grad_norm": 1.6694951787479786, + "language_loss": 0.76464832, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78838325, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14263916, + "step": 13407, + "time_per_iteration": 2.8456947803497314 + }, + { + "auxiliary_loss_clip": 0.01329169, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.22124457, + "balance_loss_mlp": 1.01813662, + "epoch": 0.8061325717721328, + "flos": 18629114927040.0, + "grad_norm": 1.6555600731991957, + "language_loss": 0.73932421, + "learning_rate": 3.81371027093822e-07, + "loss": 0.76293182, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13446045, + "step": 13408, + "time_per_iteration": 2.803499937057495 + }, + { + "auxiliary_loss_clip": 0.01332442, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.22485542, + "balance_loss_mlp": 1.014485, + "epoch": 0.8061926950248008, + "flos": 23587522741440.0, + "grad_norm": 1.9220696586842385, + "language_loss": 0.70842844, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.73203653, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13891602, + "step": 13409, + "time_per_iteration": 4.357933282852173 + }, + { + "auxiliary_loss_clip": 0.01332297, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.22309375, + "balance_loss_mlp": 1.01939118, + "epoch": 0.8062528182774689, + "flos": 11146769773920.0, + "grad_norm": 3.0248074757906083, + "language_loss": 0.77410674, + "learning_rate": 3.809136293070545e-07, + "loss": 0.79775286, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12927246, + "step": 13410, + "time_per_iteration": 2.695769786834717 + }, + { + "auxiliary_loss_clip": 0.01328764, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.22222221, + "balance_loss_mlp": 1.02106392, + "epoch": 0.8063129415301368, + "flos": 22352022232200.0, + "grad_norm": 1.7613860464246218, + "language_loss": 0.68957627, + "learning_rate": 3.806850225032117e-07, + "loss": 0.71320891, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13446045, + "step": 13411, + "time_per_iteration": 2.784456729888916 + }, + { + "auxiliary_loss_clip": 0.01329715, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.22300053, + "balance_loss_mlp": 1.01565075, + "epoch": 0.8063730647828048, + "flos": 23993785023480.0, + "grad_norm": 1.7383513952397502, + "language_loss": 0.68461049, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70819741, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.13348389, + "step": 13412, + "time_per_iteration": 2.7606539726257324 + }, + { + "auxiliary_loss_clip": 0.01338315, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.22684622, + "balance_loss_mlp": 1.0215075, + "epoch": 0.8064331880354727, + "flos": 21326080798440.0, + "grad_norm": 1.6607447578457832, + "language_loss": 0.81448328, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83822465, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14312744, + "step": 13413, + "time_per_iteration": 2.80646014213562 + }, + { + "auxiliary_loss_clip": 0.01329504, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.22353888, + "balance_loss_mlp": 1.01589823, + "epoch": 0.8064933112881407, + "flos": 19687688501040.0, + "grad_norm": 1.8477332333980214, + "language_loss": 0.84996116, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87353849, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12329102, + "step": 13414, + "time_per_iteration": 2.7396605014801025 + }, + { + "auxiliary_loss_clip": 0.01325451, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.21863031, + "balance_loss_mlp": 1.01732385, + "epoch": 0.8065534345408086, + "flos": 19284228195840.0, + "grad_norm": 1.7687382725525793, + "language_loss": 0.67460507, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69815677, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12402344, + "step": 13415, + "time_per_iteration": 2.7941536903381348 + }, + { + "auxiliary_loss_clip": 0.01324846, + "auxiliary_loss_mlp": 0.010255, + "balance_loss_clip": 1.22015762, + "balance_loss_mlp": 1.01338279, + "epoch": 0.8066135577934767, + "flos": 19681840897200.0, + "grad_norm": 1.5587227519525704, + "language_loss": 0.76801664, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.79152012, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12121582, + "step": 13416, + "time_per_iteration": 2.8778743743896484 + }, + { + "auxiliary_loss_clip": 0.01344372, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.23117757, + "balance_loss_mlp": 1.0182898, + "epoch": 0.8066736810461446, + "flos": 21148747779600.0, + "grad_norm": 1.4700875205664057, + "language_loss": 0.65859038, + "learning_rate": 3.793146714797086e-07, + "loss": 0.68234611, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.12908936, + "step": 13417, + "time_per_iteration": 2.811397075653076 + }, + { + "auxiliary_loss_clip": 0.01340524, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.2296989, + "balance_loss_mlp": 1.02376366, + "epoch": 0.8067338042988126, + "flos": 22602984853680.0, + "grad_norm": 1.494677769960297, + "language_loss": 0.80879134, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.83256221, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12817383, + "step": 13418, + "time_per_iteration": 2.8012006282806396 + }, + { + "auxiliary_loss_clip": 0.01339346, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.2297194, + "balance_loss_mlp": 1.01621747, + "epoch": 0.8067939275514805, + "flos": 16512779946240.0, + "grad_norm": 1.5310098119363003, + "language_loss": 0.84485972, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.86855143, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1361084, + "step": 13419, + "time_per_iteration": 2.687434196472168 + }, + { + "auxiliary_loss_clip": 0.01333282, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.22357917, + "balance_loss_mlp": 1.0191071, + "epoch": 0.8068540508041485, + "flos": 28546864548120.0, + "grad_norm": 2.0709112061495336, + "language_loss": 0.75819123, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.78184515, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13012695, + "step": 13420, + "time_per_iteration": 2.834083318710327 + }, + { + "auxiliary_loss_clip": 0.01328712, + "auxiliary_loss_mlp": 0.01021668, + "balance_loss_clip": 1.22186136, + "balance_loss_mlp": 1.01000357, + "epoch": 0.8069141740568164, + "flos": 21657454618680.0, + "grad_norm": 1.7085165393111856, + "language_loss": 0.78584063, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80934441, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11676025, + "step": 13421, + "time_per_iteration": 2.7545135021209717 + }, + { + "auxiliary_loss_clip": 0.01337643, + "auxiliary_loss_mlp": 0.01023096, + "balance_loss_clip": 1.22866535, + "balance_loss_mlp": 1.01081145, + "epoch": 0.8069742973094844, + "flos": 17533726551720.0, + "grad_norm": 1.7035456371961393, + "language_loss": 0.79887849, + "learning_rate": 3.78174402269098e-07, + "loss": 0.82248586, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12286377, + "step": 13422, + "time_per_iteration": 2.7526447772979736 + }, + { + "auxiliary_loss_clip": 0.01329104, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.22241795, + "balance_loss_mlp": 1.01837826, + "epoch": 0.8070344205621525, + "flos": 23372197278840.0, + "grad_norm": 1.4808880412129692, + "language_loss": 0.68434978, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70794976, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12530518, + "step": 13423, + "time_per_iteration": 2.8290183544158936 + }, + { + "auxiliary_loss_clip": 0.01344079, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.23200536, + "balance_loss_mlp": 1.02078032, + "epoch": 0.8070945438148204, + "flos": 22935414491280.0, + "grad_norm": 1.7383347440300017, + "language_loss": 0.8056823, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82946754, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.13653564, + "step": 13424, + "time_per_iteration": 2.8296780586242676 + }, + { + "auxiliary_loss_clip": 0.01337361, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.2262584, + "balance_loss_mlp": 1.01488101, + "epoch": 0.8071546670674884, + "flos": 25306082587440.0, + "grad_norm": 2.028668688266425, + "language_loss": 0.79065722, + "learning_rate": 3.774909786710232e-07, + "loss": 0.814309, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12921143, + "step": 13425, + "time_per_iteration": 2.8004894256591797 + }, + { + "auxiliary_loss_clip": 0.01332752, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.22383666, + "balance_loss_mlp": 1.01999378, + "epoch": 0.8072147903201563, + "flos": 18118255844880.0, + "grad_norm": 3.27279969144259, + "language_loss": 0.75730354, + "learning_rate": 3.772632938448923e-07, + "loss": 0.78095669, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12579346, + "step": 13426, + "time_per_iteration": 2.702782392501831 + }, + { + "auxiliary_loss_clip": 0.01331064, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.22199011, + "balance_loss_mlp": 1.01434469, + "epoch": 0.8072749135728243, + "flos": 26693877738600.0, + "grad_norm": 1.5784197117361296, + "language_loss": 0.73131031, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75488645, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12219238, + "step": 13427, + "time_per_iteration": 2.79811692237854 + }, + { + "auxiliary_loss_clip": 0.01329564, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.2216258, + "balance_loss_mlp": 1.02743483, + "epoch": 0.8073350368254922, + "flos": 19244408375880.0, + "grad_norm": 1.8060311748741886, + "language_loss": 0.70633745, + "learning_rate": 3.768081088042774e-07, + "loss": 0.73004824, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.14074707, + "step": 13428, + "time_per_iteration": 2.7765183448791504 + }, + { + "auxiliary_loss_clip": 0.01335181, + "auxiliary_loss_mlp": 0.01024523, + "balance_loss_clip": 1.22577655, + "balance_loss_mlp": 1.01257825, + "epoch": 0.8073951600781603, + "flos": 13338521125200.0, + "grad_norm": 1.7963009159190804, + "language_loss": 0.75376809, + "learning_rate": 3.765806086070544e-07, + "loss": 0.77736509, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.11938477, + "step": 13429, + "time_per_iteration": 2.844235420227051 + }, + { + "auxiliary_loss_clip": 0.01323093, + "auxiliary_loss_mlp": 0.01027222, + "balance_loss_clip": 1.218328, + "balance_loss_mlp": 1.01506233, + "epoch": 0.8074552833308282, + "flos": 22857886486080.0, + "grad_norm": 1.6313945602117281, + "language_loss": 0.67354882, + "learning_rate": 3.763531699700568e-07, + "loss": 0.697052, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12158203, + "step": 13430, + "time_per_iteration": 2.751107931137085 + }, + { + "auxiliary_loss_clip": 0.01330552, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.22287047, + "balance_loss_mlp": 1.01883411, + "epoch": 0.8075154065834962, + "flos": 20344263670800.0, + "grad_norm": 1.5939044818487718, + "language_loss": 0.80188882, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82551038, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12768555, + "step": 13431, + "time_per_iteration": 2.7674365043640137 + }, + { + "auxiliary_loss_clip": 0.01327441, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.22224665, + "balance_loss_mlp": 1.01550364, + "epoch": 0.8075755298361641, + "flos": 21913087201560.0, + "grad_norm": 1.7613609752539663, + "language_loss": 0.80507571, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.8286339, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.12860107, + "step": 13432, + "time_per_iteration": 2.7397499084472656 + }, + { + "auxiliary_loss_clip": 0.01343099, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.23052192, + "balance_loss_mlp": 1.01881087, + "epoch": 0.8076356530888321, + "flos": 15673552062480.0, + "grad_norm": 2.007256129217651, + "language_loss": 0.70699716, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.7307502, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13391113, + "step": 13433, + "time_per_iteration": 2.779182195663452 + }, + { + "auxiliary_loss_clip": 0.01331299, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.22348547, + "balance_loss_mlp": 1.0183413, + "epoch": 0.8076957763415, + "flos": 37784259481680.0, + "grad_norm": 1.5911448773103025, + "language_loss": 0.72852075, + "learning_rate": 3.754440311967828e-07, + "loss": 0.75214291, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12585449, + "step": 13434, + "time_per_iteration": 2.937596559524536 + }, + { + "auxiliary_loss_clip": 0.01337083, + "auxiliary_loss_mlp": 0.01028182, + "balance_loss_clip": 1.22981596, + "balance_loss_mlp": 1.01582611, + "epoch": 0.807755899594168, + "flos": 19615683232800.0, + "grad_norm": 2.25823444947812, + "language_loss": 0.68343914, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70709187, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12347412, + "step": 13435, + "time_per_iteration": 2.7251014709472656 + }, + { + "auxiliary_loss_clip": 0.01339196, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.22944593, + "balance_loss_mlp": 1.02384698, + "epoch": 0.8078160228468361, + "flos": 23300151402240.0, + "grad_norm": 1.5354080500952085, + "language_loss": 0.74987745, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77364588, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13800049, + "step": 13436, + "time_per_iteration": 2.8026106357574463 + }, + { + "auxiliary_loss_clip": 0.01323875, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.21893036, + "balance_loss_mlp": 1.02023149, + "epoch": 0.807876146099504, + "flos": 27168449928120.0, + "grad_norm": 1.7284489157935987, + "language_loss": 0.70618451, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72975099, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12542725, + "step": 13437, + "time_per_iteration": 2.7902109622955322 + }, + { + "auxiliary_loss_clip": 0.01328937, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.22314453, + "balance_loss_mlp": 1.01415396, + "epoch": 0.807936269352172, + "flos": 27165485517840.0, + "grad_norm": 1.7535199443889795, + "language_loss": 0.72858012, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75213349, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12268066, + "step": 13438, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.01333758, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.22667825, + "balance_loss_mlp": 1.02102137, + "epoch": 0.8079963926048399, + "flos": 20745287474400.0, + "grad_norm": 1.7681657189427225, + "language_loss": 0.76832688, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79199982, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12530518, + "step": 13439, + "time_per_iteration": 4.255740642547607 + }, + { + "auxiliary_loss_clip": 0.01328427, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.2222693, + "balance_loss_mlp": 1.0171144, + "epoch": 0.8080565158575079, + "flos": 25015137712560.0, + "grad_norm": 1.421699462129643, + "language_loss": 0.78731191, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.81089598, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12854004, + "step": 13440, + "time_per_iteration": 4.402351140975952 + }, + { + "auxiliary_loss_clip": 0.01339576, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.22886944, + "balance_loss_mlp": 1.01584375, + "epoch": 0.8081166391101758, + "flos": 18702785138040.0, + "grad_norm": 1.7663947523432815, + "language_loss": 0.5904296, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61411786, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13409424, + "step": 13441, + "time_per_iteration": 4.1859376430511475 + }, + { + "auxiliary_loss_clip": 0.01330208, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.22305799, + "balance_loss_mlp": 1.01433778, + "epoch": 0.8081767623628439, + "flos": 19833445197000.0, + "grad_norm": 2.107478536968885, + "language_loss": 0.76747918, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.79105967, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13500977, + "step": 13442, + "time_per_iteration": 2.767941951751709 + }, + { + "auxiliary_loss_clip": 0.0133375, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.22574699, + "balance_loss_mlp": 1.01904011, + "epoch": 0.8082368856155118, + "flos": 35779830805800.0, + "grad_norm": 1.436232066787027, + "language_loss": 0.70879143, + "learning_rate": 3.734020735906169e-07, + "loss": 0.73245221, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13287354, + "step": 13443, + "time_per_iteration": 2.8903205394744873 + }, + { + "auxiliary_loss_clip": 0.01324885, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.22020268, + "balance_loss_mlp": 1.02104616, + "epoch": 0.8082970088681798, + "flos": 17201987256240.0, + "grad_norm": 1.7913948009770784, + "language_loss": 0.82111794, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84470212, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.125, + "step": 13444, + "time_per_iteration": 2.739374876022339 + }, + { + "auxiliary_loss_clip": 0.01146145, + "auxiliary_loss_mlp": 0.01007511, + "balance_loss_clip": 1.10222554, + "balance_loss_mlp": 1.00436425, + "epoch": 0.8083571321208477, + "flos": 63567878668920.0, + "grad_norm": 0.8875276583223976, + "language_loss": 0.53659207, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.5581286, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.03149414, + "step": 13445, + "time_per_iteration": 3.1019127368927 + }, + { + "auxiliary_loss_clip": 0.01330953, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.22373998, + "balance_loss_mlp": 1.01780391, + "epoch": 0.8084172553735157, + "flos": 17935156438920.0, + "grad_norm": 2.0229289427433224, + "language_loss": 0.72241127, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74604094, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.14196777, + "step": 13446, + "time_per_iteration": 2.740992784500122 + }, + { + "auxiliary_loss_clip": 0.01336726, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.22662044, + "balance_loss_mlp": 1.01524389, + "epoch": 0.8084773786261836, + "flos": 24103417260240.0, + "grad_norm": 1.6670079977699042, + "language_loss": 0.71347201, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73713577, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.14404297, + "step": 13447, + "time_per_iteration": 2.9223742485046387 + }, + { + "auxiliary_loss_clip": 0.01342936, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.23190415, + "balance_loss_mlp": 1.02055526, + "epoch": 0.8085375018788516, + "flos": 15591394704240.0, + "grad_norm": 2.0860852882741203, + "language_loss": 0.7505368, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77431637, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14471436, + "step": 13448, + "time_per_iteration": 4.328446626663208 + }, + { + "auxiliary_loss_clip": 0.01146662, + "auxiliary_loss_mlp": 0.01004604, + "balance_loss_clip": 1.10216117, + "balance_loss_mlp": 1.00138497, + "epoch": 0.8085976251315197, + "flos": 67578296119560.0, + "grad_norm": 0.7572194234410711, + "language_loss": 0.63912201, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.66063476, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.03222656, + "step": 13449, + "time_per_iteration": 3.246480703353882 + }, + { + "auxiliary_loss_clip": 0.01330108, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.22264218, + "balance_loss_mlp": 1.01680732, + "epoch": 0.8086577483841876, + "flos": 22566129444000.0, + "grad_norm": 1.6112378947739208, + "language_loss": 0.73852038, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76212227, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13287354, + "step": 13450, + "time_per_iteration": 2.8061752319335938 + }, + { + "auxiliary_loss_clip": 0.01335403, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.22621095, + "balance_loss_mlp": 1.01495242, + "epoch": 0.8087178716368556, + "flos": 17972661582360.0, + "grad_norm": 1.6972640083360673, + "language_loss": 0.74193287, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.76556206, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12567139, + "step": 13451, + "time_per_iteration": 2.889469623565674 + }, + { + "auxiliary_loss_clip": 0.01341523, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.22863841, + "balance_loss_mlp": 1.01800466, + "epoch": 0.8087779948895235, + "flos": 21723734108160.0, + "grad_norm": 1.6527636885111536, + "language_loss": 0.80561137, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82934785, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14123535, + "step": 13452, + "time_per_iteration": 2.776638984680176 + }, + { + "auxiliary_loss_clip": 0.01335742, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.22637749, + "balance_loss_mlp": 1.02255249, + "epoch": 0.8088381181421915, + "flos": 29098477442520.0, + "grad_norm": 1.6768219652255594, + "language_loss": 0.78721976, + "learning_rate": 3.711390917482875e-07, + "loss": 0.81093043, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12774658, + "step": 13453, + "time_per_iteration": 2.87111234664917 + }, + { + "auxiliary_loss_clip": 0.01336018, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.22697675, + "balance_loss_mlp": 1.01714218, + "epoch": 0.8088982413948594, + "flos": 22203301125960.0, + "grad_norm": 2.0083773586980387, + "language_loss": 0.77834511, + "learning_rate": 3.709131331386892e-07, + "loss": 0.80201375, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.137146, + "step": 13454, + "time_per_iteration": 2.807586908340454 + }, + { + "auxiliary_loss_clip": 0.01328323, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.22058415, + "balance_loss_mlp": 1.01718783, + "epoch": 0.8089583646475275, + "flos": 28042421586840.0, + "grad_norm": 2.254575545211639, + "language_loss": 0.77546096, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.7990526, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.13653564, + "step": 13455, + "time_per_iteration": 2.969259262084961 + }, + { + "auxiliary_loss_clip": 0.0133507, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.22540617, + "balance_loss_mlp": 1.01891458, + "epoch": 0.8090184879001954, + "flos": 16622087316120.0, + "grad_norm": 1.7040767893132405, + "language_loss": 0.78160471, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80527925, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13458252, + "step": 13456, + "time_per_iteration": 2.8201169967651367 + }, + { + "auxiliary_loss_clip": 0.01328714, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.22268569, + "balance_loss_mlp": 1.02033138, + "epoch": 0.8090786111528634, + "flos": 27346067205480.0, + "grad_norm": 2.38100404096434, + "language_loss": 0.71675295, + "learning_rate": 3.702356279949801e-07, + "loss": 0.74036622, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.1229248, + "step": 13457, + "time_per_iteration": 2.8360249996185303 + }, + { + "auxiliary_loss_clip": 0.01333236, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.22599387, + "balance_loss_mlp": 1.01846337, + "epoch": 0.8091387344055313, + "flos": 21110714727480.0, + "grad_norm": 1.7656148354144543, + "language_loss": 0.73068285, + "learning_rate": 3.700099165373176e-07, + "loss": 0.75432044, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.12072754, + "step": 13458, + "time_per_iteration": 2.863699197769165 + }, + { + "auxiliary_loss_clip": 0.01329929, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.22198892, + "balance_loss_mlp": 1.0199641, + "epoch": 0.8091988576581993, + "flos": 11658684673440.0, + "grad_norm": 3.191992308539448, + "language_loss": 0.79923981, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.82286626, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12750244, + "step": 13459, + "time_per_iteration": 2.7367868423461914 + }, + { + "auxiliary_loss_clip": 0.013415, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.22932363, + "balance_loss_mlp": 1.01692295, + "epoch": 0.8092589809108672, + "flos": 22968127848240.0, + "grad_norm": 2.3438760814768274, + "language_loss": 0.80307502, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82679665, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13726807, + "step": 13460, + "time_per_iteration": 2.749293327331543 + }, + { + "auxiliary_loss_clip": 0.01333017, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.22348833, + "balance_loss_mlp": 1.02001786, + "epoch": 0.8093191041635353, + "flos": 13264891522560.0, + "grad_norm": 1.7050809665596682, + "language_loss": 0.84549797, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86916649, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13793945, + "step": 13461, + "time_per_iteration": 2.6924118995666504 + }, + { + "auxiliary_loss_clip": 0.01337521, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.22759676, + "balance_loss_mlp": 1.025316, + "epoch": 0.8093792274162032, + "flos": 25520961358080.0, + "grad_norm": 1.8172988237082233, + "language_loss": 0.76664424, + "learning_rate": 3.69107688886096e-07, + "loss": 0.79040706, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13433838, + "step": 13462, + "time_per_iteration": 2.7954471111297607 + }, + { + "auxiliary_loss_clip": 0.01333736, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.22386885, + "balance_loss_mlp": 1.01915622, + "epoch": 0.8094393506688712, + "flos": 23551276457160.0, + "grad_norm": 1.5503717198697546, + "language_loss": 0.83078361, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85445094, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13842773, + "step": 13463, + "time_per_iteration": 2.8227005004882812 + }, + { + "auxiliary_loss_clip": 0.01330685, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.22366381, + "balance_loss_mlp": 1.02325606, + "epoch": 0.8094994739215392, + "flos": 17060372613000.0, + "grad_norm": 1.8830628343959206, + "language_loss": 0.62702286, + "learning_rate": 3.686569460878779e-07, + "loss": 0.65068424, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12207031, + "step": 13464, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.01325551, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.22084403, + "balance_loss_mlp": 1.01728261, + "epoch": 0.8095595971742071, + "flos": 23556636760680.0, + "grad_norm": 1.5541125505737077, + "language_loss": 0.62175709, + "learning_rate": 3.684316674755341e-07, + "loss": 0.64530432, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11889648, + "step": 13465, + "time_per_iteration": 2.7551355361938477 + }, + { + "auxiliary_loss_clip": 0.01330332, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.22475779, + "balance_loss_mlp": 1.01756954, + "epoch": 0.8096197204268751, + "flos": 20377707978240.0, + "grad_norm": 1.7299797504950956, + "language_loss": 0.82461721, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84821951, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12335205, + "step": 13466, + "time_per_iteration": 2.760047435760498 + }, + { + "auxiliary_loss_clip": 0.01335158, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.2248311, + "balance_loss_mlp": 1.019454, + "epoch": 0.809679843679543, + "flos": 27824578405920.0, + "grad_norm": 2.0205597079512745, + "language_loss": 0.7731756, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.79685086, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12927246, + "step": 13467, + "time_per_iteration": 2.833944082260132 + }, + { + "auxiliary_loss_clip": 0.01333876, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.22560549, + "balance_loss_mlp": 1.01620126, + "epoch": 0.8097399669322111, + "flos": 22018699210680.0, + "grad_norm": 3.491399760203232, + "language_loss": 0.79220212, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81584215, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13922119, + "step": 13468, + "time_per_iteration": 2.8785579204559326 + }, + { + "auxiliary_loss_clip": 0.01325075, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.22017324, + "balance_loss_mlp": 1.01491785, + "epoch": 0.809800090184879, + "flos": 18993608187840.0, + "grad_norm": 1.6726259481366696, + "language_loss": 0.6822741, + "learning_rate": 3.675311718038978e-07, + "loss": 0.70579475, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12084961, + "step": 13469, + "time_per_iteration": 2.8967981338500977 + }, + { + "auxiliary_loss_clip": 0.01146415, + "auxiliary_loss_mlp": 0.01003264, + "balance_loss_clip": 1.10252309, + "balance_loss_mlp": 1.00072503, + "epoch": 0.809860213437547, + "flos": 66116018590200.0, + "grad_norm": 0.6924179469031198, + "language_loss": 0.54666209, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56815886, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02539062, + "step": 13470, + "time_per_iteration": 3.3995981216430664 + }, + { + "auxiliary_loss_clip": 0.01328514, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.22077012, + "balance_loss_mlp": 1.02002454, + "epoch": 0.8099203366902149, + "flos": 20886861509280.0, + "grad_norm": 1.562290821769879, + "language_loss": 0.69530797, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71891332, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.11999512, + "step": 13471, + "time_per_iteration": 2.8168845176696777 + }, + { + "auxiliary_loss_clip": 0.0133142, + "auxiliary_loss_mlp": 0.01029015, + "balance_loss_clip": 1.22406101, + "balance_loss_mlp": 1.01646781, + "epoch": 0.8099804599428829, + "flos": 26036490401640.0, + "grad_norm": 1.6521895059602716, + "language_loss": 0.80454898, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82815331, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12561035, + "step": 13472, + "time_per_iteration": 2.7772910594940186 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.01003019, + "balance_loss_clip": 1.10470629, + "balance_loss_mlp": 1.00032461, + "epoch": 0.8100405831955508, + "flos": 69319620274320.0, + "grad_norm": 0.7453295278712867, + "language_loss": 0.57792056, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59943861, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02697754, + "step": 13473, + "time_per_iteration": 3.2030720710754395 + }, + { + "auxiliary_loss_clip": 0.01339123, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.22911239, + "balance_loss_mlp": 1.01640821, + "epoch": 0.8101007064482189, + "flos": 15016652025840.0, + "grad_norm": 1.7167226677897336, + "language_loss": 0.74172288, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76540488, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12670898, + "step": 13474, + "time_per_iteration": 2.7442140579223633 + }, + { + "auxiliary_loss_clip": 0.01339864, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.22855306, + "balance_loss_mlp": 1.02498829, + "epoch": 0.8101608297008868, + "flos": 21071910116520.0, + "grad_norm": 1.6546602725160346, + "language_loss": 0.78612339, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80990571, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13391113, + "step": 13475, + "time_per_iteration": 2.870698928833008 + }, + { + "auxiliary_loss_clip": 0.01328614, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.22232962, + "balance_loss_mlp": 1.02106547, + "epoch": 0.8102209529535548, + "flos": 23736406281120.0, + "grad_norm": 1.754128243070746, + "language_loss": 0.75851756, + "learning_rate": 3.659576879869364e-07, + "loss": 0.78213978, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12548828, + "step": 13476, + "time_per_iteration": 2.8017289638519287 + }, + { + "auxiliary_loss_clip": 0.01344524, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.23214507, + "balance_loss_mlp": 1.02333558, + "epoch": 0.8102810762062228, + "flos": 10958634931320.0, + "grad_norm": 1.9705383553451252, + "language_loss": 0.74273694, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76655352, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13806152, + "step": 13477, + "time_per_iteration": 4.185488224029541 + }, + { + "auxiliary_loss_clip": 0.01330097, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.22222424, + "balance_loss_mlp": 1.02073908, + "epoch": 0.8103411994588907, + "flos": 14653092757320.0, + "grad_norm": 1.8962452704099935, + "language_loss": 0.69535923, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71898943, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12188721, + "step": 13478, + "time_per_iteration": 2.7133848667144775 + }, + { + "auxiliary_loss_clip": 0.01149733, + "auxiliary_loss_mlp": 0.01004512, + "balance_loss_clip": 1.10551846, + "balance_loss_mlp": 1.00172222, + "epoch": 0.8104013227115587, + "flos": 59167093786200.0, + "grad_norm": 0.7210456372787927, + "language_loss": 0.52199805, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54354048, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.0279541, + "step": 13479, + "time_per_iteration": 4.6639862060546875 + }, + { + "auxiliary_loss_clip": 0.01333235, + "auxiliary_loss_mlp": 0.01027465, + "balance_loss_clip": 1.22670245, + "balance_loss_mlp": 1.01461387, + "epoch": 0.8104614459642266, + "flos": 19833485805360.0, + "grad_norm": 1.6132811448036841, + "language_loss": 0.72028702, + "learning_rate": 3.650599173768072e-07, + "loss": 0.74389398, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12841797, + "step": 13480, + "time_per_iteration": 4.25104832649231 + }, + { + "auxiliary_loss_clip": 0.01335126, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.22621679, + "balance_loss_mlp": 1.02198946, + "epoch": 0.8105215692168947, + "flos": 25379752798440.0, + "grad_norm": 1.724271453225883, + "language_loss": 0.7983247, + "learning_rate": 3.648356296957327e-07, + "loss": 0.82202327, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12738037, + "step": 13481, + "time_per_iteration": 2.81414794921875 + }, + { + "auxiliary_loss_clip": 0.01330854, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.22217178, + "balance_loss_mlp": 1.01983428, + "epoch": 0.8105816924695626, + "flos": 20486121964200.0, + "grad_norm": 1.9069553046658854, + "language_loss": 0.72683752, + "learning_rate": 3.646114040202548e-07, + "loss": 0.75046945, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.125, + "step": 13482, + "time_per_iteration": 2.7221529483795166 + }, + { + "auxiliary_loss_clip": 0.01334223, + "auxiliary_loss_mlp": 0.01029062, + "balance_loss_clip": 1.22462702, + "balance_loss_mlp": 1.0158242, + "epoch": 0.8106418157222306, + "flos": 14542851395160.0, + "grad_norm": 2.4576218637420904, + "language_loss": 0.6547187, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67835152, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13232422, + "step": 13483, + "time_per_iteration": 2.7385342121124268 + }, + { + "auxiliary_loss_clip": 0.01326703, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.22003102, + "balance_loss_mlp": 1.01751041, + "epoch": 0.8107019389748985, + "flos": 22569540546240.0, + "grad_norm": 1.5638450063165554, + "language_loss": 0.76393652, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78750706, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.1282959, + "step": 13484, + "time_per_iteration": 2.735926389694214 + }, + { + "auxiliary_loss_clip": 0.01347875, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.23390806, + "balance_loss_mlp": 1.02316833, + "epoch": 0.8107620622275665, + "flos": 19614302548560.0, + "grad_norm": 1.434431817796532, + "language_loss": 0.72256911, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74641788, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13842773, + "step": 13485, + "time_per_iteration": 2.892676830291748 + }, + { + "auxiliary_loss_clip": 0.01323922, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.2197988, + "balance_loss_mlp": 1.01243854, + "epoch": 0.8108221854802344, + "flos": 16147799385120.0, + "grad_norm": 1.7559432374919577, + "language_loss": 0.75452369, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77800971, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12231445, + "step": 13486, + "time_per_iteration": 2.723276138305664 + }, + { + "auxiliary_loss_clip": 0.01342677, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.2300756, + "balance_loss_mlp": 1.01918757, + "epoch": 0.8108823087329025, + "flos": 21111242636160.0, + "grad_norm": 1.942693561956339, + "language_loss": 0.71915185, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74290204, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13153076, + "step": 13487, + "time_per_iteration": 4.23982310295105 + }, + { + "auxiliary_loss_clip": 0.01328153, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.22350705, + "balance_loss_mlp": 1.01552463, + "epoch": 0.8109424319855704, + "flos": 29204901618840.0, + "grad_norm": 2.2154493526918513, + "language_loss": 0.84298021, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86653674, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.11981201, + "step": 13488, + "time_per_iteration": 2.8255107402801514 + }, + { + "auxiliary_loss_clip": 0.01337589, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.22862101, + "balance_loss_mlp": 1.02258682, + "epoch": 0.8110025552382384, + "flos": 23117011387920.0, + "grad_norm": 1.8083673869045693, + "language_loss": 0.74420434, + "learning_rate": 3.630435611625502e-07, + "loss": 0.76793766, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1315918, + "step": 13489, + "time_per_iteration": 2.845294952392578 + }, + { + "auxiliary_loss_clip": 0.01327372, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.2217617, + "balance_loss_mlp": 1.02003968, + "epoch": 0.8110626784909064, + "flos": 22384735589160.0, + "grad_norm": 1.5473854635107098, + "language_loss": 0.72057736, + "learning_rate": 3.628198318377453e-07, + "loss": 0.74418116, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12957764, + "step": 13490, + "time_per_iteration": 2.8616790771484375 + }, + { + "auxiliary_loss_clip": 0.01339845, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.22805929, + "balance_loss_mlp": 1.02680779, + "epoch": 0.8111228017435743, + "flos": 23373334312920.0, + "grad_norm": 2.6161371503102346, + "language_loss": 0.71716118, + "learning_rate": 3.625961645949762e-07, + "loss": 0.74096465, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13690186, + "step": 13491, + "time_per_iteration": 2.760002613067627 + }, + { + "auxiliary_loss_clip": 0.01333306, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.22493315, + "balance_loss_mlp": 1.01718664, + "epoch": 0.8111829249962423, + "flos": 21291337023480.0, + "grad_norm": 1.404159953409626, + "language_loss": 0.67807359, + "learning_rate": 3.623725594427245e-07, + "loss": 0.70170736, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12884521, + "step": 13492, + "time_per_iteration": 2.807023525238037 + }, + { + "auxiliary_loss_clip": 0.0133531, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.22572589, + "balance_loss_mlp": 1.01803732, + "epoch": 0.8112430482489102, + "flos": 22350600939600.0, + "grad_norm": 1.5746397557903553, + "language_loss": 0.71880281, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74246514, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12896729, + "step": 13493, + "time_per_iteration": 2.771275520324707 + }, + { + "auxiliary_loss_clip": 0.01336874, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.22712576, + "balance_loss_mlp": 1.02929974, + "epoch": 0.8113031715015783, + "flos": 31144472097840.0, + "grad_norm": 1.7290249150676176, + "language_loss": 0.70662212, + "learning_rate": 3.619255354436885e-07, + "loss": 0.7304219, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13800049, + "step": 13494, + "time_per_iteration": 2.823258399963379 + }, + { + "auxiliary_loss_clip": 0.01341522, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.23015761, + "balance_loss_mlp": 1.02277815, + "epoch": 0.8113632947542462, + "flos": 25340623320600.0, + "grad_norm": 2.045104932051205, + "language_loss": 0.76800984, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.79179496, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.14215088, + "step": 13495, + "time_per_iteration": 2.8024895191192627 + }, + { + "auxiliary_loss_clip": 0.01337841, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.22728133, + "balance_loss_mlp": 1.0202862, + "epoch": 0.8114234180069142, + "flos": 28445069724840.0, + "grad_norm": 2.059654920702176, + "language_loss": 0.80048943, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82420045, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12976074, + "step": 13496, + "time_per_iteration": 2.768385887145996 + }, + { + "auxiliary_loss_clip": 0.01334263, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.22545052, + "balance_loss_mlp": 1.01639605, + "epoch": 0.8114835412595821, + "flos": 20343654545400.0, + "grad_norm": 1.7394250598018517, + "language_loss": 0.71197879, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73562455, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13909912, + "step": 13497, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.01337829, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.22782242, + "balance_loss_mlp": 1.02106452, + "epoch": 0.8115436645122501, + "flos": 22495870335240.0, + "grad_norm": 1.5841729943618457, + "language_loss": 0.77221787, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79592568, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.11895752, + "step": 13498, + "time_per_iteration": 2.797879695892334 + }, + { + "auxiliary_loss_clip": 0.01330961, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.22208858, + "balance_loss_mlp": 1.01921082, + "epoch": 0.811603787764918, + "flos": 13849339599000.0, + "grad_norm": 2.080948206807386, + "language_loss": 0.84271252, + "learning_rate": 3.608090626234055e-07, + "loss": 0.8663491, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13494873, + "step": 13499, + "time_per_iteration": 2.732088804244995 + }, + { + "auxiliary_loss_clip": 0.01333131, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.22505057, + "balance_loss_mlp": 1.01456213, + "epoch": 0.8116639110175861, + "flos": 21619421566560.0, + "grad_norm": 1.9120715894209666, + "language_loss": 0.76647925, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.79009664, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.14050293, + "step": 13500, + "time_per_iteration": 2.775233268737793 + }, + { + "auxiliary_loss_clip": 0.0114768, + "auxiliary_loss_mlp": 0.0100539, + "balance_loss_clip": 1.10306334, + "balance_loss_mlp": 1.00263619, + "epoch": 0.811724034270254, + "flos": 64476367433640.0, + "grad_norm": 0.8085427919670881, + "language_loss": 0.60025483, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62178552, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02758789, + "step": 13501, + "time_per_iteration": 3.26274037361145 + }, + { + "auxiliary_loss_clip": 0.0132662, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.22196293, + "balance_loss_mlp": 1.01685238, + "epoch": 0.811784157522922, + "flos": 24759545738040.0, + "grad_norm": 1.6305252099951921, + "language_loss": 0.79365551, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81722057, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.13018799, + "step": 13502, + "time_per_iteration": 2.7869741916656494 + }, + { + "auxiliary_loss_clip": 0.0132615, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.22052121, + "balance_loss_mlp": 1.01765108, + "epoch": 0.81184428077559, + "flos": 12171005656560.0, + "grad_norm": 1.8730516650514835, + "language_loss": 0.7152952, + "learning_rate": 3.599170031654635e-07, + "loss": 0.7388584, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12518311, + "step": 13503, + "time_per_iteration": 2.789194107055664 + }, + { + "auxiliary_loss_clip": 0.01331932, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.22231305, + "balance_loss_mlp": 1.01733971, + "epoch": 0.8119044040282579, + "flos": 44430300552960.0, + "grad_norm": 1.5287600023665064, + "language_loss": 0.680462, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70410192, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14703369, + "step": 13504, + "time_per_iteration": 3.0079429149627686 + }, + { + "auxiliary_loss_clip": 0.01333257, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.22220469, + "balance_loss_mlp": 1.01696575, + "epoch": 0.8119645272809259, + "flos": 52164582928200.0, + "grad_norm": 1.7851353509503076, + "language_loss": 0.74343669, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76707757, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13861084, + "step": 13505, + "time_per_iteration": 3.01871919631958 + }, + { + "auxiliary_loss_clip": 0.01337272, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.22872806, + "balance_loss_mlp": 1.01641941, + "epoch": 0.8120246505335939, + "flos": 30239898716880.0, + "grad_norm": 2.0517263854379277, + "language_loss": 0.7285502, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.75222206, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13500977, + "step": 13506, + "time_per_iteration": 2.866720199584961 + }, + { + "auxiliary_loss_clip": 0.01349459, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.23584485, + "balance_loss_mlp": 1.02321589, + "epoch": 0.8120847737862619, + "flos": 22132798367040.0, + "grad_norm": 2.0113212879773466, + "language_loss": 0.76449239, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78835201, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13287354, + "step": 13507, + "time_per_iteration": 2.7980988025665283 + }, + { + "auxiliary_loss_clip": 0.01344604, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.231323, + "balance_loss_mlp": 1.01893616, + "epoch": 0.8121448970389298, + "flos": 23300476269120.0, + "grad_norm": 1.7296145322823073, + "language_loss": 0.70317924, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72694957, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13482666, + "step": 13508, + "time_per_iteration": 2.7763969898223877 + }, + { + "auxiliary_loss_clip": 0.01329759, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.22191727, + "balance_loss_mlp": 1.01769483, + "epoch": 0.8122050202915978, + "flos": 22169288301480.0, + "grad_norm": 1.5958775196040995, + "language_loss": 0.76447189, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78807575, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.1295166, + "step": 13509, + "time_per_iteration": 2.8549158573150635 + }, + { + "auxiliary_loss_clip": 0.01337287, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.22675681, + "balance_loss_mlp": 1.01918852, + "epoch": 0.8122651435442657, + "flos": 23264270593200.0, + "grad_norm": 1.760376270455405, + "language_loss": 0.77637351, + "learning_rate": 3.58358293835491e-07, + "loss": 0.80007243, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13433838, + "step": 13510, + "time_per_iteration": 2.799281597137451 + }, + { + "auxiliary_loss_clip": 0.01344023, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.23229599, + "balance_loss_mlp": 1.01627803, + "epoch": 0.8123252667969337, + "flos": 16143860374200.0, + "grad_norm": 2.4953398275383805, + "language_loss": 0.7029382, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72667885, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13757324, + "step": 13511, + "time_per_iteration": 2.867304801940918 + }, + { + "auxiliary_loss_clip": 0.01340247, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.22928786, + "balance_loss_mlp": 1.02310312, + "epoch": 0.8123853900496016, + "flos": 21249568002240.0, + "grad_norm": 1.8088464937483966, + "language_loss": 0.79936147, + "learning_rate": 3.57913508447004e-07, + "loss": 0.8231287, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13372803, + "step": 13512, + "time_per_iteration": 2.796530246734619 + }, + { + "auxiliary_loss_clip": 0.01336582, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.22813296, + "balance_loss_mlp": 1.01944077, + "epoch": 0.8124455133022697, + "flos": 64388183056920.0, + "grad_norm": 1.622393293164857, + "language_loss": 0.63622427, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65991616, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1315918, + "step": 13513, + "time_per_iteration": 3.250875473022461 + }, + { + "auxiliary_loss_clip": 0.01338663, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.22714186, + "balance_loss_mlp": 1.0178628, + "epoch": 0.8125056365549376, + "flos": 23847541027200.0, + "grad_norm": 1.6694946324022992, + "language_loss": 0.72036016, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.74405897, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13336182, + "step": 13514, + "time_per_iteration": 2.826007127761841 + }, + { + "auxiliary_loss_clip": 0.01329832, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.22332072, + "balance_loss_mlp": 1.01774514, + "epoch": 0.8125657598076056, + "flos": 23555621551680.0, + "grad_norm": 1.574275066115788, + "language_loss": 0.63216543, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65577149, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13018799, + "step": 13515, + "time_per_iteration": 2.879901885986328 + }, + { + "auxiliary_loss_clip": 0.0132129, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.21802306, + "balance_loss_mlp": 1.01598406, + "epoch": 0.8126258830602736, + "flos": 20709122406840.0, + "grad_norm": 1.4711479012488924, + "language_loss": 0.75763708, + "learning_rate": 3.570246849544616e-07, + "loss": 0.78113621, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.12652588, + "step": 13516, + "time_per_iteration": 4.20979905128479 + }, + { + "auxiliary_loss_clip": 0.01339053, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.22800159, + "balance_loss_mlp": 1.02005005, + "epoch": 0.8126860063129415, + "flos": 23622794425080.0, + "grad_norm": 1.430575652505903, + "language_loss": 0.91390371, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93762189, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12701416, + "step": 13517, + "time_per_iteration": 2.8342947959899902 + }, + { + "auxiliary_loss_clip": 0.01334627, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.22628796, + "balance_loss_mlp": 1.02323627, + "epoch": 0.8127461295656095, + "flos": 25012295127360.0, + "grad_norm": 1.3437665947871342, + "language_loss": 0.78703362, + "learning_rate": 3.565806469852244e-07, + "loss": 0.8107394, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.1270752, + "step": 13518, + "time_per_iteration": 4.28130841255188 + }, + { + "auxiliary_loss_clip": 0.0133717, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.2294004, + "balance_loss_mlp": 1.01965165, + "epoch": 0.8128062528182775, + "flos": 27347610323160.0, + "grad_norm": 1.531582762223783, + "language_loss": 0.79394841, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81763279, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1161499, + "step": 13519, + "time_per_iteration": 4.216953992843628 + }, + { + "auxiliary_loss_clip": 0.0133425, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.2238363, + "balance_loss_mlp": 1.02074087, + "epoch": 0.8128663760709455, + "flos": 26512077800160.0, + "grad_norm": 1.6142481149325947, + "language_loss": 0.70479256, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72847217, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12994385, + "step": 13520, + "time_per_iteration": 2.9206607341766357 + }, + { + "auxiliary_loss_clip": 0.01335718, + "auxiliary_loss_mlp": 0.01032024, + "balance_loss_clip": 1.2270478, + "balance_loss_mlp": 1.01939976, + "epoch": 0.8129264993236134, + "flos": 17935765564320.0, + "grad_norm": 2.21944614172246, + "language_loss": 0.73046726, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.75414467, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12628174, + "step": 13521, + "time_per_iteration": 2.7347090244293213 + }, + { + "auxiliary_loss_clip": 0.01336442, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.2258575, + "balance_loss_mlp": 1.0148077, + "epoch": 0.8129866225762814, + "flos": 26183384131680.0, + "grad_norm": 1.5845022850335213, + "language_loss": 0.70191294, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72556251, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.137146, + "step": 13522, + "time_per_iteration": 2.796635627746582 + }, + { + "auxiliary_loss_clip": 0.01326908, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.22206903, + "balance_loss_mlp": 1.02013969, + "epoch": 0.8130467458289493, + "flos": 21037085124840.0, + "grad_norm": 1.542445024659587, + "language_loss": 0.70488179, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72848082, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12860107, + "step": 13523, + "time_per_iteration": 2.7579758167266846 + }, + { + "auxiliary_loss_clip": 0.01329433, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.22208142, + "balance_loss_mlp": 1.01795483, + "epoch": 0.8131068690816173, + "flos": 15491305432080.0, + "grad_norm": 3.0091765719480152, + "language_loss": 0.70685834, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73046559, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13336182, + "step": 13524, + "time_per_iteration": 4.231186151504517 + }, + { + "auxiliary_loss_clip": 0.01335017, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.22709048, + "balance_loss_mlp": 1.0150876, + "epoch": 0.8131669923342852, + "flos": 29357236869120.0, + "grad_norm": 1.7965639850876463, + "language_loss": 0.62423563, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64785677, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12011719, + "step": 13525, + "time_per_iteration": 2.842597007751465 + }, + { + "auxiliary_loss_clip": 0.01332152, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.22464132, + "balance_loss_mlp": 1.02624893, + "epoch": 0.8132271155869533, + "flos": 35262190127520.0, + "grad_norm": 1.6673034242402405, + "language_loss": 0.65796566, + "learning_rate": 3.548069885262628e-07, + "loss": 0.68168026, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.1305542, + "step": 13526, + "time_per_iteration": 2.9023637771606445 + }, + { + "auxiliary_loss_clip": 0.01330905, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.22319865, + "balance_loss_mlp": 1.01822639, + "epoch": 0.8132872388396212, + "flos": 27787641779520.0, + "grad_norm": 1.5642825669179523, + "language_loss": 0.75701118, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.78062844, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12579346, + "step": 13527, + "time_per_iteration": 2.885263442993164 + }, + { + "auxiliary_loss_clip": 0.01336166, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.22935331, + "balance_loss_mlp": 1.0169661, + "epoch": 0.8133473620922892, + "flos": 27825512398200.0, + "grad_norm": 1.5866501975606684, + "language_loss": 0.70585251, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72951126, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12738037, + "step": 13528, + "time_per_iteration": 2.8730831146240234 + }, + { + "auxiliary_loss_clip": 0.01331407, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.22181821, + "balance_loss_mlp": 1.01921642, + "epoch": 0.8134074853449572, + "flos": 18994217313240.0, + "grad_norm": 1.762907346028149, + "language_loss": 0.68752027, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71115446, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12799072, + "step": 13529, + "time_per_iteration": 2.782771587371826 + }, + { + "auxiliary_loss_clip": 0.01328642, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.22237635, + "balance_loss_mlp": 1.01952481, + "epoch": 0.8134676085976251, + "flos": 24248158747200.0, + "grad_norm": 1.293471020257363, + "language_loss": 0.77605987, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79966998, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12860107, + "step": 13530, + "time_per_iteration": 2.878272533416748 + }, + { + "auxiliary_loss_clip": 0.0132854, + "auxiliary_loss_mlp": 0.01029931, + "balance_loss_clip": 1.2222259, + "balance_loss_mlp": 1.0169791, + "epoch": 0.8135277318502931, + "flos": 19067075357040.0, + "grad_norm": 1.8466387709260776, + "language_loss": 0.82250828, + "learning_rate": 3.537004792574052e-07, + "loss": 0.846093, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12939453, + "step": 13531, + "time_per_iteration": 2.788848638534546 + }, + { + "auxiliary_loss_clip": 0.01336414, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.22690606, + "balance_loss_mlp": 1.01368356, + "epoch": 0.813587855102961, + "flos": 17273545832520.0, + "grad_norm": 1.942135991027814, + "language_loss": 0.71992218, + "learning_rate": 3.534793646536065e-07, + "loss": 0.74356532, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.14215088, + "step": 13532, + "time_per_iteration": 2.764427900314331 + }, + { + "auxiliary_loss_clip": 0.01330377, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.22265482, + "balance_loss_mlp": 1.01488078, + "epoch": 0.8136479783556291, + "flos": 20162504340720.0, + "grad_norm": 1.8627451511697477, + "language_loss": 0.77007186, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.79365396, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12957764, + "step": 13533, + "time_per_iteration": 2.7360455989837646 + }, + { + "auxiliary_loss_clip": 0.01340713, + "auxiliary_loss_mlp": 0.01038283, + "balance_loss_clip": 1.22858477, + "balance_loss_mlp": 1.02448452, + "epoch": 0.813708101608297, + "flos": 22057178954760.0, + "grad_norm": 1.6765491371117036, + "language_loss": 0.76730061, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.79109061, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13806152, + "step": 13534, + "time_per_iteration": 2.8243906497955322 + }, + { + "auxiliary_loss_clip": 0.0133195, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.22479939, + "balance_loss_mlp": 1.01562786, + "epoch": 0.813768224860965, + "flos": 16176857989680.0, + "grad_norm": 1.9051211735105893, + "language_loss": 0.93121833, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95481104, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.11712646, + "step": 13535, + "time_per_iteration": 2.8553316593170166 + }, + { + "auxiliary_loss_clip": 0.0132925, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.22469366, + "balance_loss_mlp": 1.01496458, + "epoch": 0.8138283481136329, + "flos": 24357628550520.0, + "grad_norm": 1.5962930054644577, + "language_loss": 0.70475364, + "learning_rate": 3.52595530684499e-07, + "loss": 0.7283209, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12506104, + "step": 13536, + "time_per_iteration": 2.759876251220703 + }, + { + "auxiliary_loss_clip": 0.01333069, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.2253294, + "balance_loss_mlp": 1.01810074, + "epoch": 0.8138884713663009, + "flos": 25521367441680.0, + "grad_norm": 1.539256070482472, + "language_loss": 0.75667739, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.78032541, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13635254, + "step": 13537, + "time_per_iteration": 2.800278663635254 + }, + { + "auxiliary_loss_clip": 0.0132866, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.22231054, + "balance_loss_mlp": 1.02157235, + "epoch": 0.8139485946189688, + "flos": 22459339792440.0, + "grad_norm": 1.5738720253319822, + "language_loss": 0.76209331, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78572106, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12554932, + "step": 13538, + "time_per_iteration": 2.7221028804779053 + }, + { + "auxiliary_loss_clip": 0.01334645, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.22556114, + "balance_loss_mlp": 1.01683259, + "epoch": 0.8140087178716369, + "flos": 21255212564280.0, + "grad_norm": 1.5890696744495703, + "language_loss": 0.77726108, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80089641, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12042236, + "step": 13539, + "time_per_iteration": 2.8034138679504395 + }, + { + "auxiliary_loss_clip": 0.01325779, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.22091174, + "balance_loss_mlp": 1.0197885, + "epoch": 0.8140688411243048, + "flos": 39422326912200.0, + "grad_norm": 3.05461528860448, + "language_loss": 0.66017544, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.6837638, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.13262939, + "step": 13540, + "time_per_iteration": 2.922374963760376 + }, + { + "auxiliary_loss_clip": 0.01334291, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.22647786, + "balance_loss_mlp": 1.01895523, + "epoch": 0.8141289643769728, + "flos": 25422049728360.0, + "grad_norm": 1.5126854067597304, + "language_loss": 0.67663848, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.70028996, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.11889648, + "step": 13541, + "time_per_iteration": 2.9040591716766357 + }, + { + "auxiliary_loss_clip": 0.01329797, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.22295547, + "balance_loss_mlp": 1.02235937, + "epoch": 0.8141890876296408, + "flos": 12571907635080.0, + "grad_norm": 1.83222824860853, + "language_loss": 0.69379967, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71745193, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13079834, + "step": 13542, + "time_per_iteration": 2.761660099029541 + }, + { + "auxiliary_loss_clip": 0.01345196, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.23183465, + "balance_loss_mlp": 1.01924396, + "epoch": 0.8142492108823087, + "flos": 14970172434840.0, + "grad_norm": 2.509004891457799, + "language_loss": 0.79929739, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.82307678, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13500977, + "step": 13543, + "time_per_iteration": 2.7074928283691406 + }, + { + "auxiliary_loss_clip": 0.01342897, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.23063636, + "balance_loss_mlp": 1.02621436, + "epoch": 0.8143093341349767, + "flos": 12425826072240.0, + "grad_norm": 2.046430319955787, + "language_loss": 0.78618842, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.81002319, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14355469, + "step": 13544, + "time_per_iteration": 2.831195116043091 + }, + { + "auxiliary_loss_clip": 0.01347339, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.23251402, + "balance_loss_mlp": 1.02018547, + "epoch": 0.8143694573876447, + "flos": 11914236039600.0, + "grad_norm": 5.837104608280474, + "language_loss": 0.74092823, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.76474714, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14367676, + "step": 13545, + "time_per_iteration": 2.6840298175811768 + }, + { + "auxiliary_loss_clip": 0.0132417, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.218961, + "balance_loss_mlp": 1.01829839, + "epoch": 0.8144295806403127, + "flos": 21217585595760.0, + "grad_norm": 1.5800838735733884, + "language_loss": 0.76873589, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.79228562, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12512207, + "step": 13546, + "time_per_iteration": 2.7699246406555176 + }, + { + "auxiliary_loss_clip": 0.01335463, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.22606587, + "balance_loss_mlp": 1.0150373, + "epoch": 0.8144897038929806, + "flos": 19870097564880.0, + "grad_norm": 2.6220113234094984, + "language_loss": 0.71253073, + "learning_rate": 3.501701426337178e-07, + "loss": 0.73616207, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.12640381, + "step": 13547, + "time_per_iteration": 2.7342171669006348 + }, + { + "auxiliary_loss_clip": 0.01339827, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.22847521, + "balance_loss_mlp": 1.02118027, + "epoch": 0.8145498271456486, + "flos": 24577136674200.0, + "grad_norm": 1.8801814926244504, + "language_loss": 0.70587546, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72962821, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14263916, + "step": 13548, + "time_per_iteration": 2.822312355041504 + }, + { + "auxiliary_loss_clip": 0.01340992, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.23177052, + "balance_loss_mlp": 1.01711261, + "epoch": 0.8146099503983165, + "flos": 20197816632720.0, + "grad_norm": 2.5241792329188857, + "language_loss": 0.77133286, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79504848, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13452148, + "step": 13549, + "time_per_iteration": 2.7118611335754395 + }, + { + "auxiliary_loss_clip": 0.01335968, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.22642565, + "balance_loss_mlp": 1.02038717, + "epoch": 0.8146700736509845, + "flos": 19541728763280.0, + "grad_norm": 1.8964654573577586, + "language_loss": 0.71259314, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73629123, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.13464355, + "step": 13550, + "time_per_iteration": 2.7751379013061523 + }, + { + "auxiliary_loss_clip": 0.01326131, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.22178459, + "balance_loss_mlp": 1.01712036, + "epoch": 0.8147301969036524, + "flos": 18045966318120.0, + "grad_norm": 1.7098237038837512, + "language_loss": 0.71612066, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.73967797, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12469482, + "step": 13551, + "time_per_iteration": 2.7492544651031494 + }, + { + "auxiliary_loss_clip": 0.01342603, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.22873735, + "balance_loss_mlp": 1.02338636, + "epoch": 0.8147903201563205, + "flos": 18009313950240.0, + "grad_norm": 1.7291634112807333, + "language_loss": 0.68898535, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.71278393, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13891602, + "step": 13552, + "time_per_iteration": 2.766282081604004 + }, + { + "auxiliary_loss_clip": 0.01335295, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.226547, + "balance_loss_mlp": 1.0271554, + "epoch": 0.8148504434089884, + "flos": 20263243346640.0, + "grad_norm": 1.8133519589154325, + "language_loss": 0.82667065, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.85042095, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12567139, + "step": 13553, + "time_per_iteration": 2.794072151184082 + }, + { + "auxiliary_loss_clip": 0.01334961, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.22601247, + "balance_loss_mlp": 1.01828992, + "epoch": 0.8149105666616564, + "flos": 12498521682600.0, + "grad_norm": 1.70718894141079, + "language_loss": 0.68226373, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70592713, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13104248, + "step": 13554, + "time_per_iteration": 2.7745397090911865 + }, + { + "auxiliary_loss_clip": 0.01337543, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.22785687, + "balance_loss_mlp": 1.01825094, + "epoch": 0.8149706899143244, + "flos": 32529871355760.0, + "grad_norm": 1.6775190239714286, + "language_loss": 0.66327524, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68697101, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13793945, + "step": 13555, + "time_per_iteration": 4.304272651672363 + }, + { + "auxiliary_loss_clip": 0.01342216, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.23001993, + "balance_loss_mlp": 1.02167487, + "epoch": 0.8150308131669923, + "flos": 19390205680200.0, + "grad_norm": 2.080102773724194, + "language_loss": 0.74006915, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.76384139, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13336182, + "step": 13556, + "time_per_iteration": 4.364816665649414 + }, + { + "auxiliary_loss_clip": 0.01334265, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.22714329, + "balance_loss_mlp": 1.01813483, + "epoch": 0.8150909364196604, + "flos": 17425962299520.0, + "grad_norm": 1.6741024044990591, + "language_loss": 0.8059749, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82961869, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.11981201, + "step": 13557, + "time_per_iteration": 4.25559401512146 + }, + { + "auxiliary_loss_clip": 0.01341466, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.22984767, + "balance_loss_mlp": 1.01968682, + "epoch": 0.8151510596723283, + "flos": 27168612361560.0, + "grad_norm": 1.5137575510793675, + "language_loss": 0.65704447, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68079215, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.1361084, + "step": 13558, + "time_per_iteration": 2.800940990447998 + }, + { + "auxiliary_loss_clip": 0.01150111, + "auxiliary_loss_mlp": 0.01007118, + "balance_loss_clip": 1.10637689, + "balance_loss_mlp": 1.00423265, + "epoch": 0.8152111829249963, + "flos": 64232737944120.0, + "grad_norm": 1.8663809430511522, + "language_loss": 0.57058674, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.59215903, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02880859, + "step": 13559, + "time_per_iteration": 3.1699180603027344 + }, + { + "auxiliary_loss_clip": 0.01152632, + "auxiliary_loss_mlp": 0.01006396, + "balance_loss_clip": 1.10771644, + "balance_loss_mlp": 1.00371408, + "epoch": 0.8152713061776642, + "flos": 67086993285720.0, + "grad_norm": 0.6793744496708543, + "language_loss": 0.55344313, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57503343, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02685547, + "step": 13560, + "time_per_iteration": 3.0672805309295654 + }, + { + "auxiliary_loss_clip": 0.01332006, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.2249434, + "balance_loss_mlp": 1.01936102, + "epoch": 0.8153314294303322, + "flos": 14395064281200.0, + "grad_norm": 1.6055362109556977, + "language_loss": 0.67827165, + "learning_rate": 3.470942348696948e-07, + "loss": 0.70191252, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1272583, + "step": 13561, + "time_per_iteration": 2.807431221008301 + }, + { + "auxiliary_loss_clip": 0.01339442, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.22785878, + "balance_loss_mlp": 1.01915169, + "epoch": 0.8153915526830001, + "flos": 25628238309960.0, + "grad_norm": 1.4937065090215906, + "language_loss": 0.81584382, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83956003, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13031006, + "step": 13562, + "time_per_iteration": 2.8146746158599854 + }, + { + "auxiliary_loss_clip": 0.01333953, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.22481847, + "balance_loss_mlp": 1.01354527, + "epoch": 0.8154516759356681, + "flos": 23374593172080.0, + "grad_norm": 1.4305008134855577, + "language_loss": 0.72033024, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74392915, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12390137, + "step": 13563, + "time_per_iteration": 2.77858829498291 + }, + { + "auxiliary_loss_clip": 0.01328722, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.2200408, + "balance_loss_mlp": 1.01502466, + "epoch": 0.815511799188336, + "flos": 28155302492400.0, + "grad_norm": 1.5496364129294344, + "language_loss": 0.70684946, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.73042977, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.14282227, + "step": 13564, + "time_per_iteration": 4.371116638183594 + }, + { + "auxiliary_loss_clip": 0.01330942, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.22261846, + "balance_loss_mlp": 1.01841426, + "epoch": 0.8155719224410041, + "flos": 16987920652800.0, + "grad_norm": 2.0287870270359534, + "language_loss": 0.70671141, + "learning_rate": 3.462176595017854e-07, + "loss": 0.73032975, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12469482, + "step": 13565, + "time_per_iteration": 2.757239818572998 + }, + { + "auxiliary_loss_clip": 0.01330668, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.22319305, + "balance_loss_mlp": 1.02455521, + "epoch": 0.815632045693672, + "flos": 24687215602920.0, + "grad_norm": 1.701431234425794, + "language_loss": 0.79100817, + "learning_rate": 3.459986724180188e-07, + "loss": 0.81468993, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12957764, + "step": 13566, + "time_per_iteration": 2.8108928203582764 + }, + { + "auxiliary_loss_clip": 0.01324546, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.21959615, + "balance_loss_mlp": 1.01968515, + "epoch": 0.81569216894634, + "flos": 19943158650480.0, + "grad_norm": 1.4974243853038027, + "language_loss": 0.82634199, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84989935, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1151123, + "step": 13567, + "time_per_iteration": 2.838052272796631 + }, + { + "auxiliary_loss_clip": 0.01329414, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.22300506, + "balance_loss_mlp": 1.01593339, + "epoch": 0.8157522921990079, + "flos": 21804510782160.0, + "grad_norm": 1.847660500341346, + "language_loss": 0.80332929, + "learning_rate": 3.455608864184771e-07, + "loss": 0.82689774, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.11505127, + "step": 13568, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.01325911, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.22027302, + "balance_loss_mlp": 1.01703715, + "epoch": 0.8158124154516759, + "flos": 18512051360400.0, + "grad_norm": 1.7003062761723036, + "language_loss": 0.77381533, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79736859, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12371826, + "step": 13569, + "time_per_iteration": 2.760963201522827 + }, + { + "auxiliary_loss_clip": 0.01330346, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.22378552, + "balance_loss_mlp": 1.0211935, + "epoch": 0.815872538704344, + "flos": 26836101507240.0, + "grad_norm": 2.2837926659787864, + "language_loss": 0.58870143, + "learning_rate": 3.451233513649199e-07, + "loss": 0.61233962, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.1229248, + "step": 13570, + "time_per_iteration": 2.7783703804016113 + }, + { + "auxiliary_loss_clip": 0.01343849, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.23210561, + "balance_loss_mlp": 1.02398586, + "epoch": 0.8159326619570119, + "flos": 21730718746080.0, + "grad_norm": 1.9362426413772582, + "language_loss": 0.82675922, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.85057008, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13256836, + "step": 13571, + "time_per_iteration": 2.7844321727752686 + }, + { + "auxiliary_loss_clip": 0.0132804, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.22145486, + "balance_loss_mlp": 1.02144027, + "epoch": 0.8159927852096799, + "flos": 13844182337280.0, + "grad_norm": 2.993238214201589, + "language_loss": 0.79277945, + "learning_rate": 3.446860673237142e-07, + "loss": 0.81640601, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.13165283, + "step": 13572, + "time_per_iteration": 2.794711112976074 + }, + { + "auxiliary_loss_clip": 0.01331374, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.2207948, + "balance_loss_mlp": 1.02153611, + "epoch": 0.8160529084623478, + "flos": 24505009580880.0, + "grad_norm": 1.3477366955570742, + "language_loss": 0.65149724, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.67515433, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12792969, + "step": 13573, + "time_per_iteration": 2.7746894359588623 + }, + { + "auxiliary_loss_clip": 0.01325771, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.21871614, + "balance_loss_mlp": 1.01930571, + "epoch": 0.8161130317150158, + "flos": 24831672831360.0, + "grad_norm": 1.469053694827216, + "language_loss": 0.75449288, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77806664, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12304688, + "step": 13574, + "time_per_iteration": 2.8208305835723877 + }, + { + "auxiliary_loss_clip": 0.01334943, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.22462821, + "balance_loss_mlp": 1.01797485, + "epoch": 0.8161731549676837, + "flos": 30962753376120.0, + "grad_norm": 1.6685151680938997, + "language_loss": 0.60108733, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.62474513, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12878418, + "step": 13575, + "time_per_iteration": 2.809267520904541 + }, + { + "auxiliary_loss_clip": 0.01331473, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.22348976, + "balance_loss_mlp": 1.0180397, + "epoch": 0.8162332782203517, + "flos": 18556662966840.0, + "grad_norm": 1.8439889102114053, + "language_loss": 0.74394453, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76757514, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13543701, + "step": 13576, + "time_per_iteration": 2.7333872318267822 + }, + { + "auxiliary_loss_clip": 0.01148136, + "auxiliary_loss_mlp": 0.01009239, + "balance_loss_clip": 1.1036092, + "balance_loss_mlp": 1.00635457, + "epoch": 0.8162934014730197, + "flos": 70401567282480.0, + "grad_norm": 0.824424315082298, + "language_loss": 0.58756095, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60913467, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02880859, + "step": 13577, + "time_per_iteration": 3.225492000579834 + }, + { + "auxiliary_loss_clip": 0.01320388, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.21627891, + "balance_loss_mlp": 1.01913381, + "epoch": 0.8163535247256877, + "flos": 21219778447200.0, + "grad_norm": 8.952296883651297, + "language_loss": 0.71112347, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73463625, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.11767578, + "step": 13578, + "time_per_iteration": 2.738342523574829 + }, + { + "auxiliary_loss_clip": 0.01326354, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.21917558, + "balance_loss_mlp": 1.02117705, + "epoch": 0.8164136479783556, + "flos": 21102917922360.0, + "grad_norm": 1.618598384332908, + "language_loss": 0.73470092, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75829905, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12280273, + "step": 13579, + "time_per_iteration": 2.9294800758361816 + }, + { + "auxiliary_loss_clip": 0.01330871, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.22022295, + "balance_loss_mlp": 1.01514435, + "epoch": 0.8164737712310236, + "flos": 21724992967320.0, + "grad_norm": 1.970776400795675, + "language_loss": 0.79352778, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81711751, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12963867, + "step": 13580, + "time_per_iteration": 2.753491163253784 + }, + { + "auxiliary_loss_clip": 0.01326348, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.221174, + "balance_loss_mlp": 1.02127075, + "epoch": 0.8165338944836915, + "flos": 19541566329840.0, + "grad_norm": 1.6173810847510781, + "language_loss": 0.69522297, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71882498, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12591553, + "step": 13581, + "time_per_iteration": 2.723111152648926 + }, + { + "auxiliary_loss_clip": 0.0133183, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.22350299, + "balance_loss_mlp": 1.0172224, + "epoch": 0.8165940177363595, + "flos": 22933587115080.0, + "grad_norm": 1.6424548525286697, + "language_loss": 0.59600961, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61962259, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12243652, + "step": 13582, + "time_per_iteration": 2.7408688068389893 + }, + { + "auxiliary_loss_clip": 0.01321084, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.21844053, + "balance_loss_mlp": 1.01412165, + "epoch": 0.8166541409890276, + "flos": 23375973856320.0, + "grad_norm": 1.472472235583796, + "language_loss": 0.82638228, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84985685, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.12249756, + "step": 13583, + "time_per_iteration": 2.845839262008667 + }, + { + "auxiliary_loss_clip": 0.0132946, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.22058272, + "balance_loss_mlp": 1.01909494, + "epoch": 0.8167142642416955, + "flos": 18446584038120.0, + "grad_norm": 1.7759289687539708, + "language_loss": 0.74636459, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76997638, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1262207, + "step": 13584, + "time_per_iteration": 2.7035741806030273 + }, + { + "auxiliary_loss_clip": 0.01343155, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.23282146, + "balance_loss_mlp": 1.0175581, + "epoch": 0.8167743874943635, + "flos": 21219940880640.0, + "grad_norm": 1.7474607985559238, + "language_loss": 0.74666548, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.77040571, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13323975, + "step": 13585, + "time_per_iteration": 2.7795846462249756 + }, + { + "auxiliary_loss_clip": 0.01334439, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.22657371, + "balance_loss_mlp": 1.01967716, + "epoch": 0.8168345107470314, + "flos": 18702338446080.0, + "grad_norm": 1.6203516144474963, + "language_loss": 0.69710761, + "learning_rate": 3.416321129478068e-07, + "loss": 0.72078395, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13513184, + "step": 13586, + "time_per_iteration": 2.721737861633301 + }, + { + "auxiliary_loss_clip": 0.01331726, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.22392881, + "balance_loss_mlp": 1.01964593, + "epoch": 0.8168946339996994, + "flos": 16257431621880.0, + "grad_norm": 2.8462637502192636, + "language_loss": 0.61034489, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63397956, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12115479, + "step": 13587, + "time_per_iteration": 2.7690300941467285 + }, + { + "auxiliary_loss_clip": 0.01341784, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.22960615, + "balance_loss_mlp": 1.01867914, + "epoch": 0.8169547572523673, + "flos": 26947073819880.0, + "grad_norm": 2.7661198265647404, + "language_loss": 0.70382977, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.72756493, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13049316, + "step": 13588, + "time_per_iteration": 2.837348461151123 + }, + { + "auxiliary_loss_clip": 0.01334627, + "auxiliary_loss_mlp": 0.0102767, + "balance_loss_clip": 1.22517645, + "balance_loss_mlp": 1.01374626, + "epoch": 0.8170148805050353, + "flos": 18956915211600.0, + "grad_norm": 1.7146796953611856, + "language_loss": 0.73196721, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.7555902, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13909912, + "step": 13589, + "time_per_iteration": 2.7792320251464844 + }, + { + "auxiliary_loss_clip": 0.01326725, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.21995139, + "balance_loss_mlp": 1.01795805, + "epoch": 0.8170750037577033, + "flos": 21839904290880.0, + "grad_norm": 1.8668488224394133, + "language_loss": 0.73732275, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.7609005, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13104248, + "step": 13590, + "time_per_iteration": 2.7865474224090576 + }, + { + "auxiliary_loss_clip": 0.01340295, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.22755766, + "balance_loss_mlp": 1.02143693, + "epoch": 0.8171351270103713, + "flos": 33513272209440.0, + "grad_norm": 1.6459898386599765, + "language_loss": 0.65223968, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67600244, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14556885, + "step": 13591, + "time_per_iteration": 2.95625901222229 + }, + { + "auxiliary_loss_clip": 0.01337306, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.22541678, + "balance_loss_mlp": 1.0227145, + "epoch": 0.8171952502630392, + "flos": 22712982565680.0, + "grad_norm": 1.8800072665855716, + "language_loss": 0.6859715, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70970404, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13238525, + "step": 13592, + "time_per_iteration": 2.7992429733276367 + }, + { + "auxiliary_loss_clip": 0.01333744, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.22406185, + "balance_loss_mlp": 1.01373124, + "epoch": 0.8172553735157072, + "flos": 26729311855680.0, + "grad_norm": 1.800076947238928, + "language_loss": 0.66307712, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68668437, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13250732, + "step": 13593, + "time_per_iteration": 4.301794528961182 + }, + { + "auxiliary_loss_clip": 0.01328737, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.22064722, + "balance_loss_mlp": 1.01878905, + "epoch": 0.8173154967683751, + "flos": 15965349712920.0, + "grad_norm": 1.9143973805358032, + "language_loss": 0.69646859, + "learning_rate": 3.398925286280188e-07, + "loss": 0.72006679, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12298584, + "step": 13594, + "time_per_iteration": 2.7680768966674805 + }, + { + "auxiliary_loss_clip": 0.01342609, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.23064256, + "balance_loss_mlp": 1.02113581, + "epoch": 0.8173756200210431, + "flos": 25991269669800.0, + "grad_norm": 2.139961566914426, + "language_loss": 0.66614753, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.68991655, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.13171387, + "step": 13595, + "time_per_iteration": 4.23368763923645 + }, + { + "auxiliary_loss_clip": 0.0134131, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.22765815, + "balance_loss_mlp": 1.017102, + "epoch": 0.8174357432737112, + "flos": 25669723072680.0, + "grad_norm": 1.4292948086084427, + "language_loss": 0.78643894, + "learning_rate": 3.394582618976658e-07, + "loss": 0.8101579, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13494873, + "step": 13596, + "time_per_iteration": 4.206372499465942 + }, + { + "auxiliary_loss_clip": 0.01324962, + "auxiliary_loss_mlp": 0.01024904, + "balance_loss_clip": 1.21816111, + "balance_loss_mlp": 1.01193428, + "epoch": 0.8174958665263791, + "flos": 21840107332680.0, + "grad_norm": 2.18872739500604, + "language_loss": 0.58737004, + "learning_rate": 3.392412229802362e-07, + "loss": 0.61086863, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12982178, + "step": 13597, + "time_per_iteration": 2.74299693107605 + }, + { + "auxiliary_loss_clip": 0.01327778, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.22160876, + "balance_loss_mlp": 1.02414024, + "epoch": 0.8175559897790471, + "flos": 22460639259960.0, + "grad_norm": 1.4503432426727958, + "language_loss": 0.82626975, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84991729, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12841797, + "step": 13598, + "time_per_iteration": 2.7543697357177734 + }, + { + "auxiliary_loss_clip": 0.013344, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.22443414, + "balance_loss_mlp": 1.01468694, + "epoch": 0.817616113031715, + "flos": 23620114273320.0, + "grad_norm": 1.9302065773196626, + "language_loss": 0.82733893, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.85095167, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12200928, + "step": 13599, + "time_per_iteration": 2.7740089893341064 + }, + { + "auxiliary_loss_clip": 0.01326133, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.21938932, + "balance_loss_mlp": 1.01867247, + "epoch": 0.817676236284383, + "flos": 27677968934400.0, + "grad_norm": 1.6626846334308023, + "language_loss": 0.8385601, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86213207, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12390137, + "step": 13600, + "time_per_iteration": 2.887568950653076 + }, + { + "auxiliary_loss_clip": 0.01335052, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.22453332, + "balance_loss_mlp": 1.01345062, + "epoch": 0.8177363595370509, + "flos": 24686403435720.0, + "grad_norm": 1.5847191111393424, + "language_loss": 0.73829997, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76191783, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13275146, + "step": 13601, + "time_per_iteration": 4.426939010620117 + }, + { + "auxiliary_loss_clip": 0.01341235, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.22731018, + "balance_loss_mlp": 1.01672482, + "epoch": 0.817796482789719, + "flos": 17350627145760.0, + "grad_norm": 2.3075460324730512, + "language_loss": 0.68359828, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70731091, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13305664, + "step": 13602, + "time_per_iteration": 2.9093141555786133 + }, + { + "auxiliary_loss_clip": 0.01327493, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.22159302, + "balance_loss_mlp": 1.0171504, + "epoch": 0.8178566060423869, + "flos": 17782090238160.0, + "grad_norm": 2.1805249451965056, + "language_loss": 0.84231693, + "learning_rate": 3.379403122624718e-07, + "loss": 0.86588573, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12231445, + "step": 13603, + "time_per_iteration": 2.749221086502075 + }, + { + "auxiliary_loss_clip": 0.0133079, + "auxiliary_loss_mlp": 0.01026211, + "balance_loss_clip": 1.22212768, + "balance_loss_mlp": 1.01338387, + "epoch": 0.8179167292950549, + "flos": 24978728994840.0, + "grad_norm": 1.6542360560407812, + "language_loss": 0.69719517, + "learning_rate": 3.377237143507159e-07, + "loss": 0.72076523, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12835693, + "step": 13604, + "time_per_iteration": 2.8260672092437744 + }, + { + "auxiliary_loss_clip": 0.01329426, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.22223711, + "balance_loss_mlp": 1.01759362, + "epoch": 0.8179768525477228, + "flos": 22861947322080.0, + "grad_norm": 2.3906127108221393, + "language_loss": 0.74093139, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76453382, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13214111, + "step": 13605, + "time_per_iteration": 2.734138250350952 + }, + { + "auxiliary_loss_clip": 0.01323508, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.21856165, + "balance_loss_mlp": 1.02901697, + "epoch": 0.8180369758003908, + "flos": 18519766948800.0, + "grad_norm": 1.7411981700102388, + "language_loss": 0.74563074, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76928967, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.13372803, + "step": 13606, + "time_per_iteration": 2.7318472862243652 + }, + { + "auxiliary_loss_clip": 0.01324863, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.21810412, + "balance_loss_mlp": 1.01785779, + "epoch": 0.8180970990530587, + "flos": 33188598768600.0, + "grad_norm": 1.6820708561389397, + "language_loss": 0.65781295, + "learning_rate": 3.370742988503916e-07, + "loss": 0.68136752, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12768555, + "step": 13607, + "time_per_iteration": 2.8017091751098633 + }, + { + "auxiliary_loss_clip": 0.01329504, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.22001183, + "balance_loss_mlp": 1.01761079, + "epoch": 0.8181572223057267, + "flos": 25015543796160.0, + "grad_norm": 1.6681667256062487, + "language_loss": 0.70475924, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72836107, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1305542, + "step": 13608, + "time_per_iteration": 2.7469711303710938 + }, + { + "auxiliary_loss_clip": 0.01327049, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.21988165, + "balance_loss_mlp": 1.01902092, + "epoch": 0.8182173455583948, + "flos": 28554661353240.0, + "grad_norm": 1.7539707611484794, + "language_loss": 0.80367583, + "learning_rate": 3.366416704613735e-07, + "loss": 0.82726192, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12561035, + "step": 13609, + "time_per_iteration": 2.7796685695648193 + }, + { + "auxiliary_loss_clip": 0.01148628, + "auxiliary_loss_mlp": 0.01003025, + "balance_loss_clip": 1.10451818, + "balance_loss_mlp": 1.00009274, + "epoch": 0.8182774688110627, + "flos": 72042883381800.0, + "grad_norm": 0.7488044686343968, + "language_loss": 0.55859149, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.58010805, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02929688, + "step": 13610, + "time_per_iteration": 3.3073995113372803 + }, + { + "auxiliary_loss_clip": 0.01320084, + "auxiliary_loss_mlp": 0.0102665, + "balance_loss_clip": 1.21668267, + "balance_loss_mlp": 1.01478291, + "epoch": 0.8183375920637307, + "flos": 19760099852880.0, + "grad_norm": 1.7473077551308343, + "language_loss": 0.78381515, + "learning_rate": 3.362092943712107e-07, + "loss": 0.80728257, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.11877441, + "step": 13611, + "time_per_iteration": 2.7347781658172607 + }, + { + "auxiliary_loss_clip": 0.01346654, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.23243999, + "balance_loss_mlp": 1.02060091, + "epoch": 0.8183977153163986, + "flos": 22346458886880.0, + "grad_norm": 1.9575598345380463, + "language_loss": 0.77659893, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.80042005, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.14849854, + "step": 13612, + "time_per_iteration": 2.7571499347686768 + }, + { + "auxiliary_loss_clip": 0.01325543, + "auxiliary_loss_mlp": 0.0102682, + "balance_loss_clip": 1.21981931, + "balance_loss_mlp": 1.01445222, + "epoch": 0.8184578385690666, + "flos": 17716988391120.0, + "grad_norm": 1.886652930866004, + "language_loss": 0.86596692, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.8894906, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.1237793, + "step": 13613, + "time_per_iteration": 2.8698225021362305 + }, + { + "auxiliary_loss_clip": 0.01326593, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.22051692, + "balance_loss_mlp": 1.02152717, + "epoch": 0.8185179618217345, + "flos": 25706497265640.0, + "grad_norm": 1.3399026454384715, + "language_loss": 0.72874904, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75236237, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13220215, + "step": 13614, + "time_per_iteration": 2.85026478767395 + }, + { + "auxiliary_loss_clip": 0.01336808, + "auxiliary_loss_mlp": 0.01032499, + "balance_loss_clip": 1.22712815, + "balance_loss_mlp": 1.01917148, + "epoch": 0.8185780850744026, + "flos": 25966190684520.0, + "grad_norm": 1.6879596602398512, + "language_loss": 0.81483185, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83852494, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13342285, + "step": 13615, + "time_per_iteration": 2.7858479022979736 + }, + { + "auxiliary_loss_clip": 0.01332327, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.22426867, + "balance_loss_mlp": 1.01569772, + "epoch": 0.8186382083270705, + "flos": 25233914885760.0, + "grad_norm": 1.7040916415336314, + "language_loss": 0.75545335, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77906656, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13305664, + "step": 13616, + "time_per_iteration": 2.771196126937866 + }, + { + "auxiliary_loss_clip": 0.01322664, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.21730065, + "balance_loss_mlp": 1.01825595, + "epoch": 0.8186983315797385, + "flos": 22419601189200.0, + "grad_norm": 1.765310983447568, + "language_loss": 0.75287586, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77641416, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12915039, + "step": 13617, + "time_per_iteration": 2.7327334880828857 + }, + { + "auxiliary_loss_clip": 0.01319977, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.21480823, + "balance_loss_mlp": 1.02097583, + "epoch": 0.8187584548324064, + "flos": 22023125521920.0, + "grad_norm": 1.8478503677440092, + "language_loss": 0.68679154, + "learning_rate": 3.346979658556415e-07, + "loss": 0.7103194, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11846924, + "step": 13618, + "time_per_iteration": 2.7732014656066895 + }, + { + "auxiliary_loss_clip": 0.01342089, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.2295773, + "balance_loss_mlp": 1.01690006, + "epoch": 0.8188185780850744, + "flos": 29247645240720.0, + "grad_norm": 2.12863365304998, + "language_loss": 0.70595407, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72968364, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.1394043, + "step": 13619, + "time_per_iteration": 2.755887269973755 + }, + { + "auxiliary_loss_clip": 0.01334943, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.22549069, + "balance_loss_mlp": 1.02066207, + "epoch": 0.8188787013377423, + "flos": 20700391609440.0, + "grad_norm": 1.939816763897435, + "language_loss": 0.74017417, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76386052, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13024902, + "step": 13620, + "time_per_iteration": 2.734022855758667 + }, + { + "auxiliary_loss_clip": 0.01325766, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.21970761, + "balance_loss_mlp": 1.01309228, + "epoch": 0.8189388245904103, + "flos": 23738314874040.0, + "grad_norm": 1.6166587258146188, + "language_loss": 0.76645625, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78997457, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12982178, + "step": 13621, + "time_per_iteration": 2.7138254642486572 + }, + { + "auxiliary_loss_clip": 0.0132998, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.22306108, + "balance_loss_mlp": 1.0153029, + "epoch": 0.8189989478430784, + "flos": 28260752068080.0, + "grad_norm": 2.202357489803282, + "language_loss": 0.66531539, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.68889928, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.13104248, + "step": 13622, + "time_per_iteration": 2.7997946739196777 + }, + { + "auxiliary_loss_clip": 0.01330812, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.22257113, + "balance_loss_mlp": 1.01255274, + "epoch": 0.8190590710957463, + "flos": 21402715419720.0, + "grad_norm": 2.235072169288142, + "language_loss": 0.74799937, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77156723, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13421631, + "step": 13623, + "time_per_iteration": 2.716464042663574 + }, + { + "auxiliary_loss_clip": 0.01334255, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.22500515, + "balance_loss_mlp": 1.01888204, + "epoch": 0.8191191943484143, + "flos": 38803094452440.0, + "grad_norm": 1.8360585698555105, + "language_loss": 0.63970435, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.66336608, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13049316, + "step": 13624, + "time_per_iteration": 2.933242082595825 + }, + { + "auxiliary_loss_clip": 0.01317369, + "auxiliary_loss_mlp": 0.01028775, + "balance_loss_clip": 1.21229458, + "balance_loss_mlp": 1.01624656, + "epoch": 0.8191793176010822, + "flos": 25451920500120.0, + "grad_norm": 1.6004254015491666, + "language_loss": 0.78564322, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80910462, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12530518, + "step": 13625, + "time_per_iteration": 2.7975118160247803 + }, + { + "auxiliary_loss_clip": 0.01345292, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.22980332, + "balance_loss_mlp": 1.01765323, + "epoch": 0.8192394408537502, + "flos": 25088564273400.0, + "grad_norm": 2.2306657045843616, + "language_loss": 0.76236671, + "learning_rate": 3.329745223345244e-07, + "loss": 0.78612578, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.12976074, + "step": 13626, + "time_per_iteration": 2.8837196826934814 + }, + { + "auxiliary_loss_clip": 0.01328312, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.22154832, + "balance_loss_mlp": 1.02134943, + "epoch": 0.8192995641064181, + "flos": 27679633877160.0, + "grad_norm": 1.667853776500283, + "language_loss": 0.74225426, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.76586676, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11590576, + "step": 13627, + "time_per_iteration": 2.7943193912506104 + }, + { + "auxiliary_loss_clip": 0.01337733, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.22685814, + "balance_loss_mlp": 1.01641202, + "epoch": 0.8193596873590862, + "flos": 21293529874920.0, + "grad_norm": 1.5569797785199118, + "language_loss": 0.69049817, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71417141, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13183594, + "step": 13628, + "time_per_iteration": 2.7174949645996094 + }, + { + "auxiliary_loss_clip": 0.01341163, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.22799206, + "balance_loss_mlp": 1.01900268, + "epoch": 0.8194198106117541, + "flos": 17496992967120.0, + "grad_norm": 1.6867100246682505, + "language_loss": 0.85849237, + "learning_rate": 3.323292738168171e-07, + "loss": 0.88223004, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13598633, + "step": 13629, + "time_per_iteration": 2.760469675064087 + }, + { + "auxiliary_loss_clip": 0.01333238, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.22486234, + "balance_loss_mlp": 1.01632655, + "epoch": 0.8194799338644221, + "flos": 15272122175280.0, + "grad_norm": 2.0936077897355654, + "language_loss": 0.74689484, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.77052552, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1348877, + "step": 13630, + "time_per_iteration": 2.6636438369750977 + }, + { + "auxiliary_loss_clip": 0.01335971, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.22640574, + "balance_loss_mlp": 1.01921821, + "epoch": 0.81954005711709, + "flos": 14722702132320.0, + "grad_norm": 2.322204749727805, + "language_loss": 0.7263751, + "learning_rate": 3.31899424315957e-07, + "loss": 0.75005811, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13110352, + "step": 13631, + "time_per_iteration": 2.7269539833068848 + }, + { + "auxiliary_loss_clip": 0.01333174, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.22408259, + "balance_loss_mlp": 1.01609778, + "epoch": 0.819600180369758, + "flos": 23079018944160.0, + "grad_norm": 1.5040417017813197, + "language_loss": 0.76989746, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.79351532, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12530518, + "step": 13632, + "time_per_iteration": 4.187687397003174 + }, + { + "auxiliary_loss_clip": 0.01324586, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.2177794, + "balance_loss_mlp": 1.01454711, + "epoch": 0.8196603036224259, + "flos": 27605557582560.0, + "grad_norm": 1.7218890470828077, + "language_loss": 0.65785348, + "learning_rate": 3.314698278332588e-07, + "loss": 0.68136454, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.11987305, + "step": 13633, + "time_per_iteration": 4.272634029388428 + }, + { + "auxiliary_loss_clip": 0.01323805, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.21925986, + "balance_loss_mlp": 1.02062273, + "epoch": 0.8197204268750939, + "flos": 28587537143640.0, + "grad_norm": 1.4153852202593042, + "language_loss": 0.75931728, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.78287947, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11798096, + "step": 13634, + "time_per_iteration": 2.7796778678894043 + }, + { + "auxiliary_loss_clip": 0.01320515, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.21630359, + "balance_loss_mlp": 1.017156, + "epoch": 0.819780550127762, + "flos": 23263539642720.0, + "grad_norm": 1.7977070284255323, + "language_loss": 0.81839919, + "learning_rate": 3.310404844338841e-07, + "loss": 0.84189278, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.11682129, + "step": 13635, + "time_per_iteration": 4.16079568862915 + }, + { + "auxiliary_loss_clip": 0.01331147, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.22162342, + "balance_loss_mlp": 1.01555419, + "epoch": 0.8198406733804299, + "flos": 26690588461440.0, + "grad_norm": 1.6963946169820976, + "language_loss": 0.75863528, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78223872, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13653564, + "step": 13636, + "time_per_iteration": 2.7428512573242188 + }, + { + "auxiliary_loss_clip": 0.01326192, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.22009861, + "balance_loss_mlp": 1.01808822, + "epoch": 0.8199007966330979, + "flos": 20088915346440.0, + "grad_norm": 1.9205680418590567, + "language_loss": 0.81065512, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83422613, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12811279, + "step": 13637, + "time_per_iteration": 2.8650460243225098 + }, + { + "auxiliary_loss_clip": 0.01326974, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.22006273, + "balance_loss_mlp": 1.01665783, + "epoch": 0.8199609198857658, + "flos": 31908811519800.0, + "grad_norm": 2.0122906146926973, + "language_loss": 0.71582103, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73938245, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12506104, + "step": 13638, + "time_per_iteration": 2.862018585205078 + }, + { + "auxiliary_loss_clip": 0.01334925, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.22329092, + "balance_loss_mlp": 1.01765418, + "epoch": 0.8200210431384338, + "flos": 26475953340960.0, + "grad_norm": 1.8907321836875535, + "language_loss": 0.79831654, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.82198584, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14355469, + "step": 13639, + "time_per_iteration": 2.781362771987915 + }, + { + "auxiliary_loss_clip": 0.01324813, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.21810055, + "balance_loss_mlp": 1.01490819, + "epoch": 0.8200811663911017, + "flos": 22096633299480.0, + "grad_norm": 1.8416873016581818, + "language_loss": 0.7976265, + "learning_rate": 3.299682336022589e-07, + "loss": 0.82114887, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12518311, + "step": 13640, + "time_per_iteration": 4.274795770645142 + }, + { + "auxiliary_loss_clip": 0.01340992, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.22738218, + "balance_loss_mlp": 1.021469, + "epoch": 0.8201412896437698, + "flos": 37600307300160.0, + "grad_norm": 1.8319760622421892, + "language_loss": 0.63269562, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65645146, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13140869, + "step": 13641, + "time_per_iteration": 2.887242555618286 + }, + { + "auxiliary_loss_clip": 0.01323851, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.2171731, + "balance_loss_mlp": 1.01863348, + "epoch": 0.8202014128964377, + "flos": 19651117349880.0, + "grad_norm": 1.9066907428410442, + "language_loss": 0.73572445, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75928241, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13311768, + "step": 13642, + "time_per_iteration": 2.708942174911499 + }, + { + "auxiliary_loss_clip": 0.01327026, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.22008348, + "balance_loss_mlp": 1.02109206, + "epoch": 0.8202615361491057, + "flos": 31473612458280.0, + "grad_norm": 1.7061039789633516, + "language_loss": 0.70710337, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.73071849, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13397217, + "step": 13643, + "time_per_iteration": 2.820676326751709 + }, + { + "auxiliary_loss_clip": 0.01326822, + "auxiliary_loss_mlp": 0.01032219, + "balance_loss_clip": 1.22041965, + "balance_loss_mlp": 1.01939845, + "epoch": 0.8203216594017736, + "flos": 24720700518720.0, + "grad_norm": 1.634269931976162, + "language_loss": 0.65643084, + "learning_rate": 3.291115727880256e-07, + "loss": 0.68002123, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.1282959, + "step": 13644, + "time_per_iteration": 2.7948031425476074 + }, + { + "auxiliary_loss_clip": 0.01331246, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.22131062, + "balance_loss_mlp": 1.02223945, + "epoch": 0.8203817826544416, + "flos": 26037505610640.0, + "grad_norm": 1.5911835136188497, + "language_loss": 0.71042007, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.73407853, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12359619, + "step": 13645, + "time_per_iteration": 2.799633026123047 + }, + { + "auxiliary_loss_clip": 0.01323111, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.21745121, + "balance_loss_mlp": 1.0139699, + "epoch": 0.8204419059071095, + "flos": 25959652738560.0, + "grad_norm": 1.6899424115804689, + "language_loss": 0.71414852, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73764533, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12609863, + "step": 13646, + "time_per_iteration": 2.7964534759521484 + }, + { + "auxiliary_loss_clip": 0.01334762, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.22495699, + "balance_loss_mlp": 1.02058923, + "epoch": 0.8205020291597775, + "flos": 23584477114440.0, + "grad_norm": 3.5573674857047406, + "language_loss": 0.78863168, + "learning_rate": 3.284697424316132e-07, + "loss": 0.81231678, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.13183594, + "step": 13647, + "time_per_iteration": 2.8342137336730957 + }, + { + "auxiliary_loss_clip": 0.01320387, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.21658325, + "balance_loss_mlp": 1.01678824, + "epoch": 0.8205621524124456, + "flos": 26805093701400.0, + "grad_norm": 1.3194432388133681, + "language_loss": 0.68086547, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70435941, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12225342, + "step": 13648, + "time_per_iteration": 2.8226897716522217 + }, + { + "auxiliary_loss_clip": 0.01329353, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.22023702, + "balance_loss_mlp": 1.01665211, + "epoch": 0.8206222756651135, + "flos": 27533755356120.0, + "grad_norm": 1.97951006585519, + "language_loss": 0.79667461, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82026386, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12896729, + "step": 13649, + "time_per_iteration": 2.9060726165771484 + }, + { + "auxiliary_loss_clip": 0.01332742, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.22441936, + "balance_loss_mlp": 1.02367938, + "epoch": 0.8206823989177815, + "flos": 21183572771280.0, + "grad_norm": 1.5345438951759593, + "language_loss": 0.68921214, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71291089, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13470459, + "step": 13650, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.01331553, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.22229517, + "balance_loss_mlp": 1.01850045, + "epoch": 0.8207425221704494, + "flos": 11513293452720.0, + "grad_norm": 2.7718370691841754, + "language_loss": 0.61167896, + "learning_rate": 3.276148560452001e-07, + "loss": 0.6353184, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13897705, + "step": 13651, + "time_per_iteration": 2.70320463180542 + }, + { + "auxiliary_loss_clip": 0.01332778, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.22344112, + "balance_loss_mlp": 1.02075279, + "epoch": 0.8208026454231174, + "flos": 19796752220760.0, + "grad_norm": 3.2074613224614743, + "language_loss": 0.72656131, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.75023079, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13415527, + "step": 13652, + "time_per_iteration": 2.7496631145477295 + }, + { + "auxiliary_loss_clip": 0.01317948, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.21484125, + "balance_loss_mlp": 1.01449203, + "epoch": 0.8208627686757853, + "flos": 15671562252840.0, + "grad_norm": 1.949251480213158, + "language_loss": 0.7310521, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75449342, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.11682129, + "step": 13653, + "time_per_iteration": 2.703181028366089 + }, + { + "auxiliary_loss_clip": 0.01340001, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.22758067, + "balance_loss_mlp": 1.01924229, + "epoch": 0.8209228919284534, + "flos": 37489091337360.0, + "grad_norm": 1.819623301014745, + "language_loss": 0.63677597, + "learning_rate": 3.269743571056451e-07, + "loss": 0.66051257, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14416504, + "step": 13654, + "time_per_iteration": 2.89477801322937 + }, + { + "auxiliary_loss_clip": 0.01332801, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.22254682, + "balance_loss_mlp": 1.01921749, + "epoch": 0.8209830151811213, + "flos": 23118270247080.0, + "grad_norm": 1.8287766371749765, + "language_loss": 0.70030856, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72395402, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12518311, + "step": 13655, + "time_per_iteration": 2.8054230213165283 + }, + { + "auxiliary_loss_clip": 0.01325752, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.21960378, + "balance_loss_mlp": 1.01875544, + "epoch": 0.8210431384337893, + "flos": 21293164399680.0, + "grad_norm": 1.8884375868974161, + "language_loss": 0.82478064, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84835035, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12463379, + "step": 13656, + "time_per_iteration": 2.8401613235473633 + }, + { + "auxiliary_loss_clip": 0.01316379, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.21271348, + "balance_loss_mlp": 1.02049327, + "epoch": 0.8211032616864572, + "flos": 11505131172360.0, + "grad_norm": 2.541498152333093, + "language_loss": 0.74358976, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76708126, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.1229248, + "step": 13657, + "time_per_iteration": 2.7224464416503906 + }, + { + "auxiliary_loss_clip": 0.01329717, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.22293806, + "balance_loss_mlp": 1.02314043, + "epoch": 0.8211633849391252, + "flos": 29827017272160.0, + "grad_norm": 1.702640341130259, + "language_loss": 0.55794817, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.58160508, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12854004, + "step": 13658, + "time_per_iteration": 2.804767370223999 + }, + { + "auxiliary_loss_clip": 0.01330364, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.2213347, + "balance_loss_mlp": 1.02286911, + "epoch": 0.8212235081917931, + "flos": 13119378476760.0, + "grad_norm": 2.059820615486463, + "language_loss": 0.7929616, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81661993, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.1260376, + "step": 13659, + "time_per_iteration": 2.7320821285247803 + }, + { + "auxiliary_loss_clip": 0.01321518, + "auxiliary_loss_mlp": 0.01025211, + "balance_loss_clip": 1.21790719, + "balance_loss_mlp": 1.01362395, + "epoch": 0.8212836314444611, + "flos": 40522750724160.0, + "grad_norm": 1.6020016742064487, + "language_loss": 0.59522259, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61868989, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.11590576, + "step": 13660, + "time_per_iteration": 2.911923408508301 + }, + { + "auxiliary_loss_clip": 0.01336079, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.22574353, + "balance_loss_mlp": 1.01358175, + "epoch": 0.8213437546971292, + "flos": 18775196489880.0, + "grad_norm": 1.9693979666367851, + "language_loss": 0.73797691, + "learning_rate": 3.254820804029075e-07, + "loss": 0.76161289, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13946533, + "step": 13661, + "time_per_iteration": 2.753300189971924 + }, + { + "auxiliary_loss_clip": 0.01333737, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.22248316, + "balance_loss_mlp": 1.01820612, + "epoch": 0.8214038779497971, + "flos": 19686998158920.0, + "grad_norm": 1.8363500257278924, + "language_loss": 0.75208485, + "learning_rate": 3.252691519437143e-07, + "loss": 0.7757355, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13122559, + "step": 13662, + "time_per_iteration": 2.845038414001465 + }, + { + "auxiliary_loss_clip": 0.0114899, + "auxiliary_loss_mlp": 0.01000335, + "balance_loss_clip": 1.10413837, + "balance_loss_mlp": 0.99748629, + "epoch": 0.8214640012024651, + "flos": 71619420136320.0, + "grad_norm": 0.7453739460958045, + "language_loss": 0.54106325, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56255651, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02844238, + "step": 13663, + "time_per_iteration": 3.340428590774536 + }, + { + "auxiliary_loss_clip": 0.01328956, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.22073972, + "balance_loss_mlp": 1.02089715, + "epoch": 0.821524124455133, + "flos": 14761344309840.0, + "grad_norm": 1.777690494559244, + "language_loss": 0.6616621, + "learning_rate": 3.248434855512838e-07, + "loss": 0.68528879, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12805176, + "step": 13664, + "time_per_iteration": 2.7631337642669678 + }, + { + "auxiliary_loss_clip": 0.01321738, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.21724796, + "balance_loss_mlp": 1.01568508, + "epoch": 0.821584247707801, + "flos": 25087711497840.0, + "grad_norm": 1.497814192689324, + "language_loss": 0.75231218, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77580786, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12133789, + "step": 13665, + "time_per_iteration": 2.80759334564209 + }, + { + "auxiliary_loss_clip": 0.01335196, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.22644424, + "balance_loss_mlp": 1.01599503, + "epoch": 0.8216443709604689, + "flos": 36837673429320.0, + "grad_norm": 3.995982355045327, + "language_loss": 0.65517032, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67880595, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12359619, + "step": 13666, + "time_per_iteration": 2.954556465148926 + }, + { + "auxiliary_loss_clip": 0.01332355, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.22462726, + "balance_loss_mlp": 1.01748109, + "epoch": 0.821704494213137, + "flos": 25087183589160.0, + "grad_norm": 1.5806962550708947, + "language_loss": 0.76833701, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79196274, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12744141, + "step": 13667, + "time_per_iteration": 2.7804245948791504 + }, + { + "auxiliary_loss_clip": 0.01334248, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.22442162, + "balance_loss_mlp": 1.02060127, + "epoch": 0.8217646174658049, + "flos": 14360239289520.0, + "grad_norm": 1.7691845911764126, + "language_loss": 0.77182853, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79550999, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13299561, + "step": 13668, + "time_per_iteration": 2.8949012756347656 + }, + { + "auxiliary_loss_clip": 0.01321775, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.2155602, + "balance_loss_mlp": 1.01843882, + "epoch": 0.8218247407184729, + "flos": 22095618090480.0, + "grad_norm": 2.0067654272011954, + "language_loss": 0.74093091, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.76445901, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12597656, + "step": 13669, + "time_per_iteration": 2.789567232131958 + }, + { + "auxiliary_loss_clip": 0.01331889, + "auxiliary_loss_mlp": 0.01027856, + "balance_loss_clip": 1.22408295, + "balance_loss_mlp": 1.01545227, + "epoch": 0.8218848639711408, + "flos": 16768453137480.0, + "grad_norm": 1.6541808686775235, + "language_loss": 0.78873032, + "learning_rate": 3.235680111625161e-07, + "loss": 0.81232774, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.1239624, + "step": 13670, + "time_per_iteration": 2.7338149547576904 + }, + { + "auxiliary_loss_clip": 0.01338665, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.22717953, + "balance_loss_mlp": 1.02326357, + "epoch": 0.8219449872238088, + "flos": 26000406550800.0, + "grad_norm": 2.0798018346243023, + "language_loss": 0.74967843, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77343988, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14202881, + "step": 13671, + "time_per_iteration": 4.236250638961792 + }, + { + "auxiliary_loss_clip": 0.01341915, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.22871172, + "balance_loss_mlp": 1.01747954, + "epoch": 0.8220051104764767, + "flos": 20783117484720.0, + "grad_norm": 1.854755148261178, + "language_loss": 0.76509959, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78883952, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14593506, + "step": 13672, + "time_per_iteration": 4.275604009628296 + }, + { + "auxiliary_loss_clip": 0.013313, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.22239327, + "balance_loss_mlp": 1.01672053, + "epoch": 0.8220652337291448, + "flos": 14578975854360.0, + "grad_norm": 1.8470892694415642, + "language_loss": 0.74472749, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76834571, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13800049, + "step": 13673, + "time_per_iteration": 2.8277230262756348 + }, + { + "auxiliary_loss_clip": 0.01334629, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.22239137, + "balance_loss_mlp": 1.01957047, + "epoch": 0.8221253569818128, + "flos": 23811213526200.0, + "grad_norm": 1.5261595495812665, + "language_loss": 0.79778647, + "learning_rate": 3.227189662052254e-07, + "loss": 0.82146084, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13238525, + "step": 13674, + "time_per_iteration": 4.241067409515381 + }, + { + "auxiliary_loss_clip": 0.01326176, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.21826684, + "balance_loss_mlp": 1.01889145, + "epoch": 0.8221854802344807, + "flos": 21293245616400.0, + "grad_norm": 2.0625832356788587, + "language_loss": 0.70951033, + "learning_rate": 3.225068639524484e-07, + "loss": 0.73308647, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12561035, + "step": 13675, + "time_per_iteration": 2.805330514907837 + }, + { + "auxiliary_loss_clip": 0.01325043, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.22019327, + "balance_loss_mlp": 1.01769257, + "epoch": 0.8222456034871487, + "flos": 20961343887480.0, + "grad_norm": 1.5667971510506955, + "language_loss": 0.74563837, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76919383, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12811279, + "step": 13676, + "time_per_iteration": 2.733318328857422 + }, + { + "auxiliary_loss_clip": 0.01330413, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.22335744, + "balance_loss_mlp": 1.01862359, + "epoch": 0.8223057267398166, + "flos": 21402512377920.0, + "grad_norm": 2.1351695808473012, + "language_loss": 0.80679762, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.83041584, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12799072, + "step": 13677, + "time_per_iteration": 2.746366262435913 + }, + { + "auxiliary_loss_clip": 0.01336723, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.22726679, + "balance_loss_mlp": 1.0206089, + "epoch": 0.8223658499924846, + "flos": 15272447042160.0, + "grad_norm": 1.6305630989940805, + "language_loss": 0.70462519, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72833526, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13684082, + "step": 13678, + "time_per_iteration": 2.685765504837036 + }, + { + "auxiliary_loss_clip": 0.01329737, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.22259688, + "balance_loss_mlp": 1.01772821, + "epoch": 0.8224259732451525, + "flos": 31255931710800.0, + "grad_norm": 1.5035554754524452, + "language_loss": 0.71859276, + "learning_rate": 3.216590911288133e-07, + "loss": 0.74219859, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13104248, + "step": 13679, + "time_per_iteration": 4.251555919647217 + }, + { + "auxiliary_loss_clip": 0.01323044, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.21684062, + "balance_loss_mlp": 1.01463282, + "epoch": 0.8224860964978206, + "flos": 21578952012840.0, + "grad_norm": 1.9549453430743813, + "language_loss": 0.70102191, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72452354, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12487793, + "step": 13680, + "time_per_iteration": 2.7445266246795654 + }, + { + "auxiliary_loss_clip": 0.01331268, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.22459519, + "balance_loss_mlp": 1.01978683, + "epoch": 0.8225462197504885, + "flos": 25488654084720.0, + "grad_norm": 1.6317734376937914, + "language_loss": 0.60016727, + "learning_rate": 3.21235586541986e-07, + "loss": 0.6237992, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12139893, + "step": 13681, + "time_per_iteration": 2.769165277481079 + }, + { + "auxiliary_loss_clip": 0.0133929, + "auxiliary_loss_mlp": 0.01036001, + "balance_loss_clip": 1.2279458, + "balance_loss_mlp": 1.02312064, + "epoch": 0.8226063430031565, + "flos": 39392862224040.0, + "grad_norm": 1.685809421710218, + "language_loss": 0.69500166, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71875453, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12878418, + "step": 13682, + "time_per_iteration": 2.9525182247161865 + }, + { + "auxiliary_loss_clip": 0.01328447, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.22008181, + "balance_loss_mlp": 1.01844168, + "epoch": 0.8226664662558244, + "flos": 22819772217240.0, + "grad_norm": 1.9006536612706195, + "language_loss": 0.80041617, + "learning_rate": 3.20812336590816e-07, + "loss": 0.82402158, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13641357, + "step": 13683, + "time_per_iteration": 2.7491135597229004 + }, + { + "auxiliary_loss_clip": 0.01321303, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.21658266, + "balance_loss_mlp": 1.01682472, + "epoch": 0.8227265895084924, + "flos": 25671063148560.0, + "grad_norm": 1.9108536527855708, + "language_loss": 0.86356044, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88706315, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.12139893, + "step": 13684, + "time_per_iteration": 2.9352002143859863 + }, + { + "auxiliary_loss_clip": 0.01317324, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.21343422, + "balance_loss_mlp": 1.01821625, + "epoch": 0.8227867127611603, + "flos": 26185170899520.0, + "grad_norm": 1.5970813834275137, + "language_loss": 0.79944408, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82292545, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.12597656, + "step": 13685, + "time_per_iteration": 2.8572516441345215 + }, + { + "auxiliary_loss_clip": 0.01331499, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.22303367, + "balance_loss_mlp": 1.02168274, + "epoch": 0.8228468360138284, + "flos": 22023166130280.0, + "grad_norm": 1.5567779981441552, + "language_loss": 0.6883893, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71204859, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12762451, + "step": 13686, + "time_per_iteration": 2.7332348823547363 + }, + { + "auxiliary_loss_clip": 0.01330721, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.22115862, + "balance_loss_mlp": 1.01873469, + "epoch": 0.8229069592664963, + "flos": 14907547697760.0, + "grad_norm": 1.8949984386534813, + "language_loss": 0.77976376, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80339181, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13360596, + "step": 13687, + "time_per_iteration": 2.730043649673462 + }, + { + "auxiliary_loss_clip": 0.01327815, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.21933079, + "balance_loss_mlp": 1.02034581, + "epoch": 0.8229670825191643, + "flos": 15673755104280.0, + "grad_norm": 1.8986891932652825, + "language_loss": 0.72733396, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.75095367, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.13806152, + "step": 13688, + "time_per_iteration": 2.6667110919952393 + }, + { + "auxiliary_loss_clip": 0.01333144, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.22483683, + "balance_loss_mlp": 1.01809502, + "epoch": 0.8230272057718323, + "flos": 23188488747480.0, + "grad_norm": 1.586680112462177, + "language_loss": 0.73476684, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75840461, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12530518, + "step": 13689, + "time_per_iteration": 2.8039233684539795 + }, + { + "auxiliary_loss_clip": 0.01331421, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.22194016, + "balance_loss_mlp": 1.01861656, + "epoch": 0.8230873290245002, + "flos": 21037328775000.0, + "grad_norm": 1.7988186025750033, + "language_loss": 0.69580317, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.7194308, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.1272583, + "step": 13690, + "time_per_iteration": 2.7253000736236572 + }, + { + "auxiliary_loss_clip": 0.01335223, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.22639179, + "balance_loss_mlp": 1.02016938, + "epoch": 0.8231474522771682, + "flos": 21254928305760.0, + "grad_norm": 1.733365351191975, + "language_loss": 0.85774505, + "learning_rate": 3.191218844260988e-07, + "loss": 0.88142586, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12683105, + "step": 13691, + "time_per_iteration": 2.8293585777282715 + }, + { + "auxiliary_loss_clip": 0.01330699, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.22158146, + "balance_loss_mlp": 1.01763439, + "epoch": 0.8232075755298361, + "flos": 23847337985400.0, + "grad_norm": 1.9480992449418029, + "language_loss": 0.76747143, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79108405, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12927246, + "step": 13692, + "time_per_iteration": 2.7711009979248047 + }, + { + "auxiliary_loss_clip": 0.01326938, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.2206831, + "balance_loss_mlp": 1.02155912, + "epoch": 0.8232676987825042, + "flos": 21659119561440.0, + "grad_norm": 1.4356757735628456, + "language_loss": 0.71562129, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73923022, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1239624, + "step": 13693, + "time_per_iteration": 2.754225015640259 + }, + { + "auxiliary_loss_clip": 0.01321685, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.21744466, + "balance_loss_mlp": 1.0247004, + "epoch": 0.8233278220351721, + "flos": 26328288052080.0, + "grad_norm": 1.3726182542106324, + "language_loss": 0.84077251, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.86435616, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11987305, + "step": 13694, + "time_per_iteration": 2.7549898624420166 + }, + { + "auxiliary_loss_clip": 0.01340183, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.22939992, + "balance_loss_mlp": 1.02066565, + "epoch": 0.8233879452878401, + "flos": 21730840571160.0, + "grad_norm": 1.5576108817751897, + "language_loss": 0.76981568, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79355693, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13262939, + "step": 13695, + "time_per_iteration": 2.73492431640625 + }, + { + "auxiliary_loss_clip": 0.01328952, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.22200346, + "balance_loss_mlp": 1.02186632, + "epoch": 0.823448068540508, + "flos": 20562553543680.0, + "grad_norm": 3.6439361787730356, + "language_loss": 0.81394023, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83757609, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12768555, + "step": 13696, + "time_per_iteration": 2.74660062789917 + }, + { + "auxiliary_loss_clip": 0.01146445, + "auxiliary_loss_mlp": 0.01004863, + "balance_loss_clip": 1.1026119, + "balance_loss_mlp": 1.00190687, + "epoch": 0.823508191793176, + "flos": 67291209038880.0, + "grad_norm": 0.7456495023144072, + "language_loss": 0.63857985, + "learning_rate": 3.178567221188393e-07, + "loss": 0.66009295, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02954102, + "step": 13697, + "time_per_iteration": 3.4289612770080566 + }, + { + "auxiliary_loss_clip": 0.01321225, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.21800423, + "balance_loss_mlp": 1.01413357, + "epoch": 0.8235683150458439, + "flos": 17932557503880.0, + "grad_norm": 1.7318999573198857, + "language_loss": 0.73389268, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75736272, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.11633301, + "step": 13698, + "time_per_iteration": 2.790637969970703 + }, + { + "auxiliary_loss_clip": 0.01329655, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.22074604, + "balance_loss_mlp": 1.01858616, + "epoch": 0.823628438298512, + "flos": 18920709535680.0, + "grad_norm": 1.8477473750841775, + "language_loss": 0.72206652, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74568176, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1328125, + "step": 13699, + "time_per_iteration": 2.674748659133911 + }, + { + "auxiliary_loss_clip": 0.0132283, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.21659625, + "balance_loss_mlp": 1.01369274, + "epoch": 0.8236885615511799, + "flos": 18700957761840.0, + "grad_norm": 1.8857227589704422, + "language_loss": 0.82283044, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84632421, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12866211, + "step": 13700, + "time_per_iteration": 2.7131168842315674 + }, + { + "auxiliary_loss_clip": 0.01336239, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.22636044, + "balance_loss_mlp": 1.0207355, + "epoch": 0.8237486848038479, + "flos": 23700159996840.0, + "grad_norm": 2.0060382833632247, + "language_loss": 0.732952, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75664783, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12615967, + "step": 13701, + "time_per_iteration": 2.7547569274902344 + }, + { + "auxiliary_loss_clip": 0.01332976, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.22279286, + "balance_loss_mlp": 1.01830888, + "epoch": 0.8238088080565159, + "flos": 23446639048680.0, + "grad_norm": 1.7127024214360689, + "language_loss": 0.69577193, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71942163, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13684082, + "step": 13702, + "time_per_iteration": 2.7722156047821045 + }, + { + "auxiliary_loss_clip": 0.01328979, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.22064543, + "balance_loss_mlp": 1.01743996, + "epoch": 0.8238689313091838, + "flos": 22751584134840.0, + "grad_norm": 1.8484184497170082, + "language_loss": 0.75339377, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77699208, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13421631, + "step": 13703, + "time_per_iteration": 2.7685086727142334 + }, + { + "auxiliary_loss_clip": 0.0134271, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.22849536, + "balance_loss_mlp": 1.0209136, + "epoch": 0.8239290545618518, + "flos": 25635669639840.0, + "grad_norm": 1.929410856659086, + "language_loss": 0.70433921, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72811449, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.13891602, + "step": 13704, + "time_per_iteration": 2.7509918212890625 + }, + { + "auxiliary_loss_clip": 0.01328232, + "auxiliary_loss_mlp": 0.01026553, + "balance_loss_clip": 1.22057056, + "balance_loss_mlp": 1.01416087, + "epoch": 0.8239891778145197, + "flos": 26031454965000.0, + "grad_norm": 1.8621946867328594, + "language_loss": 0.64596057, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66950846, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12390137, + "step": 13705, + "time_per_iteration": 2.7764580249786377 + }, + { + "auxiliary_loss_clip": 0.01330341, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.22015285, + "balance_loss_mlp": 1.01559746, + "epoch": 0.8240493010671878, + "flos": 21838036306320.0, + "grad_norm": 1.5391568031320948, + "language_loss": 0.69622922, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71982616, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13769531, + "step": 13706, + "time_per_iteration": 2.7323269844055176 + }, + { + "auxiliary_loss_clip": 0.01334276, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.22578037, + "balance_loss_mlp": 1.02051973, + "epoch": 0.8241094243198557, + "flos": 18556581750120.0, + "grad_norm": 1.7166699450139282, + "language_loss": 0.69876778, + "learning_rate": 3.157532220876475e-07, + "loss": 0.72244561, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12982178, + "step": 13707, + "time_per_iteration": 2.7480719089508057 + }, + { + "auxiliary_loss_clip": 0.01333938, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.225191, + "balance_loss_mlp": 1.01967525, + "epoch": 0.8241695475725237, + "flos": 25452651450600.0, + "grad_norm": 1.7652868076396484, + "language_loss": 0.7960943, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81976062, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13024902, + "step": 13708, + "time_per_iteration": 2.787182569503784 + }, + { + "auxiliary_loss_clip": 0.01332388, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.22245145, + "balance_loss_mlp": 1.01386726, + "epoch": 0.8242296708251916, + "flos": 18994379746680.0, + "grad_norm": 2.1310260964028416, + "language_loss": 0.68833447, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.71193337, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1362915, + "step": 13709, + "time_per_iteration": 2.887955665588379 + }, + { + "auxiliary_loss_clip": 0.01328369, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.21980202, + "balance_loss_mlp": 1.02204072, + "epoch": 0.8242897940778596, + "flos": 22605949263960.0, + "grad_norm": 1.7410423164349011, + "language_loss": 0.82529551, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84893107, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.1315918, + "step": 13710, + "time_per_iteration": 4.240397930145264 + }, + { + "auxiliary_loss_clip": 0.01325374, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.21762896, + "balance_loss_mlp": 1.01825428, + "epoch": 0.8243499173305275, + "flos": 21472852703400.0, + "grad_norm": 2.003249866831005, + "language_loss": 0.7832762, + "learning_rate": 3.149136098993257e-07, + "loss": 0.8068437, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13116455, + "step": 13711, + "time_per_iteration": 4.326573371887207 + }, + { + "auxiliary_loss_clip": 0.01330096, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.22216046, + "balance_loss_mlp": 1.01576972, + "epoch": 0.8244100405831956, + "flos": 20015082702000.0, + "grad_norm": 1.9485552362237513, + "language_loss": 0.66094601, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.68453455, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13006592, + "step": 13712, + "time_per_iteration": 4.314928293228149 + }, + { + "auxiliary_loss_clip": 0.01330313, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.22152817, + "balance_loss_mlp": 1.01775658, + "epoch": 0.8244701638358635, + "flos": 26436092912640.0, + "grad_norm": 1.8448428379551194, + "language_loss": 0.74717283, + "learning_rate": 3.14494187165202e-07, + "loss": 0.77077889, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12536621, + "step": 13713, + "time_per_iteration": 2.7745749950408936 + }, + { + "auxiliary_loss_clip": 0.01330708, + "auxiliary_loss_mlp": 0.01027734, + "balance_loss_clip": 1.22120273, + "balance_loss_mlp": 1.01416159, + "epoch": 0.8245302870885315, + "flos": 17644577039280.0, + "grad_norm": 1.8621361302744617, + "language_loss": 0.81434882, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.8379333, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13555908, + "step": 13714, + "time_per_iteration": 2.7023043632507324 + }, + { + "auxiliary_loss_clip": 0.01325672, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.21847761, + "balance_loss_mlp": 1.01724863, + "epoch": 0.8245904103411995, + "flos": 26214594979320.0, + "grad_norm": 2.3959745722381998, + "language_loss": 0.66295713, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68651974, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13336182, + "step": 13715, + "time_per_iteration": 2.7811970710754395 + }, + { + "auxiliary_loss_clip": 0.01335844, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.22528267, + "balance_loss_mlp": 1.01679611, + "epoch": 0.8246505335938674, + "flos": 24210206911800.0, + "grad_norm": 1.6485915753737623, + "language_loss": 0.75201118, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77566981, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13232422, + "step": 13716, + "time_per_iteration": 2.8074147701263428 + }, + { + "auxiliary_loss_clip": 0.01146257, + "auxiliary_loss_mlp": 0.01010946, + "balance_loss_clip": 1.1021781, + "balance_loss_mlp": 1.0081799, + "epoch": 0.8247106568465354, + "flos": 67109815184040.0, + "grad_norm": 0.7130225061655627, + "language_loss": 0.5904094, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61198139, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02770996, + "step": 13717, + "time_per_iteration": 3.3603241443634033 + }, + { + "auxiliary_loss_clip": 0.01332716, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.2256999, + "balance_loss_mlp": 1.01594532, + "epoch": 0.8247707800992033, + "flos": 12572110676880.0, + "grad_norm": 2.03534329029252, + "language_loss": 0.80490434, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.82851309, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12213135, + "step": 13718, + "time_per_iteration": 4.24186372756958 + }, + { + "auxiliary_loss_clip": 0.0132297, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.2168107, + "balance_loss_mlp": 1.020383, + "epoch": 0.8248309033518714, + "flos": 15927763352760.0, + "grad_norm": 1.6277303096168452, + "language_loss": 0.6892767, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71283615, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12597656, + "step": 13719, + "time_per_iteration": 2.8610901832580566 + }, + { + "auxiliary_loss_clip": 0.01330609, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.22201109, + "balance_loss_mlp": 1.01763988, + "epoch": 0.8248910266045393, + "flos": 17568998235360.0, + "grad_norm": 2.5881521708954436, + "language_loss": 0.70512289, + "learning_rate": 3.13028221321197e-07, + "loss": 0.72874194, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13665771, + "step": 13720, + "time_per_iteration": 2.6835811138153076 + }, + { + "auxiliary_loss_clip": 0.01337027, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.22695756, + "balance_loss_mlp": 1.01783681, + "epoch": 0.8249511498572073, + "flos": 28625326545600.0, + "grad_norm": 1.5660452786899202, + "language_loss": 0.75699437, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78067625, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13330078, + "step": 13721, + "time_per_iteration": 2.8613440990448 + }, + { + "auxiliary_loss_clip": 0.01328888, + "auxiliary_loss_mlp": 0.0102611, + "balance_loss_clip": 1.22140098, + "balance_loss_mlp": 1.01380777, + "epoch": 0.8250112731098752, + "flos": 25562080645560.0, + "grad_norm": 1.752803528857138, + "language_loss": 0.77668786, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80023777, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.1229248, + "step": 13722, + "time_per_iteration": 2.8662898540496826 + }, + { + "auxiliary_loss_clip": 0.01323461, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.21729255, + "balance_loss_mlp": 1.01624072, + "epoch": 0.8250713963625432, + "flos": 27752004620640.0, + "grad_norm": 1.5973772289198565, + "language_loss": 0.63025236, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.65377301, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12365723, + "step": 13723, + "time_per_iteration": 2.84325909614563 + }, + { + "auxiliary_loss_clip": 0.01332148, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.22386205, + "balance_loss_mlp": 1.01810431, + "epoch": 0.8251315196152111, + "flos": 21614873430240.0, + "grad_norm": 1.449994075293861, + "language_loss": 0.74588156, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76951396, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12988281, + "step": 13724, + "time_per_iteration": 2.8148632049560547 + }, + { + "auxiliary_loss_clip": 0.01330546, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.22127569, + "balance_loss_mlp": 1.02056026, + "epoch": 0.8251916428678792, + "flos": 28584532125000.0, + "grad_norm": 1.7893950577273057, + "language_loss": 0.64332134, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66697764, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.14532471, + "step": 13725, + "time_per_iteration": 2.784921169281006 + }, + { + "auxiliary_loss_clip": 0.01325746, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.21943879, + "balance_loss_mlp": 1.01564765, + "epoch": 0.8252517661205471, + "flos": 23080318411680.0, + "grad_norm": 1.4755450519377178, + "language_loss": 0.81820327, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.8417421, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12487793, + "step": 13726, + "time_per_iteration": 2.77618408203125 + }, + { + "auxiliary_loss_clip": 0.01321928, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.21775699, + "balance_loss_mlp": 1.01887488, + "epoch": 0.8253118893732151, + "flos": 31765938017400.0, + "grad_norm": 1.5584545224025694, + "language_loss": 0.70992422, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.73345125, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.11914062, + "step": 13727, + "time_per_iteration": 2.8446784019470215 + }, + { + "auxiliary_loss_clip": 0.01335138, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.22523534, + "balance_loss_mlp": 1.01956034, + "epoch": 0.8253720126258831, + "flos": 18301030383960.0, + "grad_norm": 1.8014895017286532, + "language_loss": 0.63554299, + "learning_rate": 3.113566701515036e-07, + "loss": 0.65922707, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13720703, + "step": 13728, + "time_per_iteration": 2.792752742767334 + }, + { + "auxiliary_loss_clip": 0.01341893, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.22837639, + "balance_loss_mlp": 1.02095556, + "epoch": 0.825432135878551, + "flos": 26803022675040.0, + "grad_norm": 1.9261995325139771, + "language_loss": 0.71841288, + "learning_rate": 3.111480143230092e-07, + "loss": 0.74217808, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.13671875, + "step": 13729, + "time_per_iteration": 2.787856340408325 + }, + { + "auxiliary_loss_clip": 0.0114708, + "auxiliary_loss_mlp": 0.01006002, + "balance_loss_clip": 1.10252047, + "balance_loss_mlp": 1.00322461, + "epoch": 0.825492259131219, + "flos": 54231125413680.0, + "grad_norm": 0.8705283130760683, + "language_loss": 0.62682712, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64835793, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02783203, + "step": 13730, + "time_per_iteration": 3.08229398727417 + }, + { + "auxiliary_loss_clip": 0.01327137, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.21938872, + "balance_loss_mlp": 1.01754379, + "epoch": 0.825552382383887, + "flos": 43763126601240.0, + "grad_norm": 2.6572702151418413, + "language_loss": 0.63509721, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65866864, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12463379, + "step": 13731, + "time_per_iteration": 2.9651432037353516 + }, + { + "auxiliary_loss_clip": 0.01343901, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.22928965, + "balance_loss_mlp": 1.01489139, + "epoch": 0.825612505636555, + "flos": 12606448368240.0, + "grad_norm": 2.0258498370891016, + "language_loss": 0.70134282, + "learning_rate": 3.105224311177812e-07, + "loss": 0.72506928, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.13842773, + "step": 13732, + "time_per_iteration": 2.7436563968658447 + }, + { + "auxiliary_loss_clip": 0.0134264, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.22985041, + "balance_loss_mlp": 1.02059197, + "epoch": 0.8256726288892229, + "flos": 17599153265640.0, + "grad_norm": 2.2225101124828464, + "language_loss": 0.71578431, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73955154, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13464355, + "step": 13733, + "time_per_iteration": 2.7214713096618652 + }, + { + "auxiliary_loss_clip": 0.01325144, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.21902585, + "balance_loss_mlp": 1.01555443, + "epoch": 0.8257327521418909, + "flos": 23811132309480.0, + "grad_norm": 1.4302713936501943, + "language_loss": 0.82609308, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84963036, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13037109, + "step": 13734, + "time_per_iteration": 2.760209560394287 + }, + { + "auxiliary_loss_clip": 0.01324518, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.21945429, + "balance_loss_mlp": 1.01573634, + "epoch": 0.8257928753945588, + "flos": 19285811921880.0, + "grad_norm": 2.476214537548908, + "language_loss": 0.8267625, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85029691, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.13189697, + "step": 13735, + "time_per_iteration": 2.7646069526672363 + }, + { + "auxiliary_loss_clip": 0.01330694, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.22184825, + "balance_loss_mlp": 1.01968265, + "epoch": 0.8258529986472268, + "flos": 18483480056160.0, + "grad_norm": 1.8490820515675195, + "language_loss": 0.71050501, + "learning_rate": 3.096892171265497e-07, + "loss": 0.73412406, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.11529541, + "step": 13736, + "time_per_iteration": 2.7814157009124756 + }, + { + "auxiliary_loss_clip": 0.0114661, + "auxiliary_loss_mlp": 0.01009437, + "balance_loss_clip": 1.10261929, + "balance_loss_mlp": 1.00655198, + "epoch": 0.8259131218998947, + "flos": 62151187346640.0, + "grad_norm": 0.8674491460497192, + "language_loss": 0.68073398, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70229447, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02880859, + "step": 13737, + "time_per_iteration": 3.2573623657226562 + }, + { + "auxiliary_loss_clip": 0.01333497, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.22520947, + "balance_loss_mlp": 1.02087367, + "epoch": 0.8259732451525628, + "flos": 22163521914360.0, + "grad_norm": 2.3894087013660132, + "language_loss": 0.6983, + "learning_rate": 3.0927299467987e-07, + "loss": 0.72197306, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12939453, + "step": 13738, + "time_per_iteration": 2.8437225818634033 + }, + { + "auxiliary_loss_clip": 0.01338072, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.22819781, + "balance_loss_mlp": 1.0171802, + "epoch": 0.8260333684052307, + "flos": 38368626341400.0, + "grad_norm": 1.9067848325346066, + "language_loss": 0.63286567, + "learning_rate": 3.090649796213911e-07, + "loss": 0.6565634, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14538574, + "step": 13739, + "time_per_iteration": 2.859135150909424 + }, + { + "auxiliary_loss_clip": 0.0114743, + "auxiliary_loss_mlp": 0.01001901, + "balance_loss_clip": 1.10338092, + "balance_loss_mlp": 0.99865848, + "epoch": 0.8260934916578987, + "flos": 62200306481040.0, + "grad_norm": 0.8327667864718675, + "language_loss": 0.5940032, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61549652, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.0324707, + "step": 13740, + "time_per_iteration": 3.2792091369628906 + }, + { + "auxiliary_loss_clip": 0.0134773, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.23386168, + "balance_loss_mlp": 1.01843762, + "epoch": 0.8261536149105667, + "flos": 22570677580320.0, + "grad_norm": 1.911420737517773, + "language_loss": 0.75809073, + "learning_rate": 3.086491418735959e-07, + "loss": 0.78188837, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13598633, + "step": 13741, + "time_per_iteration": 2.7135844230651855 + }, + { + "auxiliary_loss_clip": 0.01329946, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.22158575, + "balance_loss_mlp": 1.01751995, + "epoch": 0.8262137381632346, + "flos": 32531901773760.0, + "grad_norm": 1.7666121991376322, + "language_loss": 0.63100314, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.65460742, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12963867, + "step": 13742, + "time_per_iteration": 2.8191142082214355 + }, + { + "auxiliary_loss_clip": 0.01343241, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.22814941, + "balance_loss_mlp": 1.02194643, + "epoch": 0.8262738614159026, + "flos": 14140446907320.0, + "grad_norm": 2.9551468420939417, + "language_loss": 0.66540456, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68920767, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.15124512, + "step": 13743, + "time_per_iteration": 2.7393743991851807 + }, + { + "auxiliary_loss_clip": 0.01336122, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.22762692, + "balance_loss_mlp": 1.02040994, + "epoch": 0.8263339846685706, + "flos": 19829709227880.0, + "grad_norm": 3.478688808496621, + "language_loss": 0.67136359, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69505984, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13079834, + "step": 13744, + "time_per_iteration": 2.956268548965454 + }, + { + "auxiliary_loss_clip": 0.0133862, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.22929585, + "balance_loss_mlp": 1.01736128, + "epoch": 0.8263941079212386, + "flos": 22750934401080.0, + "grad_norm": 2.24513089834837, + "language_loss": 0.75345433, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77713716, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12322998, + "step": 13745, + "time_per_iteration": 2.8194668292999268 + }, + { + "auxiliary_loss_clip": 0.01319459, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.21500993, + "balance_loss_mlp": 1.02151108, + "epoch": 0.8264542311739065, + "flos": 20125486497600.0, + "grad_norm": 1.8828797232669967, + "language_loss": 0.79270166, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81622946, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.11816406, + "step": 13746, + "time_per_iteration": 2.769865036010742 + }, + { + "auxiliary_loss_clip": 0.01343325, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.23032141, + "balance_loss_mlp": 1.01935863, + "epoch": 0.8265143544265745, + "flos": 16841595439800.0, + "grad_norm": 2.0903879214565233, + "language_loss": 0.68528569, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70905173, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13928223, + "step": 13747, + "time_per_iteration": 2.834975004196167 + }, + { + "auxiliary_loss_clip": 0.01331172, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.22109461, + "balance_loss_mlp": 1.015167, + "epoch": 0.8265744776792424, + "flos": 22023775255680.0, + "grad_norm": 1.9975798161619727, + "language_loss": 0.75530946, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77890772, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13500977, + "step": 13748, + "time_per_iteration": 2.7735636234283447 + }, + { + "auxiliary_loss_clip": 0.01325206, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.22097921, + "balance_loss_mlp": 1.02026248, + "epoch": 0.8266346009319104, + "flos": 19249565637600.0, + "grad_norm": 1.6913649922958103, + "language_loss": 0.63987666, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66345072, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.11938477, + "step": 13749, + "time_per_iteration": 5.648097038269043 + }, + { + "auxiliary_loss_clip": 0.01327699, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.22018933, + "balance_loss_mlp": 1.01451492, + "epoch": 0.8266947241845783, + "flos": 24171645951000.0, + "grad_norm": 1.788099577303531, + "language_loss": 0.73765874, + "learning_rate": 3.067810476598132e-07, + "loss": 0.76120615, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12530518, + "step": 13750, + "time_per_iteration": 2.85726261138916 + }, + { + "auxiliary_loss_clip": 0.0133414, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.22443581, + "balance_loss_mlp": 1.02312338, + "epoch": 0.8267548474372464, + "flos": 21110795944200.0, + "grad_norm": 1.9904645929458824, + "language_loss": 0.66089487, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68460065, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13305664, + "step": 13751, + "time_per_iteration": 4.229750394821167 + }, + { + "auxiliary_loss_clip": 0.01327243, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.22074699, + "balance_loss_mlp": 1.01981544, + "epoch": 0.8268149706899143, + "flos": 39975645357720.0, + "grad_norm": 1.4066630265538647, + "language_loss": 0.6063512, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62994516, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12316895, + "step": 13752, + "time_per_iteration": 2.8769476413726807 + }, + { + "auxiliary_loss_clip": 0.01146546, + "auxiliary_loss_mlp": 0.01003453, + "balance_loss_clip": 1.10247946, + "balance_loss_mlp": 1.00052059, + "epoch": 0.8268750939425823, + "flos": 65795592045960.0, + "grad_norm": 0.7810570247783749, + "language_loss": 0.5751102, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59661013, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02929688, + "step": 13753, + "time_per_iteration": 3.2832164764404297 + }, + { + "auxiliary_loss_clip": 0.01144671, + "auxiliary_loss_mlp": 0.01003425, + "balance_loss_clip": 1.10110068, + "balance_loss_mlp": 1.00075459, + "epoch": 0.8269352171952503, + "flos": 52993675703160.0, + "grad_norm": 0.699385677849569, + "language_loss": 0.55028045, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.57176149, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.0267334, + "step": 13754, + "time_per_iteration": 3.31112003326416 + }, + { + "auxiliary_loss_clip": 0.0132565, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.21894848, + "balance_loss_mlp": 1.0206089, + "epoch": 0.8269953404479182, + "flos": 23081374229040.0, + "grad_norm": 1.6371646499031587, + "language_loss": 0.69651413, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.72008932, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.11248779, + "step": 13755, + "time_per_iteration": 2.797661542892456 + }, + { + "auxiliary_loss_clip": 0.01328388, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.22129357, + "balance_loss_mlp": 1.02037358, + "epoch": 0.8270554637005862, + "flos": 14214076509960.0, + "grad_norm": 2.4248682777430384, + "language_loss": 0.69768238, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72129029, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12023926, + "step": 13756, + "time_per_iteration": 4.222565174102783 + }, + { + "auxiliary_loss_clip": 0.01333123, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.2249918, + "balance_loss_mlp": 1.01828301, + "epoch": 0.8271155869532542, + "flos": 21767046247080.0, + "grad_norm": 1.997130038837409, + "language_loss": 0.72925454, + "learning_rate": 3.053316807931623e-07, + "loss": 0.75289649, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12768555, + "step": 13757, + "time_per_iteration": 2.9194483757019043 + }, + { + "auxiliary_loss_clip": 0.01342846, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.23133576, + "balance_loss_mlp": 1.01804698, + "epoch": 0.8271757102059222, + "flos": 15124456886400.0, + "grad_norm": 2.2634319126954963, + "language_loss": 0.68863261, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71239102, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14953613, + "step": 13758, + "time_per_iteration": 2.7768311500549316 + }, + { + "auxiliary_loss_clip": 0.01324197, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.21878576, + "balance_loss_mlp": 1.01746058, + "epoch": 0.8272358334585901, + "flos": 24139054419120.0, + "grad_norm": 1.5718893833013299, + "language_loss": 0.69701195, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72054505, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11651611, + "step": 13759, + "time_per_iteration": 2.8188765048980713 + }, + { + "auxiliary_loss_clip": 0.01327704, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.22037101, + "balance_loss_mlp": 1.02088141, + "epoch": 0.8272959567112581, + "flos": 18995557389120.0, + "grad_norm": 1.7159202375598153, + "language_loss": 0.71415275, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73777592, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13739014, + "step": 13760, + "time_per_iteration": 2.737356424331665 + }, + { + "auxiliary_loss_clip": 0.01326188, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.22106028, + "balance_loss_mlp": 1.01864231, + "epoch": 0.827356079963926, + "flos": 20636548621560.0, + "grad_norm": 1.6590563555060722, + "language_loss": 0.77529931, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79886937, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12176514, + "step": 13761, + "time_per_iteration": 2.7678678035736084 + }, + { + "auxiliary_loss_clip": 0.0132445, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.21974587, + "balance_loss_mlp": 1.01719308, + "epoch": 0.827416203216594, + "flos": 22421428565400.0, + "grad_norm": 1.728558382534958, + "language_loss": 0.70194542, + "learning_rate": 3.042983464482387e-07, + "loss": 0.7254827, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12091064, + "step": 13762, + "time_per_iteration": 2.7570884227752686 + }, + { + "auxiliary_loss_clip": 0.01320551, + "auxiliary_loss_mlp": 0.01025286, + "balance_loss_clip": 1.2137593, + "balance_loss_mlp": 1.0129714, + "epoch": 0.827476326469262, + "flos": 19030910289480.0, + "grad_norm": 1.666316328699956, + "language_loss": 0.70354193, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72700036, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12304688, + "step": 13763, + "time_per_iteration": 2.7280399799346924 + }, + { + "auxiliary_loss_clip": 0.01148885, + "auxiliary_loss_mlp": 0.01007829, + "balance_loss_clip": 1.10512733, + "balance_loss_mlp": 1.00489604, + "epoch": 0.82753644972193, + "flos": 68516476241400.0, + "grad_norm": 0.8529786422845114, + "language_loss": 0.65134573, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67291296, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02929688, + "step": 13764, + "time_per_iteration": 3.3271901607513428 + }, + { + "auxiliary_loss_clip": 0.01328364, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.22153747, + "balance_loss_mlp": 1.02312636, + "epoch": 0.8275965729745979, + "flos": 18409972278600.0, + "grad_norm": 2.160678837463764, + "language_loss": 0.78028721, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.80393577, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13366699, + "step": 13765, + "time_per_iteration": 2.713940143585205 + }, + { + "auxiliary_loss_clip": 0.0133601, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.22425032, + "balance_loss_mlp": 1.0206089, + "epoch": 0.8276566962272659, + "flos": 28517968377000.0, + "grad_norm": 1.4770154460477891, + "language_loss": 0.62145555, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64515638, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13464355, + "step": 13766, + "time_per_iteration": 2.934546947479248 + }, + { + "auxiliary_loss_clip": 0.01329661, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.22084475, + "balance_loss_mlp": 1.0191704, + "epoch": 0.8277168194799339, + "flos": 20235118734360.0, + "grad_norm": 1.5674799155021315, + "language_loss": 0.82516587, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84878212, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12799072, + "step": 13767, + "time_per_iteration": 2.7424116134643555 + }, + { + "auxiliary_loss_clip": 0.01335286, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.22601438, + "balance_loss_mlp": 1.01540136, + "epoch": 0.8277769427326018, + "flos": 28482656085000.0, + "grad_norm": 1.535126120187132, + "language_loss": 0.68948525, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71311688, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12469482, + "step": 13768, + "time_per_iteration": 2.8672940731048584 + }, + { + "auxiliary_loss_clip": 0.01323037, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.21723306, + "balance_loss_mlp": 1.01558256, + "epoch": 0.8278370659852698, + "flos": 27203234311440.0, + "grad_norm": 1.9186379079671132, + "language_loss": 0.74573851, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76925004, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12536621, + "step": 13769, + "time_per_iteration": 2.913243532180786 + }, + { + "auxiliary_loss_clip": 0.01334651, + "auxiliary_loss_mlp": 0.01029036, + "balance_loss_clip": 1.2247014, + "balance_loss_mlp": 1.01613164, + "epoch": 0.8278971892379378, + "flos": 37823267134440.0, + "grad_norm": 1.7073379288549166, + "language_loss": 0.74475873, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76839566, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12908936, + "step": 13770, + "time_per_iteration": 2.885930299758911 + }, + { + "auxiliary_loss_clip": 0.01335319, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.22555709, + "balance_loss_mlp": 1.01803017, + "epoch": 0.8279573124906058, + "flos": 22564667543040.0, + "grad_norm": 1.6704234315807267, + "language_loss": 0.76039892, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.7840699, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13739014, + "step": 13771, + "time_per_iteration": 2.7474730014801025 + }, + { + "auxiliary_loss_clip": 0.01330747, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.22152376, + "balance_loss_mlp": 1.01780915, + "epoch": 0.8280174357432737, + "flos": 36071506631160.0, + "grad_norm": 1.5456679030835203, + "language_loss": 0.72631401, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74992532, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12573242, + "step": 13772, + "time_per_iteration": 2.8270866870880127 + }, + { + "auxiliary_loss_clip": 0.01319711, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.2135998, + "balance_loss_mlp": 1.016958, + "epoch": 0.8280775589959417, + "flos": 22965569521560.0, + "grad_norm": 2.317089200547393, + "language_loss": 0.74977529, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.77327442, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13238525, + "step": 13773, + "time_per_iteration": 2.770712375640869 + }, + { + "auxiliary_loss_clip": 0.01325474, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.21934962, + "balance_loss_mlp": 1.02027512, + "epoch": 0.8281376822486096, + "flos": 26068310374680.0, + "grad_norm": 1.672096142789642, + "language_loss": 0.76014948, + "learning_rate": 3.01824904601915e-07, + "loss": 0.7837289, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12194824, + "step": 13774, + "time_per_iteration": 2.739346981048584 + }, + { + "auxiliary_loss_clip": 0.01342598, + "auxiliary_loss_mlp": 0.01036323, + "balance_loss_clip": 1.22873497, + "balance_loss_mlp": 1.02313876, + "epoch": 0.8281978055012776, + "flos": 20672673080760.0, + "grad_norm": 1.6841789700065093, + "language_loss": 0.74928176, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77307099, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.13183594, + "step": 13775, + "time_per_iteration": 2.735758066177368 + }, + { + "auxiliary_loss_clip": 0.01343506, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.22926772, + "balance_loss_mlp": 1.01932263, + "epoch": 0.8282579287539455, + "flos": 29321356060080.0, + "grad_norm": 1.7620177720241323, + "language_loss": 0.74038553, + "learning_rate": 3.01413565459353e-07, + "loss": 0.76415598, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14215088, + "step": 13776, + "time_per_iteration": 2.796299457550049 + }, + { + "auxiliary_loss_clip": 0.01330579, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.2207222, + "balance_loss_mlp": 1.01695609, + "epoch": 0.8283180520066136, + "flos": 15710488688880.0, + "grad_norm": 2.255585164309452, + "language_loss": 0.77099741, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79459631, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12347412, + "step": 13777, + "time_per_iteration": 2.7176871299743652 + }, + { + "auxiliary_loss_clip": 0.01324788, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.22011662, + "balance_loss_mlp": 1.01620293, + "epoch": 0.8283781752592815, + "flos": 24797010273120.0, + "grad_norm": 1.5758503223298308, + "language_loss": 0.82475454, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84828067, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1161499, + "step": 13778, + "time_per_iteration": 2.891310691833496 + }, + { + "auxiliary_loss_clip": 0.01318276, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.21405828, + "balance_loss_mlp": 1.01477718, + "epoch": 0.8284382985119495, + "flos": 18986623549920.0, + "grad_norm": 1.6817668852161682, + "language_loss": 0.74184442, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76529658, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.1217041, + "step": 13779, + "time_per_iteration": 2.8012163639068604 + }, + { + "auxiliary_loss_clip": 0.01147274, + "auxiliary_loss_mlp": 0.0100919, + "balance_loss_clip": 1.10359788, + "balance_loss_mlp": 1.00648355, + "epoch": 0.8284984217646175, + "flos": 61051145991120.0, + "grad_norm": 0.7715884666886865, + "language_loss": 0.56719822, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58876282, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02709961, + "step": 13780, + "time_per_iteration": 3.2961666584014893 + }, + { + "auxiliary_loss_clip": 0.01326006, + "auxiliary_loss_mlp": 0.01023986, + "balance_loss_clip": 1.2190876, + "balance_loss_mlp": 1.01115298, + "epoch": 0.8285585450172854, + "flos": 19718818131960.0, + "grad_norm": 3.053983368583736, + "language_loss": 0.80162537, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82512522, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12835693, + "step": 13781, + "time_per_iteration": 2.936615467071533 + }, + { + "auxiliary_loss_clip": 0.01333814, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.22436166, + "balance_loss_mlp": 1.01897526, + "epoch": 0.8286186682699535, + "flos": 21693863336400.0, + "grad_norm": 2.0287528034135582, + "language_loss": 0.75930959, + "learning_rate": 3.001810941346543e-07, + "loss": 0.7829808, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14343262, + "step": 13782, + "time_per_iteration": 2.8428854942321777 + }, + { + "auxiliary_loss_clip": 0.01332123, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.22260523, + "balance_loss_mlp": 1.0180763, + "epoch": 0.8286787915226214, + "flos": 25781101468920.0, + "grad_norm": 1.49072292153758, + "language_loss": 0.76285243, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78648293, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12854004, + "step": 13783, + "time_per_iteration": 2.817845106124878 + }, + { + "auxiliary_loss_clip": 0.01333576, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.22486448, + "balance_loss_mlp": 1.01436353, + "epoch": 0.8287389147752894, + "flos": 21293367441480.0, + "grad_norm": 1.5539373240941667, + "language_loss": 0.73954028, + "learning_rate": 2.997707859351304e-07, + "loss": 0.76314896, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12927246, + "step": 13784, + "time_per_iteration": 2.71470308303833 + }, + { + "auxiliary_loss_clip": 0.01334361, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.22250819, + "balance_loss_mlp": 1.02287674, + "epoch": 0.8287990380279573, + "flos": 33551142828120.0, + "grad_norm": 1.3959086786498232, + "language_loss": 0.70111519, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72483623, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14868164, + "step": 13785, + "time_per_iteration": 2.832357883453369 + }, + { + "auxiliary_loss_clip": 0.01328507, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.2199018, + "balance_loss_mlp": 1.01798165, + "epoch": 0.8288591612806253, + "flos": 22971538950480.0, + "grad_norm": 1.4288869781432356, + "language_loss": 0.6840831, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70767522, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12719727, + "step": 13786, + "time_per_iteration": 2.789469003677368 + }, + { + "auxiliary_loss_clip": 0.01339301, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.22628105, + "balance_loss_mlp": 1.02226448, + "epoch": 0.8289192845332932, + "flos": 18593721418320.0, + "grad_norm": 1.7109694269198883, + "language_loss": 0.77070451, + "learning_rate": 2.991558072017426e-07, + "loss": 0.79445672, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.13671875, + "step": 13787, + "time_per_iteration": 2.7498667240142822 + }, + { + "auxiliary_loss_clip": 0.01326165, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.21913958, + "balance_loss_mlp": 1.0215466, + "epoch": 0.8289794077859612, + "flos": 15454653064200.0, + "grad_norm": 1.746198221095269, + "language_loss": 0.80549049, + "learning_rate": 2.989509432726163e-07, + "loss": 0.8290894, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1217041, + "step": 13788, + "time_per_iteration": 4.248944282531738 + }, + { + "auxiliary_loss_clip": 0.01328897, + "auxiliary_loss_mlp": 0.01034588, + "balance_loss_clip": 1.22131991, + "balance_loss_mlp": 1.02214217, + "epoch": 0.8290395310386292, + "flos": 28883923538760.0, + "grad_norm": 1.6158421131718066, + "language_loss": 0.71274823, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73638308, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12438965, + "step": 13789, + "time_per_iteration": 5.810671329498291 + }, + { + "auxiliary_loss_clip": 0.01330788, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.22045553, + "balance_loss_mlp": 1.02013326, + "epoch": 0.8290996542912972, + "flos": 36583340313960.0, + "grad_norm": 1.7254516661618544, + "language_loss": 0.68287563, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70651329, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1282959, + "step": 13790, + "time_per_iteration": 2.835062265396118 + }, + { + "auxiliary_loss_clip": 0.01330374, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.22031796, + "balance_loss_mlp": 1.01611602, + "epoch": 0.8291597775439651, + "flos": 23628276553680.0, + "grad_norm": 1.5665822007155703, + "language_loss": 0.77392447, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.7975328, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.14361572, + "step": 13791, + "time_per_iteration": 2.736481189727783 + }, + { + "auxiliary_loss_clip": 0.01323159, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.21745002, + "balance_loss_mlp": 1.01457965, + "epoch": 0.8292199007966331, + "flos": 21402471769560.0, + "grad_norm": 1.40767327467941, + "language_loss": 0.70005262, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72356164, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13165283, + "step": 13792, + "time_per_iteration": 2.7284293174743652 + }, + { + "auxiliary_loss_clip": 0.01335092, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.22431588, + "balance_loss_mlp": 1.01700187, + "epoch": 0.829280024049301, + "flos": 28773560351520.0, + "grad_norm": 2.184845375700832, + "language_loss": 0.65649867, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.68015105, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.1315918, + "step": 13793, + "time_per_iteration": 2.8444995880126953 + }, + { + "auxiliary_loss_clip": 0.01333022, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.22204757, + "balance_loss_mlp": 1.01908505, + "epoch": 0.829340147301969, + "flos": 19942874391960.0, + "grad_norm": 2.1448824531745228, + "language_loss": 0.66420507, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68786407, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13775635, + "step": 13794, + "time_per_iteration": 2.778961181640625 + }, + { + "auxiliary_loss_clip": 0.01333256, + "auxiliary_loss_mlp": 0.01031103, + "balance_loss_clip": 1.22349095, + "balance_loss_mlp": 1.01753736, + "epoch": 0.829400270554637, + "flos": 25234077319200.0, + "grad_norm": 1.8626353744715425, + "language_loss": 0.66058695, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68423057, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13574219, + "step": 13795, + "time_per_iteration": 4.238582134246826 + }, + { + "auxiliary_loss_clip": 0.01316419, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.21380007, + "balance_loss_mlp": 1.01877785, + "epoch": 0.829460393807305, + "flos": 24469128771840.0, + "grad_norm": 1.8745803625719, + "language_loss": 0.66796458, + "learning_rate": 2.973143546338661e-07, + "loss": 0.69143933, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.1227417, + "step": 13796, + "time_per_iteration": 2.8053884506225586 + }, + { + "auxiliary_loss_clip": 0.01327432, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.22059524, + "balance_loss_mlp": 1.01840401, + "epoch": 0.829520517059973, + "flos": 15126730954560.0, + "grad_norm": 1.6374958267916178, + "language_loss": 0.71608174, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73966753, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12744141, + "step": 13797, + "time_per_iteration": 2.7448959350585938 + }, + { + "auxiliary_loss_clip": 0.01327374, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.21924341, + "balance_loss_mlp": 1.01644456, + "epoch": 0.8295806403126409, + "flos": 21585002658480.0, + "grad_norm": 3.6998482295409505, + "language_loss": 0.72185749, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74541438, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.11889648, + "step": 13798, + "time_per_iteration": 2.73121976852417 + }, + { + "auxiliary_loss_clip": 0.01316147, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.21254301, + "balance_loss_mlp": 1.01557732, + "epoch": 0.8296407635653089, + "flos": 21731693346720.0, + "grad_norm": 1.5816548254638791, + "language_loss": 0.76275551, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78619844, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.12567139, + "step": 13799, + "time_per_iteration": 2.896885871887207 + }, + { + "auxiliary_loss_clip": 0.01320974, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.2146976, + "balance_loss_mlp": 1.0173589, + "epoch": 0.8297008868179768, + "flos": 11184762217680.0, + "grad_norm": 2.0579649230304904, + "language_loss": 0.67272687, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69623315, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12304688, + "step": 13800, + "time_per_iteration": 2.819920778274536 + }, + { + "auxiliary_loss_clip": 0.01336813, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.22566903, + "balance_loss_mlp": 1.01964986, + "epoch": 0.8297610100706448, + "flos": 20668449811320.0, + "grad_norm": 1.6899640515991858, + "language_loss": 0.74902868, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.77273262, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13946533, + "step": 13801, + "time_per_iteration": 2.8692729473114014 + }, + { + "auxiliary_loss_clip": 0.01329782, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.21958923, + "balance_loss_mlp": 1.02120662, + "epoch": 0.8298211333233128, + "flos": 20381240905560.0, + "grad_norm": 1.4438326698208472, + "language_loss": 0.73852324, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.76215696, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12402344, + "step": 13802, + "time_per_iteration": 2.7360830307006836 + }, + { + "auxiliary_loss_clip": 0.01327685, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.21893775, + "balance_loss_mlp": 1.01921034, + "epoch": 0.8298812565759808, + "flos": 21513931382520.0, + "grad_norm": 1.6472205616019882, + "language_loss": 0.75178075, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77537274, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1229248, + "step": 13803, + "time_per_iteration": 2.810546875 + }, + { + "auxiliary_loss_clip": 0.01326397, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.21790612, + "balance_loss_mlp": 1.02115786, + "epoch": 0.8299413798286487, + "flos": 22823630011440.0, + "grad_norm": 1.7350723322134964, + "language_loss": 0.79437971, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81798363, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12835693, + "step": 13804, + "time_per_iteration": 2.7306056022644043 + }, + { + "auxiliary_loss_clip": 0.01329919, + "auxiliary_loss_mlp": 0.01024544, + "balance_loss_clip": 1.22263646, + "balance_loss_mlp": 1.01237273, + "epoch": 0.8300015030813167, + "flos": 29685199587120.0, + "grad_norm": 1.6082087610846936, + "language_loss": 0.73376322, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75730783, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.12182617, + "step": 13805, + "time_per_iteration": 2.8711678981781006 + }, + { + "auxiliary_loss_clip": 0.01331849, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.22165048, + "balance_loss_mlp": 1.01996136, + "epoch": 0.8300616263339846, + "flos": 19724300260560.0, + "grad_norm": 2.4759427571028456, + "language_loss": 0.77955043, + "learning_rate": 2.952744302396906e-07, + "loss": 0.80319333, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12487793, + "step": 13806, + "time_per_iteration": 2.7987585067749023 + }, + { + "auxiliary_loss_clip": 0.01333043, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.22100425, + "balance_loss_mlp": 1.01919532, + "epoch": 0.8301217495866526, + "flos": 19906790541120.0, + "grad_norm": 2.5816061274584157, + "language_loss": 0.63721371, + "learning_rate": 2.950707932112444e-07, + "loss": 0.66087103, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13470459, + "step": 13807, + "time_per_iteration": 2.809408664703369 + }, + { + "auxiliary_loss_clip": 0.01328933, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.22216845, + "balance_loss_mlp": 1.01541066, + "epoch": 0.8301818728393207, + "flos": 19720117599480.0, + "grad_norm": 10.333203508256558, + "language_loss": 0.73313689, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75670898, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12872314, + "step": 13808, + "time_per_iteration": 2.769650459289551 + }, + { + "auxiliary_loss_clip": 0.01340133, + "auxiliary_loss_mlp": 0.01041553, + "balance_loss_clip": 1.22722733, + "balance_loss_mlp": 1.02700305, + "epoch": 0.8302419960919886, + "flos": 28299597287400.0, + "grad_norm": 1.7095391180380346, + "language_loss": 0.67270195, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.69651878, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14532471, + "step": 13809, + "time_per_iteration": 2.837662696838379 + }, + { + "auxiliary_loss_clip": 0.01328555, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.21950328, + "balance_loss_mlp": 1.01242876, + "epoch": 0.8303021193446566, + "flos": 18228050515080.0, + "grad_norm": 2.3299307388816066, + "language_loss": 0.74266833, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76620317, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12512207, + "step": 13810, + "time_per_iteration": 2.869685649871826 + }, + { + "auxiliary_loss_clip": 0.01321464, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.21697128, + "balance_loss_mlp": 1.01872349, + "epoch": 0.8303622425973245, + "flos": 23116524087600.0, + "grad_norm": 1.4883558710797975, + "language_loss": 0.81440127, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83792603, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12286377, + "step": 13811, + "time_per_iteration": 2.7084317207336426 + }, + { + "auxiliary_loss_clip": 0.01331108, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.22143769, + "balance_loss_mlp": 1.02184606, + "epoch": 0.8304223658499925, + "flos": 19577568963960.0, + "grad_norm": 1.8410158208749163, + "language_loss": 0.73676181, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.76041782, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12640381, + "step": 13812, + "time_per_iteration": 2.7453527450561523 + }, + { + "auxiliary_loss_clip": 0.01316113, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.21128738, + "balance_loss_mlp": 1.02319908, + "epoch": 0.8304824891026604, + "flos": 24431542411680.0, + "grad_norm": 1.6163780568656252, + "language_loss": 0.78407216, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80759209, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12689209, + "step": 13813, + "time_per_iteration": 2.8396780490875244 + }, + { + "auxiliary_loss_clip": 0.0133434, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.22357607, + "balance_loss_mlp": 1.01439619, + "epoch": 0.8305426123553284, + "flos": 22386806615520.0, + "grad_norm": 1.8230875568424103, + "language_loss": 0.71005863, + "learning_rate": 2.93647144674658e-07, + "loss": 0.73368621, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14013672, + "step": 13814, + "time_per_iteration": 2.704775333404541 + }, + { + "auxiliary_loss_clip": 0.01350208, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.23232675, + "balance_loss_mlp": 1.01846063, + "epoch": 0.8306027356079964, + "flos": 14907791347920.0, + "grad_norm": 2.125607884042678, + "language_loss": 0.6817717, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70560694, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.1484375, + "step": 13815, + "time_per_iteration": 2.718951463699341 + }, + { + "auxiliary_loss_clip": 0.01330486, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.22270799, + "balance_loss_mlp": 1.01818752, + "epoch": 0.8306628588606644, + "flos": 19649127540240.0, + "grad_norm": 1.763308763208901, + "language_loss": 0.76132089, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.7849369, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12921143, + "step": 13816, + "time_per_iteration": 2.7146308422088623 + }, + { + "auxiliary_loss_clip": 0.01328225, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.22039962, + "balance_loss_mlp": 1.02073836, + "epoch": 0.8307229821133323, + "flos": 24395174302320.0, + "grad_norm": 1.7043094216684778, + "language_loss": 0.82096028, + "learning_rate": 2.930379800094371e-07, + "loss": 0.84457457, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12451172, + "step": 13817, + "time_per_iteration": 2.786036968231201 + }, + { + "auxiliary_loss_clip": 0.01334134, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.22355843, + "balance_loss_mlp": 1.02228916, + "epoch": 0.8307831053660003, + "flos": 21001975874640.0, + "grad_norm": 1.4317320746776125, + "language_loss": 0.78376174, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80746841, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.14245605, + "step": 13818, + "time_per_iteration": 2.784198522567749 + }, + { + "auxiliary_loss_clip": 0.01335406, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.22608376, + "balance_loss_mlp": 1.02124596, + "epoch": 0.8308432286186682, + "flos": 21402268727760.0, + "grad_norm": 1.7327971141761898, + "language_loss": 0.81786907, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84156597, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13049316, + "step": 13819, + "time_per_iteration": 2.8190441131591797 + }, + { + "auxiliary_loss_clip": 0.01144542, + "auxiliary_loss_mlp": 0.0100105, + "balance_loss_clip": 1.10131061, + "balance_loss_mlp": 0.99836749, + "epoch": 0.8309033518713362, + "flos": 62546201112960.0, + "grad_norm": 0.7640574450904414, + "language_loss": 0.56268787, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58414376, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02685547, + "step": 13820, + "time_per_iteration": 3.2265114784240723 + }, + { + "auxiliary_loss_clip": 0.01316562, + "auxiliary_loss_mlp": 0.01023598, + "balance_loss_clip": 1.21096623, + "balance_loss_mlp": 1.01093757, + "epoch": 0.8309634751240043, + "flos": 16983047649600.0, + "grad_norm": 2.03540181229891, + "language_loss": 0.68664539, + "learning_rate": 2.922266666860831e-07, + "loss": 0.71004701, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12664795, + "step": 13821, + "time_per_iteration": 2.8297476768493652 + }, + { + "auxiliary_loss_clip": 0.01335446, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.22298813, + "balance_loss_mlp": 1.0198406, + "epoch": 0.8310235983766722, + "flos": 22679619474960.0, + "grad_norm": 2.1151432661109193, + "language_loss": 0.69249433, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71617985, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13275146, + "step": 13822, + "time_per_iteration": 2.790590286254883 + }, + { + "auxiliary_loss_clip": 0.01325614, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.21920526, + "balance_loss_mlp": 1.0181942, + "epoch": 0.8310837216293402, + "flos": 30817768239000.0, + "grad_norm": 1.9096850936534904, + "language_loss": 0.62395567, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64751959, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12591553, + "step": 13823, + "time_per_iteration": 2.819387674331665 + }, + { + "auxiliary_loss_clip": 0.01144363, + "auxiliary_loss_mlp": 0.01006345, + "balance_loss_clip": 1.10191679, + "balance_loss_mlp": 1.00359154, + "epoch": 0.8311438448820081, + "flos": 71292675669120.0, + "grad_norm": 0.8654015595644099, + "language_loss": 0.62027591, + "learning_rate": 2.916188616354669e-07, + "loss": 0.641783, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02758789, + "step": 13824, + "time_per_iteration": 3.3292722702026367 + }, + { + "auxiliary_loss_clip": 0.01331928, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.22349215, + "balance_loss_mlp": 1.02235556, + "epoch": 0.8312039681346761, + "flos": 20892059379360.0, + "grad_norm": 1.6098888224653756, + "language_loss": 0.74281251, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76648563, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13031006, + "step": 13825, + "time_per_iteration": 2.8101885318756104 + }, + { + "auxiliary_loss_clip": 0.01334441, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.22391224, + "balance_loss_mlp": 1.01776218, + "epoch": 0.831264091387344, + "flos": 17021730435480.0, + "grad_norm": 1.824423902196964, + "language_loss": 0.80172312, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82537544, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13024902, + "step": 13826, + "time_per_iteration": 2.8933517932891846 + }, + { + "auxiliary_loss_clip": 0.01331465, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.22255611, + "balance_loss_mlp": 1.01607847, + "epoch": 0.831324214640012, + "flos": 24424151690160.0, + "grad_norm": 1.5117940530024518, + "language_loss": 0.68086791, + "learning_rate": 2.910116396226914e-07, + "loss": 0.7044695, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12615967, + "step": 13827, + "time_per_iteration": 4.234933614730835 + }, + { + "auxiliary_loss_clip": 0.01325117, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.21805358, + "balance_loss_mlp": 1.01791036, + "epoch": 0.83138433789268, + "flos": 13548567501000.0, + "grad_norm": 1.7503153430990002, + "language_loss": 0.74538261, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76893139, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11859131, + "step": 13828, + "time_per_iteration": 5.760280132293701 + }, + { + "auxiliary_loss_clip": 0.01334697, + "auxiliary_loss_mlp": 0.01036982, + "balance_loss_clip": 1.22458887, + "balance_loss_mlp": 1.02460194, + "epoch": 0.831444461145348, + "flos": 44500316011560.0, + "grad_norm": 1.649978165217186, + "language_loss": 0.67936134, + "learning_rate": 2.906071489597657e-07, + "loss": 0.70307815, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12384033, + "step": 13829, + "time_per_iteration": 3.0384206771850586 + }, + { + "auxiliary_loss_clip": 0.01338892, + "auxiliary_loss_mlp": 0.010272, + "balance_loss_clip": 1.22701335, + "balance_loss_mlp": 1.01418853, + "epoch": 0.8315045843980159, + "flos": 22709409030000.0, + "grad_norm": 1.7680188318024679, + "language_loss": 0.82855058, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.85221148, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13012695, + "step": 13830, + "time_per_iteration": 2.8160006999969482 + }, + { + "auxiliary_loss_clip": 0.01326481, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.21988261, + "balance_loss_mlp": 1.01869786, + "epoch": 0.8315647076506839, + "flos": 16877922940800.0, + "grad_norm": 2.032263776961619, + "language_loss": 0.74625868, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76983649, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12609863, + "step": 13831, + "time_per_iteration": 2.728442907333374 + }, + { + "auxiliary_loss_clip": 0.01333412, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.22485256, + "balance_loss_mlp": 1.02087271, + "epoch": 0.8316248309033518, + "flos": 13812768447840.0, + "grad_norm": 1.65142449430114, + "language_loss": 0.71398365, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73765719, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.13067627, + "step": 13832, + "time_per_iteration": 2.6885595321655273 + }, + { + "auxiliary_loss_clip": 0.01327452, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.21963453, + "balance_loss_mlp": 1.01476002, + "epoch": 0.8316849541560198, + "flos": 23517629107920.0, + "grad_norm": 1.6148133155855802, + "language_loss": 0.84617579, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86972427, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12652588, + "step": 13833, + "time_per_iteration": 2.8371543884277344 + }, + { + "auxiliary_loss_clip": 0.01335366, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.22462368, + "balance_loss_mlp": 1.01874757, + "epoch": 0.8317450774086879, + "flos": 23776957051560.0, + "grad_norm": 1.4148999315856456, + "language_loss": 0.76475298, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78842539, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13110352, + "step": 13834, + "time_per_iteration": 4.262632131576538 + }, + { + "auxiliary_loss_clip": 0.01326344, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.22058821, + "balance_loss_mlp": 1.0182929, + "epoch": 0.8318052006613558, + "flos": 16219723436640.0, + "grad_norm": 1.8092673785646438, + "language_loss": 0.7979874, + "learning_rate": 2.893952329045459e-07, + "loss": 0.82155961, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12573242, + "step": 13835, + "time_per_iteration": 2.683323860168457 + }, + { + "auxiliary_loss_clip": 0.01339772, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.2289058, + "balance_loss_mlp": 1.01748812, + "epoch": 0.8318653239140238, + "flos": 19979364326400.0, + "grad_norm": 1.763008377228907, + "language_loss": 0.80773556, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.83144802, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13983154, + "step": 13836, + "time_per_iteration": 2.839162588119507 + }, + { + "auxiliary_loss_clip": 0.0132706, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.22108436, + "balance_loss_mlp": 1.0172677, + "epoch": 0.8319254471666917, + "flos": 17708216985360.0, + "grad_norm": 1.7754615174987358, + "language_loss": 0.77643824, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.80000949, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12792969, + "step": 13837, + "time_per_iteration": 2.687803030014038 + }, + { + "auxiliary_loss_clip": 0.0133884, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.22598195, + "balance_loss_mlp": 1.01530814, + "epoch": 0.8319855704193597, + "flos": 19541282071320.0, + "grad_norm": 1.676259710165636, + "language_loss": 0.83621144, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85989797, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14483643, + "step": 13838, + "time_per_iteration": 2.749168634414673 + }, + { + "auxiliary_loss_clip": 0.01329504, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.22276771, + "balance_loss_mlp": 1.02042282, + "epoch": 0.8320456936720276, + "flos": 21182963645880.0, + "grad_norm": 1.6603326352936778, + "language_loss": 0.74556416, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76919496, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.13146973, + "step": 13839, + "time_per_iteration": 2.7313456535339355 + }, + { + "auxiliary_loss_clip": 0.01332506, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.22338343, + "balance_loss_mlp": 1.01784694, + "epoch": 0.8321058169246957, + "flos": 33256827459360.0, + "grad_norm": 1.3584053016909075, + "language_loss": 0.67749608, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70113468, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13500977, + "step": 13840, + "time_per_iteration": 2.9190025329589844 + }, + { + "auxiliary_loss_clip": 0.01333, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.22346699, + "balance_loss_mlp": 1.01550198, + "epoch": 0.8321659401773636, + "flos": 14211964875240.0, + "grad_norm": 2.3392835510222385, + "language_loss": 0.79561996, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81923711, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13208008, + "step": 13841, + "time_per_iteration": 2.844656229019165 + }, + { + "auxiliary_loss_clip": 0.0132988, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.22278881, + "balance_loss_mlp": 1.01804543, + "epoch": 0.8322260634300316, + "flos": 15162408721800.0, + "grad_norm": 2.3216533643249395, + "language_loss": 0.68346107, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70707071, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13043213, + "step": 13842, + "time_per_iteration": 2.7763609886169434 + }, + { + "auxiliary_loss_clip": 0.01333662, + "auxiliary_loss_mlp": 0.01034966, + "balance_loss_clip": 1.22697675, + "balance_loss_mlp": 1.02120328, + "epoch": 0.8322861866826995, + "flos": 25306245020880.0, + "grad_norm": 1.6743416769687278, + "language_loss": 0.72902608, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75271237, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13763428, + "step": 13843, + "time_per_iteration": 2.8630940914154053 + }, + { + "auxiliary_loss_clip": 0.01325677, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.2191844, + "balance_loss_mlp": 1.01715517, + "epoch": 0.8323463099353675, + "flos": 17023801461840.0, + "grad_norm": 1.7688128286076876, + "language_loss": 0.77792311, + "learning_rate": 2.875817378128975e-07, + "loss": 0.80148351, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13214111, + "step": 13844, + "time_per_iteration": 2.771195650100708 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01009169, + "balance_loss_clip": 1.10095644, + "balance_loss_mlp": 1.00645089, + "epoch": 0.8324064331880354, + "flos": 55620382465800.0, + "grad_norm": 0.8312874857283313, + "language_loss": 0.5521102, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57364428, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02722168, + "step": 13845, + "time_per_iteration": 3.237637758255005 + }, + { + "auxiliary_loss_clip": 0.01335248, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.22518182, + "balance_loss_mlp": 1.02648163, + "epoch": 0.8324665564407034, + "flos": 26143970395320.0, + "grad_norm": 2.271197900042131, + "language_loss": 0.7602815, + "learning_rate": 2.871794529934555e-07, + "loss": 0.78403056, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13171387, + "step": 13846, + "time_per_iteration": 2.7903494834899902 + }, + { + "auxiliary_loss_clip": 0.01340099, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.22677016, + "balance_loss_mlp": 1.01633072, + "epoch": 0.8325266796933715, + "flos": 22053605419080.0, + "grad_norm": 1.8579541011977292, + "language_loss": 0.79264092, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.81634283, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13757324, + "step": 13847, + "time_per_iteration": 2.8580827713012695 + }, + { + "auxiliary_loss_clip": 0.01326744, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.22101259, + "balance_loss_mlp": 1.01499748, + "epoch": 0.8325868029460394, + "flos": 22821315334920.0, + "grad_norm": 1.593503397323348, + "language_loss": 0.74724293, + "learning_rate": 2.867774279753175e-07, + "loss": 0.77078176, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.121521, + "step": 13848, + "time_per_iteration": 2.862557888031006 + }, + { + "auxiliary_loss_clip": 0.01329573, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.22159243, + "balance_loss_mlp": 1.01489687, + "epoch": 0.8326469261987074, + "flos": 14761628568360.0, + "grad_norm": 2.1276456794516894, + "language_loss": 0.64295608, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.66653335, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13244629, + "step": 13849, + "time_per_iteration": 2.8182218074798584 + }, + { + "auxiliary_loss_clip": 0.01335196, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.22464502, + "balance_loss_mlp": 1.01894498, + "epoch": 0.8327070494513753, + "flos": 22930825746600.0, + "grad_norm": 2.1509764673098424, + "language_loss": 0.79920805, + "learning_rate": 2.863756628194638e-07, + "loss": 0.82287967, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13024902, + "step": 13850, + "time_per_iteration": 2.832118272781372 + }, + { + "auxiliary_loss_clip": 0.01318374, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.21459293, + "balance_loss_mlp": 1.01684129, + "epoch": 0.8327671727040433, + "flos": 20669911712280.0, + "grad_norm": 1.657716924776357, + "language_loss": 0.78501832, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80849135, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 1.03759766, + "router_z_loss_mlp": 0.12084961, + "step": 13851, + "time_per_iteration": 2.837057113647461 + }, + { + "auxiliary_loss_clip": 0.01144842, + "auxiliary_loss_mlp": 0.01000947, + "balance_loss_clip": 1.10106993, + "balance_loss_mlp": 0.99824089, + "epoch": 0.8328272959567112, + "flos": 56075276581920.0, + "grad_norm": 0.7645409233677495, + "language_loss": 0.55888116, + "learning_rate": 2.859741575868344e-07, + "loss": 0.58033907, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02709961, + "step": 13852, + "time_per_iteration": 3.2338104248046875 + }, + { + "auxiliary_loss_clip": 0.01326134, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.22054899, + "balance_loss_mlp": 1.0129658, + "epoch": 0.8328874192093793, + "flos": 32309023156200.0, + "grad_norm": 1.5124540187060835, + "language_loss": 0.6740644, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69758016, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12481689, + "step": 13853, + "time_per_iteration": 2.9043872356414795 + }, + { + "auxiliary_loss_clip": 0.01327328, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.21919751, + "balance_loss_mlp": 1.01811695, + "epoch": 0.8329475424620472, + "flos": 23517832149720.0, + "grad_norm": 1.5964749423540565, + "language_loss": 0.78658152, + "learning_rate": 2.855729123383286e-07, + "loss": 0.81015933, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12341309, + "step": 13854, + "time_per_iteration": 2.8509397506713867 + }, + { + "auxiliary_loss_clip": 0.01142041, + "auxiliary_loss_mlp": 0.01001189, + "balance_loss_clip": 1.09843969, + "balance_loss_mlp": 0.99865037, + "epoch": 0.8330076657147152, + "flos": 67856083885800.0, + "grad_norm": 0.7677475982165962, + "language_loss": 0.58741432, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60884666, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02539062, + "step": 13855, + "time_per_iteration": 3.13802433013916 + }, + { + "auxiliary_loss_clip": 0.01329272, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.22223115, + "balance_loss_mlp": 1.01454246, + "epoch": 0.8330677889673831, + "flos": 22897909347840.0, + "grad_norm": 2.09426859845419, + "language_loss": 0.71919221, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.74276024, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12994385, + "step": 13856, + "time_per_iteration": 2.7699718475341797 + }, + { + "auxiliary_loss_clip": 0.01330721, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.2222991, + "balance_loss_mlp": 1.01761508, + "epoch": 0.8331279122200511, + "flos": 27350858991960.0, + "grad_norm": 1.561825774381853, + "language_loss": 0.76091707, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.78452659, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12615967, + "step": 13857, + "time_per_iteration": 2.926931858062744 + }, + { + "auxiliary_loss_clip": 0.01321061, + "auxiliary_loss_mlp": 0.01024216, + "balance_loss_clip": 1.21726573, + "balance_loss_mlp": 1.01226509, + "epoch": 0.833188035472719, + "flos": 19942996217040.0, + "grad_norm": 6.305445168446271, + "language_loss": 0.73527372, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75872648, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.11956787, + "step": 13858, + "time_per_iteration": 2.715430974960327 + }, + { + "auxiliary_loss_clip": 0.01338836, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.22606647, + "balance_loss_mlp": 1.01970124, + "epoch": 0.833248158725387, + "flos": 15236972316720.0, + "grad_norm": 1.831040932534898, + "language_loss": 0.73583436, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75955522, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13555908, + "step": 13859, + "time_per_iteration": 2.7538204193115234 + }, + { + "auxiliary_loss_clip": 0.01323212, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.21868849, + "balance_loss_mlp": 1.01414824, + "epoch": 0.8333082819780551, + "flos": 24540971606640.0, + "grad_norm": 1.4757856533635747, + "language_loss": 0.79338622, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81688011, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12023926, + "step": 13860, + "time_per_iteration": 2.917778730392456 + }, + { + "auxiliary_loss_clip": 0.01323953, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.21787786, + "balance_loss_mlp": 1.01568627, + "epoch": 0.833368405230723, + "flos": 31473693675000.0, + "grad_norm": 2.414099598027101, + "language_loss": 0.82372677, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84725308, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.13012695, + "step": 13861, + "time_per_iteration": 2.94012188911438 + }, + { + "auxiliary_loss_clip": 0.01333982, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.22462296, + "balance_loss_mlp": 1.01755774, + "epoch": 0.833428528483391, + "flos": 14906816747280.0, + "grad_norm": 1.7289851855188132, + "language_loss": 0.79346234, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81711018, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.13256836, + "step": 13862, + "time_per_iteration": 2.8290741443634033 + }, + { + "auxiliary_loss_clip": 0.01333649, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.22295618, + "balance_loss_mlp": 1.01776528, + "epoch": 0.8334886517360589, + "flos": 22204884852000.0, + "grad_norm": 1.9386336325079407, + "language_loss": 0.75560653, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77924883, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.1282959, + "step": 13863, + "time_per_iteration": 2.7738938331604004 + }, + { + "auxiliary_loss_clip": 0.01318887, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.21567988, + "balance_loss_mlp": 1.01586533, + "epoch": 0.8335487749887269, + "flos": 30378670774920.0, + "grad_norm": 2.0950528573409812, + "language_loss": 0.74896765, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77243674, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12139893, + "step": 13864, + "time_per_iteration": 4.280742168426514 + }, + { + "auxiliary_loss_clip": 0.01330686, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.22144186, + "balance_loss_mlp": 1.01988113, + "epoch": 0.8336088982413948, + "flos": 24686565869160.0, + "grad_norm": 1.7109913777962051, + "language_loss": 0.69556856, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71921504, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.14093018, + "step": 13865, + "time_per_iteration": 2.880406618118286 + }, + { + "auxiliary_loss_clip": 0.01329448, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.2212038, + "balance_loss_mlp": 1.02016044, + "epoch": 0.8336690214940629, + "flos": 38183902601040.0, + "grad_norm": 1.542310881089739, + "language_loss": 0.75519866, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77882373, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12896729, + "step": 13866, + "time_per_iteration": 4.4387288093566895 + }, + { + "auxiliary_loss_clip": 0.01143846, + "auxiliary_loss_mlp": 0.01008094, + "balance_loss_clip": 1.10071492, + "balance_loss_mlp": 1.00532866, + "epoch": 0.8337291447467308, + "flos": 55576786068360.0, + "grad_norm": 1.1085542060808407, + "language_loss": 0.63230407, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65382349, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02770996, + "step": 13867, + "time_per_iteration": 4.613127946853638 + }, + { + "auxiliary_loss_clip": 0.01324016, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.21926808, + "balance_loss_mlp": 1.01883888, + "epoch": 0.8337892679993988, + "flos": 24138851377320.0, + "grad_norm": 1.7226340504723452, + "language_loss": 0.72545666, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74900222, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11694336, + "step": 13868, + "time_per_iteration": 2.7836482524871826 + }, + { + "auxiliary_loss_clip": 0.01324785, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.21748638, + "balance_loss_mlp": 1.01825023, + "epoch": 0.8338493912520667, + "flos": 28189802617200.0, + "grad_norm": 1.5045679087032342, + "language_loss": 0.80711687, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.83067954, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13238525, + "step": 13869, + "time_per_iteration": 3.0077271461486816 + }, + { + "auxiliary_loss_clip": 0.01325993, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.21907592, + "balance_loss_mlp": 1.02254987, + "epoch": 0.8339095145047347, + "flos": 22162750355520.0, + "grad_norm": 1.4928445147589382, + "language_loss": 0.82576388, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84937656, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12738037, + "step": 13870, + "time_per_iteration": 2.7337520122528076 + }, + { + "auxiliary_loss_clip": 0.01335243, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.22369611, + "balance_loss_mlp": 1.01487494, + "epoch": 0.8339696377574026, + "flos": 17311050975960.0, + "grad_norm": 3.1954826973368053, + "language_loss": 0.71473503, + "learning_rate": 2.821728331750264e-07, + "loss": 0.73837584, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13952637, + "step": 13871, + "time_per_iteration": 2.8652966022491455 + }, + { + "auxiliary_loss_clip": 0.01323522, + "auxiliary_loss_mlp": 0.01031673, + "balance_loss_clip": 1.21807849, + "balance_loss_mlp": 1.0185895, + "epoch": 0.8340297610100706, + "flos": 20673322814520.0, + "grad_norm": 2.6018566134175787, + "language_loss": 0.69371367, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71726561, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13092041, + "step": 13872, + "time_per_iteration": 2.7191736698150635 + }, + { + "auxiliary_loss_clip": 0.01328141, + "auxiliary_loss_mlp": 0.01025576, + "balance_loss_clip": 1.22012568, + "balance_loss_mlp": 1.0135715, + "epoch": 0.8340898842627387, + "flos": 20519485054920.0, + "grad_norm": 1.8605081450762102, + "language_loss": 0.73501205, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75854921, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12005615, + "step": 13873, + "time_per_iteration": 4.252940893173218 + }, + { + "auxiliary_loss_clip": 0.01334739, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.22388554, + "balance_loss_mlp": 1.01707578, + "epoch": 0.8341500075154066, + "flos": 21429784214640.0, + "grad_norm": 2.383848182042397, + "language_loss": 0.7576828, + "learning_rate": 2.81574772350013e-07, + "loss": 0.78134739, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14642334, + "step": 13874, + "time_per_iteration": 2.7364842891693115 + }, + { + "auxiliary_loss_clip": 0.01323874, + "auxiliary_loss_mlp": 0.01025734, + "balance_loss_clip": 1.21823382, + "balance_loss_mlp": 1.01374722, + "epoch": 0.8342101307680746, + "flos": 22096105390800.0, + "grad_norm": 2.45050203065446, + "language_loss": 0.66271138, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68620741, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11981201, + "step": 13875, + "time_per_iteration": 2.7513198852539062 + }, + { + "auxiliary_loss_clip": 0.01325696, + "auxiliary_loss_mlp": 0.01034122, + "balance_loss_clip": 1.21924162, + "balance_loss_mlp": 1.02174807, + "epoch": 0.8342702540207425, + "flos": 21876475442040.0, + "grad_norm": 1.6678137018657335, + "language_loss": 0.79716378, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.82076198, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12365723, + "step": 13876, + "time_per_iteration": 2.825424909591675 + }, + { + "auxiliary_loss_clip": 0.01322853, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.21692026, + "balance_loss_mlp": 1.02141857, + "epoch": 0.8343303772734105, + "flos": 22533619128840.0, + "grad_norm": 1.9018797389364597, + "language_loss": 0.87544262, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89901423, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12902832, + "step": 13877, + "time_per_iteration": 2.7505979537963867 + }, + { + "auxiliary_loss_clip": 0.01323958, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.21636677, + "balance_loss_mlp": 1.01722348, + "epoch": 0.8343905005260784, + "flos": 14943956415480.0, + "grad_norm": 1.7927480649545107, + "language_loss": 0.69797999, + "learning_rate": 2.807782702318828e-07, + "loss": 0.72151315, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12121582, + "step": 13878, + "time_per_iteration": 2.7755112648010254 + }, + { + "auxiliary_loss_clip": 0.01328069, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.21986055, + "balance_loss_mlp": 1.01769006, + "epoch": 0.8344506237787465, + "flos": 15016611417480.0, + "grad_norm": 2.3553149995379337, + "language_loss": 0.79658616, + "learning_rate": 2.805793076661309e-07, + "loss": 0.82016921, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12542725, + "step": 13879, + "time_per_iteration": 2.751281261444092 + }, + { + "auxiliary_loss_clip": 0.01323373, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.21667457, + "balance_loss_mlp": 1.01879609, + "epoch": 0.8345107470314144, + "flos": 17564084623800.0, + "grad_norm": 1.9870148390375564, + "language_loss": 0.83249134, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85602927, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11633301, + "step": 13880, + "time_per_iteration": 2.7677817344665527 + }, + { + "auxiliary_loss_clip": 0.0133254, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.22266126, + "balance_loss_mlp": 1.01895976, + "epoch": 0.8345708702840824, + "flos": 25192145864520.0, + "grad_norm": 2.0065421110792756, + "language_loss": 0.78346145, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80710173, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12518311, + "step": 13881, + "time_per_iteration": 2.854046106338501 + }, + { + "auxiliary_loss_clip": 0.01316239, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.21359158, + "balance_loss_mlp": 1.01667595, + "epoch": 0.8346309935367503, + "flos": 15087479651640.0, + "grad_norm": 2.717441939567261, + "language_loss": 0.78506947, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80851543, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11676025, + "step": 13882, + "time_per_iteration": 2.73073148727417 + }, + { + "auxiliary_loss_clip": 0.0132503, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.21818447, + "balance_loss_mlp": 1.0235163, + "epoch": 0.8346911167894183, + "flos": 22935901791600.0, + "grad_norm": 1.7357975437512967, + "language_loss": 0.8049587, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82857168, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12762451, + "step": 13883, + "time_per_iteration": 2.8762049674987793 + }, + { + "auxiliary_loss_clip": 0.01328016, + "auxiliary_loss_mlp": 0.01029832, + "balance_loss_clip": 1.21826959, + "balance_loss_mlp": 1.01655257, + "epoch": 0.8347512400420862, + "flos": 20197816632720.0, + "grad_norm": 1.9134376007186944, + "language_loss": 0.74307597, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76665443, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13287354, + "step": 13884, + "time_per_iteration": 2.711977958679199 + }, + { + "auxiliary_loss_clip": 0.0134383, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.22726285, + "balance_loss_mlp": 1.02391505, + "epoch": 0.8348113632947542, + "flos": 25959977605440.0, + "grad_norm": 1.904826528343848, + "language_loss": 0.70667017, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.73049212, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.14471436, + "step": 13885, + "time_per_iteration": 2.8475754261016846 + }, + { + "auxiliary_loss_clip": 0.01329305, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.21993208, + "balance_loss_mlp": 1.01815295, + "epoch": 0.8348714865474223, + "flos": 34211291533560.0, + "grad_norm": 1.6068167813839065, + "language_loss": 0.70689672, + "learning_rate": 2.791883957449912e-07, + "loss": 0.73050439, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13311768, + "step": 13886, + "time_per_iteration": 2.832679510116577 + }, + { + "auxiliary_loss_clip": 0.0132884, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.22217298, + "balance_loss_mlp": 1.01513278, + "epoch": 0.8349316098000902, + "flos": 24395620994280.0, + "grad_norm": 2.2385273128133973, + "language_loss": 0.79814696, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.82171082, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12420654, + "step": 13887, + "time_per_iteration": 2.799086570739746 + }, + { + "auxiliary_loss_clip": 0.01337615, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.2253716, + "balance_loss_mlp": 1.01643741, + "epoch": 0.8349917330527582, + "flos": 23037087489480.0, + "grad_norm": 2.2355129127162945, + "language_loss": 0.64821523, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.67189884, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.14294434, + "step": 13888, + "time_per_iteration": 2.9743621349334717 + }, + { + "auxiliary_loss_clip": 0.01333307, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.22194004, + "balance_loss_mlp": 1.01330185, + "epoch": 0.8350518563054261, + "flos": 13629831475320.0, + "grad_norm": 2.0684490235791406, + "language_loss": 0.67875123, + "learning_rate": 2.785932692855244e-07, + "loss": 0.70234609, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12872314, + "step": 13889, + "time_per_iteration": 2.7534921169281006 + }, + { + "auxiliary_loss_clip": 0.0132123, + "auxiliary_loss_mlp": 0.0102415, + "balance_loss_clip": 1.21458054, + "balance_loss_mlp": 1.01223516, + "epoch": 0.8351119795580941, + "flos": 21584799616680.0, + "grad_norm": 2.042148121009912, + "language_loss": 0.68943042, + "learning_rate": 2.783950243408399e-07, + "loss": 0.71288419, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.11920166, + "step": 13890, + "time_per_iteration": 2.7708144187927246 + }, + { + "auxiliary_loss_clip": 0.01329879, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.22165108, + "balance_loss_mlp": 1.0247004, + "epoch": 0.835172102810762, + "flos": 20041907846760.0, + "grad_norm": 2.391407186785062, + "language_loss": 0.59732497, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.62100708, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13641357, + "step": 13891, + "time_per_iteration": 2.702904224395752 + }, + { + "auxiliary_loss_clip": 0.01327659, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.21974587, + "balance_loss_mlp": 1.01823115, + "epoch": 0.8352322260634301, + "flos": 25116079760280.0, + "grad_norm": 1.5848748279843965, + "language_loss": 0.72002012, + "learning_rate": 2.779987303092846e-07, + "loss": 0.74360085, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.1217041, + "step": 13892, + "time_per_iteration": 2.8337666988372803 + }, + { + "auxiliary_loss_clip": 0.01321554, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.21564329, + "balance_loss_mlp": 1.01759696, + "epoch": 0.835292349316098, + "flos": 24869218583160.0, + "grad_norm": 4.279576297927104, + "language_loss": 0.65989351, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68341005, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12506104, + "step": 13893, + "time_per_iteration": 2.756540298461914 + }, + { + "auxiliary_loss_clip": 0.01322508, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.21401775, + "balance_loss_mlp": 1.01692617, + "epoch": 0.835352472568766, + "flos": 19870422431760.0, + "grad_norm": 2.434451925827604, + "language_loss": 0.7870698, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.81058866, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12463379, + "step": 13894, + "time_per_iteration": 2.789414882659912 + }, + { + "auxiliary_loss_clip": 0.01314232, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.21179581, + "balance_loss_mlp": 1.01890957, + "epoch": 0.8354125958214339, + "flos": 22059899714880.0, + "grad_norm": 1.6202463745142746, + "language_loss": 0.73105115, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.75450486, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.12219238, + "step": 13895, + "time_per_iteration": 2.8708534240722656 + }, + { + "auxiliary_loss_clip": 0.01332918, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.22183549, + "balance_loss_mlp": 1.02820754, + "epoch": 0.8354727190741019, + "flos": 21402837244800.0, + "grad_norm": 2.839286551133823, + "language_loss": 0.72207212, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74582851, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1449585, + "step": 13896, + "time_per_iteration": 2.745626449584961 + }, + { + "auxiliary_loss_clip": 0.01318499, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.21262193, + "balance_loss_mlp": 1.01498997, + "epoch": 0.8355328423267698, + "flos": 50849767645920.0, + "grad_norm": 5.400922388179368, + "language_loss": 0.58864212, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6121009, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12390137, + "step": 13897, + "time_per_iteration": 3.0849902629852295 + }, + { + "auxiliary_loss_clip": 0.01142186, + "auxiliary_loss_mlp": 0.01007034, + "balance_loss_clip": 1.09844136, + "balance_loss_mlp": 1.00429201, + "epoch": 0.8355929655794379, + "flos": 65567329497720.0, + "grad_norm": 0.7247113521799036, + "language_loss": 0.57676291, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.5982551, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02746582, + "step": 13898, + "time_per_iteration": 3.264134168624878 + }, + { + "auxiliary_loss_clip": 0.01332072, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.22095895, + "balance_loss_mlp": 1.02218354, + "epoch": 0.8356530888321058, + "flos": 19175042651040.0, + "grad_norm": 1.7412123402139985, + "language_loss": 0.80048752, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82416862, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13848877, + "step": 13899, + "time_per_iteration": 2.851905107498169 + }, + { + "auxiliary_loss_clip": 0.01330556, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.22174001, + "balance_loss_mlp": 1.02254963, + "epoch": 0.8357132120847738, + "flos": 44134442066520.0, + "grad_norm": 1.6124898413039428, + "language_loss": 0.6926502, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71630186, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12072754, + "step": 13900, + "time_per_iteration": 2.967434883117676 + }, + { + "auxiliary_loss_clip": 0.01327373, + "auxiliary_loss_mlp": 0.01028836, + "balance_loss_clip": 1.22043645, + "balance_loss_mlp": 1.01652157, + "epoch": 0.8357733353374418, + "flos": 24395133693960.0, + "grad_norm": 1.4170390232548993, + "language_loss": 0.71403027, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73759234, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12310791, + "step": 13901, + "time_per_iteration": 2.8962528705596924 + }, + { + "auxiliary_loss_clip": 0.01338399, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.22616565, + "balance_loss_mlp": 1.02288342, + "epoch": 0.8358334585901097, + "flos": 20920346425080.0, + "grad_norm": 2.0432673468490883, + "language_loss": 0.8024922, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82624245, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13757324, + "step": 13902, + "time_per_iteration": 2.8088107109069824 + }, + { + "auxiliary_loss_clip": 0.0132115, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.21619046, + "balance_loss_mlp": 1.0200752, + "epoch": 0.8358935818427777, + "flos": 19248794078760.0, + "grad_norm": 1.4203848045985927, + "language_loss": 0.62442082, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64796036, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12731934, + "step": 13903, + "time_per_iteration": 4.197567701339722 + }, + { + "auxiliary_loss_clip": 0.01329777, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.22268796, + "balance_loss_mlp": 1.0257771, + "epoch": 0.8359537050954456, + "flos": 24139419894360.0, + "grad_norm": 1.6953312586979883, + "language_loss": 0.74316692, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76684201, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11962891, + "step": 13904, + "time_per_iteration": 2.7559657096862793 + }, + { + "auxiliary_loss_clip": 0.01319399, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.2135222, + "balance_loss_mlp": 1.01935315, + "epoch": 0.8360138283481137, + "flos": 16184776619880.0, + "grad_norm": 1.6549596859870452, + "language_loss": 0.72978967, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.75330943, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13238525, + "step": 13905, + "time_per_iteration": 4.324832201004028 + }, + { + "auxiliary_loss_clip": 0.01317835, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.21381903, + "balance_loss_mlp": 1.02536869, + "epoch": 0.8360739516007816, + "flos": 22203829034640.0, + "grad_norm": 1.5028680950193527, + "language_loss": 0.6658622, + "learning_rate": 2.752319888771e-07, + "loss": 0.6894052, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11096191, + "step": 13906, + "time_per_iteration": 4.319117307662964 + }, + { + "auxiliary_loss_clip": 0.01322774, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.21613526, + "balance_loss_mlp": 1.01527643, + "epoch": 0.8361340748534496, + "flos": 20928062013480.0, + "grad_norm": 1.5013196946615797, + "language_loss": 0.74037981, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76388407, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12384033, + "step": 13907, + "time_per_iteration": 2.821657657623291 + }, + { + "auxiliary_loss_clip": 0.01332052, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.22094488, + "balance_loss_mlp": 1.01620245, + "epoch": 0.8361941981061175, + "flos": 26178754778640.0, + "grad_norm": 1.6793381415509419, + "language_loss": 0.75702637, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.78063887, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12994385, + "step": 13908, + "time_per_iteration": 2.8322227001190186 + }, + { + "auxiliary_loss_clip": 0.01329963, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.22147799, + "balance_loss_mlp": 1.02000082, + "epoch": 0.8362543213587855, + "flos": 24423908040000.0, + "grad_norm": 2.2741279677425914, + "language_loss": 0.71492171, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73855603, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13458252, + "step": 13909, + "time_per_iteration": 2.7301526069641113 + }, + { + "auxiliary_loss_clip": 0.01332113, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.22095871, + "balance_loss_mlp": 1.01994276, + "epoch": 0.8363144446114534, + "flos": 17206941476160.0, + "grad_norm": 1.9852683435130463, + "language_loss": 0.73462349, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75826961, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.12567139, + "step": 13910, + "time_per_iteration": 2.7308456897735596 + }, + { + "auxiliary_loss_clip": 0.01330419, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.22232461, + "balance_loss_mlp": 1.01751757, + "epoch": 0.8363745678641215, + "flos": 19283862720600.0, + "grad_norm": 1.5618055763470067, + "language_loss": 0.73514843, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75874877, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12091064, + "step": 13911, + "time_per_iteration": 2.763364315032959 + }, + { + "auxiliary_loss_clip": 0.01325211, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.21619546, + "balance_loss_mlp": 1.02033091, + "epoch": 0.8364346911167894, + "flos": 11878639489080.0, + "grad_norm": 1.828056593777159, + "language_loss": 0.7872718, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81084758, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12042236, + "step": 13912, + "time_per_iteration": 4.303877115249634 + }, + { + "auxiliary_loss_clip": 0.0132418, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.21682513, + "balance_loss_mlp": 1.01787353, + "epoch": 0.8364948143694574, + "flos": 20229433563960.0, + "grad_norm": 1.5356351788062812, + "language_loss": 0.79178762, + "learning_rate": 2.738534240246797e-07, + "loss": 0.81533372, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12561035, + "step": 13913, + "time_per_iteration": 2.858184337615967 + }, + { + "auxiliary_loss_clip": 0.01328984, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.21938455, + "balance_loss_mlp": 1.01950264, + "epoch": 0.8365549376221254, + "flos": 21617472365280.0, + "grad_norm": 1.8835079624028017, + "language_loss": 0.73952955, + "learning_rate": 2.736567479515153e-07, + "loss": 0.76315022, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13568115, + "step": 13914, + "time_per_iteration": 2.800018310546875 + }, + { + "auxiliary_loss_clip": 0.01325986, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.218117, + "balance_loss_mlp": 1.02143097, + "epoch": 0.8366150608747933, + "flos": 23299379843400.0, + "grad_norm": 1.5282682249667587, + "language_loss": 0.71600711, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73961008, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12890625, + "step": 13915, + "time_per_iteration": 2.762336254119873 + }, + { + "auxiliary_loss_clip": 0.01333046, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.22345448, + "balance_loss_mlp": 1.01642764, + "epoch": 0.8366751841274613, + "flos": 15271837916760.0, + "grad_norm": 1.7634895154609387, + "language_loss": 0.73153192, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.75515187, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12530518, + "step": 13916, + "time_per_iteration": 2.7408270835876465 + }, + { + "auxiliary_loss_clip": 0.01328689, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.21898103, + "balance_loss_mlp": 1.01639032, + "epoch": 0.8367353073801292, + "flos": 13228888888440.0, + "grad_norm": 1.7304357287138727, + "language_loss": 0.75450933, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.77809024, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13012695, + "step": 13917, + "time_per_iteration": 2.777531385421753 + }, + { + "auxiliary_loss_clip": 0.01317456, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.21408796, + "balance_loss_mlp": 1.01795471, + "epoch": 0.8367954306327973, + "flos": 24210206911800.0, + "grad_norm": 1.4988754318130821, + "language_loss": 0.79424381, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81771469, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11682129, + "step": 13918, + "time_per_iteration": 2.848386526107788 + }, + { + "auxiliary_loss_clip": 0.01327741, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.22138429, + "balance_loss_mlp": 1.01740527, + "epoch": 0.8368555538854652, + "flos": 24540078222720.0, + "grad_norm": 1.600763748817598, + "language_loss": 0.68035328, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70393491, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.13012695, + "step": 13919, + "time_per_iteration": 2.8188557624816895 + }, + { + "auxiliary_loss_clip": 0.01321829, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.21462941, + "balance_loss_mlp": 1.01662147, + "epoch": 0.8369156771381332, + "flos": 20263283955000.0, + "grad_norm": 1.9481060751429777, + "language_loss": 0.74200958, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.76551551, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.121521, + "step": 13920, + "time_per_iteration": 2.8273110389709473 + }, + { + "auxiliary_loss_clip": 0.01328422, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.21963143, + "balance_loss_mlp": 1.02052939, + "epoch": 0.8369758003908011, + "flos": 21840635241360.0, + "grad_norm": 1.6999524794976866, + "language_loss": 0.69303584, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71665502, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12957764, + "step": 13921, + "time_per_iteration": 2.809706687927246 + }, + { + "auxiliary_loss_clip": 0.01336608, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.22578025, + "balance_loss_mlp": 1.01756752, + "epoch": 0.8370359236434691, + "flos": 21723490458000.0, + "grad_norm": 4.24954745042225, + "language_loss": 0.85327315, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87694478, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12982178, + "step": 13922, + "time_per_iteration": 2.7988481521606445 + }, + { + "auxiliary_loss_clip": 0.01319335, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.2144196, + "balance_loss_mlp": 1.01864243, + "epoch": 0.837096046896137, + "flos": 23154110447760.0, + "grad_norm": 1.4724350675738438, + "language_loss": 0.72055852, + "learning_rate": 2.71889610027088e-07, + "loss": 0.74405682, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11865234, + "step": 13923, + "time_per_iteration": 2.874413013458252 + }, + { + "auxiliary_loss_clip": 0.01319659, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.21402073, + "balance_loss_mlp": 1.01728988, + "epoch": 0.8371561701488051, + "flos": 24497497034280.0, + "grad_norm": 1.781777307305185, + "language_loss": 0.76699269, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.79049456, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.13238525, + "step": 13924, + "time_per_iteration": 2.8335483074188232 + }, + { + "auxiliary_loss_clip": 0.01324264, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.21714163, + "balance_loss_mlp": 1.01738191, + "epoch": 0.837216293401473, + "flos": 29212901465760.0, + "grad_norm": 1.4822386502630358, + "language_loss": 0.64915621, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.67269802, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12530518, + "step": 13925, + "time_per_iteration": 2.8718478679656982 + }, + { + "auxiliary_loss_clip": 0.01329286, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.21982169, + "balance_loss_mlp": 1.02242279, + "epoch": 0.837276416654141, + "flos": 25270810903800.0, + "grad_norm": 1.4422204671611076, + "language_loss": 0.74753833, + "learning_rate": 2.713017433265543e-07, + "loss": 0.77118123, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12585449, + "step": 13926, + "time_per_iteration": 2.797217607498169 + }, + { + "auxiliary_loss_clip": 0.0133214, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.22276211, + "balance_loss_mlp": 1.02042627, + "epoch": 0.837336539906809, + "flos": 13886113791960.0, + "grad_norm": 1.7024389995371283, + "language_loss": 0.71244025, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73609757, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.1315918, + "step": 13927, + "time_per_iteration": 2.7654106616973877 + }, + { + "auxiliary_loss_clip": 0.01142889, + "auxiliary_loss_mlp": 0.01009564, + "balance_loss_clip": 1.09875178, + "balance_loss_mlp": 1.00701261, + "epoch": 0.8373966631594769, + "flos": 68887223189640.0, + "grad_norm": 0.713501946334025, + "language_loss": 0.58931452, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.61083907, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.0255127, + "step": 13928, + "time_per_iteration": 3.3794214725494385 + }, + { + "auxiliary_loss_clip": 0.01332553, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.22444963, + "balance_loss_mlp": 1.01777887, + "epoch": 0.8374567864121449, + "flos": 20453855299200.0, + "grad_norm": 2.0516359328106537, + "language_loss": 0.69903004, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72266018, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12683105, + "step": 13929, + "time_per_iteration": 2.8246207237243652 + }, + { + "auxiliary_loss_clip": 0.01338713, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.22782743, + "balance_loss_mlp": 1.01920855, + "epoch": 0.8375169096648128, + "flos": 41912088993000.0, + "grad_norm": 1.856978204244209, + "language_loss": 0.67225468, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69597244, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 1.10986328, + "router_z_loss_mlp": 0.13873291, + "step": 13930, + "time_per_iteration": 3.023355007171631 + }, + { + "auxiliary_loss_clip": 0.01325142, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.21968246, + "balance_loss_mlp": 1.01604581, + "epoch": 0.8375770329174809, + "flos": 20013945667920.0, + "grad_norm": 1.5660970075026515, + "language_loss": 0.71440744, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73794258, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12316895, + "step": 13931, + "time_per_iteration": 2.7291300296783447 + }, + { + "auxiliary_loss_clip": 0.01321612, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.21530354, + "balance_loss_mlp": 1.01887071, + "epoch": 0.8376371561701488, + "flos": 22788723803040.0, + "grad_norm": 1.584570281471906, + "language_loss": 0.71779156, + "learning_rate": 2.701277800409705e-07, + "loss": 0.74132186, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12548828, + "step": 13932, + "time_per_iteration": 2.7851815223693848 + }, + { + "auxiliary_loss_clip": 0.0132807, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.22162414, + "balance_loss_mlp": 1.02225757, + "epoch": 0.8376972794228168, + "flos": 23919708728880.0, + "grad_norm": 1.9885855462667643, + "language_loss": 0.67137134, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69499004, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11541748, + "step": 13933, + "time_per_iteration": 2.901787519454956 + }, + { + "auxiliary_loss_clip": 0.01324832, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.21972406, + "balance_loss_mlp": 1.02308583, + "epoch": 0.8377574026754847, + "flos": 13738610936520.0, + "grad_norm": 1.971424289343509, + "language_loss": 0.76177776, + "learning_rate": 2.697369836420933e-07, + "loss": 0.7853862, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12921143, + "step": 13934, + "time_per_iteration": 2.7892441749572754 + }, + { + "auxiliary_loss_clip": 0.0132581, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.21925449, + "balance_loss_mlp": 1.01815295, + "epoch": 0.8378175259281527, + "flos": 21656155151160.0, + "grad_norm": 1.438390379209671, + "language_loss": 0.77505094, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79861736, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12677002, + "step": 13935, + "time_per_iteration": 2.7985727787017822 + }, + { + "auxiliary_loss_clip": 0.01326854, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.21906316, + "balance_loss_mlp": 1.01543546, + "epoch": 0.8378776491808206, + "flos": 15452825688000.0, + "grad_norm": 2.6175209420606227, + "language_loss": 0.56534255, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58889401, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12860107, + "step": 13936, + "time_per_iteration": 2.7493386268615723 + }, + { + "auxiliary_loss_clip": 0.01323592, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.2168169, + "balance_loss_mlp": 1.01807213, + "epoch": 0.8379377724334887, + "flos": 14724488900160.0, + "grad_norm": 2.064022043314609, + "language_loss": 0.89877951, + "learning_rate": 2.691512811503882e-07, + "loss": 0.92231953, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12341309, + "step": 13937, + "time_per_iteration": 2.791991949081421 + }, + { + "auxiliary_loss_clip": 0.01327511, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.22058105, + "balance_loss_mlp": 1.01625371, + "epoch": 0.8379978956861566, + "flos": 24540646739760.0, + "grad_norm": 1.823238352862273, + "language_loss": 0.81803977, + "learning_rate": 2.689561782445313e-07, + "loss": 0.84160626, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12890625, + "step": 13938, + "time_per_iteration": 2.794661521911621 + }, + { + "auxiliary_loss_clip": 0.01336607, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.22664738, + "balance_loss_mlp": 1.01888382, + "epoch": 0.8380580189388246, + "flos": 18957240078480.0, + "grad_norm": 1.853394347531918, + "language_loss": 0.71098524, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.7346729, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13275146, + "step": 13939, + "time_per_iteration": 2.7770519256591797 + }, + { + "auxiliary_loss_clip": 0.01332028, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.22202539, + "balance_loss_mlp": 1.0226419, + "epoch": 0.8381181421914926, + "flos": 26545725149400.0, + "grad_norm": 1.5873129417022935, + "language_loss": 0.7649405, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78862059, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13354492, + "step": 13940, + "time_per_iteration": 2.783518075942993 + }, + { + "auxiliary_loss_clip": 0.01325266, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.21925056, + "balance_loss_mlp": 1.02481341, + "epoch": 0.8381782654441605, + "flos": 23296293608040.0, + "grad_norm": 1.61254801726071, + "language_loss": 0.7675975, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.7912246, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.1262207, + "step": 13941, + "time_per_iteration": 2.850940704345703 + }, + { + "auxiliary_loss_clip": 0.01338467, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.22623801, + "balance_loss_mlp": 1.01806498, + "epoch": 0.8382383886968285, + "flos": 26764218064080.0, + "grad_norm": 1.9623840095124894, + "language_loss": 0.732288, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75598991, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13665771, + "step": 13942, + "time_per_iteration": 4.353259325027466 + }, + { + "auxiliary_loss_clip": 0.01344963, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.22959638, + "balance_loss_mlp": 1.0262382, + "epoch": 0.8382985119494964, + "flos": 26110201221000.0, + "grad_norm": 1.7787034096070256, + "language_loss": 0.79865301, + "learning_rate": 2.679816484834554e-07, + "loss": 0.82250679, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.14178467, + "step": 13943, + "time_per_iteration": 2.8272221088409424 + }, + { + "auxiliary_loss_clip": 0.01328204, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.22085929, + "balance_loss_mlp": 1.01551914, + "epoch": 0.8383586352021645, + "flos": 16439434602120.0, + "grad_norm": 3.1820393210228786, + "language_loss": 0.85093629, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87449706, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12329102, + "step": 13944, + "time_per_iteration": 4.313309907913208 + }, + { + "auxiliary_loss_clip": 0.01142095, + "auxiliary_loss_mlp": 0.01000822, + "balance_loss_clip": 1.09806836, + "balance_loss_mlp": 0.99810374, + "epoch": 0.8384187584548324, + "flos": 64210907627640.0, + "grad_norm": 0.6242865539768657, + "language_loss": 0.50242746, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52385664, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02722168, + "step": 13945, + "time_per_iteration": 4.793781757354736 + }, + { + "auxiliary_loss_clip": 0.01323507, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.2181685, + "balance_loss_mlp": 1.02055216, + "epoch": 0.8384788817075004, + "flos": 22388065474680.0, + "grad_norm": 2.3524531426840105, + "language_loss": 0.65111631, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67467916, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12213135, + "step": 13946, + "time_per_iteration": 2.828590154647827 + }, + { + "auxiliary_loss_clip": 0.01331381, + "auxiliary_loss_mlp": 0.01031439, + "balance_loss_clip": 1.22216105, + "balance_loss_mlp": 1.01827228, + "epoch": 0.8385390049601683, + "flos": 29502831131640.0, + "grad_norm": 1.6061357755090475, + "language_loss": 0.67534119, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69896936, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1315918, + "step": 13947, + "time_per_iteration": 2.9172873497009277 + }, + { + "auxiliary_loss_clip": 0.01326345, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.21676934, + "balance_loss_mlp": 1.01752639, + "epoch": 0.8385991282128363, + "flos": 32714229620880.0, + "grad_norm": 1.404376691501896, + "language_loss": 0.69842756, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.7220003, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.13397217, + "step": 13948, + "time_per_iteration": 2.976814031600952 + }, + { + "auxiliary_loss_clip": 0.01319426, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.21518373, + "balance_loss_mlp": 1.02001429, + "epoch": 0.8386592514655042, + "flos": 25445707421040.0, + "grad_norm": 3.591389645481309, + "language_loss": 0.84914219, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87264919, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11248779, + "step": 13949, + "time_per_iteration": 2.7559666633605957 + }, + { + "auxiliary_loss_clip": 0.01319859, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.21543527, + "balance_loss_mlp": 1.0139451, + "epoch": 0.8387193747181723, + "flos": 22020729628680.0, + "grad_norm": 1.6382195449165653, + "language_loss": 0.70888329, + "learning_rate": 2.66620065513385e-07, + "loss": 0.73233891, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.11755371, + "step": 13950, + "time_per_iteration": 4.207491874694824 + }, + { + "auxiliary_loss_clip": 0.01324893, + "auxiliary_loss_mlp": 0.01024956, + "balance_loss_clip": 1.21811247, + "balance_loss_mlp": 1.01187301, + "epoch": 0.8387794979708402, + "flos": 18154786387680.0, + "grad_norm": 1.5421134246022963, + "language_loss": 0.65039706, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.6738956, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13092041, + "step": 13951, + "time_per_iteration": 2.733612060546875 + }, + { + "auxiliary_loss_clip": 0.01328674, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.22073185, + "balance_loss_mlp": 1.01576245, + "epoch": 0.8388396212235082, + "flos": 25416770641560.0, + "grad_norm": 2.5924503831433574, + "language_loss": 0.70184255, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72541547, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12841797, + "step": 13952, + "time_per_iteration": 2.874540090560913 + }, + { + "auxiliary_loss_clip": 0.01326622, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.21976995, + "balance_loss_mlp": 1.0180999, + "epoch": 0.8388997444761762, + "flos": 22278148979400.0, + "grad_norm": 1.8161958118468766, + "language_loss": 0.73101521, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.75458694, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12451172, + "step": 13953, + "time_per_iteration": 2.813595771789551 + }, + { + "auxiliary_loss_clip": 0.0132286, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.21690381, + "balance_loss_mlp": 1.01536679, + "epoch": 0.8389598677288441, + "flos": 19577975047560.0, + "grad_norm": 2.0921432019958197, + "language_loss": 0.68195021, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70545542, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12286377, + "step": 13954, + "time_per_iteration": 2.7000746726989746 + }, + { + "auxiliary_loss_clip": 0.01330144, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.22226095, + "balance_loss_mlp": 1.01779342, + "epoch": 0.8390199909815121, + "flos": 17389269323280.0, + "grad_norm": 1.6611357911413547, + "language_loss": 0.73316324, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75676316, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12060547, + "step": 13955, + "time_per_iteration": 2.7714974880218506 + }, + { + "auxiliary_loss_clip": 0.01327813, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.2205621, + "balance_loss_mlp": 1.01455545, + "epoch": 0.83908011423418, + "flos": 24644390764320.0, + "grad_norm": 2.7682473201993973, + "language_loss": 0.67160285, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.69515425, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12780762, + "step": 13956, + "time_per_iteration": 2.766713857650757 + }, + { + "auxiliary_loss_clip": 0.01335497, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.22461212, + "balance_loss_mlp": 1.01438165, + "epoch": 0.8391402374868481, + "flos": 24723867970800.0, + "grad_norm": 2.8529970076413793, + "language_loss": 0.80523622, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.8288697, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13470459, + "step": 13957, + "time_per_iteration": 2.7650306224823 + }, + { + "auxiliary_loss_clip": 0.0114044, + "auxiliary_loss_mlp": 0.01011535, + "balance_loss_clip": 1.09704828, + "balance_loss_mlp": 1.00893629, + "epoch": 0.839200360739516, + "flos": 56887395062400.0, + "grad_norm": 0.752056620583708, + "language_loss": 0.53458309, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55610287, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02600098, + "step": 13958, + "time_per_iteration": 3.366943120956421 + }, + { + "auxiliary_loss_clip": 0.0132599, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.21936774, + "balance_loss_mlp": 1.01964128, + "epoch": 0.839260483992184, + "flos": 18337398493320.0, + "grad_norm": 1.7534224208084868, + "language_loss": 0.73590422, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75949574, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13531494, + "step": 13959, + "time_per_iteration": 2.8449342250823975 + }, + { + "auxiliary_loss_clip": 0.01320314, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.21691656, + "balance_loss_mlp": 1.0138402, + "epoch": 0.8393206072448519, + "flos": 27093480249600.0, + "grad_norm": 1.735363055216751, + "language_loss": 0.55854177, + "learning_rate": 2.646805346545169e-07, + "loss": 0.58200431, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12109375, + "step": 13960, + "time_per_iteration": 2.931009292602539 + }, + { + "auxiliary_loss_clip": 0.01142161, + "auxiliary_loss_mlp": 0.01017845, + "balance_loss_clip": 1.09846139, + "balance_loss_mlp": 1.01510322, + "epoch": 0.8393807304975199, + "flos": 61534797472080.0, + "grad_norm": 0.8683861923715849, + "language_loss": 0.60763478, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62923479, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02746582, + "step": 13961, + "time_per_iteration": 3.3206968307495117 + }, + { + "auxiliary_loss_clip": 0.01330251, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.22126603, + "balance_loss_mlp": 1.01894176, + "epoch": 0.8394408537501878, + "flos": 14897923516440.0, + "grad_norm": 2.8596919964508225, + "language_loss": 0.68382609, + "learning_rate": 2.642934178894405e-07, + "loss": 0.7074433, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12524414, + "step": 13962, + "time_per_iteration": 2.7490875720977783 + }, + { + "auxiliary_loss_clip": 0.01331943, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.22189999, + "balance_loss_mlp": 1.01717675, + "epoch": 0.8395009770028559, + "flos": 17415566559360.0, + "grad_norm": 1.7976200931229056, + "language_loss": 0.73252439, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75614434, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12878418, + "step": 13963, + "time_per_iteration": 2.846343517303467 + }, + { + "auxiliary_loss_clip": 0.01326339, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.21863449, + "balance_loss_mlp": 1.02292669, + "epoch": 0.8395611002555238, + "flos": 27930109198320.0, + "grad_norm": 1.5168015179944419, + "language_loss": 0.76688051, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.79050082, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12768555, + "step": 13964, + "time_per_iteration": 2.92144775390625 + }, + { + "auxiliary_loss_clip": 0.01337158, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.22588515, + "balance_loss_mlp": 1.02067399, + "epoch": 0.8396212235081918, + "flos": 11103579460080.0, + "grad_norm": 2.1750767200806447, + "language_loss": 0.78827441, + "learning_rate": 2.637132363964161e-07, + "loss": 0.81198895, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13616943, + "step": 13965, + "time_per_iteration": 2.7804758548736572 + }, + { + "auxiliary_loss_clip": 0.01321139, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.2148937, + "balance_loss_mlp": 1.02070212, + "epoch": 0.8396813467608598, + "flos": 35742406879080.0, + "grad_norm": 1.4907813258992864, + "language_loss": 0.66154462, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68508625, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12322998, + "step": 13966, + "time_per_iteration": 2.8611791133880615 + }, + { + "auxiliary_loss_clip": 0.01319793, + "auxiliary_loss_mlp": 0.01024718, + "balance_loss_clip": 1.21350145, + "balance_loss_mlp": 1.01225507, + "epoch": 0.8397414700135277, + "flos": 26182287705960.0, + "grad_norm": 1.42015266593234, + "language_loss": 0.74542391, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76886904, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12469482, + "step": 13967, + "time_per_iteration": 2.797320604324341 + }, + { + "auxiliary_loss_clip": 0.01326605, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.22012293, + "balance_loss_mlp": 1.017349, + "epoch": 0.8398015932661957, + "flos": 18337763968560.0, + "grad_norm": 1.835398753252212, + "language_loss": 0.83265525, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85621262, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.11773682, + "step": 13968, + "time_per_iteration": 2.7785072326660156 + }, + { + "auxiliary_loss_clip": 0.01331217, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.22159266, + "balance_loss_mlp": 1.01997352, + "epoch": 0.8398617165188637, + "flos": 17383381111080.0, + "grad_norm": 1.7852334607264093, + "language_loss": 0.77785093, + "learning_rate": 2.629405828689075e-07, + "loss": 0.801494, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13116455, + "step": 13969, + "time_per_iteration": 2.692455530166626 + }, + { + "auxiliary_loss_clip": 0.01337483, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.22664237, + "balance_loss_mlp": 1.01611495, + "epoch": 0.8399218397715317, + "flos": 22934845974240.0, + "grad_norm": 2.5842842718895502, + "language_loss": 0.7765286, + "learning_rate": 2.627475841423923e-07, + "loss": 0.80020005, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13562012, + "step": 13970, + "time_per_iteration": 2.7783584594726562 + }, + { + "auxiliary_loss_clip": 0.01325706, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.21820343, + "balance_loss_mlp": 1.02213359, + "epoch": 0.8399819630241996, + "flos": 23154882006600.0, + "grad_norm": 2.0830323853072734, + "language_loss": 0.72775871, + "learning_rate": 2.625546512926633e-07, + "loss": 0.75136483, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12774658, + "step": 13971, + "time_per_iteration": 2.766127586364746 + }, + { + "auxiliary_loss_clip": 0.01332475, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.22345281, + "balance_loss_mlp": 1.01791275, + "epoch": 0.8400420862768676, + "flos": 16401563983440.0, + "grad_norm": 1.6859247465006706, + "language_loss": 0.77498215, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79862159, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13586426, + "step": 13972, + "time_per_iteration": 2.8017590045928955 + }, + { + "auxiliary_loss_clip": 0.01320899, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.21521568, + "balance_loss_mlp": 1.01816058, + "epoch": 0.8401022095295355, + "flos": 21292392840840.0, + "grad_norm": 1.2688757968837059, + "language_loss": 0.68520689, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70871663, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11914062, + "step": 13973, + "time_per_iteration": 2.74821400642395 + }, + { + "auxiliary_loss_clip": 0.01324281, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.2158432, + "balance_loss_mlp": 1.0189923, + "epoch": 0.8401623327822035, + "flos": 17315802154080.0, + "grad_norm": 1.8128678431005523, + "language_loss": 0.78721386, + "learning_rate": 2.619762480773382e-07, + "loss": 0.81077802, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13140869, + "step": 13974, + "time_per_iteration": 2.7679460048675537 + }, + { + "auxiliary_loss_clip": 0.01329864, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.22008252, + "balance_loss_mlp": 1.02261567, + "epoch": 0.8402224560348714, + "flos": 22241780870040.0, + "grad_norm": 1.4825303304618809, + "language_loss": 0.72734928, + "learning_rate": 2.617835788078868e-07, + "loss": 0.75100315, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12921143, + "step": 13975, + "time_per_iteration": 2.7961795330047607 + }, + { + "auxiliary_loss_clip": 0.01324871, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.21712673, + "balance_loss_mlp": 1.01655674, + "epoch": 0.8402825792875395, + "flos": 20234793867480.0, + "grad_norm": 1.6557003930841654, + "language_loss": 0.72607934, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74962318, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12976074, + "step": 13976, + "time_per_iteration": 2.918788194656372 + }, + { + "auxiliary_loss_clip": 0.01325099, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.21790802, + "balance_loss_mlp": 1.01832497, + "epoch": 0.8403427025402074, + "flos": 23294588056920.0, + "grad_norm": 1.6865069468378915, + "language_loss": 0.72234583, + "learning_rate": 2.61398438016311e-07, + "loss": 0.74589491, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11486816, + "step": 13977, + "time_per_iteration": 2.8518078327178955 + }, + { + "auxiliary_loss_clip": 0.01327886, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.2191174, + "balance_loss_mlp": 1.01719141, + "epoch": 0.8404028257928754, + "flos": 32682572081280.0, + "grad_norm": 1.3606293489475862, + "language_loss": 0.68624455, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70981574, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1204834, + "step": 13978, + "time_per_iteration": 2.8622658252716064 + }, + { + "auxiliary_loss_clip": 0.01315862, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.21188831, + "balance_loss_mlp": 1.01986945, + "epoch": 0.8404629490455434, + "flos": 16184898444960.0, + "grad_norm": 1.6418668899183282, + "language_loss": 0.78306365, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80654174, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12078857, + "step": 13979, + "time_per_iteration": 2.7750136852264404 + }, + { + "auxiliary_loss_clip": 0.01329297, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.22106123, + "balance_loss_mlp": 1.01888561, + "epoch": 0.8405230722982113, + "flos": 15198573789360.0, + "grad_norm": 1.7114382932504761, + "language_loss": 0.78060293, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80421257, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12774658, + "step": 13980, + "time_per_iteration": 2.734473705291748 + }, + { + "auxiliary_loss_clip": 0.01320135, + "auxiliary_loss_mlp": 0.01026992, + "balance_loss_clip": 1.21610153, + "balance_loss_mlp": 1.01497555, + "epoch": 0.8405831955508793, + "flos": 27817390726200.0, + "grad_norm": 1.5031472775990942, + "language_loss": 0.86747265, + "learning_rate": 2.606289476268757e-07, + "loss": 0.89094388, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.12023926, + "step": 13981, + "time_per_iteration": 4.214267730712891 + }, + { + "auxiliary_loss_clip": 0.01329313, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.22282457, + "balance_loss_mlp": 1.01821017, + "epoch": 0.8406433188035473, + "flos": 23774804808480.0, + "grad_norm": 1.8831647290358444, + "language_loss": 0.68121397, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70480579, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11669922, + "step": 13982, + "time_per_iteration": 2.7852063179016113 + }, + { + "auxiliary_loss_clip": 0.01337703, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.22925258, + "balance_loss_mlp": 1.0166899, + "epoch": 0.8407034420562153, + "flos": 29211723823320.0, + "grad_norm": 1.8249000832193158, + "language_loss": 0.68535984, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70903391, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13018799, + "step": 13983, + "time_per_iteration": 4.391988515853882 + }, + { + "auxiliary_loss_clip": 0.01330262, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.22134447, + "balance_loss_mlp": 1.01981413, + "epoch": 0.8407635653088832, + "flos": 26365793195520.0, + "grad_norm": 2.1714045138996476, + "language_loss": 0.79410625, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.81774139, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13421631, + "step": 13984, + "time_per_iteration": 4.3332319259643555 + }, + { + "auxiliary_loss_clip": 0.01326111, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.21825111, + "balance_loss_mlp": 1.01723778, + "epoch": 0.8408236885615512, + "flos": 21473583653880.0, + "grad_norm": 1.8173383212368461, + "language_loss": 0.60479343, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62835109, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12426758, + "step": 13985, + "time_per_iteration": 2.774867534637451 + }, + { + "auxiliary_loss_clip": 0.01329942, + "auxiliary_loss_mlp": 0.01026592, + "balance_loss_clip": 1.22022986, + "balance_loss_mlp": 1.0132941, + "epoch": 0.8408838118142191, + "flos": 22968574540200.0, + "grad_norm": 1.7853184607038284, + "language_loss": 0.82064927, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84421462, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13299561, + "step": 13986, + "time_per_iteration": 2.7514607906341553 + }, + { + "auxiliary_loss_clip": 0.0133088, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.22322536, + "balance_loss_mlp": 1.01971579, + "epoch": 0.8409439350668871, + "flos": 26806068302040.0, + "grad_norm": 1.8124280822254912, + "language_loss": 0.66164917, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.68528605, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13104248, + "step": 13987, + "time_per_iteration": 2.971874713897705 + }, + { + "auxiliary_loss_clip": 0.01323459, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.21724677, + "balance_loss_mlp": 1.02023888, + "epoch": 0.841004058319555, + "flos": 26584367326920.0, + "grad_norm": 2.113406937616832, + "language_loss": 0.67462254, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69818497, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12554932, + "step": 13988, + "time_per_iteration": 2.8394975662231445 + }, + { + "auxiliary_loss_clip": 0.01334714, + "auxiliary_loss_mlp": 0.01034575, + "balance_loss_clip": 1.22441697, + "balance_loss_mlp": 1.02146232, + "epoch": 0.8410641815722231, + "flos": 14505995985480.0, + "grad_norm": 3.666052150738997, + "language_loss": 0.81590271, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83959556, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13128662, + "step": 13989, + "time_per_iteration": 4.231957912445068 + }, + { + "auxiliary_loss_clip": 0.01330483, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.22092521, + "balance_loss_mlp": 1.01784849, + "epoch": 0.841124304824891, + "flos": 29172391303680.0, + "grad_norm": 1.6866429172494461, + "language_loss": 0.75668204, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.78029382, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12860107, + "step": 13990, + "time_per_iteration": 2.7965307235717773 + }, + { + "auxiliary_loss_clip": 0.01317526, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.21400213, + "balance_loss_mlp": 1.021101, + "epoch": 0.841184428077559, + "flos": 22416230695320.0, + "grad_norm": 1.4863598630791492, + "language_loss": 0.80863106, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.83213633, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.11914062, + "step": 13991, + "time_per_iteration": 2.786689281463623 + }, + { + "auxiliary_loss_clip": 0.01327133, + "auxiliary_loss_mlp": 0.01026959, + "balance_loss_clip": 1.22012174, + "balance_loss_mlp": 1.01508021, + "epoch": 0.841244551330227, + "flos": 22967518722840.0, + "grad_norm": 2.014552909308471, + "language_loss": 0.70653689, + "learning_rate": 2.585182919204105e-07, + "loss": 0.7300778, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11889648, + "step": 13992, + "time_per_iteration": 2.739868640899658 + }, + { + "auxiliary_loss_clip": 0.0132814, + "auxiliary_loss_mlp": 0.01023899, + "balance_loss_clip": 1.21976161, + "balance_loss_mlp": 1.01169848, + "epoch": 0.8413046745828949, + "flos": 21037694250240.0, + "grad_norm": 1.4995601749433503, + "language_loss": 0.76473361, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78825402, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12194824, + "step": 13993, + "time_per_iteration": 2.7604615688323975 + }, + { + "auxiliary_loss_clip": 0.01342525, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.22846842, + "balance_loss_mlp": 1.0216887, + "epoch": 0.841364797835563, + "flos": 27057721265640.0, + "grad_norm": 1.7569377833279587, + "language_loss": 0.74272156, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76650828, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14465332, + "step": 13994, + "time_per_iteration": 2.8207499980926514 + }, + { + "auxiliary_loss_clip": 0.01320813, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.21727574, + "balance_loss_mlp": 1.0171237, + "epoch": 0.8414249210882309, + "flos": 17900331447240.0, + "grad_norm": 1.4926604144928468, + "language_loss": 0.59597957, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61947572, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11682129, + "step": 13995, + "time_per_iteration": 2.8376266956329346 + }, + { + "auxiliary_loss_clip": 0.01323975, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.21714067, + "balance_loss_mlp": 1.01491785, + "epoch": 0.8414850443408989, + "flos": 25445910462840.0, + "grad_norm": 1.9037552843202437, + "language_loss": 0.72082007, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74433696, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12780762, + "step": 13996, + "time_per_iteration": 2.792131185531616 + }, + { + "auxiliary_loss_clip": 0.01321478, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.2138381, + "balance_loss_mlp": 1.02184772, + "epoch": 0.8415451675935668, + "flos": 23225059898640.0, + "grad_norm": 1.6635813246825684, + "language_loss": 0.64492816, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66848201, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12060547, + "step": 13997, + "time_per_iteration": 2.776127338409424 + }, + { + "auxiliary_loss_clip": 0.01340625, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.2287221, + "balance_loss_mlp": 1.01941681, + "epoch": 0.8416052908462348, + "flos": 18550449887760.0, + "grad_norm": 2.4638867430142346, + "language_loss": 0.81883562, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84257066, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13458252, + "step": 13998, + "time_per_iteration": 2.756314516067505 + }, + { + "auxiliary_loss_clip": 0.01327209, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.21861935, + "balance_loss_mlp": 1.01764631, + "epoch": 0.8416654140989027, + "flos": 26110932171480.0, + "grad_norm": 1.577803221755089, + "language_loss": 0.80626583, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82984388, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12927246, + "step": 13999, + "time_per_iteration": 2.9457359313964844 + }, + { + "auxiliary_loss_clip": 0.01339483, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.22892165, + "balance_loss_mlp": 1.02276206, + "epoch": 0.8417255373515707, + "flos": 26439016714560.0, + "grad_norm": 1.9126829140758053, + "language_loss": 0.66659296, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69035178, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13653564, + "step": 14000, + "time_per_iteration": 2.837233066558838 + }, + { + "auxiliary_loss_clip": 0.01334603, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.22403693, + "balance_loss_mlp": 1.01790011, + "epoch": 0.8417856606042387, + "flos": 24723177628680.0, + "grad_norm": 1.538304680139934, + "language_loss": 0.79992682, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.82358313, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13116455, + "step": 14001, + "time_per_iteration": 2.8121633529663086 + }, + { + "auxiliary_loss_clip": 0.01325949, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.21740937, + "balance_loss_mlp": 1.02173817, + "epoch": 0.8418457838569067, + "flos": 20855853703440.0, + "grad_norm": 1.6607514987646035, + "language_loss": 0.78690928, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.81051511, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12890625, + "step": 14002, + "time_per_iteration": 2.809030294418335 + }, + { + "auxiliary_loss_clip": 0.01321542, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.21460867, + "balance_loss_mlp": 1.01653433, + "epoch": 0.8419059071095746, + "flos": 28666892525040.0, + "grad_norm": 1.7854462228333647, + "language_loss": 0.78347003, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80698144, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13067627, + "step": 14003, + "time_per_iteration": 2.963097095489502 + }, + { + "auxiliary_loss_clip": 0.01326218, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.21851468, + "balance_loss_mlp": 1.02068138, + "epoch": 0.8419660303622426, + "flos": 21658713477840.0, + "grad_norm": 1.6137233549614114, + "language_loss": 0.65932751, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.68292177, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12524414, + "step": 14004, + "time_per_iteration": 2.787703037261963 + }, + { + "auxiliary_loss_clip": 0.01333579, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.2236414, + "balance_loss_mlp": 1.02206647, + "epoch": 0.8420261536149106, + "flos": 25306569887760.0, + "grad_norm": 1.8495047790483476, + "language_loss": 0.76698208, + "learning_rate": 2.560341831785724e-07, + "loss": 0.7906729, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13427734, + "step": 14005, + "time_per_iteration": 2.9217426776885986 + }, + { + "auxiliary_loss_clip": 0.01333146, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.22213578, + "balance_loss_mlp": 1.01811349, + "epoch": 0.8420862768675785, + "flos": 18767115426240.0, + "grad_norm": 1.9166324551649723, + "language_loss": 0.77606577, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79971325, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13494873, + "step": 14006, + "time_per_iteration": 2.744722604751587 + }, + { + "auxiliary_loss_clip": 0.01325502, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.21818137, + "balance_loss_mlp": 1.01913762, + "epoch": 0.8421464001202466, + "flos": 18332038189800.0, + "grad_norm": 1.716008883412255, + "language_loss": 0.77613962, + "learning_rate": 2.556530041751932e-07, + "loss": 0.7997117, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12579346, + "step": 14007, + "time_per_iteration": 2.79189133644104 + }, + { + "auxiliary_loss_clip": 0.01330441, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.2209146, + "balance_loss_mlp": 1.01791716, + "epoch": 0.8422065233729145, + "flos": 31543059399840.0, + "grad_norm": 2.0218813013767765, + "language_loss": 0.65684968, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68046445, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13122559, + "step": 14008, + "time_per_iteration": 2.9643285274505615 + }, + { + "auxiliary_loss_clip": 0.01145183, + "auxiliary_loss_mlp": 0.01007066, + "balance_loss_clip": 1.10138869, + "balance_loss_mlp": 1.00434768, + "epoch": 0.8422666466255825, + "flos": 64312621234200.0, + "grad_norm": 0.7082709424712657, + "language_loss": 0.56993306, + "learning_rate": 2.552720897550631e-07, + "loss": 0.59145558, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02722168, + "step": 14009, + "time_per_iteration": 3.3729333877563477 + }, + { + "auxiliary_loss_clip": 0.01325615, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.21965742, + "balance_loss_mlp": 1.0208962, + "epoch": 0.8423267698782504, + "flos": 24322153825080.0, + "grad_norm": 1.2815835679754217, + "language_loss": 0.78028882, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80387163, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11773682, + "step": 14010, + "time_per_iteration": 2.7683539390563965 + }, + { + "auxiliary_loss_clip": 0.01331824, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.22165334, + "balance_loss_mlp": 1.02159214, + "epoch": 0.8423868931309184, + "flos": 18300664908720.0, + "grad_norm": 1.6224797271294513, + "language_loss": 0.72596943, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74964297, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1394043, + "step": 14011, + "time_per_iteration": 2.864877700805664 + }, + { + "auxiliary_loss_clip": 0.01324707, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.21714246, + "balance_loss_mlp": 1.02155709, + "epoch": 0.8424470163835863, + "flos": 23555499726600.0, + "grad_norm": 1.7811838028356075, + "language_loss": 0.84438884, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86797673, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12524414, + "step": 14012, + "time_per_iteration": 2.785257339477539 + }, + { + "auxiliary_loss_clip": 0.01311735, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.21073389, + "balance_loss_mlp": 1.01704013, + "epoch": 0.8425071396362543, + "flos": 23774845416840.0, + "grad_norm": 2.087228250171384, + "language_loss": 0.68329239, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70669365, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.11358643, + "step": 14013, + "time_per_iteration": 2.7397568225860596 + }, + { + "auxiliary_loss_clip": 0.01336287, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.2239567, + "balance_loss_mlp": 1.02017403, + "epoch": 0.8425672628889223, + "flos": 16183071068760.0, + "grad_norm": 17.918684752518306, + "language_loss": 0.78882122, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81251609, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13049316, + "step": 14014, + "time_per_iteration": 2.7162399291992188 + }, + { + "auxiliary_loss_clip": 0.01326421, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.21874118, + "balance_loss_mlp": 1.01656222, + "epoch": 0.8426273861415903, + "flos": 23154394706280.0, + "grad_norm": 1.6023013942100774, + "language_loss": 0.67979681, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.70335317, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12640381, + "step": 14015, + "time_per_iteration": 2.7127673625946045 + }, + { + "auxiliary_loss_clip": 0.01332614, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.2238934, + "balance_loss_mlp": 1.01527822, + "epoch": 0.8426875093942582, + "flos": 17462330408880.0, + "grad_norm": 8.1759957374782, + "language_loss": 0.76627153, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78988326, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13269043, + "step": 14016, + "time_per_iteration": 2.797271251678467 + }, + { + "auxiliary_loss_clip": 0.01325411, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.21684515, + "balance_loss_mlp": 1.01899588, + "epoch": 0.8427476326469262, + "flos": 19644579403920.0, + "grad_norm": 3.9351609280917534, + "language_loss": 0.79480082, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81837445, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.1295166, + "step": 14017, + "time_per_iteration": 2.709092617034912 + }, + { + "auxiliary_loss_clip": 0.01328088, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.21993244, + "balance_loss_mlp": 1.01858544, + "epoch": 0.8428077558995941, + "flos": 11944066203000.0, + "grad_norm": 2.462922575836365, + "language_loss": 0.62458956, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64817685, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12060547, + "step": 14018, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.01325743, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.21714616, + "balance_loss_mlp": 1.01979399, + "epoch": 0.8428678791522621, + "flos": 10455288395760.0, + "grad_norm": 1.9269321785947844, + "language_loss": 0.79630935, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81988502, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12036133, + "step": 14019, + "time_per_iteration": 4.244307041168213 + }, + { + "auxiliary_loss_clip": 0.01328619, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.21919048, + "balance_loss_mlp": 1.02180982, + "epoch": 0.8429280024049302, + "flos": 28773154267920.0, + "grad_norm": 2.3758588188073855, + "language_loss": 0.7890805, + "learning_rate": 2.531817924498265e-07, + "loss": 0.81271094, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.1262207, + "step": 14020, + "time_per_iteration": 2.908891439437866 + }, + { + "auxiliary_loss_clip": 0.01333989, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.22556448, + "balance_loss_mlp": 1.01412082, + "epoch": 0.8429881256575981, + "flos": 19541891196720.0, + "grad_norm": 1.5103155325179558, + "language_loss": 0.71187627, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.7354902, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1328125, + "step": 14021, + "time_per_iteration": 4.313607931137085 + }, + { + "auxiliary_loss_clip": 0.01335112, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.22503257, + "balance_loss_mlp": 1.02023602, + "epoch": 0.8430482489102661, + "flos": 24796401147720.0, + "grad_norm": 1.530730692610556, + "language_loss": 0.69721365, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.72090042, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.13317871, + "step": 14022, + "time_per_iteration": 4.272261142730713 + }, + { + "auxiliary_loss_clip": 0.01333912, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.22384071, + "balance_loss_mlp": 1.02163744, + "epoch": 0.843108372162934, + "flos": 21549365499600.0, + "grad_norm": 1.8821661027063419, + "language_loss": 0.72307938, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74676698, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13208008, + "step": 14023, + "time_per_iteration": 2.78841495513916 + }, + { + "auxiliary_loss_clip": 0.01325695, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.21858382, + "balance_loss_mlp": 1.01983356, + "epoch": 0.843168495415602, + "flos": 24614276342400.0, + "grad_norm": 1.4183024656660654, + "language_loss": 0.66896546, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69255543, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1348877, + "step": 14024, + "time_per_iteration": 2.798722982406616 + }, + { + "auxiliary_loss_clip": 0.01324037, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.21788132, + "balance_loss_mlp": 1.02049255, + "epoch": 0.8432286186682699, + "flos": 15127015213080.0, + "grad_norm": 1.8449813248286815, + "language_loss": 0.80929238, + "learning_rate": 2.522343063158261e-07, + "loss": 0.8328687, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13098145, + "step": 14025, + "time_per_iteration": 2.7865231037139893 + }, + { + "auxiliary_loss_clip": 0.01322688, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.21773624, + "balance_loss_mlp": 1.02192652, + "epoch": 0.843288741920938, + "flos": 20306596093920.0, + "grad_norm": 1.5149126794094894, + "language_loss": 0.77924705, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.80280042, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.10723877, + "step": 14026, + "time_per_iteration": 2.7668168544769287 + }, + { + "auxiliary_loss_clip": 0.01319877, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.21429884, + "balance_loss_mlp": 1.02479053, + "epoch": 0.8433488651736059, + "flos": 23337453503880.0, + "grad_norm": 1.4133135862926915, + "language_loss": 0.82718861, + "learning_rate": 2.518557757400945e-07, + "loss": 0.8507812, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.14599609, + "step": 14027, + "time_per_iteration": 4.356168508529663 + }, + { + "auxiliary_loss_clip": 0.01324806, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.21788716, + "balance_loss_mlp": 1.02010465, + "epoch": 0.8434089884262739, + "flos": 39465476617680.0, + "grad_norm": 1.4856249470217524, + "language_loss": 0.56626678, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58984435, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12835693, + "step": 14028, + "time_per_iteration": 2.908022165298462 + }, + { + "auxiliary_loss_clip": 0.0132841, + "auxiliary_loss_mlp": 0.01029536, + "balance_loss_clip": 1.220994, + "balance_loss_mlp": 1.01759148, + "epoch": 0.8434691116789418, + "flos": 23774114466360.0, + "grad_norm": 1.8232174576364546, + "language_loss": 0.63863909, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66221857, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.1194458, + "step": 14029, + "time_per_iteration": 2.748358964920044 + }, + { + "auxiliary_loss_clip": 0.01319521, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.21603203, + "balance_loss_mlp": 1.02105308, + "epoch": 0.8435292349316098, + "flos": 22676167764360.0, + "grad_norm": 1.5627364555764542, + "language_loss": 0.75674844, + "learning_rate": 2.51288477067956e-07, + "loss": 0.78027385, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.11962891, + "step": 14030, + "time_per_iteration": 2.7957303524017334 + }, + { + "auxiliary_loss_clip": 0.01316525, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.21182525, + "balance_loss_mlp": 1.01693642, + "epoch": 0.8435893581842777, + "flos": 18848298183840.0, + "grad_norm": 1.6464764360849078, + "language_loss": 0.83871222, + "learning_rate": 2.510995101236502e-07, + "loss": 0.86216921, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.12225342, + "step": 14031, + "time_per_iteration": 2.7863855361938477 + }, + { + "auxiliary_loss_clip": 0.01322802, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.21662164, + "balance_loss_mlp": 1.01735902, + "epoch": 0.8436494814369457, + "flos": 20709122406840.0, + "grad_norm": 3.3762041155150784, + "language_loss": 0.80502999, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82854927, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11761475, + "step": 14032, + "time_per_iteration": 2.7969491481781006 + }, + { + "auxiliary_loss_clip": 0.01327923, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.21924782, + "balance_loss_mlp": 1.02135825, + "epoch": 0.8437096046896138, + "flos": 22679294608080.0, + "grad_norm": 1.355637104477443, + "language_loss": 0.75465143, + "learning_rate": 2.507217751976478e-07, + "loss": 0.7782855, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.14129639, + "step": 14033, + "time_per_iteration": 2.7613730430603027 + }, + { + "auxiliary_loss_clip": 0.0132998, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.22151041, + "balance_loss_mlp": 1.0171324, + "epoch": 0.8437697279422817, + "flos": 16184451753000.0, + "grad_norm": 1.7595259652638986, + "language_loss": 0.83781374, + "learning_rate": 2.505330072302743e-07, + "loss": 0.86139882, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.11395264, + "step": 14034, + "time_per_iteration": 2.818160057067871 + }, + { + "auxiliary_loss_clip": 0.01327057, + "auxiliary_loss_mlp": 0.01028463, + "balance_loss_clip": 1.21790409, + "balance_loss_mlp": 1.01562989, + "epoch": 0.8438298511949497, + "flos": 28771570541880.0, + "grad_norm": 1.5094657482811826, + "language_loss": 0.78560054, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80915576, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12823486, + "step": 14035, + "time_per_iteration": 2.834750175476074 + }, + { + "auxiliary_loss_clip": 0.01328428, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.22061896, + "balance_loss_mlp": 1.0179801, + "epoch": 0.8438899744476176, + "flos": 33730181398080.0, + "grad_norm": 1.3050822128557125, + "language_loss": 0.72499025, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74857986, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12554932, + "step": 14036, + "time_per_iteration": 2.8729052543640137 + }, + { + "auxiliary_loss_clip": 0.01314935, + "auxiliary_loss_mlp": 0.0102535, + "balance_loss_clip": 1.21146274, + "balance_loss_mlp": 1.01457977, + "epoch": 0.8439500977002856, + "flos": 25115186376360.0, + "grad_norm": 1.6328052052692206, + "language_loss": 0.69670904, + "learning_rate": 2.49967101396557e-07, + "loss": 0.72011185, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.10760498, + "step": 14037, + "time_per_iteration": 2.7894248962402344 + }, + { + "auxiliary_loss_clip": 0.01322036, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.21571457, + "balance_loss_mlp": 1.01802039, + "epoch": 0.8440102209529535, + "flos": 32856372172800.0, + "grad_norm": 2.280718525462674, + "language_loss": 0.69029272, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71381438, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12115479, + "step": 14038, + "time_per_iteration": 2.832354784011841 + }, + { + "auxiliary_loss_clip": 0.01326622, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.21832848, + "balance_loss_mlp": 1.02284312, + "epoch": 0.8440703442056215, + "flos": 23735188030320.0, + "grad_norm": 1.506454109890129, + "language_loss": 0.76266968, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78629375, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12969971, + "step": 14039, + "time_per_iteration": 2.8052737712860107 + }, + { + "auxiliary_loss_clip": 0.01339223, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.22731948, + "balance_loss_mlp": 1.02068424, + "epoch": 0.8441304674582895, + "flos": 20198588191560.0, + "grad_norm": 1.9923570996401154, + "language_loss": 0.79382622, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81755316, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.12792969, + "step": 14040, + "time_per_iteration": 2.7634341716766357 + }, + { + "auxiliary_loss_clip": 0.01327591, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.22063518, + "balance_loss_mlp": 1.02031159, + "epoch": 0.8441905907109575, + "flos": 20223423526680.0, + "grad_norm": 2.1925247150645606, + "language_loss": 0.69264984, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71626031, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.13140869, + "step": 14041, + "time_per_iteration": 2.7720630168914795 + }, + { + "auxiliary_loss_clip": 0.01334991, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.22453618, + "balance_loss_mlp": 1.02596426, + "epoch": 0.8442507139636254, + "flos": 13520402280360.0, + "grad_norm": 1.8268127766267988, + "language_loss": 0.69638395, + "learning_rate": 2.490252523307341e-07, + "loss": 0.72012317, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12969971, + "step": 14042, + "time_per_iteration": 2.751438856124878 + }, + { + "auxiliary_loss_clip": 0.01319898, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.21519637, + "balance_loss_mlp": 1.02062035, + "epoch": 0.8443108372162934, + "flos": 18224395762680.0, + "grad_norm": 1.700754334991989, + "language_loss": 0.75027287, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.77379537, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11737061, + "step": 14043, + "time_per_iteration": 2.8062074184417725 + }, + { + "auxiliary_loss_clip": 0.01323537, + "auxiliary_loss_mlp": 0.01028197, + "balance_loss_clip": 1.21640909, + "balance_loss_mlp": 1.01617503, + "epoch": 0.8443709604689613, + "flos": 16109157207600.0, + "grad_norm": 1.9364833742595706, + "language_loss": 0.71941394, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74293125, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12017822, + "step": 14044, + "time_per_iteration": 2.8305845260620117 + }, + { + "auxiliary_loss_clip": 0.01317721, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.21215081, + "balance_loss_mlp": 1.01802635, + "epoch": 0.8444310837216293, + "flos": 18516355846560.0, + "grad_norm": 1.5492510843045844, + "language_loss": 0.74893737, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77242243, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12762451, + "step": 14045, + "time_per_iteration": 2.755573034286499 + }, + { + "auxiliary_loss_clip": 0.01324164, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.21671367, + "balance_loss_mlp": 1.02234602, + "epoch": 0.8444912069742974, + "flos": 14944037632200.0, + "grad_norm": 1.7207925497609342, + "language_loss": 0.78351831, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80710548, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12207031, + "step": 14046, + "time_per_iteration": 2.918876886367798 + }, + { + "auxiliary_loss_clip": 0.01326101, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.21701169, + "balance_loss_mlp": 1.01763082, + "epoch": 0.8445513302269653, + "flos": 20125121022360.0, + "grad_norm": 1.9397611859548725, + "language_loss": 0.77891469, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80248916, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13702393, + "step": 14047, + "time_per_iteration": 2.9382545948028564 + }, + { + "auxiliary_loss_clip": 0.01323521, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.21764648, + "balance_loss_mlp": 1.02102709, + "epoch": 0.8446114534796333, + "flos": 31176576329400.0, + "grad_norm": 2.1601168380497935, + "language_loss": 0.72046697, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74403775, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12542725, + "step": 14048, + "time_per_iteration": 2.9189305305480957 + }, + { + "auxiliary_loss_clip": 0.01327907, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.22065067, + "balance_loss_mlp": 1.02687132, + "epoch": 0.8446715767323012, + "flos": 23953234253040.0, + "grad_norm": 1.4429458114927465, + "language_loss": 0.73553348, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75920904, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12780762, + "step": 14049, + "time_per_iteration": 2.804918050765991 + }, + { + "auxiliary_loss_clip": 0.01142992, + "auxiliary_loss_mlp": 0.01004486, + "balance_loss_clip": 1.09895706, + "balance_loss_mlp": 1.00173175, + "epoch": 0.8447316999849692, + "flos": 68000094422280.0, + "grad_norm": 0.8075290373769206, + "language_loss": 0.60765839, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62913316, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02758789, + "step": 14050, + "time_per_iteration": 3.1923635005950928 + }, + { + "auxiliary_loss_clip": 0.0132156, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.21446502, + "balance_loss_mlp": 1.01479459, + "epoch": 0.8447918232376371, + "flos": 22424027500440.0, + "grad_norm": 2.1573510389464574, + "language_loss": 0.72337562, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74686491, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12573242, + "step": 14051, + "time_per_iteration": 2.7853360176086426 + }, + { + "auxiliary_loss_clip": 0.01320069, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.21457911, + "balance_loss_mlp": 1.02084327, + "epoch": 0.8448519464903052, + "flos": 23699185396200.0, + "grad_norm": 1.8534054843535226, + "language_loss": 0.75103265, + "learning_rate": 2.471465348753547e-07, + "loss": 0.77456546, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12353516, + "step": 14052, + "time_per_iteration": 2.720451831817627 + }, + { + "auxiliary_loss_clip": 0.01312397, + "auxiliary_loss_mlp": 0.01024206, + "balance_loss_clip": 1.21133137, + "balance_loss_mlp": 1.01331043, + "epoch": 0.8449120697429731, + "flos": 13739951012400.0, + "grad_norm": 1.6443206368953036, + "language_loss": 0.73965812, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76302415, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.10894775, + "step": 14053, + "time_per_iteration": 2.7512295246124268 + }, + { + "auxiliary_loss_clip": 0.01320466, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.21430564, + "balance_loss_mlp": 1.01235354, + "epoch": 0.8449721929956411, + "flos": 20891734512480.0, + "grad_norm": 2.4210454409577316, + "language_loss": 0.74300385, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76645631, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12432861, + "step": 14054, + "time_per_iteration": 2.7406957149505615 + }, + { + "auxiliary_loss_clip": 0.01339262, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.22669339, + "balance_loss_mlp": 1.01542974, + "epoch": 0.845032316248309, + "flos": 33223220718480.0, + "grad_norm": 1.3892049474166381, + "language_loss": 0.78345037, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.8071267, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12957764, + "step": 14055, + "time_per_iteration": 2.8557660579681396 + }, + { + "auxiliary_loss_clip": 0.01320144, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.21463108, + "balance_loss_mlp": 1.02079153, + "epoch": 0.845092439500977, + "flos": 23590324718280.0, + "grad_norm": 1.694419563385666, + "language_loss": 0.73155701, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75509036, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12402344, + "step": 14056, + "time_per_iteration": 2.8259451389312744 + }, + { + "auxiliary_loss_clip": 0.01334761, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.22353983, + "balance_loss_mlp": 1.0212431, + "epoch": 0.8451525627536449, + "flos": 13337830783080.0, + "grad_norm": 2.298957668588145, + "language_loss": 0.67251229, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69620085, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12860107, + "step": 14057, + "time_per_iteration": 2.802845001220703 + }, + { + "auxiliary_loss_clip": 0.01327679, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.21878767, + "balance_loss_mlp": 1.0183996, + "epoch": 0.8452126860063129, + "flos": 27823685022000.0, + "grad_norm": 1.9288532704370989, + "language_loss": 0.77526724, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79885513, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12719727, + "step": 14058, + "time_per_iteration": 4.2562384605407715 + }, + { + "auxiliary_loss_clip": 0.01330715, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.22151828, + "balance_loss_mlp": 1.02302742, + "epoch": 0.845272809258981, + "flos": 27130295050920.0, + "grad_norm": 1.484760682503585, + "language_loss": 0.69845206, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72211772, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12835693, + "step": 14059, + "time_per_iteration": 4.378859758377075 + }, + { + "auxiliary_loss_clip": 0.01335314, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.22280645, + "balance_loss_mlp": 1.02024329, + "epoch": 0.8453329325116489, + "flos": 18337154843160.0, + "grad_norm": 1.8984416714308112, + "language_loss": 0.5760566, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59975302, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14093018, + "step": 14060, + "time_per_iteration": 4.2386579513549805 + }, + { + "auxiliary_loss_clip": 0.01335443, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.22416043, + "balance_loss_mlp": 1.02059102, + "epoch": 0.8453930557643169, + "flos": 22680512858880.0, + "grad_norm": 1.631446471696984, + "language_loss": 0.76007771, + "learning_rate": 2.454613720076277e-07, + "loss": 0.7837739, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13604736, + "step": 14061, + "time_per_iteration": 2.8264970779418945 + }, + { + "auxiliary_loss_clip": 0.01332687, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.22227907, + "balance_loss_mlp": 1.01777577, + "epoch": 0.8454531790169848, + "flos": 22491768890880.0, + "grad_norm": 2.0766102071616164, + "language_loss": 0.71669286, + "learning_rate": 2.452744642558013e-07, + "loss": 0.74033147, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13397217, + "step": 14062, + "time_per_iteration": 2.799557685852051 + }, + { + "auxiliary_loss_clip": 0.01143716, + "auxiliary_loss_mlp": 0.01006644, + "balance_loss_clip": 1.10007262, + "balance_loss_mlp": 1.0038662, + "epoch": 0.8455133022696528, + "flos": 58291538401440.0, + "grad_norm": 0.7213851548401365, + "language_loss": 0.52754533, + "learning_rate": 2.450876230433432e-07, + "loss": 0.5490489, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02783203, + "step": 14063, + "time_per_iteration": 3.339625835418701 + }, + { + "auxiliary_loss_clip": 0.01316802, + "auxiliary_loss_mlp": 0.01025329, + "balance_loss_clip": 1.21327651, + "balance_loss_mlp": 1.01388526, + "epoch": 0.8455734255223207, + "flos": 21366306702000.0, + "grad_norm": 1.720695780373687, + "language_loss": 0.82066548, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84408683, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.11437988, + "step": 14064, + "time_per_iteration": 2.7726070880889893 + }, + { + "auxiliary_loss_clip": 0.01335018, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.22557151, + "balance_loss_mlp": 1.01770806, + "epoch": 0.8456335487749888, + "flos": 20454180166080.0, + "grad_norm": 1.724946742256883, + "language_loss": 0.72672904, + "learning_rate": 2.447141402648685e-07, + "loss": 0.75039256, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.1362915, + "step": 14065, + "time_per_iteration": 2.807060956954956 + }, + { + "auxiliary_loss_clip": 0.01320576, + "auxiliary_loss_mlp": 0.01026868, + "balance_loss_clip": 1.21544611, + "balance_loss_mlp": 1.01545405, + "epoch": 0.8456936720276567, + "flos": 28846905695640.0, + "grad_norm": 1.4761470489027397, + "language_loss": 0.77563965, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79911411, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11419678, + "step": 14066, + "time_per_iteration": 4.423535346984863 + }, + { + "auxiliary_loss_clip": 0.01329469, + "auxiliary_loss_mlp": 0.01027708, + "balance_loss_clip": 1.22185183, + "balance_loss_mlp": 1.01477373, + "epoch": 0.8457537952803247, + "flos": 22677832707120.0, + "grad_norm": 1.5187341305224478, + "language_loss": 0.7033397, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72691143, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12927246, + "step": 14067, + "time_per_iteration": 2.900799036026001 + }, + { + "auxiliary_loss_clip": 0.01315731, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.21045661, + "balance_loss_mlp": 1.01397252, + "epoch": 0.8458139185329926, + "flos": 33809577387840.0, + "grad_norm": 2.1862910547994225, + "language_loss": 0.71312654, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73655403, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13049316, + "step": 14068, + "time_per_iteration": 2.955703020095825 + }, + { + "auxiliary_loss_clip": 0.01145352, + "auxiliary_loss_mlp": 0.01008317, + "balance_loss_clip": 1.10112929, + "balance_loss_mlp": 1.00553954, + "epoch": 0.8458740417856606, + "flos": 70313051392920.0, + "grad_norm": 0.6969539847639108, + "language_loss": 0.60526377, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62680048, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02783203, + "step": 14069, + "time_per_iteration": 3.405254364013672 + }, + { + "auxiliary_loss_clip": 0.01327762, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.21773529, + "balance_loss_mlp": 1.01872337, + "epoch": 0.8459341650383285, + "flos": 24176275304040.0, + "grad_norm": 1.4599646574099654, + "language_loss": 0.74742472, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.77101374, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.12408447, + "step": 14070, + "time_per_iteration": 2.7971127033233643 + }, + { + "auxiliary_loss_clip": 0.01321654, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.21452618, + "balance_loss_mlp": 1.01692247, + "epoch": 0.8459942882909965, + "flos": 38188369520640.0, + "grad_norm": 1.6318236414630942, + "language_loss": 0.66981411, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69333017, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13037109, + "step": 14071, + "time_per_iteration": 2.8951022624969482 + }, + { + "auxiliary_loss_clip": 0.01144755, + "auxiliary_loss_mlp": 0.01005283, + "balance_loss_clip": 1.10026288, + "balance_loss_mlp": 1.00274372, + "epoch": 0.8460544115436646, + "flos": 64133054755560.0, + "grad_norm": 1.0885385114152628, + "language_loss": 0.61040562, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63190603, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.02539062, + "step": 14072, + "time_per_iteration": 3.098278284072876 + }, + { + "auxiliary_loss_clip": 0.01330986, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.22128785, + "balance_loss_mlp": 1.01930916, + "epoch": 0.8461145347963325, + "flos": 24176234695680.0, + "grad_norm": 1.6757297080613223, + "language_loss": 0.72651112, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.75015843, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14428711, + "step": 14073, + "time_per_iteration": 2.793501377105713 + }, + { + "auxiliary_loss_clip": 0.01343572, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.22885466, + "balance_loss_mlp": 1.01788628, + "epoch": 0.8461746580490005, + "flos": 34900092759960.0, + "grad_norm": 1.923712755912018, + "language_loss": 0.78275371, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80651128, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.1428833, + "step": 14074, + "time_per_iteration": 2.890875816345215 + }, + { + "auxiliary_loss_clip": 0.01324798, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.2165451, + "balance_loss_mlp": 1.02098179, + "epoch": 0.8462347813016684, + "flos": 25562567945880.0, + "grad_norm": 6.438886922376025, + "language_loss": 0.75722963, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.78082466, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13708496, + "step": 14075, + "time_per_iteration": 2.8130011558532715 + }, + { + "auxiliary_loss_clip": 0.01319804, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.21344411, + "balance_loss_mlp": 1.0176791, + "epoch": 0.8462949045543364, + "flos": 21330425892960.0, + "grad_norm": 2.05472052684209, + "language_loss": 0.73224294, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75574625, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12860107, + "step": 14076, + "time_per_iteration": 2.879279851913452 + }, + { + "auxiliary_loss_clip": 0.01331998, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.2210294, + "balance_loss_mlp": 1.01821017, + "epoch": 0.8463550278070043, + "flos": 22642520415120.0, + "grad_norm": 1.8023214672105397, + "language_loss": 0.77633619, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79996431, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12609863, + "step": 14077, + "time_per_iteration": 2.8506593704223633 + }, + { + "auxiliary_loss_clip": 0.0133072, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.21918929, + "balance_loss_mlp": 1.02045107, + "epoch": 0.8464151510596724, + "flos": 13009461981480.0, + "grad_norm": 2.2448092943878373, + "language_loss": 0.76051027, + "learning_rate": 2.422929943924643e-07, + "loss": 0.78415704, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.1350708, + "step": 14078, + "time_per_iteration": 2.8870465755462646 + }, + { + "auxiliary_loss_clip": 0.01319842, + "auxiliary_loss_mlp": 0.01023761, + "balance_loss_clip": 1.21474767, + "balance_loss_mlp": 1.01092196, + "epoch": 0.8464752743123403, + "flos": 15709067396280.0, + "grad_norm": 2.486786635385561, + "language_loss": 0.85011661, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87355268, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.128479, + "step": 14079, + "time_per_iteration": 2.9352710247039795 + }, + { + "auxiliary_loss_clip": 0.01345454, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.22971547, + "balance_loss_mlp": 1.02175641, + "epoch": 0.8465353975650083, + "flos": 21659119561440.0, + "grad_norm": 2.3391774474861653, + "language_loss": 0.59087878, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61470133, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.15039062, + "step": 14080, + "time_per_iteration": 3.0301156044006348 + }, + { + "auxiliary_loss_clip": 0.0133299, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.22139299, + "balance_loss_mlp": 1.01741183, + "epoch": 0.8465955208176762, + "flos": 18520335465840.0, + "grad_norm": 2.1249935726430627, + "language_loss": 0.66239357, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68602949, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13189697, + "step": 14081, + "time_per_iteration": 2.8463199138641357 + }, + { + "auxiliary_loss_clip": 0.01331202, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.22088003, + "balance_loss_mlp": 1.02094448, + "epoch": 0.8466556440703442, + "flos": 24205090258440.0, + "grad_norm": 2.69334653669311, + "language_loss": 0.72940481, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75304782, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.121521, + "step": 14082, + "time_per_iteration": 2.8562862873077393 + }, + { + "auxiliary_loss_clip": 0.0132879, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.21825051, + "balance_loss_mlp": 1.01819682, + "epoch": 0.8467157673230121, + "flos": 20380753605240.0, + "grad_norm": 1.6735164662549247, + "language_loss": 0.75678682, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78038317, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12652588, + "step": 14083, + "time_per_iteration": 2.768648386001587 + }, + { + "auxiliary_loss_clip": 0.01332229, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.2197423, + "balance_loss_mlp": 1.01660085, + "epoch": 0.8467758905756801, + "flos": 28479285591120.0, + "grad_norm": 2.1299914654012393, + "language_loss": 0.66064644, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68427455, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14001465, + "step": 14084, + "time_per_iteration": 2.844146251678467 + }, + { + "auxiliary_loss_clip": 0.01320441, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.21307242, + "balance_loss_mlp": 1.01629901, + "epoch": 0.8468360138283482, + "flos": 11696311641960.0, + "grad_norm": 2.2295678387060427, + "language_loss": 0.70141339, + "learning_rate": 2.409939651426938e-07, + "loss": 0.72490853, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12786865, + "step": 14085, + "time_per_iteration": 2.770246744155884 + }, + { + "auxiliary_loss_clip": 0.01321645, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.21292996, + "balance_loss_mlp": 1.01566243, + "epoch": 0.8468961370810161, + "flos": 24613220525040.0, + "grad_norm": 1.5044352028732695, + "language_loss": 0.71497154, + "learning_rate": 2.408086562860634e-07, + "loss": 0.7384643, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.11956787, + "step": 14086, + "time_per_iteration": 2.829374313354492 + }, + { + "auxiliary_loss_clip": 0.01324359, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.21726739, + "balance_loss_mlp": 1.01359749, + "epoch": 0.8469562603336841, + "flos": 19614586807080.0, + "grad_norm": 2.43639311970601, + "language_loss": 0.74926686, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77277273, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12646484, + "step": 14087, + "time_per_iteration": 2.9162752628326416 + }, + { + "auxiliary_loss_clip": 0.01325354, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.21807265, + "balance_loss_mlp": 1.01200128, + "epoch": 0.847016383586352, + "flos": 22644225966240.0, + "grad_norm": 1.3910346854047169, + "language_loss": 0.73945248, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76294756, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.121521, + "step": 14088, + "time_per_iteration": 2.85884165763855 + }, + { + "auxiliary_loss_clip": 0.01326454, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.21822023, + "balance_loss_mlp": 1.01936591, + "epoch": 0.84707650683902, + "flos": 20965161073320.0, + "grad_norm": 2.1527903563680133, + "language_loss": 0.72874486, + "learning_rate": 2.402531299965387e-07, + "loss": 0.75233078, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12762451, + "step": 14089, + "time_per_iteration": 2.839641571044922 + }, + { + "auxiliary_loss_clip": 0.01319627, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.21504855, + "balance_loss_mlp": 1.01588881, + "epoch": 0.8471366300916879, + "flos": 24097772698200.0, + "grad_norm": 1.4088707593793894, + "language_loss": 0.79183775, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81531, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1171875, + "step": 14090, + "time_per_iteration": 2.813110589981079 + }, + { + "auxiliary_loss_clip": 0.01327038, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.21688318, + "balance_loss_mlp": 1.0186944, + "epoch": 0.847196753344356, + "flos": 18337520318400.0, + "grad_norm": 1.812572755221106, + "language_loss": 0.76970172, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79330146, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14239502, + "step": 14091, + "time_per_iteration": 2.954577684402466 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01003841, + "balance_loss_clip": 1.10039485, + "balance_loss_mlp": 1.00102806, + "epoch": 0.8472568765970239, + "flos": 49581367719120.0, + "grad_norm": 0.8451020091904744, + "language_loss": 0.59469283, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61617774, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.02807617, + "step": 14092, + "time_per_iteration": 3.4396419525146484 + }, + { + "auxiliary_loss_clip": 0.01324407, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.21515059, + "balance_loss_mlp": 1.01778483, + "epoch": 0.8473169998496919, + "flos": 19283497245360.0, + "grad_norm": 1.996054652095563, + "language_loss": 0.7032097, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72676897, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1373291, + "step": 14093, + "time_per_iteration": 2.7860634326934814 + }, + { + "auxiliary_loss_clip": 0.01316987, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.21097684, + "balance_loss_mlp": 1.01329386, + "epoch": 0.8473771231023598, + "flos": 17680011156360.0, + "grad_norm": 2.0579677184594827, + "language_loss": 0.83231527, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85573846, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12042236, + "step": 14094, + "time_per_iteration": 2.7734549045562744 + }, + { + "auxiliary_loss_clip": 0.01317377, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.21357834, + "balance_loss_mlp": 1.01799595, + "epoch": 0.8474372463550278, + "flos": 26365427720280.0, + "grad_norm": 1.5307790146052696, + "language_loss": 0.71611094, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73958737, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.12268066, + "step": 14095, + "time_per_iteration": 2.825971841812134 + }, + { + "auxiliary_loss_clip": 0.01320892, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.21445012, + "balance_loss_mlp": 1.01871729, + "epoch": 0.8474973696076957, + "flos": 23407022270520.0, + "grad_norm": 1.7171322991716664, + "language_loss": 0.80807698, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.83159876, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12573242, + "step": 14096, + "time_per_iteration": 4.213594436645508 + }, + { + "auxiliary_loss_clip": 0.0133277, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.22083676, + "balance_loss_mlp": 1.01667404, + "epoch": 0.8475574928603637, + "flos": 25080239559600.0, + "grad_norm": 1.9973456238686522, + "language_loss": 0.7738201, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79744434, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12982178, + "step": 14097, + "time_per_iteration": 4.3905792236328125 + }, + { + "auxiliary_loss_clip": 0.01321009, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.21443474, + "balance_loss_mlp": 1.02190447, + "epoch": 0.8476176161130318, + "flos": 19970471095560.0, + "grad_norm": 1.7814143473543038, + "language_loss": 0.80586958, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82941806, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11932373, + "step": 14098, + "time_per_iteration": 2.8210248947143555 + }, + { + "auxiliary_loss_clip": 0.01323604, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.21754742, + "balance_loss_mlp": 1.02192152, + "epoch": 0.8476777393656997, + "flos": 21290524856280.0, + "grad_norm": 1.6648552809310568, + "language_loss": 0.71770281, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.7412864, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12841797, + "step": 14099, + "time_per_iteration": 4.220022916793823 + }, + { + "auxiliary_loss_clip": 0.0132853, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.22046685, + "balance_loss_mlp": 1.01563263, + "epoch": 0.8477378626183677, + "flos": 29977565754600.0, + "grad_norm": 1.9732974822871596, + "language_loss": 0.64224935, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.66582876, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13787842, + "step": 14100, + "time_per_iteration": 2.927455186843872 + }, + { + "auxiliary_loss_clip": 0.01330585, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.21974051, + "balance_loss_mlp": 1.0191077, + "epoch": 0.8477979858710356, + "flos": 24242351751720.0, + "grad_norm": 2.012408715704018, + "language_loss": 0.73737001, + "learning_rate": 2.380370324111085e-07, + "loss": 0.7610032, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13635254, + "step": 14101, + "time_per_iteration": 2.8600990772247314 + }, + { + "auxiliary_loss_clip": 0.0132835, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.22084594, + "balance_loss_mlp": 1.0152781, + "epoch": 0.8478581091237036, + "flos": 25599260922120.0, + "grad_norm": 1.5772024282691224, + "language_loss": 0.71373564, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73729408, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12219238, + "step": 14102, + "time_per_iteration": 2.9243037700653076 + }, + { + "auxiliary_loss_clip": 0.01333434, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.22132635, + "balance_loss_mlp": 1.01430416, + "epoch": 0.8479182323763715, + "flos": 12060886119480.0, + "grad_norm": 2.3112027315105874, + "language_loss": 0.82426214, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.84787768, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13812256, + "step": 14103, + "time_per_iteration": 2.916046380996704 + }, + { + "auxiliary_loss_clip": 0.01324915, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.21869445, + "balance_loss_mlp": 1.0207237, + "epoch": 0.8479783556290396, + "flos": 21438311970240.0, + "grad_norm": 1.9707498907137169, + "language_loss": 0.79061651, + "learning_rate": 2.374845108533079e-07, + "loss": 0.81419796, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12506104, + "step": 14104, + "time_per_iteration": 2.780095338821411 + }, + { + "auxiliary_loss_clip": 0.01332182, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.22181988, + "balance_loss_mlp": 1.02307594, + "epoch": 0.8480384788817075, + "flos": 19646934688800.0, + "grad_norm": 1.757181945789485, + "language_loss": 0.78913742, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81282806, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13800049, + "step": 14105, + "time_per_iteration": 2.814270257949829 + }, + { + "auxiliary_loss_clip": 0.01337591, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.22350717, + "balance_loss_mlp": 1.01616883, + "epoch": 0.8480986021343755, + "flos": 22493961742320.0, + "grad_norm": 1.8079302591296793, + "language_loss": 0.50707096, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.53075147, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14306641, + "step": 14106, + "time_per_iteration": 4.629802227020264 + }, + { + "auxiliary_loss_clip": 0.01320683, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.21295047, + "balance_loss_mlp": 1.01915395, + "epoch": 0.8481587253870434, + "flos": 22095618090480.0, + "grad_norm": 2.4342572276060293, + "language_loss": 0.75635982, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77988297, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12469482, + "step": 14107, + "time_per_iteration": 2.7652883529663086 + }, + { + "auxiliary_loss_clip": 0.01325174, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.21728039, + "balance_loss_mlp": 1.01792264, + "epoch": 0.8482188486397114, + "flos": 33589135271880.0, + "grad_norm": 1.5017013585272176, + "language_loss": 0.73395032, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75750321, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12194824, + "step": 14108, + "time_per_iteration": 2.8676633834838867 + }, + { + "auxiliary_loss_clip": 0.01316833, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.21206737, + "balance_loss_mlp": 1.01420557, + "epoch": 0.8482789718923793, + "flos": 20923838744040.0, + "grad_norm": 1.6862236689252734, + "language_loss": 0.72442102, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74786723, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.13568115, + "step": 14109, + "time_per_iteration": 2.797011137008667 + }, + { + "auxiliary_loss_clip": 0.01325084, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.21908998, + "balance_loss_mlp": 1.01995361, + "epoch": 0.8483390951450474, + "flos": 12900073394880.0, + "grad_norm": 2.4611844331402994, + "language_loss": 0.74684012, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.77042234, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13195801, + "step": 14110, + "time_per_iteration": 2.758673906326294 + }, + { + "auxiliary_loss_clip": 0.013266, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.22010756, + "balance_loss_mlp": 1.02115273, + "epoch": 0.8483992183977154, + "flos": 25086980547360.0, + "grad_norm": 1.623600671234546, + "language_loss": 0.76359165, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78719288, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12371826, + "step": 14111, + "time_per_iteration": 2.806786060333252 + }, + { + "auxiliary_loss_clip": 0.01316737, + "auxiliary_loss_mlp": 0.01027998, + "balance_loss_clip": 1.21086276, + "balance_loss_mlp": 1.01659584, + "epoch": 0.8484593416503833, + "flos": 25563055246200.0, + "grad_norm": 1.5060021287921126, + "language_loss": 0.67386544, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69731283, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11413574, + "step": 14112, + "time_per_iteration": 2.77852201461792 + }, + { + "auxiliary_loss_clip": 0.0132843, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.21839261, + "balance_loss_mlp": 1.0156312, + "epoch": 0.8485194649030513, + "flos": 27204168303720.0, + "grad_norm": 1.4371381808581019, + "language_loss": 0.74173975, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.76530826, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12780762, + "step": 14113, + "time_per_iteration": 2.8901045322418213 + }, + { + "auxiliary_loss_clip": 0.01322381, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.2154187, + "balance_loss_mlp": 1.01613188, + "epoch": 0.8485795881557192, + "flos": 24211140904080.0, + "grad_norm": 2.4366342022644965, + "language_loss": 0.66842413, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.69193172, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12243652, + "step": 14114, + "time_per_iteration": 2.895359516143799 + }, + { + "auxiliary_loss_clip": 0.01328839, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.21868563, + "balance_loss_mlp": 1.01767111, + "epoch": 0.8486397114083872, + "flos": 21146554928160.0, + "grad_norm": 1.632758214571333, + "language_loss": 0.79133874, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.81493038, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12646484, + "step": 14115, + "time_per_iteration": 2.9599435329437256 + }, + { + "auxiliary_loss_clip": 0.01327789, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.21839511, + "balance_loss_mlp": 1.01478648, + "epoch": 0.8486998346610551, + "flos": 19979607976560.0, + "grad_norm": 1.9326884826461386, + "language_loss": 0.79305154, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81659889, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.121521, + "step": 14116, + "time_per_iteration": 2.8067071437835693 + }, + { + "auxiliary_loss_clip": 0.0132835, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.2168864, + "balance_loss_mlp": 1.02272511, + "epoch": 0.8487599579137232, + "flos": 19797117696000.0, + "grad_norm": 1.8123776895661816, + "language_loss": 0.68850839, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.71215284, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.1338501, + "step": 14117, + "time_per_iteration": 2.9209132194519043 + }, + { + "auxiliary_loss_clip": 0.0132725, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.21694648, + "balance_loss_mlp": 1.01374388, + "epoch": 0.8488200811663911, + "flos": 26401674004560.0, + "grad_norm": 1.925837063755584, + "language_loss": 0.65119618, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67473489, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12884521, + "step": 14118, + "time_per_iteration": 2.913005828857422 + }, + { + "auxiliary_loss_clip": 0.01322348, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.2155838, + "balance_loss_mlp": 1.01541293, + "epoch": 0.8488802044190591, + "flos": 16363206064440.0, + "grad_norm": 1.5017757238929343, + "language_loss": 0.73441386, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75791204, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12054443, + "step": 14119, + "time_per_iteration": 2.811004638671875 + }, + { + "auxiliary_loss_clip": 0.01326329, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.21743131, + "balance_loss_mlp": 1.01751447, + "epoch": 0.848940327671727, + "flos": 19213644220200.0, + "grad_norm": 2.28875271672725, + "language_loss": 0.78008872, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80366313, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13604736, + "step": 14120, + "time_per_iteration": 2.8517661094665527 + }, + { + "auxiliary_loss_clip": 0.01331861, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.22215188, + "balance_loss_mlp": 1.01522934, + "epoch": 0.849000450924395, + "flos": 21876150575160.0, + "grad_norm": 2.4603735806978797, + "language_loss": 0.75521731, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77882242, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13439941, + "step": 14121, + "time_per_iteration": 2.817505121231079 + }, + { + "auxiliary_loss_clip": 0.01143882, + "auxiliary_loss_mlp": 0.01004355, + "balance_loss_clip": 1.10017967, + "balance_loss_mlp": 1.00191164, + "epoch": 0.8490605741770629, + "flos": 71183043432360.0, + "grad_norm": 0.8208503210707395, + "language_loss": 0.60232359, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.623806, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02441406, + "step": 14122, + "time_per_iteration": 3.3226544857025146 + }, + { + "auxiliary_loss_clip": 0.01324769, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.21554208, + "balance_loss_mlp": 1.01884389, + "epoch": 0.849120697429731, + "flos": 24978850819920.0, + "grad_norm": 1.7223462041982773, + "language_loss": 0.79981244, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.82337391, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12536621, + "step": 14123, + "time_per_iteration": 2.9088733196258545 + }, + { + "auxiliary_loss_clip": 0.01316501, + "auxiliary_loss_mlp": 0.01026495, + "balance_loss_clip": 1.21321034, + "balance_loss_mlp": 1.0147109, + "epoch": 0.8491808206823989, + "flos": 23036559580800.0, + "grad_norm": 1.9513308930973425, + "language_loss": 0.83253837, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85596836, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.11779785, + "step": 14124, + "time_per_iteration": 2.824800491333008 + }, + { + "auxiliary_loss_clip": 0.01323027, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.21527922, + "balance_loss_mlp": 1.01771021, + "epoch": 0.8492409439350669, + "flos": 23883868528200.0, + "grad_norm": 1.872923952324214, + "language_loss": 0.72425139, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.74779105, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13214111, + "step": 14125, + "time_per_iteration": 2.869041919708252 + }, + { + "auxiliary_loss_clip": 0.01337226, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.22418022, + "balance_loss_mlp": 1.02172804, + "epoch": 0.8493010671877349, + "flos": 22425123926160.0, + "grad_norm": 1.535639949794557, + "language_loss": 0.73535365, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75908071, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13745117, + "step": 14126, + "time_per_iteration": 2.785541534423828 + }, + { + "auxiliary_loss_clip": 0.01322533, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.21514034, + "balance_loss_mlp": 1.01551259, + "epoch": 0.8493611904404028, + "flos": 17533848376800.0, + "grad_norm": 1.396243708953329, + "language_loss": 0.67889917, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.70240521, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12548828, + "step": 14127, + "time_per_iteration": 3.003169536590576 + }, + { + "auxiliary_loss_clip": 0.01332428, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.22117269, + "balance_loss_mlp": 1.01529849, + "epoch": 0.8494213136930708, + "flos": 19467571251960.0, + "grad_norm": 2.3778040255089548, + "language_loss": 0.6969257, + "learning_rate": 2.330860086502211e-07, + "loss": 0.72053462, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.1317749, + "step": 14128, + "time_per_iteration": 2.822244882583618 + }, + { + "auxiliary_loss_clip": 0.01323185, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.21579146, + "balance_loss_mlp": 1.01853669, + "epoch": 0.8494814369457387, + "flos": 18774912231360.0, + "grad_norm": 1.839052163227681, + "language_loss": 0.78335214, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80689704, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12786865, + "step": 14129, + "time_per_iteration": 2.7420029640197754 + }, + { + "auxiliary_loss_clip": 0.01324558, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.21681595, + "balance_loss_mlp": 1.02365136, + "epoch": 0.8495415601984068, + "flos": 23336763161760.0, + "grad_norm": 1.6324084841048008, + "language_loss": 0.68118602, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70479453, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12658691, + "step": 14130, + "time_per_iteration": 2.767159938812256 + }, + { + "auxiliary_loss_clip": 0.01323195, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.21383893, + "balance_loss_mlp": 1.02043676, + "epoch": 0.8496016834510747, + "flos": 26618258326320.0, + "grad_norm": 1.6673011179457435, + "language_loss": 0.71666551, + "learning_rate": 2.3253890747186e-07, + "loss": 0.74023151, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12963867, + "step": 14131, + "time_per_iteration": 2.7783570289611816 + }, + { + "auxiliary_loss_clip": 0.01330975, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.21980214, + "balance_loss_mlp": 1.01411247, + "epoch": 0.8496618067037427, + "flos": 25485446024280.0, + "grad_norm": 1.9096270530461688, + "language_loss": 0.68560231, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70917946, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12634277, + "step": 14132, + "time_per_iteration": 2.796995162963867 + }, + { + "auxiliary_loss_clip": 0.01318236, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.2129097, + "balance_loss_mlp": 1.01948369, + "epoch": 0.8497219299564106, + "flos": 25380118273680.0, + "grad_norm": 1.5918648451300423, + "language_loss": 0.70145619, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72495097, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11767578, + "step": 14133, + "time_per_iteration": 2.764720916748047 + }, + { + "auxiliary_loss_clip": 0.0114459, + "auxiliary_loss_mlp": 0.0100445, + "balance_loss_clip": 1.10043836, + "balance_loss_mlp": 1.00187516, + "epoch": 0.8497820532090786, + "flos": 67797316942920.0, + "grad_norm": 0.7277393556069491, + "language_loss": 0.57621086, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59770131, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02575684, + "step": 14134, + "time_per_iteration": 3.363107681274414 + }, + { + "auxiliary_loss_clip": 0.01330744, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.2201519, + "balance_loss_mlp": 1.01835775, + "epoch": 0.8498421764617465, + "flos": 23445786273120.0, + "grad_norm": 2.212985868394998, + "language_loss": 0.79133159, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81494915, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12658691, + "step": 14135, + "time_per_iteration": 4.249821901321411 + }, + { + "auxiliary_loss_clip": 0.01330687, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.22009552, + "balance_loss_mlp": 1.01649499, + "epoch": 0.8499022997144146, + "flos": 17717150824560.0, + "grad_norm": 1.752750628847471, + "language_loss": 0.63308227, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65668851, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.13452148, + "step": 14136, + "time_per_iteration": 2.723497152328491 + }, + { + "auxiliary_loss_clip": 0.01334305, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.22395658, + "balance_loss_mlp": 1.01739454, + "epoch": 0.8499624229670825, + "flos": 18593193509640.0, + "grad_norm": 1.7273758763795803, + "language_loss": 0.84206295, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86571503, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1350708, + "step": 14137, + "time_per_iteration": 4.4234373569488525 + }, + { + "auxiliary_loss_clip": 0.01316345, + "auxiliary_loss_mlp": 0.01024897, + "balance_loss_clip": 1.21269512, + "balance_loss_mlp": 1.01348877, + "epoch": 0.8500225462197505, + "flos": 24350319045720.0, + "grad_norm": 2.7038699974901816, + "language_loss": 0.78789806, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.81131053, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.11413574, + "step": 14138, + "time_per_iteration": 4.42099666595459 + }, + { + "auxiliary_loss_clip": 0.0132284, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.21433151, + "balance_loss_mlp": 1.01592278, + "epoch": 0.8500826694724185, + "flos": 16549960222800.0, + "grad_norm": 1.7829598241987408, + "language_loss": 0.64727008, + "learning_rate": 2.310829204839073e-07, + "loss": 0.67078388, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12615967, + "step": 14139, + "time_per_iteration": 2.787447214126587 + }, + { + "auxiliary_loss_clip": 0.01322556, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.21476126, + "balance_loss_mlp": 1.01748967, + "epoch": 0.8501427927250864, + "flos": 16293718514520.0, + "grad_norm": 1.5559682182971963, + "language_loss": 0.70488352, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72840166, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.11773682, + "step": 14140, + "time_per_iteration": 2.7708191871643066 + }, + { + "auxiliary_loss_clip": 0.01334939, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.22225296, + "balance_loss_mlp": 1.02006662, + "epoch": 0.8502029159777544, + "flos": 26693593480080.0, + "grad_norm": 1.8596115116951841, + "language_loss": 0.64317477, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66685498, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13018799, + "step": 14141, + "time_per_iteration": 2.8766744136810303 + }, + { + "auxiliary_loss_clip": 0.01327082, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.2186439, + "balance_loss_mlp": 1.01702332, + "epoch": 0.8502630392304223, + "flos": 35597665392120.0, + "grad_norm": 1.680832363025347, + "language_loss": 0.71147549, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.73504651, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13018799, + "step": 14142, + "time_per_iteration": 2.918549060821533 + }, + { + "auxiliary_loss_clip": 0.01328126, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.21935821, + "balance_loss_mlp": 1.01378798, + "epoch": 0.8503231624830904, + "flos": 21654043516440.0, + "grad_norm": 1.5137414352276084, + "language_loss": 0.6568563, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.68040144, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12597656, + "step": 14143, + "time_per_iteration": 2.8646860122680664 + }, + { + "auxiliary_loss_clip": 0.01334852, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.22323895, + "balance_loss_mlp": 1.02232385, + "epoch": 0.8503832857357583, + "flos": 22422362557680.0, + "grad_norm": 2.557818892491556, + "language_loss": 0.68078101, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.70448959, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13684082, + "step": 14144, + "time_per_iteration": 2.905517816543579 + }, + { + "auxiliary_loss_clip": 0.01316696, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.21206379, + "balance_loss_mlp": 1.01726043, + "epoch": 0.8504434089884263, + "flos": 18702663312960.0, + "grad_norm": 4.802734766310455, + "language_loss": 0.65313476, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67660064, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.12634277, + "step": 14145, + "time_per_iteration": 4.225741386413574 + }, + { + "auxiliary_loss_clip": 0.01319379, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.21208596, + "balance_loss_mlp": 1.01933002, + "epoch": 0.8505035322410942, + "flos": 20012402550240.0, + "grad_norm": 1.6402100109101172, + "language_loss": 0.85777807, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88129866, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13354492, + "step": 14146, + "time_per_iteration": 2.733900308609009 + }, + { + "auxiliary_loss_clip": 0.01319479, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.21228671, + "balance_loss_mlp": 1.01560068, + "epoch": 0.8505636554937622, + "flos": 20816886659040.0, + "grad_norm": 1.706526870451071, + "language_loss": 0.84033895, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.86381078, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12121582, + "step": 14147, + "time_per_iteration": 2.7686474323272705 + }, + { + "auxiliary_loss_clip": 0.0133398, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.22091973, + "balance_loss_mlp": 1.02394772, + "epoch": 0.8506237787464301, + "flos": 14178642392880.0, + "grad_norm": 2.4030032143045155, + "language_loss": 0.85067987, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87439346, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13446045, + "step": 14148, + "time_per_iteration": 2.7657077312469482 + }, + { + "auxiliary_loss_clip": 0.0131938, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.21318173, + "balance_loss_mlp": 1.01524365, + "epoch": 0.8506839019990982, + "flos": 23263620859440.0, + "grad_norm": 1.661255093932797, + "language_loss": 0.72593439, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74941123, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.13049316, + "step": 14149, + "time_per_iteration": 3.119020938873291 + }, + { + "auxiliary_loss_clip": 0.01325814, + "auxiliary_loss_mlp": 0.01024949, + "balance_loss_clip": 1.21810055, + "balance_loss_mlp": 1.01214623, + "epoch": 0.8507440252517661, + "flos": 23664441621240.0, + "grad_norm": 1.5175322161659783, + "language_loss": 0.76085174, + "learning_rate": 2.290879486935804e-07, + "loss": 0.7843594, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12805176, + "step": 14150, + "time_per_iteration": 2.9136831760406494 + }, + { + "auxiliary_loss_clip": 0.01322121, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.21691823, + "balance_loss_mlp": 1.02272582, + "epoch": 0.8508041485044341, + "flos": 18666010945080.0, + "grad_norm": 1.8077929665455823, + "language_loss": 0.73279977, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.75637245, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12402344, + "step": 14151, + "time_per_iteration": 2.824056625366211 + }, + { + "auxiliary_loss_clip": 0.01142196, + "auxiliary_loss_mlp": 0.01002835, + "balance_loss_clip": 1.0980041, + "balance_loss_mlp": 0.99983084, + "epoch": 0.8508642717571021, + "flos": 52523570433240.0, + "grad_norm": 0.935060794156024, + "language_loss": 0.59657121, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61802155, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.0300293, + "step": 14152, + "time_per_iteration": 3.0281782150268555 + }, + { + "auxiliary_loss_clip": 0.0114466, + "auxiliary_loss_mlp": 0.01007199, + "balance_loss_clip": 1.10098553, + "balance_loss_mlp": 1.00424278, + "epoch": 0.85092439500977, + "flos": 69312351377880.0, + "grad_norm": 0.6882657344548468, + "language_loss": 0.61300087, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63451946, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02954102, + "step": 14153, + "time_per_iteration": 3.262744903564453 + }, + { + "auxiliary_loss_clip": 0.01319706, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.21257567, + "balance_loss_mlp": 1.01885438, + "epoch": 0.850984518262438, + "flos": 24395174302320.0, + "grad_norm": 1.504578939766499, + "language_loss": 0.8106811, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.83420676, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.14001465, + "step": 14154, + "time_per_iteration": 2.9524526596069336 + }, + { + "auxiliary_loss_clip": 0.01312812, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.20954847, + "balance_loss_mlp": 1.01633239, + "epoch": 0.851044641515106, + "flos": 23300070185520.0, + "grad_norm": 1.6397707565550623, + "language_loss": 0.79640251, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81980705, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11291504, + "step": 14155, + "time_per_iteration": 2.913330316543579 + }, + { + "auxiliary_loss_clip": 0.0133151, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.22035885, + "balance_loss_mlp": 1.01729321, + "epoch": 0.851104764767774, + "flos": 22054620628080.0, + "grad_norm": 1.664473408827114, + "language_loss": 0.70995289, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.73357272, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13165283, + "step": 14156, + "time_per_iteration": 2.9521560668945312 + }, + { + "auxiliary_loss_clip": 0.01315838, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.21106827, + "balance_loss_mlp": 1.01688838, + "epoch": 0.8511648880204419, + "flos": 20709690923880.0, + "grad_norm": 1.7099667261783158, + "language_loss": 0.73895955, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76241308, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1262207, + "step": 14157, + "time_per_iteration": 2.864602565765381 + }, + { + "auxiliary_loss_clip": 0.01316624, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.21237099, + "balance_loss_mlp": 1.01250482, + "epoch": 0.8512250112731099, + "flos": 24030031307760.0, + "grad_norm": 1.9185369791866744, + "language_loss": 0.79557461, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81897736, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11151123, + "step": 14158, + "time_per_iteration": 2.9608609676361084 + }, + { + "auxiliary_loss_clip": 0.01330326, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.22236323, + "balance_loss_mlp": 1.02438831, + "epoch": 0.8512851345257778, + "flos": 22020242328360.0, + "grad_norm": 1.9305192898584436, + "language_loss": 0.79070508, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81438261, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13031006, + "step": 14159, + "time_per_iteration": 2.864537000656128 + }, + { + "auxiliary_loss_clip": 0.01327326, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.21929014, + "balance_loss_mlp": 1.01734447, + "epoch": 0.8513452577784458, + "flos": 14834364787080.0, + "grad_norm": 10.04756398517756, + "language_loss": 0.71463907, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73821694, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13116455, + "step": 14160, + "time_per_iteration": 2.945803165435791 + }, + { + "auxiliary_loss_clip": 0.01339121, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.22567701, + "balance_loss_mlp": 1.01715732, + "epoch": 0.8514053810311137, + "flos": 33042070513800.0, + "grad_norm": 2.040981417657672, + "language_loss": 0.70324862, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72695315, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.14154053, + "step": 14161, + "time_per_iteration": 3.07840895652771 + }, + { + "auxiliary_loss_clip": 0.01333697, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.22244883, + "balance_loss_mlp": 1.02054346, + "epoch": 0.8514655042837818, + "flos": 27570367115640.0, + "grad_norm": 2.168466553929014, + "language_loss": 0.78052914, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80420232, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13079834, + "step": 14162, + "time_per_iteration": 2.9852941036224365 + }, + { + "auxiliary_loss_clip": 0.01320546, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.21356583, + "balance_loss_mlp": 1.02138972, + "epoch": 0.8515256275364497, + "flos": 35563165267320.0, + "grad_norm": 1.8314603653756703, + "language_loss": 0.77311814, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79666471, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12713623, + "step": 14163, + "time_per_iteration": 3.0130317211151123 + }, + { + "auxiliary_loss_clip": 0.01146337, + "auxiliary_loss_mlp": 0.0100486, + "balance_loss_clip": 1.10218632, + "balance_loss_mlp": 1.00195169, + "epoch": 0.8515857507891177, + "flos": 70222447495800.0, + "grad_norm": 0.6876242251828883, + "language_loss": 0.55083311, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57234514, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02905273, + "step": 14164, + "time_per_iteration": 3.3784725666046143 + }, + { + "auxiliary_loss_clip": 0.01324869, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.21666336, + "balance_loss_mlp": 1.02282524, + "epoch": 0.8516458740417857, + "flos": 22680553467240.0, + "grad_norm": 1.6154026295024508, + "language_loss": 0.73170543, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.75530249, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12023926, + "step": 14165, + "time_per_iteration": 2.8415725231170654 + }, + { + "auxiliary_loss_clip": 0.01316418, + "auxiliary_loss_mlp": 0.01029667, + "balance_loss_clip": 1.21093857, + "balance_loss_mlp": 1.01727557, + "epoch": 0.8517059972944536, + "flos": 22752599343840.0, + "grad_norm": 1.6192681425470319, + "language_loss": 0.67359722, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69705808, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12390137, + "step": 14166, + "time_per_iteration": 2.879171371459961 + }, + { + "auxiliary_loss_clip": 0.01328715, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.22017694, + "balance_loss_mlp": 1.01713181, + "epoch": 0.8517661205471216, + "flos": 21694147594920.0, + "grad_norm": 1.7189319088757748, + "language_loss": 0.73628157, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75986856, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12841797, + "step": 14167, + "time_per_iteration": 2.802293062210083 + }, + { + "auxiliary_loss_clip": 0.01325126, + "auxiliary_loss_mlp": 0.01025151, + "balance_loss_clip": 1.21841621, + "balance_loss_mlp": 1.0123415, + "epoch": 0.8518262437997896, + "flos": 25380402532200.0, + "grad_norm": 1.755949216943061, + "language_loss": 0.80535418, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82885695, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12817383, + "step": 14168, + "time_per_iteration": 2.956406354904175 + }, + { + "auxiliary_loss_clip": 0.0132208, + "auxiliary_loss_mlp": 0.01024362, + "balance_loss_clip": 1.21482015, + "balance_loss_mlp": 1.01254809, + "epoch": 0.8518863670524576, + "flos": 27241186146840.0, + "grad_norm": 1.9755010264255843, + "language_loss": 0.76331156, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78677601, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.11828613, + "step": 14169, + "time_per_iteration": 2.8990843296051025 + }, + { + "auxiliary_loss_clip": 0.01329195, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.21972477, + "balance_loss_mlp": 1.01536119, + "epoch": 0.8519464903051255, + "flos": 20964430122840.0, + "grad_norm": 1.8891004710464805, + "language_loss": 0.64134133, + "learning_rate": 2.254815511000452e-07, + "loss": 0.66491592, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12915039, + "step": 14170, + "time_per_iteration": 2.80936598777771 + }, + { + "auxiliary_loss_clip": 0.0131866, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.21118021, + "balance_loss_mlp": 1.01739001, + "epoch": 0.8520066135577935, + "flos": 18446299779600.0, + "grad_norm": 2.222750799728182, + "language_loss": 0.86687815, + "learning_rate": 2.253019373106384e-07, + "loss": 0.8903656, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1270752, + "step": 14171, + "time_per_iteration": 2.7299418449401855 + }, + { + "auxiliary_loss_clip": 0.0132925, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.22072661, + "balance_loss_mlp": 1.01699996, + "epoch": 0.8520667368104614, + "flos": 29135941977600.0, + "grad_norm": 1.7481295153179706, + "language_loss": 0.54944873, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.57303751, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12628174, + "step": 14172, + "time_per_iteration": 2.825507402420044 + }, + { + "auxiliary_loss_clip": 0.01316544, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.21174109, + "balance_loss_mlp": 1.0141865, + "epoch": 0.8521268600631294, + "flos": 16038695057040.0, + "grad_norm": 2.1411593201912873, + "language_loss": 0.70116198, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.72457963, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11022949, + "step": 14173, + "time_per_iteration": 2.8993399143218994 + }, + { + "auxiliary_loss_clip": 0.01323044, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.21428609, + "balance_loss_mlp": 1.01327431, + "epoch": 0.8521869833157973, + "flos": 22459908309480.0, + "grad_norm": 2.155489323382975, + "language_loss": 0.77442145, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79791713, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13244629, + "step": 14174, + "time_per_iteration": 4.2694666385650635 + }, + { + "auxiliary_loss_clip": 0.01330802, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.22078884, + "balance_loss_mlp": 1.0185585, + "epoch": 0.8522471065684654, + "flos": 24977348310600.0, + "grad_norm": 1.601061746344353, + "language_loss": 0.8269729, + "learning_rate": 2.245841551883676e-07, + "loss": 0.85059732, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13098145, + "step": 14175, + "time_per_iteration": 2.822936773300171 + }, + { + "auxiliary_loss_clip": 0.01336153, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.22573853, + "balance_loss_mlp": 1.01459932, + "epoch": 0.8523072298211333, + "flos": 17714876756400.0, + "grad_norm": 2.3375110696237833, + "language_loss": 0.65863812, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.68227512, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1293335, + "step": 14176, + "time_per_iteration": 4.318183422088623 + }, + { + "auxiliary_loss_clip": 0.01319702, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.2143358, + "balance_loss_mlp": 1.01808119, + "epoch": 0.8523673530738013, + "flos": 25451555024880.0, + "grad_norm": 1.7394751153902948, + "language_loss": 0.78853345, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.81203866, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12738037, + "step": 14177, + "time_per_iteration": 2.883063554763794 + }, + { + "auxiliary_loss_clip": 0.0132967, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.2210182, + "balance_loss_mlp": 1.01375365, + "epoch": 0.8524274763264693, + "flos": 31435701231240.0, + "grad_norm": 1.7422219875230296, + "language_loss": 0.73657709, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.76014674, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13537598, + "step": 14178, + "time_per_iteration": 2.940619945526123 + }, + { + "auxiliary_loss_clip": 0.01329089, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.21954274, + "balance_loss_mlp": 1.02260721, + "epoch": 0.8524875995791372, + "flos": 17717191432920.0, + "grad_norm": 1.7530080523803564, + "language_loss": 0.75109065, + "learning_rate": 2.238674502491935e-07, + "loss": 0.77473873, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13122559, + "step": 14179, + "time_per_iteration": 2.7989907264709473 + }, + { + "auxiliary_loss_clip": 0.01324265, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.21809912, + "balance_loss_mlp": 1.01683474, + "epoch": 0.8525477228318052, + "flos": 21692076568560.0, + "grad_norm": 2.2359430210922495, + "language_loss": 0.82574499, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.84928346, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12750244, + "step": 14180, + "time_per_iteration": 2.7562830448150635 + }, + { + "auxiliary_loss_clip": 0.01322605, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.21476436, + "balance_loss_mlp": 1.02031279, + "epoch": 0.8526078460844732, + "flos": 24832484998560.0, + "grad_norm": 2.260530559717102, + "language_loss": 0.61272299, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63627207, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.11993408, + "step": 14181, + "time_per_iteration": 2.8073573112487793 + }, + { + "auxiliary_loss_clip": 0.01319697, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.2146349, + "balance_loss_mlp": 1.02037024, + "epoch": 0.8526679693371412, + "flos": 13520361672000.0, + "grad_norm": 2.177903522750237, + "language_loss": 0.72008002, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74360287, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12213135, + "step": 14182, + "time_per_iteration": 2.7236316204071045 + }, + { + "auxiliary_loss_clip": 0.01327311, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.22084713, + "balance_loss_mlp": 1.01892531, + "epoch": 0.8527280925898091, + "flos": 23519415875760.0, + "grad_norm": 1.4829948608117196, + "language_loss": 0.70728016, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73087317, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13067627, + "step": 14183, + "time_per_iteration": 2.851992130279541 + }, + { + "auxiliary_loss_clip": 0.0132018, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.21450734, + "balance_loss_mlp": 1.01840222, + "epoch": 0.8527882158424771, + "flos": 20307895561440.0, + "grad_norm": 1.6851679682109852, + "language_loss": 0.72426015, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74776608, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12017822, + "step": 14184, + "time_per_iteration": 4.576665878295898 + }, + { + "auxiliary_loss_clip": 0.01320173, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.21312308, + "balance_loss_mlp": 1.01757216, + "epoch": 0.852848339095145, + "flos": 17207063301240.0, + "grad_norm": 1.7251864689786038, + "language_loss": 0.77004838, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.79354537, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.1194458, + "step": 14185, + "time_per_iteration": 2.8493950366973877 + }, + { + "auxiliary_loss_clip": 0.01330832, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.22007763, + "balance_loss_mlp": 1.01380062, + "epoch": 0.852908462347813, + "flos": 18373563560880.0, + "grad_norm": 1.9302994721203401, + "language_loss": 0.80066955, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.82425153, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13568115, + "step": 14186, + "time_per_iteration": 2.805800199508667 + }, + { + "auxiliary_loss_clip": 0.0132982, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.21979523, + "balance_loss_mlp": 1.01692903, + "epoch": 0.8529685856004809, + "flos": 18629561619000.0, + "grad_norm": 1.6268742508064151, + "language_loss": 0.62516725, + "learning_rate": 2.224372736588449e-07, + "loss": 0.6487658, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13098145, + "step": 14187, + "time_per_iteration": 2.723198175430298 + }, + { + "auxiliary_loss_clip": 0.01333466, + "auxiliary_loss_mlp": 0.01027107, + "balance_loss_clip": 1.22112703, + "balance_loss_mlp": 1.01345718, + "epoch": 0.853028708853149, + "flos": 29613844052640.0, + "grad_norm": 1.5903184479255283, + "language_loss": 0.76948392, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.79308963, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13641357, + "step": 14188, + "time_per_iteration": 2.8539204597473145 + }, + { + "auxiliary_loss_clip": 0.01328646, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.21776271, + "balance_loss_mlp": 1.01899886, + "epoch": 0.8530888321058169, + "flos": 26357955782040.0, + "grad_norm": 7.046125670809352, + "language_loss": 0.78338432, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80700183, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.14111328, + "step": 14189, + "time_per_iteration": 2.806906223297119 + }, + { + "auxiliary_loss_clip": 0.01328818, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.21954083, + "balance_loss_mlp": 1.02152753, + "epoch": 0.8531489553584849, + "flos": 20526875776440.0, + "grad_norm": 2.2035433351180025, + "language_loss": 0.79821837, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.82184774, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12591553, + "step": 14190, + "time_per_iteration": 2.767634391784668 + }, + { + "auxiliary_loss_clip": 0.01323531, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.21688652, + "balance_loss_mlp": 1.02102494, + "epoch": 0.8532090786111529, + "flos": 20709203623560.0, + "grad_norm": 1.9186708604384208, + "language_loss": 0.76476866, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78834033, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12609863, + "step": 14191, + "time_per_iteration": 2.7690606117248535 + }, + { + "auxiliary_loss_clip": 0.01323528, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.21696889, + "balance_loss_mlp": 1.01776755, + "epoch": 0.8532692018638208, + "flos": 19833729455520.0, + "grad_norm": 1.7478490512523481, + "language_loss": 0.69763947, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.72118235, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13006592, + "step": 14192, + "time_per_iteration": 2.7112274169921875 + }, + { + "auxiliary_loss_clip": 0.013391, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.22451878, + "balance_loss_mlp": 1.02535367, + "epoch": 0.8533293251164888, + "flos": 21001813441200.0, + "grad_norm": 2.4159167752803365, + "language_loss": 0.63526535, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.6590575, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14788818, + "step": 14193, + "time_per_iteration": 2.814161777496338 + }, + { + "auxiliary_loss_clip": 0.01325008, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.21726871, + "balance_loss_mlp": 1.01614583, + "epoch": 0.8533894483691568, + "flos": 22424799059280.0, + "grad_norm": 1.9377191334469137, + "language_loss": 0.77114534, + "learning_rate": 2.211894078044365e-07, + "loss": 0.79468328, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12646484, + "step": 14194, + "time_per_iteration": 2.8172993659973145 + }, + { + "auxiliary_loss_clip": 0.01326805, + "auxiliary_loss_mlp": 0.0102593, + "balance_loss_clip": 1.21802747, + "balance_loss_mlp": 1.01380634, + "epoch": 0.8534495716218248, + "flos": 21621573809640.0, + "grad_norm": 2.0277722163549847, + "language_loss": 0.70345092, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.7269783, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12127686, + "step": 14195, + "time_per_iteration": 2.7465336322784424 + }, + { + "auxiliary_loss_clip": 0.0132317, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.21486855, + "balance_loss_mlp": 1.01645482, + "epoch": 0.8535096948744927, + "flos": 22351413106800.0, + "grad_norm": 2.091855425062232, + "language_loss": 0.86164731, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88517487, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13134766, + "step": 14196, + "time_per_iteration": 2.9688379764556885 + }, + { + "auxiliary_loss_clip": 0.01145887, + "auxiliary_loss_mlp": 0.01005534, + "balance_loss_clip": 1.10177898, + "balance_loss_mlp": 1.00256562, + "epoch": 0.8535698181271607, + "flos": 52774695488160.0, + "grad_norm": 0.764104987259113, + "language_loss": 0.55089474, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57240891, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02966309, + "step": 14197, + "time_per_iteration": 3.2817838191986084 + }, + { + "auxiliary_loss_clip": 0.01317492, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.21233284, + "balance_loss_mlp": 1.01590145, + "epoch": 0.8536299413798286, + "flos": 19067643874080.0, + "grad_norm": 1.5567881744723795, + "language_loss": 0.81756276, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.84102046, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.12371826, + "step": 14198, + "time_per_iteration": 2.899721622467041 + }, + { + "auxiliary_loss_clip": 0.01322492, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.21646452, + "balance_loss_mlp": 1.01722789, + "epoch": 0.8536900646324966, + "flos": 49353274250280.0, + "grad_norm": 1.354861902496623, + "language_loss": 0.68549186, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70900375, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11462402, + "step": 14199, + "time_per_iteration": 3.0591251850128174 + }, + { + "auxiliary_loss_clip": 0.01315811, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.21170723, + "balance_loss_mlp": 1.01671445, + "epoch": 0.8537501878851645, + "flos": 21767371113960.0, + "grad_norm": 1.5569307605938065, + "language_loss": 0.86526155, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88871264, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.12579346, + "step": 14200, + "time_per_iteration": 2.828129291534424 + }, + { + "auxiliary_loss_clip": 0.01320836, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.21307194, + "balance_loss_mlp": 1.01798904, + "epoch": 0.8538103111378326, + "flos": 22273763276520.0, + "grad_norm": 1.6561311384521036, + "language_loss": 0.77696776, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.80048019, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12432861, + "step": 14201, + "time_per_iteration": 2.787907600402832 + }, + { + "auxiliary_loss_clip": 0.01318707, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.21292102, + "balance_loss_mlp": 1.01563907, + "epoch": 0.8538704343905005, + "flos": 20308910770440.0, + "grad_norm": 1.754023296900446, + "language_loss": 0.69326693, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71672916, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11871338, + "step": 14202, + "time_per_iteration": 2.72868013381958 + }, + { + "auxiliary_loss_clip": 0.0132588, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.21739841, + "balance_loss_mlp": 1.0214299, + "epoch": 0.8539305576431685, + "flos": 24760723380480.0, + "grad_norm": 3.080229291831981, + "language_loss": 0.80578512, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82938409, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12591553, + "step": 14203, + "time_per_iteration": 2.826697587966919 + }, + { + "auxiliary_loss_clip": 0.01327967, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.21786284, + "balance_loss_mlp": 1.0191617, + "epoch": 0.8539906808958365, + "flos": 26693268613200.0, + "grad_norm": 2.063916831771316, + "language_loss": 0.66448307, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.68809426, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13995361, + "step": 14204, + "time_per_iteration": 2.8196446895599365 + }, + { + "auxiliary_loss_clip": 0.0133364, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.22338092, + "balance_loss_mlp": 1.01680064, + "epoch": 0.8540508041485044, + "flos": 13368797980560.0, + "grad_norm": 2.849027346345406, + "language_loss": 0.60570627, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62934226, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13153076, + "step": 14205, + "time_per_iteration": 2.8092668056488037 + }, + { + "auxiliary_loss_clip": 0.01325657, + "auxiliary_loss_mlp": 0.01026389, + "balance_loss_clip": 1.21865344, + "balance_loss_mlp": 1.01384258, + "epoch": 0.8541109274011724, + "flos": 32787331314840.0, + "grad_norm": 2.300273862852779, + "language_loss": 0.72038984, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74391031, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12548828, + "step": 14206, + "time_per_iteration": 2.8342363834381104 + }, + { + "auxiliary_loss_clip": 0.01328866, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.21904588, + "balance_loss_mlp": 1.01342165, + "epoch": 0.8541710506538404, + "flos": 17643764872080.0, + "grad_norm": 2.90595393932951, + "language_loss": 0.75729024, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78083992, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12677002, + "step": 14207, + "time_per_iteration": 2.7273905277252197 + }, + { + "auxiliary_loss_clip": 0.0132878, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.2196629, + "balance_loss_mlp": 1.01691222, + "epoch": 0.8542311739065084, + "flos": 20267466616080.0, + "grad_norm": 1.8008308979752303, + "language_loss": 0.84661722, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.8702026, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12854004, + "step": 14208, + "time_per_iteration": 2.8653457164764404 + }, + { + "auxiliary_loss_clip": 0.01325639, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.21652293, + "balance_loss_mlp": 1.02187324, + "epoch": 0.8542912971591763, + "flos": 17790211910160.0, + "grad_norm": 2.652246689213529, + "language_loss": 0.66650867, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.69010735, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12359619, + "step": 14209, + "time_per_iteration": 2.7620325088500977 + }, + { + "auxiliary_loss_clip": 0.01320928, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.2144289, + "balance_loss_mlp": 1.01340461, + "epoch": 0.8543514204118443, + "flos": 26985188088720.0, + "grad_norm": 1.8626736356312488, + "language_loss": 0.70638788, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72985244, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12115479, + "step": 14210, + "time_per_iteration": 2.8686130046844482 + }, + { + "auxiliary_loss_clip": 0.01322637, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.21423006, + "balance_loss_mlp": 1.01400018, + "epoch": 0.8544115436645122, + "flos": 24029950091040.0, + "grad_norm": 1.3519210723575976, + "language_loss": 0.70321357, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72670782, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12786865, + "step": 14211, + "time_per_iteration": 2.765535831451416 + }, + { + "auxiliary_loss_clip": 0.01324512, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.21460772, + "balance_loss_mlp": 1.0226841, + "epoch": 0.8544716669171802, + "flos": 16622371574640.0, + "grad_norm": 2.0490982790066674, + "language_loss": 0.81222081, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.8358258, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13299561, + "step": 14212, + "time_per_iteration": 4.169594764709473 + }, + { + "auxiliary_loss_clip": 0.01323715, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.21464443, + "balance_loss_mlp": 1.02300847, + "epoch": 0.8545317901698481, + "flos": 40013272326240.0, + "grad_norm": 1.812322354542591, + "language_loss": 0.66662085, + "learning_rate": 2.178190108088105e-07, + "loss": 0.69023311, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.14508057, + "step": 14213, + "time_per_iteration": 2.936251640319824 + }, + { + "auxiliary_loss_clip": 0.01322749, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.21613121, + "balance_loss_mlp": 1.01360273, + "epoch": 0.8545919134225162, + "flos": 19907440274880.0, + "grad_norm": 1.6679869830916112, + "language_loss": 0.78393656, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80742502, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12475586, + "step": 14214, + "time_per_iteration": 4.254047870635986 + }, + { + "auxiliary_loss_clip": 0.01334446, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.22173309, + "balance_loss_mlp": 1.01594806, + "epoch": 0.8546520366751841, + "flos": 18957646162080.0, + "grad_norm": 2.745884510671984, + "language_loss": 0.66388178, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.6875242, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13842773, + "step": 14215, + "time_per_iteration": 4.203487873077393 + }, + { + "auxiliary_loss_clip": 0.01319897, + "auxiliary_loss_mlp": 0.0102771, + "balance_loss_clip": 1.21258187, + "balance_loss_mlp": 1.01553833, + "epoch": 0.8547121599278521, + "flos": 35627576772240.0, + "grad_norm": 1.5713772550186873, + "language_loss": 0.63103366, + "learning_rate": 2.172890718362279e-07, + "loss": 0.65450966, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1217041, + "step": 14216, + "time_per_iteration": 2.9050612449645996 + }, + { + "auxiliary_loss_clip": 0.0132673, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.21757877, + "balance_loss_mlp": 1.02291775, + "epoch": 0.8547722831805201, + "flos": 16914494091960.0, + "grad_norm": 2.032764272563803, + "language_loss": 0.65494883, + "learning_rate": 2.17112560704259e-07, + "loss": 0.6785695, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12414551, + "step": 14217, + "time_per_iteration": 2.7506508827209473 + }, + { + "auxiliary_loss_clip": 0.01322847, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.21717489, + "balance_loss_mlp": 1.01814127, + "epoch": 0.854832406433188, + "flos": 23007785234760.0, + "grad_norm": 2.6441993358303693, + "language_loss": 0.6500662, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67359614, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12005615, + "step": 14218, + "time_per_iteration": 2.8228321075439453 + }, + { + "auxiliary_loss_clip": 0.01328197, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.21796763, + "balance_loss_mlp": 1.0155704, + "epoch": 0.854892529685856, + "flos": 20417487189840.0, + "grad_norm": 1.7145364133963914, + "language_loss": 0.70051473, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72407472, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12237549, + "step": 14219, + "time_per_iteration": 2.932950019836426 + }, + { + "auxiliary_loss_clip": 0.01330736, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.21901369, + "balance_loss_mlp": 1.02326632, + "epoch": 0.854952652938524, + "flos": 16403025884400.0, + "grad_norm": 2.264828376392632, + "language_loss": 0.6769647, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.70063937, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13464355, + "step": 14220, + "time_per_iteration": 2.743272066116333 + }, + { + "auxiliary_loss_clip": 0.01320006, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.21477056, + "balance_loss_mlp": 1.01375222, + "epoch": 0.855012776191192, + "flos": 21183653988000.0, + "grad_norm": 2.120267721236362, + "language_loss": 0.71660137, + "learning_rate": 2.164071923159827e-07, + "loss": 0.74005765, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.11877441, + "step": 14221, + "time_per_iteration": 2.8309926986694336 + }, + { + "auxiliary_loss_clip": 0.01325595, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.21688342, + "balance_loss_mlp": 1.01902103, + "epoch": 0.8550728994438599, + "flos": 26146650547080.0, + "grad_norm": 1.8188285176105212, + "language_loss": 0.60007417, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62365103, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13079834, + "step": 14222, + "time_per_iteration": 4.424030303955078 + }, + { + "auxiliary_loss_clip": 0.01317022, + "auxiliary_loss_mlp": 0.01030549, + "balance_loss_clip": 1.21150875, + "balance_loss_mlp": 1.01847327, + "epoch": 0.8551330226965279, + "flos": 22792337947080.0, + "grad_norm": 1.7934844167843846, + "language_loss": 0.84458011, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86805582, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12072754, + "step": 14223, + "time_per_iteration": 2.872037649154663 + }, + { + "auxiliary_loss_clip": 0.01323874, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.21740925, + "balance_loss_mlp": 1.01867366, + "epoch": 0.8551931459491958, + "flos": 22424149325520.0, + "grad_norm": 1.5263158831672423, + "language_loss": 0.73882759, + "learning_rate": 2.158788761585515e-07, + "loss": 0.7623741, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12091064, + "step": 14224, + "time_per_iteration": 2.7865307331085205 + }, + { + "auxiliary_loss_clip": 0.01323496, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.21627915, + "balance_loss_mlp": 1.0150373, + "epoch": 0.8552532692018638, + "flos": 19577934439200.0, + "grad_norm": 1.8209900419267422, + "language_loss": 0.75848937, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.78199387, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.11914062, + "step": 14225, + "time_per_iteration": 2.728567600250244 + }, + { + "auxiliary_loss_clip": 0.01319226, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.2139287, + "balance_loss_mlp": 1.02063084, + "epoch": 0.8553133924545318, + "flos": 26438448197520.0, + "grad_norm": 1.635546926086847, + "language_loss": 0.77308464, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79660177, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.11859131, + "step": 14226, + "time_per_iteration": 2.7801475524902344 + }, + { + "auxiliary_loss_clip": 0.01333978, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.22210193, + "balance_loss_mlp": 1.01694143, + "epoch": 0.8553735157071998, + "flos": 16366779600120.0, + "grad_norm": 1.8580726398427034, + "language_loss": 0.54976773, + "learning_rate": 2.153511688875702e-07, + "loss": 0.57341385, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13720703, + "step": 14227, + "time_per_iteration": 2.7105705738067627 + }, + { + "auxiliary_loss_clip": 0.01318876, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.21305132, + "balance_loss_mlp": 1.01745665, + "epoch": 0.8554336389598677, + "flos": 20892343637880.0, + "grad_norm": 2.2930278703408864, + "language_loss": 0.6595152, + "learning_rate": 2.151754018031442e-07, + "loss": 0.68300593, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12744141, + "step": 14228, + "time_per_iteration": 2.792144775390625 + }, + { + "auxiliary_loss_clip": 0.01331898, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.2220757, + "balance_loss_mlp": 1.0207721, + "epoch": 0.8554937622125357, + "flos": 21289265997120.0, + "grad_norm": 2.173438423399288, + "language_loss": 0.74359429, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76725221, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13122559, + "step": 14229, + "time_per_iteration": 2.785323143005371 + }, + { + "auxiliary_loss_clip": 0.013178, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.21218121, + "balance_loss_mlp": 1.01637816, + "epoch": 0.8555538854652037, + "flos": 22417530162840.0, + "grad_norm": 2.6486329049596575, + "language_loss": 0.7324639, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.75592303, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11743164, + "step": 14230, + "time_per_iteration": 2.7311525344848633 + }, + { + "auxiliary_loss_clip": 0.01324556, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.21826005, + "balance_loss_mlp": 1.01846337, + "epoch": 0.8556140087178716, + "flos": 20198588191560.0, + "grad_norm": 1.728862109719041, + "language_loss": 0.83179975, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.85535777, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12786865, + "step": 14231, + "time_per_iteration": 2.784654378890991 + }, + { + "auxiliary_loss_clip": 0.01327289, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.21788645, + "balance_loss_mlp": 1.02058971, + "epoch": 0.8556741319705397, + "flos": 22643291973960.0, + "grad_norm": 2.013540754743552, + "language_loss": 0.68226039, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70587498, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13568115, + "step": 14232, + "time_per_iteration": 2.7422289848327637 + }, + { + "auxiliary_loss_clip": 0.0132775, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.21794558, + "balance_loss_mlp": 1.01715338, + "epoch": 0.8557342552232076, + "flos": 23554484517600.0, + "grad_norm": 1.3652713318142495, + "language_loss": 0.67349827, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69707787, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13061523, + "step": 14233, + "time_per_iteration": 2.8170223236083984 + }, + { + "auxiliary_loss_clip": 0.01318636, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.21260667, + "balance_loss_mlp": 1.01807809, + "epoch": 0.8557943784758756, + "flos": 19614668023800.0, + "grad_norm": 1.6838908513454285, + "language_loss": 0.7708205, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.79431367, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.1260376, + "step": 14234, + "time_per_iteration": 2.8268017768859863 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.10326648, + "balance_loss_mlp": 1.00319183, + "epoch": 0.8558545017285435, + "flos": 70656590739960.0, + "grad_norm": 0.7874664702964093, + "language_loss": 0.58038121, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.6019119, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.02697754, + "step": 14235, + "time_per_iteration": 3.221994400024414 + }, + { + "auxiliary_loss_clip": 0.01145757, + "auxiliary_loss_mlp": 0.01007893, + "balance_loss_clip": 1.10093045, + "balance_loss_mlp": 1.00499654, + "epoch": 0.8559146249812115, + "flos": 56665288003680.0, + "grad_norm": 0.7827213023091899, + "language_loss": 0.5668062, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58834273, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.02893066, + "step": 14236, + "time_per_iteration": 3.159198760986328 + }, + { + "auxiliary_loss_clip": 0.01322096, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.21408606, + "balance_loss_mlp": 1.01601601, + "epoch": 0.8559747482338794, + "flos": 22892752086120.0, + "grad_norm": 1.725958691108665, + "language_loss": 0.70336521, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72686684, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.1204834, + "step": 14237, + "time_per_iteration": 2.8798677921295166 + }, + { + "auxiliary_loss_clip": 0.01322914, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.21492851, + "balance_loss_mlp": 1.0172472, + "epoch": 0.8560348714865474, + "flos": 22607126906400.0, + "grad_norm": 1.9633556395425, + "language_loss": 0.63929594, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.66281891, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12139893, + "step": 14238, + "time_per_iteration": 2.8065733909606934 + }, + { + "auxiliary_loss_clip": 0.01314951, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.21009719, + "balance_loss_mlp": 1.01907289, + "epoch": 0.8560949947392154, + "flos": 17936009214480.0, + "grad_norm": 1.7130043782359536, + "language_loss": 0.69519931, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.7186507, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11108398, + "step": 14239, + "time_per_iteration": 2.7866950035095215 + }, + { + "auxiliary_loss_clip": 0.01329574, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.21892786, + "balance_loss_mlp": 1.01879525, + "epoch": 0.8561551179918834, + "flos": 31031997275880.0, + "grad_norm": 1.994374572584637, + "language_loss": 0.66756392, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.69117427, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12670898, + "step": 14240, + "time_per_iteration": 2.9667654037475586 + }, + { + "auxiliary_loss_clip": 0.01329536, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.21907508, + "balance_loss_mlp": 1.02067947, + "epoch": 0.8562152412445513, + "flos": 30671443026000.0, + "grad_norm": 2.0535558418417987, + "language_loss": 0.62355983, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64719909, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13708496, + "step": 14241, + "time_per_iteration": 2.821479082107544 + }, + { + "auxiliary_loss_clip": 0.0133438, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.220402, + "balance_loss_mlp": 1.02318633, + "epoch": 0.8562753644972193, + "flos": 31582351311120.0, + "grad_norm": 1.495950477601022, + "language_loss": 0.74734133, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.77105546, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1385498, + "step": 14242, + "time_per_iteration": 2.894826889038086 + }, + { + "auxiliary_loss_clip": 0.01331644, + "auxiliary_loss_mlp": 0.01035476, + "balance_loss_clip": 1.22149503, + "balance_loss_mlp": 1.02242887, + "epoch": 0.8563354877498872, + "flos": 26219264940720.0, + "grad_norm": 2.827307032631257, + "language_loss": 0.77038795, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.79405916, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13067627, + "step": 14243, + "time_per_iteration": 2.883089303970337 + }, + { + "auxiliary_loss_clip": 0.01329066, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.22021532, + "balance_loss_mlp": 1.01712298, + "epoch": 0.8563956110025552, + "flos": 24139744761240.0, + "grad_norm": 1.70525462906968, + "language_loss": 0.68202055, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70560652, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12402344, + "step": 14244, + "time_per_iteration": 2.8263721466064453 + }, + { + "auxiliary_loss_clip": 0.01147548, + "auxiliary_loss_mlp": 0.01005838, + "balance_loss_clip": 1.10372913, + "balance_loss_mlp": 1.00309587, + "epoch": 0.8564557342552233, + "flos": 56285525999520.0, + "grad_norm": 0.7575383077971369, + "language_loss": 0.58535624, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60689008, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02746582, + "step": 14245, + "time_per_iteration": 3.184925079345703 + }, + { + "auxiliary_loss_clip": 0.01336219, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.22434473, + "balance_loss_mlp": 1.01826644, + "epoch": 0.8565158575078912, + "flos": 23445867489840.0, + "grad_norm": 1.5837105423073534, + "language_loss": 0.77755445, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.80123448, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13525391, + "step": 14246, + "time_per_iteration": 2.8077149391174316 + }, + { + "auxiliary_loss_clip": 0.01321523, + "auxiliary_loss_mlp": 0.01024222, + "balance_loss_clip": 1.21461165, + "balance_loss_mlp": 1.01203918, + "epoch": 0.8565759807605592, + "flos": 20380794213600.0, + "grad_norm": 1.7086513881260228, + "language_loss": 0.81709981, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.84055722, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12194824, + "step": 14247, + "time_per_iteration": 2.793107748031616 + }, + { + "auxiliary_loss_clip": 0.01323649, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.21538615, + "balance_loss_mlp": 1.01510942, + "epoch": 0.8566361040132271, + "flos": 18811767641040.0, + "grad_norm": 1.7822093369623384, + "language_loss": 0.77766252, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.80117589, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12573242, + "step": 14248, + "time_per_iteration": 2.7225911617279053 + }, + { + "auxiliary_loss_clip": 0.01325192, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.21614909, + "balance_loss_mlp": 1.01827276, + "epoch": 0.8566962272658951, + "flos": 24540768564840.0, + "grad_norm": 1.7500002394703875, + "language_loss": 0.78397214, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80753136, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12469482, + "step": 14249, + "time_per_iteration": 2.770702838897705 + }, + { + "auxiliary_loss_clip": 0.01320466, + "auxiliary_loss_mlp": 0.01031488, + "balance_loss_clip": 1.2142837, + "balance_loss_mlp": 1.01911402, + "epoch": 0.856756350518563, + "flos": 23182844185440.0, + "grad_norm": 1.7742230563631296, + "language_loss": 0.78668773, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.81020725, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12371826, + "step": 14250, + "time_per_iteration": 2.760040521621704 + }, + { + "auxiliary_loss_clip": 0.01314456, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.21043682, + "balance_loss_mlp": 1.01706243, + "epoch": 0.856816473771231, + "flos": 20812825823040.0, + "grad_norm": 1.8564726647005072, + "language_loss": 0.79844356, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.82187557, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11694336, + "step": 14251, + "time_per_iteration": 4.328023672103882 + }, + { + "auxiliary_loss_clip": 0.01321828, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.21523976, + "balance_loss_mlp": 1.01547515, + "epoch": 0.856876597023899, + "flos": 20232763449480.0, + "grad_norm": 1.8632688997030638, + "language_loss": 0.60890305, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63240057, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12432861, + "step": 14252, + "time_per_iteration": 2.7357141971588135 + }, + { + "auxiliary_loss_clip": 0.01331996, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.22200179, + "balance_loss_mlp": 1.01903951, + "epoch": 0.856936720276567, + "flos": 18300705517080.0, + "grad_norm": 1.7377565477197094, + "language_loss": 0.70310706, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.72674811, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13079834, + "step": 14253, + "time_per_iteration": 4.3028404712677 + }, + { + "auxiliary_loss_clip": 0.01145686, + "auxiliary_loss_mlp": 0.01007015, + "balance_loss_clip": 1.10157096, + "balance_loss_mlp": 1.00422597, + "epoch": 0.8569968435292349, + "flos": 69893347743720.0, + "grad_norm": 0.7999578613048294, + "language_loss": 0.5926367, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61416376, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.0279541, + "step": 14254, + "time_per_iteration": 4.749504089355469 + }, + { + "auxiliary_loss_clip": 0.01319418, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.21280599, + "balance_loss_mlp": 1.01793957, + "epoch": 0.8570569667819029, + "flos": 25854121946160.0, + "grad_norm": 1.7190576640727055, + "language_loss": 0.81405091, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83756697, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.14239502, + "step": 14255, + "time_per_iteration": 2.784898281097412 + }, + { + "auxiliary_loss_clip": 0.01315672, + "auxiliary_loss_mlp": 0.01025888, + "balance_loss_clip": 1.21135235, + "balance_loss_mlp": 1.01414561, + "epoch": 0.8571170900345708, + "flos": 23262199566840.0, + "grad_norm": 2.047300708044796, + "language_loss": 0.67726839, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.70068401, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.11743164, + "step": 14256, + "time_per_iteration": 2.892643690109253 + }, + { + "auxiliary_loss_clip": 0.01330001, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.22127867, + "balance_loss_mlp": 1.01995647, + "epoch": 0.8571772132872388, + "flos": 18922496303520.0, + "grad_norm": 1.413705268279644, + "language_loss": 0.70165849, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72528195, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12390137, + "step": 14257, + "time_per_iteration": 2.8505489826202393 + }, + { + "auxiliary_loss_clip": 0.01320834, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.21506476, + "balance_loss_mlp": 1.01666069, + "epoch": 0.8572373365399069, + "flos": 33255974683800.0, + "grad_norm": 12.946238483033266, + "language_loss": 0.77370858, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79721236, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12872314, + "step": 14258, + "time_per_iteration": 2.8827619552612305 + }, + { + "auxiliary_loss_clip": 0.01318302, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.21289051, + "balance_loss_mlp": 1.01462936, + "epoch": 0.8572974597925748, + "flos": 23332296242160.0, + "grad_norm": 1.8496756244103416, + "language_loss": 0.67859972, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.70205247, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12341309, + "step": 14259, + "time_per_iteration": 2.7874832153320312 + }, + { + "auxiliary_loss_clip": 0.01323568, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.21539068, + "balance_loss_mlp": 1.01702976, + "epoch": 0.8573575830452428, + "flos": 24541337081880.0, + "grad_norm": 1.84324056898676, + "language_loss": 0.77474457, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79827952, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12896729, + "step": 14260, + "time_per_iteration": 2.9177005290985107 + }, + { + "auxiliary_loss_clip": 0.01325875, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.21698523, + "balance_loss_mlp": 1.01702571, + "epoch": 0.8574177062979107, + "flos": 24170102833320.0, + "grad_norm": 2.510657304301521, + "language_loss": 0.74102545, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76458442, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13000488, + "step": 14261, + "time_per_iteration": 4.388509035110474 + }, + { + "auxiliary_loss_clip": 0.01327029, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.21624911, + "balance_loss_mlp": 1.02058482, + "epoch": 0.8574778295505787, + "flos": 17935197047280.0, + "grad_norm": 1.8557835045228306, + "language_loss": 0.79250884, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81612259, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13775635, + "step": 14262, + "time_per_iteration": 2.7984976768493652 + }, + { + "auxiliary_loss_clip": 0.01320752, + "auxiliary_loss_mlp": 0.01027156, + "balance_loss_clip": 1.21514559, + "balance_loss_mlp": 1.0151515, + "epoch": 0.8575379528032466, + "flos": 21585814825680.0, + "grad_norm": 1.4512622697392579, + "language_loss": 0.67895204, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70243108, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11993408, + "step": 14263, + "time_per_iteration": 2.748950719833374 + }, + { + "auxiliary_loss_clip": 0.01326905, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.2183547, + "balance_loss_mlp": 1.0208807, + "epoch": 0.8575980760559146, + "flos": 21766518338400.0, + "grad_norm": 1.410127892839345, + "language_loss": 0.79500145, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81860769, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1282959, + "step": 14264, + "time_per_iteration": 2.7750518321990967 + }, + { + "auxiliary_loss_clip": 0.01320799, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.214468, + "balance_loss_mlp": 1.01620126, + "epoch": 0.8576581993085826, + "flos": 34393132080360.0, + "grad_norm": 1.4542433398134391, + "language_loss": 0.6980511, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.72153807, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.11700439, + "step": 14265, + "time_per_iteration": 2.917900800704956 + }, + { + "auxiliary_loss_clip": 0.01317697, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.21331012, + "balance_loss_mlp": 1.01432002, + "epoch": 0.8577183225612506, + "flos": 23227821267120.0, + "grad_norm": 1.6896088589869753, + "language_loss": 0.66372609, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68716419, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11804199, + "step": 14266, + "time_per_iteration": 2.874018430709839 + }, + { + "auxiliary_loss_clip": 0.01320127, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.21440172, + "balance_loss_mlp": 1.02043867, + "epoch": 0.8577784458139185, + "flos": 28810131502680.0, + "grad_norm": 1.5569761749904738, + "language_loss": 0.75484008, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77837539, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12982178, + "step": 14267, + "time_per_iteration": 2.875241756439209 + }, + { + "auxiliary_loss_clip": 0.01319703, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.21332693, + "balance_loss_mlp": 1.02055216, + "epoch": 0.8578385690665865, + "flos": 19760343503040.0, + "grad_norm": 1.7686921478218978, + "language_loss": 0.88104415, + "learning_rate": 2.082002873852946e-07, + "loss": 0.90456438, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11779785, + "step": 14268, + "time_per_iteration": 2.8137049674987793 + }, + { + "auxiliary_loss_clip": 0.01329801, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.22022009, + "balance_loss_mlp": 1.01904047, + "epoch": 0.8578986923192544, + "flos": 20708959973400.0, + "grad_norm": 1.8271488706808585, + "language_loss": 0.73321724, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75683761, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13195801, + "step": 14269, + "time_per_iteration": 2.8521344661712646 + }, + { + "auxiliary_loss_clip": 0.01330643, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.22147655, + "balance_loss_mlp": 1.01849103, + "epoch": 0.8579588155719224, + "flos": 36108686907720.0, + "grad_norm": 1.4966108446981197, + "language_loss": 0.66606259, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68968523, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13134766, + "step": 14270, + "time_per_iteration": 2.9106791019439697 + }, + { + "auxiliary_loss_clip": 0.01317329, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.21129167, + "balance_loss_mlp": 1.01486969, + "epoch": 0.8580189388245905, + "flos": 22858211352960.0, + "grad_norm": 1.6296670809493612, + "language_loss": 0.73991966, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76336485, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12316895, + "step": 14271, + "time_per_iteration": 2.7695322036743164 + }, + { + "auxiliary_loss_clip": 0.01145607, + "auxiliary_loss_mlp": 0.01005909, + "balance_loss_clip": 1.10176444, + "balance_loss_mlp": 1.00315571, + "epoch": 0.8580790620772584, + "flos": 69659521850160.0, + "grad_norm": 0.7845697483292621, + "language_loss": 0.59467721, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61619234, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02758789, + "step": 14272, + "time_per_iteration": 3.2739717960357666 + }, + { + "auxiliary_loss_clip": 0.01334652, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.22251856, + "balance_loss_mlp": 1.01802588, + "epoch": 0.8581391853299264, + "flos": 13338196258320.0, + "grad_norm": 1.8556899281345647, + "language_loss": 0.75791889, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.78157938, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13378906, + "step": 14273, + "time_per_iteration": 2.818485736846924 + }, + { + "auxiliary_loss_clip": 0.01320376, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.21172118, + "balance_loss_mlp": 1.01648498, + "epoch": 0.8581993085825943, + "flos": 19650345791040.0, + "grad_norm": 1.700241684610164, + "language_loss": 0.82216388, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84565622, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1237793, + "step": 14274, + "time_per_iteration": 2.8026342391967773 + }, + { + "auxiliary_loss_clip": 0.01145098, + "auxiliary_loss_mlp": 0.01001398, + "balance_loss_clip": 1.10133696, + "balance_loss_mlp": 0.99845308, + "epoch": 0.8582594318352623, + "flos": 55837372871160.0, + "grad_norm": 0.8001822436691773, + "language_loss": 0.60907316, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.63053817, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.02941895, + "step": 14275, + "time_per_iteration": 3.2624661922454834 + }, + { + "auxiliary_loss_clip": 0.01327835, + "auxiliary_loss_mlp": 0.01023404, + "balance_loss_clip": 1.21827221, + "balance_loss_mlp": 1.01074982, + "epoch": 0.8583195550879302, + "flos": 24284892331800.0, + "grad_norm": 1.99104169121516, + "language_loss": 0.59105122, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61456358, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12664795, + "step": 14276, + "time_per_iteration": 2.805943250656128 + }, + { + "auxiliary_loss_clip": 0.01321034, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.21285963, + "balance_loss_mlp": 1.01738286, + "epoch": 0.8583796783405983, + "flos": 13448153361960.0, + "grad_norm": 1.9820477186246959, + "language_loss": 0.76442903, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78794158, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12835693, + "step": 14277, + "time_per_iteration": 2.846984624862671 + }, + { + "auxiliary_loss_clip": 0.01326764, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.21889114, + "balance_loss_mlp": 1.01878238, + "epoch": 0.8584398015932662, + "flos": 16184614186440.0, + "grad_norm": 1.5388492903822042, + "language_loss": 0.83900219, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.86258912, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13122559, + "step": 14278, + "time_per_iteration": 2.7792129516601562 + }, + { + "auxiliary_loss_clip": 0.01338467, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.22697389, + "balance_loss_mlp": 1.02034402, + "epoch": 0.8584999248459342, + "flos": 17454046303440.0, + "grad_norm": 2.360250436949918, + "language_loss": 0.7489273, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.77264774, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13244629, + "step": 14279, + "time_per_iteration": 2.730910062789917 + }, + { + "auxiliary_loss_clip": 0.01319491, + "auxiliary_loss_mlp": 0.01028867, + "balance_loss_clip": 1.21346748, + "balance_loss_mlp": 1.0167017, + "epoch": 0.8585600480986021, + "flos": 23446314181800.0, + "grad_norm": 2.2808330185343073, + "language_loss": 0.66909844, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.69258201, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12158203, + "step": 14280, + "time_per_iteration": 2.9005653858184814 + }, + { + "auxiliary_loss_clip": 0.01322021, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.21589923, + "balance_loss_mlp": 1.01957464, + "epoch": 0.8586201713512701, + "flos": 19942711958520.0, + "grad_norm": 1.7416593890629404, + "language_loss": 0.63021231, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.65374929, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12097168, + "step": 14281, + "time_per_iteration": 2.9989829063415527 + }, + { + "auxiliary_loss_clip": 0.01327462, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.21959138, + "balance_loss_mlp": 1.01636934, + "epoch": 0.858680294603938, + "flos": 15309018193320.0, + "grad_norm": 1.6403023240646442, + "language_loss": 0.73504353, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75860828, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12646484, + "step": 14282, + "time_per_iteration": 2.9062204360961914 + }, + { + "auxiliary_loss_clip": 0.0132171, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.21423435, + "balance_loss_mlp": 1.01504123, + "epoch": 0.858740417856606, + "flos": 22716434276280.0, + "grad_norm": 1.746665308826668, + "language_loss": 0.75843745, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.78192353, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.11853027, + "step": 14283, + "time_per_iteration": 2.8052282333374023 + }, + { + "auxiliary_loss_clip": 0.01324355, + "auxiliary_loss_mlp": 0.01026731, + "balance_loss_clip": 1.21622014, + "balance_loss_mlp": 1.01398778, + "epoch": 0.8588005411092741, + "flos": 34060458792600.0, + "grad_norm": 1.6705844927689484, + "language_loss": 0.60242081, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62593174, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12750244, + "step": 14284, + "time_per_iteration": 2.85086727142334 + }, + { + "auxiliary_loss_clip": 0.01317973, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.21297979, + "balance_loss_mlp": 1.01591253, + "epoch": 0.858860664361942, + "flos": 28919560697640.0, + "grad_norm": 1.7156353994029943, + "language_loss": 0.75769985, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.78115916, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12042236, + "step": 14285, + "time_per_iteration": 2.8045594692230225 + }, + { + "auxiliary_loss_clip": 0.01330103, + "auxiliary_loss_mlp": 0.01038887, + "balance_loss_clip": 1.22111595, + "balance_loss_mlp": 1.02579808, + "epoch": 0.85892078761461, + "flos": 19797361346160.0, + "grad_norm": 1.8993121103761335, + "language_loss": 0.74418378, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.7678737, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13079834, + "step": 14286, + "time_per_iteration": 2.728306770324707 + }, + { + "auxiliary_loss_clip": 0.01146013, + "auxiliary_loss_mlp": 0.01002249, + "balance_loss_clip": 1.10269666, + "balance_loss_mlp": 0.99966222, + "epoch": 0.8589809108672779, + "flos": 67121371585440.0, + "grad_norm": 0.8099129012393571, + "language_loss": 0.49512815, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51661074, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02587891, + "step": 14287, + "time_per_iteration": 3.2078309059143066 + }, + { + "auxiliary_loss_clip": 0.01323458, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.2159574, + "balance_loss_mlp": 1.01780593, + "epoch": 0.8590410341199459, + "flos": 29722339255320.0, + "grad_norm": 1.7713360152927564, + "language_loss": 0.79664958, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.82018226, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12011719, + "step": 14288, + "time_per_iteration": 2.932330369949341 + }, + { + "auxiliary_loss_clip": 0.01329402, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.21963704, + "balance_loss_mlp": 1.01851523, + "epoch": 0.8591011573726138, + "flos": 23992688597760.0, + "grad_norm": 2.2018855687535908, + "language_loss": 0.81461692, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83822864, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13238525, + "step": 14289, + "time_per_iteration": 2.769434690475464 + }, + { + "auxiliary_loss_clip": 0.01327743, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.22006381, + "balance_loss_mlp": 1.01655459, + "epoch": 0.8591612806252819, + "flos": 14432731858080.0, + "grad_norm": 1.7313868049226242, + "language_loss": 0.65086889, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67442971, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.11791992, + "step": 14290, + "time_per_iteration": 4.168808698654175 + }, + { + "auxiliary_loss_clip": 0.01331086, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.22043765, + "balance_loss_mlp": 1.01615202, + "epoch": 0.8592214038779498, + "flos": 31583488345200.0, + "grad_norm": 1.809179237489748, + "language_loss": 0.56137037, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.58497429, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13153076, + "step": 14291, + "time_per_iteration": 2.8673722743988037 + }, + { + "auxiliary_loss_clip": 0.01325042, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.2168088, + "balance_loss_mlp": 1.01461649, + "epoch": 0.8592815271306178, + "flos": 17461802500200.0, + "grad_norm": 1.875000638554104, + "language_loss": 0.71624726, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73976851, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12487793, + "step": 14292, + "time_per_iteration": 4.318502426147461 + }, + { + "auxiliary_loss_clip": 0.01326141, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.21767402, + "balance_loss_mlp": 1.01661325, + "epoch": 0.8593416503832857, + "flos": 25416933075000.0, + "grad_norm": 1.360333835220202, + "language_loss": 0.71642983, + "learning_rate": 2.038960195018542e-07, + "loss": 0.7399807, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12341309, + "step": 14293, + "time_per_iteration": 4.277920961380005 + }, + { + "auxiliary_loss_clip": 0.01320443, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.2143867, + "balance_loss_mlp": 1.0200187, + "epoch": 0.8594017736359537, + "flos": 21001651007760.0, + "grad_norm": 1.4206718397554094, + "language_loss": 0.69045043, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.71397245, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.11743164, + "step": 14294, + "time_per_iteration": 2.802602767944336 + }, + { + "auxiliary_loss_clip": 0.01315413, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.21032882, + "balance_loss_mlp": 1.01882446, + "epoch": 0.8594618968886216, + "flos": 22096186607520.0, + "grad_norm": 2.0222097474102596, + "language_loss": 0.77839613, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80185831, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11981201, + "step": 14295, + "time_per_iteration": 2.7727482318878174 + }, + { + "auxiliary_loss_clip": 0.01331656, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.21939182, + "balance_loss_mlp": 1.02203155, + "epoch": 0.8595220201412896, + "flos": 11660024749320.0, + "grad_norm": 18.898827024878024, + "language_loss": 0.68481064, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.70848864, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14111328, + "step": 14296, + "time_per_iteration": 2.792173147201538 + }, + { + "auxiliary_loss_clip": 0.01328239, + "auxiliary_loss_mlp": 0.01026045, + "balance_loss_clip": 1.22095513, + "balance_loss_mlp": 1.01357555, + "epoch": 0.8595821433939577, + "flos": 25045089701040.0, + "grad_norm": 1.9334660404786217, + "language_loss": 0.79085243, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81439531, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12457275, + "step": 14297, + "time_per_iteration": 2.861081600189209 + }, + { + "auxiliary_loss_clip": 0.01317965, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.21180427, + "balance_loss_mlp": 1.01490974, + "epoch": 0.8596422666466256, + "flos": 28517359251600.0, + "grad_norm": 1.5986558273492957, + "language_loss": 0.68345892, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70690215, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11456299, + "step": 14298, + "time_per_iteration": 2.85617995262146 + }, + { + "auxiliary_loss_clip": 0.01325367, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.22035933, + "balance_loss_mlp": 1.0202899, + "epoch": 0.8597023898992936, + "flos": 13593057282360.0, + "grad_norm": 2.146003392377455, + "language_loss": 0.68841529, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71199501, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12310791, + "step": 14299, + "time_per_iteration": 2.867035150527954 + }, + { + "auxiliary_loss_clip": 0.01326921, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.21794462, + "balance_loss_mlp": 1.022645, + "epoch": 0.8597625131519615, + "flos": 32306464829520.0, + "grad_norm": 2.25611273677526, + "language_loss": 0.72047311, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.74409562, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12689209, + "step": 14300, + "time_per_iteration": 2.8285744190216064 + }, + { + "auxiliary_loss_clip": 0.01317817, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.2118175, + "balance_loss_mlp": 1.01452982, + "epoch": 0.8598226364046295, + "flos": 28736217641520.0, + "grad_norm": 1.4392310128098684, + "language_loss": 0.6937362, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71717858, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11883545, + "step": 14301, + "time_per_iteration": 4.378427028656006 + }, + { + "auxiliary_loss_clip": 0.0132678, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.21877766, + "balance_loss_mlp": 1.02068686, + "epoch": 0.8598827596572974, + "flos": 21876881525640.0, + "grad_norm": 1.6285014684655839, + "language_loss": 0.74475002, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76835012, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12524414, + "step": 14302, + "time_per_iteration": 2.823359251022339 + }, + { + "auxiliary_loss_clip": 0.0131474, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.20997918, + "balance_loss_mlp": 1.0142498, + "epoch": 0.8599428829099655, + "flos": 23772368306880.0, + "grad_norm": 1.8401374966680257, + "language_loss": 0.83668435, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86008787, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.11364746, + "step": 14303, + "time_per_iteration": 2.7482528686523438 + }, + { + "auxiliary_loss_clip": 0.0132787, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.21908593, + "balance_loss_mlp": 1.02002788, + "epoch": 0.8600030061626334, + "flos": 16216596592920.0, + "grad_norm": 1.97583344615061, + "language_loss": 0.77509916, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79871625, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13806152, + "step": 14304, + "time_per_iteration": 2.7681233882904053 + }, + { + "auxiliary_loss_clip": 0.01325978, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.21765268, + "balance_loss_mlp": 1.01687384, + "epoch": 0.8600631294153014, + "flos": 15673633279200.0, + "grad_norm": 1.9765680481614132, + "language_loss": 0.53427267, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.55783355, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.13232422, + "step": 14305, + "time_per_iteration": 2.74055814743042 + }, + { + "auxiliary_loss_clip": 0.01316042, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.20996284, + "balance_loss_mlp": 1.01799297, + "epoch": 0.8601232526679693, + "flos": 17497520875800.0, + "grad_norm": 2.0023236341253763, + "language_loss": 0.84006912, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86353666, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12713623, + "step": 14306, + "time_per_iteration": 2.7245545387268066 + }, + { + "auxiliary_loss_clip": 0.01318223, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.21321452, + "balance_loss_mlp": 1.01836014, + "epoch": 0.8601833759206373, + "flos": 26992456985160.0, + "grad_norm": 1.4384783510294126, + "language_loss": 0.71920842, + "learning_rate": 2.01504216561474e-07, + "loss": 0.74269342, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11914062, + "step": 14307, + "time_per_iteration": 2.860914945602417 + }, + { + "auxiliary_loss_clip": 0.01331724, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.22115886, + "balance_loss_mlp": 1.02240109, + "epoch": 0.8602434991733052, + "flos": 25235417395080.0, + "grad_norm": 1.6504363624196914, + "language_loss": 0.63852173, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.66220057, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13763428, + "step": 14308, + "time_per_iteration": 2.812047004699707 + }, + { + "auxiliary_loss_clip": 0.01143139, + "auxiliary_loss_mlp": 0.01004544, + "balance_loss_clip": 1.10041559, + "balance_loss_mlp": 1.00177848, + "epoch": 0.8603036224259732, + "flos": 71031236090760.0, + "grad_norm": 0.6291213474183527, + "language_loss": 0.48521101, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50668788, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02770996, + "step": 14309, + "time_per_iteration": 3.342259168624878 + }, + { + "auxiliary_loss_clip": 0.01328914, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.21887231, + "balance_loss_mlp": 1.02203882, + "epoch": 0.8603637456786413, + "flos": 20305174801320.0, + "grad_norm": 1.7019626433190278, + "language_loss": 0.674752, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69840074, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13922119, + "step": 14310, + "time_per_iteration": 2.859032392501831 + }, + { + "auxiliary_loss_clip": 0.01323896, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.21517825, + "balance_loss_mlp": 1.01392126, + "epoch": 0.8604238689313092, + "flos": 21840960108240.0, + "grad_norm": 1.9680081642388318, + "language_loss": 0.78921986, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.8127234, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12554932, + "step": 14311, + "time_per_iteration": 2.9373421669006348 + }, + { + "auxiliary_loss_clip": 0.01324766, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.2178607, + "balance_loss_mlp": 1.01524925, + "epoch": 0.8604839921839772, + "flos": 18008745433200.0, + "grad_norm": 2.0709197862936426, + "language_loss": 0.72323287, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74675333, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12030029, + "step": 14312, + "time_per_iteration": 2.7532992362976074 + }, + { + "auxiliary_loss_clip": 0.01320144, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.21267056, + "balance_loss_mlp": 1.01939952, + "epoch": 0.8605441154366451, + "flos": 16256375804520.0, + "grad_norm": 1.706170060761748, + "language_loss": 0.77839255, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80191588, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12799072, + "step": 14313, + "time_per_iteration": 2.8248674869537354 + }, + { + "auxiliary_loss_clip": 0.01317819, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.2138505, + "balance_loss_mlp": 1.01673555, + "epoch": 0.8606042386893131, + "flos": 32273061130440.0, + "grad_norm": 2.1591749142894074, + "language_loss": 0.73045754, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75393409, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.13104248, + "step": 14314, + "time_per_iteration": 2.887131690979004 + }, + { + "auxiliary_loss_clip": 0.0132237, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.21466553, + "balance_loss_mlp": 1.02206063, + "epoch": 0.860664361941981, + "flos": 20234590825680.0, + "grad_norm": 1.7812624522656455, + "language_loss": 0.69464821, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71821737, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.125, + "step": 14315, + "time_per_iteration": 2.7815825939178467 + }, + { + "auxiliary_loss_clip": 0.01323795, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.21759439, + "balance_loss_mlp": 1.01546121, + "epoch": 0.8607244851946491, + "flos": 25197059476080.0, + "grad_norm": 1.724742560473906, + "language_loss": 0.72292489, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74643922, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12182617, + "step": 14316, + "time_per_iteration": 2.9836740493774414 + }, + { + "auxiliary_loss_clip": 0.01332014, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.22297406, + "balance_loss_mlp": 1.01248908, + "epoch": 0.860784608447317, + "flos": 20486528047800.0, + "grad_norm": 1.8783649071770796, + "language_loss": 0.83597565, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.8595469, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1262207, + "step": 14317, + "time_per_iteration": 2.836665391921997 + }, + { + "auxiliary_loss_clip": 0.01321874, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.2171762, + "balance_loss_mlp": 1.01579678, + "epoch": 0.860844731699985, + "flos": 50484746476440.0, + "grad_norm": 1.9882358274473875, + "language_loss": 0.67475855, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69825739, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12225342, + "step": 14318, + "time_per_iteration": 3.035949230194092 + }, + { + "auxiliary_loss_clip": 0.01319696, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.21440983, + "balance_loss_mlp": 1.01352262, + "epoch": 0.8609048549526529, + "flos": 41180341102920.0, + "grad_norm": 1.4402462505246496, + "language_loss": 0.714526, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73797733, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.11907959, + "step": 14319, + "time_per_iteration": 3.024419069290161 + }, + { + "auxiliary_loss_clip": 0.01328817, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.21926796, + "balance_loss_mlp": 1.0212394, + "epoch": 0.8609649782053209, + "flos": 23956482921840.0, + "grad_norm": 1.7780765206295495, + "language_loss": 0.67459482, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69822443, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12915039, + "step": 14320, + "time_per_iteration": 2.8427653312683105 + }, + { + "auxiliary_loss_clip": 0.0132711, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.21762788, + "balance_loss_mlp": 1.01694524, + "epoch": 0.8610251014579888, + "flos": 23117823555120.0, + "grad_norm": 1.9606413289095226, + "language_loss": 0.80383694, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82741141, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13397217, + "step": 14321, + "time_per_iteration": 2.8344297409057617 + }, + { + "auxiliary_loss_clip": 0.01312895, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.2090677, + "balance_loss_mlp": 1.01349497, + "epoch": 0.8610852247106568, + "flos": 19431487401120.0, + "grad_norm": 1.8018855735624193, + "language_loss": 0.71931565, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.74271333, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.13391113, + "step": 14322, + "time_per_iteration": 2.7511720657348633 + }, + { + "auxiliary_loss_clip": 0.01332707, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.22026992, + "balance_loss_mlp": 1.02212, + "epoch": 0.8611453479633249, + "flos": 19316048168880.0, + "grad_norm": 1.8316368402202814, + "language_loss": 0.56535131, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.5890373, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13775635, + "step": 14323, + "time_per_iteration": 2.8697917461395264 + }, + { + "auxiliary_loss_clip": 0.01320198, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.21403837, + "balance_loss_mlp": 1.01771331, + "epoch": 0.8612054712159928, + "flos": 23258301164280.0, + "grad_norm": 1.7524092734107095, + "language_loss": 0.75856566, + "learning_rate": 1.986178565813801e-07, + "loss": 0.7820664, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.1217041, + "step": 14324, + "time_per_iteration": 2.8137989044189453 + }, + { + "auxiliary_loss_clip": 0.01324662, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.21712518, + "balance_loss_mlp": 1.01974392, + "epoch": 0.8612655944686608, + "flos": 16031994677640.0, + "grad_norm": 2.104379683956233, + "language_loss": 0.67448938, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.69807255, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13909912, + "step": 14325, + "time_per_iteration": 2.767629384994507 + }, + { + "auxiliary_loss_clip": 0.01323846, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.21527052, + "balance_loss_mlp": 1.0173955, + "epoch": 0.8613257177213287, + "flos": 22497941361600.0, + "grad_norm": 1.64749303547213, + "language_loss": 0.64814425, + "learning_rate": 1.982795820716472e-07, + "loss": 0.67169344, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13671875, + "step": 14326, + "time_per_iteration": 2.7546563148498535 + }, + { + "auxiliary_loss_clip": 0.01329101, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.22027564, + "balance_loss_mlp": 1.01500666, + "epoch": 0.8613858409739967, + "flos": 17242497418320.0, + "grad_norm": 2.2859321170404807, + "language_loss": 0.84907329, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.87264347, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12902832, + "step": 14327, + "time_per_iteration": 2.703979969024658 + }, + { + "auxiliary_loss_clip": 0.01319429, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.21235037, + "balance_loss_mlp": 1.01955211, + "epoch": 0.8614459642266646, + "flos": 22826391379920.0, + "grad_norm": 2.044576556803191, + "language_loss": 0.75262356, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77613282, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.1194458, + "step": 14328, + "time_per_iteration": 2.8041653633117676 + }, + { + "auxiliary_loss_clip": 0.01317237, + "auxiliary_loss_mlp": 0.01024061, + "balance_loss_clip": 1.21187162, + "balance_loss_mlp": 1.01188397, + "epoch": 0.8615060874793327, + "flos": 26509803732000.0, + "grad_norm": 1.593836824508356, + "language_loss": 0.79999167, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82340467, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1217041, + "step": 14329, + "time_per_iteration": 4.366103410720825 + }, + { + "auxiliary_loss_clip": 0.01320882, + "auxiliary_loss_mlp": 0.01025466, + "balance_loss_clip": 1.21334565, + "balance_loss_mlp": 1.01275253, + "epoch": 0.8615662107320006, + "flos": 24066358808760.0, + "grad_norm": 2.2831290024829394, + "language_loss": 0.77405381, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.7975173, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12713623, + "step": 14330, + "time_per_iteration": 4.394453525543213 + }, + { + "auxiliary_loss_clip": 0.01317995, + "auxiliary_loss_mlp": 0.01026715, + "balance_loss_clip": 1.21128774, + "balance_loss_mlp": 1.01374507, + "epoch": 0.8616263339846686, + "flos": 24169696749720.0, + "grad_norm": 4.110663628767054, + "language_loss": 0.65328407, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67673117, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12976074, + "step": 14331, + "time_per_iteration": 2.880317449569702 + }, + { + "auxiliary_loss_clip": 0.0132167, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.21643639, + "balance_loss_mlp": 1.02092624, + "epoch": 0.8616864572373365, + "flos": 21729175628400.0, + "grad_norm": 1.6385998514305808, + "language_loss": 0.76356989, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.7871151, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11920166, + "step": 14332, + "time_per_iteration": 4.254471778869629 + }, + { + "auxiliary_loss_clip": 0.01325894, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.21564341, + "balance_loss_mlp": 1.01419818, + "epoch": 0.8617465804900045, + "flos": 23771637356400.0, + "grad_norm": 1.7218919483212998, + "language_loss": 0.67105234, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69458717, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1338501, + "step": 14333, + "time_per_iteration": 2.8584635257720947 + }, + { + "auxiliary_loss_clip": 0.01333978, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.22101486, + "balance_loss_mlp": 1.01730156, + "epoch": 0.8618067037426724, + "flos": 37710711095760.0, + "grad_norm": 1.6280365877457132, + "language_loss": 0.62548149, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64913678, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14233398, + "step": 14334, + "time_per_iteration": 3.034520149230957 + }, + { + "auxiliary_loss_clip": 0.01332988, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.22279048, + "balance_loss_mlp": 1.02138567, + "epoch": 0.8618668269953405, + "flos": 21476101372200.0, + "grad_norm": 1.9224165259553814, + "language_loss": 0.69638598, + "learning_rate": 1.967607294278577e-07, + "loss": 0.72005832, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.128479, + "step": 14335, + "time_per_iteration": 2.9065463542938232 + }, + { + "auxiliary_loss_clip": 0.01326653, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.21760988, + "balance_loss_mlp": 1.01813126, + "epoch": 0.8619269502480085, + "flos": 22237273342080.0, + "grad_norm": 1.3984689747216585, + "language_loss": 0.82892156, + "learning_rate": 1.965923098328135e-07, + "loss": 0.8524909, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.121521, + "step": 14336, + "time_per_iteration": 2.883517265319824 + }, + { + "auxiliary_loss_clip": 0.01334663, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.22157252, + "balance_loss_mlp": 1.01415586, + "epoch": 0.8619870735006764, + "flos": 22715743934160.0, + "grad_norm": 1.876767151650853, + "language_loss": 0.67415547, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69777423, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13061523, + "step": 14337, + "time_per_iteration": 2.8928656578063965 + }, + { + "auxiliary_loss_clip": 0.01321367, + "auxiliary_loss_mlp": 0.01025396, + "balance_loss_clip": 1.2138741, + "balance_loss_mlp": 1.01296866, + "epoch": 0.8620471967533444, + "flos": 37526880739320.0, + "grad_norm": 1.5906952743018594, + "language_loss": 0.67297971, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69644731, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12432861, + "step": 14338, + "time_per_iteration": 2.9932942390441895 + }, + { + "auxiliary_loss_clip": 0.01325551, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.21735907, + "balance_loss_mlp": 1.02019083, + "epoch": 0.8621073200060123, + "flos": 19687201200720.0, + "grad_norm": 2.7490447997430683, + "language_loss": 0.62600744, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64958262, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.11779785, + "step": 14339, + "time_per_iteration": 4.4654541015625 + }, + { + "auxiliary_loss_clip": 0.01315152, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.2087214, + "balance_loss_mlp": 1.01712799, + "epoch": 0.8621674432586803, + "flos": 14540780368800.0, + "grad_norm": 2.3026267076420623, + "language_loss": 0.62584543, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64929169, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12335205, + "step": 14340, + "time_per_iteration": 2.843529224395752 + }, + { + "auxiliary_loss_clip": 0.01305177, + "auxiliary_loss_mlp": 0.01025061, + "balance_loss_clip": 1.20502174, + "balance_loss_mlp": 1.01335466, + "epoch": 0.8622275665113482, + "flos": 20744921999160.0, + "grad_norm": 1.6939908143726723, + "language_loss": 0.80314088, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82644331, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.11712646, + "step": 14341, + "time_per_iteration": 2.8050522804260254 + }, + { + "auxiliary_loss_clip": 0.01319578, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.21393514, + "balance_loss_mlp": 1.01800835, + "epoch": 0.8622876897640163, + "flos": 24721106602320.0, + "grad_norm": 1.7633137251193591, + "language_loss": 0.74684811, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.77034539, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12145996, + "step": 14342, + "time_per_iteration": 2.869194269180298 + }, + { + "auxiliary_loss_clip": 0.01325551, + "auxiliary_loss_mlp": 0.01027648, + "balance_loss_clip": 1.21633959, + "balance_loss_mlp": 1.01453543, + "epoch": 0.8623478130166842, + "flos": 17461883716920.0, + "grad_norm": 1.8320572495901863, + "language_loss": 0.69240755, + "learning_rate": 1.95415287816028e-07, + "loss": 0.71593958, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13098145, + "step": 14343, + "time_per_iteration": 2.739137649536133 + }, + { + "auxiliary_loss_clip": 0.01321521, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.21326756, + "balance_loss_mlp": 1.02405894, + "epoch": 0.8624079362693522, + "flos": 18113139191520.0, + "grad_norm": 2.0704665012579495, + "language_loss": 0.68056524, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70414925, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12823486, + "step": 14344, + "time_per_iteration": 2.7892985343933105 + }, + { + "auxiliary_loss_clip": 0.01325842, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.21595144, + "balance_loss_mlp": 1.02237582, + "epoch": 0.8624680595220201, + "flos": 30672864318600.0, + "grad_norm": 1.4061551983793208, + "language_loss": 0.81692863, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.84053415, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12335205, + "step": 14345, + "time_per_iteration": 2.992070198059082 + }, + { + "auxiliary_loss_clip": 0.01328187, + "auxiliary_loss_mlp": 0.01026022, + "balance_loss_clip": 1.21933985, + "balance_loss_mlp": 1.01321316, + "epoch": 0.8625281827746881, + "flos": 38005757415000.0, + "grad_norm": 2.2169700131218812, + "language_loss": 0.51513696, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.538679, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12811279, + "step": 14346, + "time_per_iteration": 2.8814382553100586 + }, + { + "auxiliary_loss_clip": 0.01319627, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.21237373, + "balance_loss_mlp": 1.0136919, + "epoch": 0.862588306027356, + "flos": 26255308183200.0, + "grad_norm": 1.4523193759101947, + "language_loss": 0.75454199, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77799976, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12457275, + "step": 14347, + "time_per_iteration": 2.830087900161743 + }, + { + "auxiliary_loss_clip": 0.01328084, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.22043252, + "balance_loss_mlp": 1.01702905, + "epoch": 0.862648429280024, + "flos": 25883546025960.0, + "grad_norm": 1.7763204103538244, + "language_loss": 0.80966312, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83324403, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12969971, + "step": 14348, + "time_per_iteration": 2.8027167320251465 + }, + { + "auxiliary_loss_clip": 0.01315607, + "auxiliary_loss_mlp": 0.01024811, + "balance_loss_clip": 1.21241426, + "balance_loss_mlp": 1.01268172, + "epoch": 0.862708552532692, + "flos": 37823592001320.0, + "grad_norm": 1.5632223235688665, + "language_loss": 0.66410309, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68750733, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12139893, + "step": 14349, + "time_per_iteration": 2.920379400253296 + }, + { + "auxiliary_loss_clip": 0.01318775, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.21224785, + "balance_loss_mlp": 1.01808143, + "epoch": 0.86276867578536, + "flos": 19095565444560.0, + "grad_norm": 2.5504210544915398, + "language_loss": 0.70067638, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72417498, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12994385, + "step": 14350, + "time_per_iteration": 2.774592399597168 + }, + { + "auxiliary_loss_clip": 0.01324307, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.2167511, + "balance_loss_mlp": 1.01875782, + "epoch": 0.862828799038028, + "flos": 22169450734920.0, + "grad_norm": 1.7685432130344665, + "language_loss": 0.77402788, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79758662, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12811279, + "step": 14351, + "time_per_iteration": 2.945923089981079 + }, + { + "auxiliary_loss_clip": 0.01321544, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.21508503, + "balance_loss_mlp": 1.01600444, + "epoch": 0.8628889222906959, + "flos": 23150374478640.0, + "grad_norm": 1.9703686073967408, + "language_loss": 0.84797347, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87147349, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12457275, + "step": 14352, + "time_per_iteration": 2.987852096557617 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01008798, + "balance_loss_clip": 1.10064292, + "balance_loss_mlp": 1.00584173, + "epoch": 0.8629490455433639, + "flos": 57833250164280.0, + "grad_norm": 0.785482064867318, + "language_loss": 0.62004828, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.6415742, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02954102, + "step": 14353, + "time_per_iteration": 3.360572338104248 + }, + { + "auxiliary_loss_clip": 0.01321318, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.21649742, + "balance_loss_mlp": 1.01371264, + "epoch": 0.8630091687960318, + "flos": 15923296433160.0, + "grad_norm": 1.7402131896916524, + "language_loss": 0.81558114, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83904332, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.11187744, + "step": 14354, + "time_per_iteration": 3.0756232738494873 + }, + { + "auxiliary_loss_clip": 0.01319242, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.2133069, + "balance_loss_mlp": 1.0161854, + "epoch": 0.8630692920486999, + "flos": 17965433294280.0, + "grad_norm": 1.8769024268717107, + "language_loss": 0.85755229, + "learning_rate": 1.934053380181031e-07, + "loss": 0.88103414, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12744141, + "step": 14355, + "time_per_iteration": 2.859043598175049 + }, + { + "auxiliary_loss_clip": 0.0132921, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.22066319, + "balance_loss_mlp": 1.01697242, + "epoch": 0.8631294153013678, + "flos": 22460233176360.0, + "grad_norm": 1.8835037237296972, + "language_loss": 0.59017432, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61376798, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1317749, + "step": 14356, + "time_per_iteration": 2.979706287384033 + }, + { + "auxiliary_loss_clip": 0.01333603, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.22191811, + "balance_loss_mlp": 1.01940227, + "epoch": 0.8631895385540358, + "flos": 16841433006360.0, + "grad_norm": 1.6533368916056046, + "language_loss": 0.77320325, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79687113, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.13769531, + "step": 14357, + "time_per_iteration": 2.8033905029296875 + }, + { + "auxiliary_loss_clip": 0.01324942, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.21713591, + "balance_loss_mlp": 1.01859772, + "epoch": 0.8632496618067037, + "flos": 18701810537400.0, + "grad_norm": 2.4816595792901244, + "language_loss": 0.78191704, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.80548179, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12927246, + "step": 14358, + "time_per_iteration": 2.8151354789733887 + }, + { + "auxiliary_loss_clip": 0.01327679, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.21949673, + "balance_loss_mlp": 1.01619554, + "epoch": 0.8633097850593717, + "flos": 24285298415400.0, + "grad_norm": 1.280505632023559, + "language_loss": 0.75301683, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77658105, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12536621, + "step": 14359, + "time_per_iteration": 2.936593532562256 + }, + { + "auxiliary_loss_clip": 0.01317103, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.21404111, + "balance_loss_mlp": 1.0142051, + "epoch": 0.8633699083120396, + "flos": 21183532162920.0, + "grad_norm": 1.7414742255079276, + "language_loss": 0.7055006, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72893691, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.12310791, + "step": 14360, + "time_per_iteration": 2.893219232559204 + }, + { + "auxiliary_loss_clip": 0.01329309, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.2190026, + "balance_loss_mlp": 1.01869583, + "epoch": 0.8634300315647077, + "flos": 19249606245960.0, + "grad_norm": 1.6857992525524828, + "language_loss": 0.77072662, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.79434144, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13476562, + "step": 14361, + "time_per_iteration": 2.8081743717193604 + }, + { + "auxiliary_loss_clip": 0.01144744, + "auxiliary_loss_mlp": 0.01005997, + "balance_loss_clip": 1.10032856, + "balance_loss_mlp": 1.00295722, + "epoch": 0.8634901548173756, + "flos": 66210422691960.0, + "grad_norm": 0.9787830706031813, + "language_loss": 0.5887686, + "learning_rate": 1.922374222645329e-07, + "loss": 0.61027598, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.03039551, + "step": 14362, + "time_per_iteration": 3.320911169052124 + }, + { + "auxiliary_loss_clip": 0.01330105, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.21990561, + "balance_loss_mlp": 1.02033544, + "epoch": 0.8635502780700436, + "flos": 24794817421680.0, + "grad_norm": 1.8307309316368292, + "language_loss": 0.80508369, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.8287276, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 1.10205078, + "router_z_loss_mlp": 0.13964844, + "step": 14363, + "time_per_iteration": 2.8562965393066406 + }, + { + "auxiliary_loss_clip": 0.01324733, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.21564829, + "balance_loss_mlp": 1.02512789, + "epoch": 0.8636104013227116, + "flos": 25195191491520.0, + "grad_norm": 3.546968181429468, + "language_loss": 0.72815877, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.75179368, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13635254, + "step": 14364, + "time_per_iteration": 2.9201672077178955 + }, + { + "auxiliary_loss_clip": 0.01325448, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.21560085, + "balance_loss_mlp": 1.01729178, + "epoch": 0.8636705245753795, + "flos": 23883746703120.0, + "grad_norm": 1.5358655386575029, + "language_loss": 0.71776831, + "learning_rate": 1.917379150731755e-07, + "loss": 0.74132144, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12591553, + "step": 14365, + "time_per_iteration": 2.818718910217285 + }, + { + "auxiliary_loss_clip": 0.01330688, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.22032356, + "balance_loss_mlp": 1.01797271, + "epoch": 0.8637306478280475, + "flos": 23115427661880.0, + "grad_norm": 2.4768909559941084, + "language_loss": 0.70885408, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73247504, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13433838, + "step": 14366, + "time_per_iteration": 2.829233169555664 + }, + { + "auxiliary_loss_clip": 0.01317697, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.21275878, + "balance_loss_mlp": 1.01431751, + "epoch": 0.8637907710807154, + "flos": 21911665908960.0, + "grad_norm": 1.5626498150952401, + "language_loss": 0.81874317, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.84218055, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11730957, + "step": 14367, + "time_per_iteration": 4.183480501174927 + }, + { + "auxiliary_loss_clip": 0.01323452, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.21581578, + "balance_loss_mlp": 1.01733208, + "epoch": 0.8638508943333835, + "flos": 23585005023120.0, + "grad_norm": 1.979314300382956, + "language_loss": 0.61548817, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63903368, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13757324, + "step": 14368, + "time_per_iteration": 2.821061849594116 + }, + { + "auxiliary_loss_clip": 0.01322142, + "auxiliary_loss_mlp": 0.01032664, + "balance_loss_clip": 1.21568727, + "balance_loss_mlp": 1.02024198, + "epoch": 0.8639110175860514, + "flos": 25781426335800.0, + "grad_norm": 2.2804219870262674, + "language_loss": 0.76554096, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78908908, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12426758, + "step": 14369, + "time_per_iteration": 4.276330232620239 + }, + { + "auxiliary_loss_clip": 0.01330955, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.22060907, + "balance_loss_mlp": 1.02399993, + "epoch": 0.8639711408387194, + "flos": 23374918038960.0, + "grad_norm": 1.8435751135535683, + "language_loss": 0.64410716, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66778898, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13214111, + "step": 14370, + "time_per_iteration": 2.875978469848633 + }, + { + "auxiliary_loss_clip": 0.01321892, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.21393931, + "balance_loss_mlp": 1.02003014, + "epoch": 0.8640312640913873, + "flos": 22132311066720.0, + "grad_norm": 1.636618177175711, + "language_loss": 0.66344345, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68699157, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12890625, + "step": 14371, + "time_per_iteration": 4.350209951400757 + }, + { + "auxiliary_loss_clip": 0.01144851, + "auxiliary_loss_mlp": 0.01006803, + "balance_loss_clip": 1.10092473, + "balance_loss_mlp": 1.00368011, + "epoch": 0.8640913873440553, + "flos": 57583651245840.0, + "grad_norm": 0.8928405929391736, + "language_loss": 0.56986111, + "learning_rate": 1.905747985193107e-07, + "loss": 0.59137762, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.03125, + "step": 14372, + "time_per_iteration": 3.1265382766723633 + }, + { + "auxiliary_loss_clip": 0.0131761, + "auxiliary_loss_mlp": 0.01028985, + "balance_loss_clip": 1.21329808, + "balance_loss_mlp": 1.01603913, + "epoch": 0.8641515105967232, + "flos": 23992688597760.0, + "grad_norm": 1.8906244855486647, + "language_loss": 0.79676455, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.8202306, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12939453, + "step": 14373, + "time_per_iteration": 2.8143150806427 + }, + { + "auxiliary_loss_clip": 0.01325984, + "auxiliary_loss_mlp": 0.01028662, + "balance_loss_clip": 1.2165755, + "balance_loss_mlp": 1.01581168, + "epoch": 0.8642116338493913, + "flos": 19067481440640.0, + "grad_norm": 2.0531958150619345, + "language_loss": 0.64065647, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.66420293, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12854004, + "step": 14374, + "time_per_iteration": 2.7888972759246826 + }, + { + "auxiliary_loss_clip": 0.01325696, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.22076249, + "balance_loss_mlp": 1.0172534, + "epoch": 0.8642717571020592, + "flos": 18258002503560.0, + "grad_norm": 1.700140783453174, + "language_loss": 0.77168953, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79523385, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.11486816, + "step": 14375, + "time_per_iteration": 2.881274700164795 + }, + { + "auxiliary_loss_clip": 0.01324741, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.21662319, + "balance_loss_mlp": 1.01744151, + "epoch": 0.8643318803547272, + "flos": 57672857477520.0, + "grad_norm": 1.6077855327580202, + "language_loss": 0.61013877, + "learning_rate": 1.899116698488117e-07, + "loss": 0.63368821, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12768555, + "step": 14376, + "time_per_iteration": 3.1690585613250732 + }, + { + "auxiliary_loss_clip": 0.01324372, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.2176671, + "balance_loss_mlp": 1.02284479, + "epoch": 0.8643920036073952, + "flos": 19614261940200.0, + "grad_norm": 1.3258525670868384, + "language_loss": 0.66375595, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68734902, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12103271, + "step": 14377, + "time_per_iteration": 2.816058397293091 + }, + { + "auxiliary_loss_clip": 0.01327411, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.21773469, + "balance_loss_mlp": 1.01986396, + "epoch": 0.8644521268600631, + "flos": 20855285186400.0, + "grad_norm": 1.5620858985959702, + "language_loss": 0.70421016, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72780764, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12481689, + "step": 14378, + "time_per_iteration": 3.03112530708313 + }, + { + "auxiliary_loss_clip": 0.01145465, + "auxiliary_loss_mlp": 0.01000173, + "balance_loss_clip": 1.10137963, + "balance_loss_mlp": 0.99727577, + "epoch": 0.8645122501127311, + "flos": 66736103825520.0, + "grad_norm": 0.8056762381796999, + "language_loss": 0.60274863, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62420499, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.02893066, + "step": 14379, + "time_per_iteration": 4.9128382205963135 + }, + { + "auxiliary_loss_clip": 0.01319059, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.21338153, + "balance_loss_mlp": 1.0181663, + "epoch": 0.864572373365399, + "flos": 21695122195560.0, + "grad_norm": 1.5526304761719838, + "language_loss": 0.74556887, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76906788, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12683105, + "step": 14380, + "time_per_iteration": 2.817802667617798 + }, + { + "auxiliary_loss_clip": 0.01334678, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.2231251, + "balance_loss_mlp": 1.01677012, + "epoch": 0.8646324966180671, + "flos": 20271243193560.0, + "grad_norm": 3.2964414411122926, + "language_loss": 0.75614607, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77978492, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12445068, + "step": 14381, + "time_per_iteration": 2.903428316116333 + }, + { + "auxiliary_loss_clip": 0.01319975, + "auxiliary_loss_mlp": 0.01026632, + "balance_loss_clip": 1.2143048, + "balance_loss_mlp": 1.01464546, + "epoch": 0.864692619870735, + "flos": 11949751373400.0, + "grad_norm": 2.170799232620287, + "language_loss": 0.84810382, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.87156987, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.11993408, + "step": 14382, + "time_per_iteration": 2.7331552505493164 + }, + { + "auxiliary_loss_clip": 0.01328282, + "auxiliary_loss_mlp": 0.01035597, + "balance_loss_clip": 1.2205615, + "balance_loss_mlp": 1.02226937, + "epoch": 0.864752743123403, + "flos": 21475735896960.0, + "grad_norm": 1.716103730779364, + "language_loss": 0.75948608, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.78312492, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13342285, + "step": 14383, + "time_per_iteration": 2.777020215988159 + }, + { + "auxiliary_loss_clip": 0.01321553, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.21649241, + "balance_loss_mlp": 1.01808429, + "epoch": 0.8648128663760709, + "flos": 19533891349800.0, + "grad_norm": 1.8030207598874914, + "language_loss": 0.85187685, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87539548, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12225342, + "step": 14384, + "time_per_iteration": 2.784670829772949 + }, + { + "auxiliary_loss_clip": 0.01318906, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.213027, + "balance_loss_mlp": 1.01771879, + "epoch": 0.8648729896287389, + "flos": 21292920749520.0, + "grad_norm": 1.830542913290963, + "language_loss": 0.80949664, + "learning_rate": 1.884236463176072e-07, + "loss": 0.83298194, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11901855, + "step": 14385, + "time_per_iteration": 2.803596019744873 + }, + { + "auxiliary_loss_clip": 0.01334363, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.22429264, + "balance_loss_mlp": 1.01845336, + "epoch": 0.8649331128814068, + "flos": 24609403339200.0, + "grad_norm": 3.5520344634779555, + "language_loss": 0.72951651, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.75317001, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12548828, + "step": 14386, + "time_per_iteration": 2.9325098991394043 + }, + { + "auxiliary_loss_clip": 0.01320145, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.21443319, + "balance_loss_mlp": 1.02005661, + "epoch": 0.8649932361340749, + "flos": 15382079278920.0, + "grad_norm": 1.81866281591499, + "language_loss": 0.82260746, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84613287, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12341309, + "step": 14387, + "time_per_iteration": 2.832875967025757 + }, + { + "auxiliary_loss_clip": 0.01319442, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.21383512, + "balance_loss_mlp": 1.0172168, + "epoch": 0.8650533593867428, + "flos": 19905978373920.0, + "grad_norm": 1.7614030584328195, + "language_loss": 0.68976247, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.7132538, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12469482, + "step": 14388, + "time_per_iteration": 2.781572103500366 + }, + { + "auxiliary_loss_clip": 0.01316227, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.21318865, + "balance_loss_mlp": 1.02343106, + "epoch": 0.8651134826394108, + "flos": 25632096104160.0, + "grad_norm": 1.890084635001618, + "language_loss": 0.90990734, + "learning_rate": 1.877640883285283e-07, + "loss": 0.93341827, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.11437988, + "step": 14389, + "time_per_iteration": 2.832383394241333 + }, + { + "auxiliary_loss_clip": 0.01320859, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.21521628, + "balance_loss_mlp": 1.01578569, + "epoch": 0.8651736058920788, + "flos": 18739234464120.0, + "grad_norm": 1.60866050866069, + "language_loss": 0.71011662, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.73360765, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12463379, + "step": 14390, + "time_per_iteration": 2.8885750770568848 + }, + { + "auxiliary_loss_clip": 0.01325699, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.21641803, + "balance_loss_mlp": 1.02471983, + "epoch": 0.8652337291447467, + "flos": 20781614975400.0, + "grad_norm": 1.5189411606410956, + "language_loss": 0.82132518, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84495676, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.12756348, + "step": 14391, + "time_per_iteration": 2.905553102493286 + }, + { + "auxiliary_loss_clip": 0.01144337, + "auxiliary_loss_mlp": 0.01006812, + "balance_loss_clip": 1.10085475, + "balance_loss_mlp": 1.00399864, + "epoch": 0.8652938523974147, + "flos": 64242727600680.0, + "grad_norm": 0.7967762549901507, + "language_loss": 0.68046796, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.7019794, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02807617, + "step": 14392, + "time_per_iteration": 3.252577066421509 + }, + { + "auxiliary_loss_clip": 0.01336562, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.22414815, + "balance_loss_mlp": 1.02003622, + "epoch": 0.8653539756500827, + "flos": 18045113542560.0, + "grad_norm": 1.8159997519035769, + "language_loss": 0.75440913, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77811193, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13702393, + "step": 14393, + "time_per_iteration": 2.876574993133545 + }, + { + "auxiliary_loss_clip": 0.01327636, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.21746755, + "balance_loss_mlp": 1.01938748, + "epoch": 0.8654140989027507, + "flos": 17386670388240.0, + "grad_norm": 1.8567191836631685, + "language_loss": 0.73826522, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76186788, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13250732, + "step": 14394, + "time_per_iteration": 2.8154993057250977 + }, + { + "auxiliary_loss_clip": 0.01324607, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.21646416, + "balance_loss_mlp": 1.01274323, + "epoch": 0.8654742221554186, + "flos": 53293862302920.0, + "grad_norm": 1.8465326516049645, + "language_loss": 0.66102183, + "learning_rate": 1.867768130747036e-07, + "loss": 0.68452913, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1338501, + "step": 14395, + "time_per_iteration": 3.1672191619873047 + }, + { + "auxiliary_loss_clip": 0.0132671, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.22085178, + "balance_loss_mlp": 1.01830792, + "epoch": 0.8655343454080866, + "flos": 23919992987400.0, + "grad_norm": 1.654045795020728, + "language_loss": 0.67933923, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.70291281, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12347412, + "step": 14396, + "time_per_iteration": 2.8614580631256104 + }, + { + "auxiliary_loss_clip": 0.0132983, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.21979499, + "balance_loss_mlp": 1.01862359, + "epoch": 0.8655944686607545, + "flos": 24102523876320.0, + "grad_norm": 2.281146958879032, + "language_loss": 0.69774282, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.72135526, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12786865, + "step": 14397, + "time_per_iteration": 2.880281448364258 + }, + { + "auxiliary_loss_clip": 0.0132266, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.21457386, + "balance_loss_mlp": 1.01975691, + "epoch": 0.8656545919134225, + "flos": 23117985988560.0, + "grad_norm": 1.8700566973310806, + "language_loss": 0.63864899, + "learning_rate": 1.86284103591253e-07, + "loss": 0.6621964, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12329102, + "step": 14398, + "time_per_iteration": 3.0430526733398438 + }, + { + "auxiliary_loss_clip": 0.01321137, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.21439731, + "balance_loss_mlp": 1.01559901, + "epoch": 0.8657147151660904, + "flos": 21146676753240.0, + "grad_norm": 1.8172224110375241, + "language_loss": 0.7609973, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78448677, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12213135, + "step": 14399, + "time_per_iteration": 2.9894659519195557 + }, + { + "auxiliary_loss_clip": 0.01316278, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.21120822, + "balance_loss_mlp": 1.01431465, + "epoch": 0.8657748384187585, + "flos": 16293759122880.0, + "grad_norm": 2.0025072955916627, + "language_loss": 0.9346838, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95810765, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11785889, + "step": 14400, + "time_per_iteration": 2.8348476886749268 + }, + { + "auxiliary_loss_clip": 0.01325545, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.21799088, + "balance_loss_mlp": 1.01682949, + "epoch": 0.8658349616714264, + "flos": 30849669428760.0, + "grad_norm": 1.7066999811204102, + "language_loss": 0.6726135, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.6961602, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12286377, + "step": 14401, + "time_per_iteration": 2.917940139770508 + }, + { + "auxiliary_loss_clip": 0.01333787, + "auxiliary_loss_mlp": 0.01028282, + "balance_loss_clip": 1.22312784, + "balance_loss_mlp": 1.01516318, + "epoch": 0.8658950849240944, + "flos": 18957524337000.0, + "grad_norm": 2.089972277319074, + "language_loss": 0.74487478, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.76849544, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13128662, + "step": 14402, + "time_per_iteration": 2.891890287399292 + }, + { + "auxiliary_loss_clip": 0.01315125, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.2099762, + "balance_loss_mlp": 1.01946104, + "epoch": 0.8659552081767624, + "flos": 23369151651840.0, + "grad_norm": 1.660835740825635, + "language_loss": 0.74990582, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77337301, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12133789, + "step": 14403, + "time_per_iteration": 2.8494033813476562 + }, + { + "auxiliary_loss_clip": 0.01330984, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.22301149, + "balance_loss_mlp": 1.01740026, + "epoch": 0.8660153314294303, + "flos": 23847297377040.0, + "grad_norm": 1.622921205271948, + "language_loss": 0.73016357, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75378478, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13739014, + "step": 14404, + "time_per_iteration": 2.8164994716644287 + }, + { + "auxiliary_loss_clip": 0.01317901, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.21288395, + "balance_loss_mlp": 1.01646805, + "epoch": 0.8660754546820983, + "flos": 23117701730040.0, + "grad_norm": 1.7283241225457067, + "language_loss": 0.70793712, + "learning_rate": 1.851368555901447e-07, + "loss": 0.73140842, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12774658, + "step": 14405, + "time_per_iteration": 2.8121306896209717 + }, + { + "auxiliary_loss_clip": 0.01332776, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.22244394, + "balance_loss_mlp": 1.0199461, + "epoch": 0.8661355779347663, + "flos": 14396038881840.0, + "grad_norm": 1.787280430422554, + "language_loss": 0.66694838, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.69060099, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12542725, + "step": 14406, + "time_per_iteration": 2.764296293258667 + }, + { + "auxiliary_loss_clip": 0.01322893, + "auxiliary_loss_mlp": 0.01025157, + "balance_loss_clip": 1.21570706, + "balance_loss_mlp": 1.01353383, + "epoch": 0.8661957011874343, + "flos": 21874972932720.0, + "grad_norm": 1.5732717059808456, + "language_loss": 0.83309811, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85657859, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11621094, + "step": 14407, + "time_per_iteration": 4.3072190284729 + }, + { + "auxiliary_loss_clip": 0.01322147, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.21596122, + "balance_loss_mlp": 1.02219844, + "epoch": 0.8662558244401022, + "flos": 21840188549400.0, + "grad_norm": 1.7569534089313308, + "language_loss": 0.69909322, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72266358, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12701416, + "step": 14408, + "time_per_iteration": 5.890579462051392 + }, + { + "auxiliary_loss_clip": 0.01317463, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.21406353, + "balance_loss_mlp": 1.01872087, + "epoch": 0.8663159476927702, + "flos": 17388578981160.0, + "grad_norm": 1.7417998770577403, + "language_loss": 0.7749635, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79844195, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.11657715, + "step": 14409, + "time_per_iteration": 2.949415445327759 + }, + { + "auxiliary_loss_clip": 0.01333523, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.22293699, + "balance_loss_mlp": 1.01677823, + "epoch": 0.8663760709454381, + "flos": 22752924210720.0, + "grad_norm": 2.0831322156258025, + "language_loss": 0.77110791, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79475176, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14086914, + "step": 14410, + "time_per_iteration": 2.9763617515563965 + }, + { + "auxiliary_loss_clip": 0.01323412, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.21606493, + "balance_loss_mlp": 1.01883554, + "epoch": 0.8664361941981061, + "flos": 17380376092440.0, + "grad_norm": 2.018837455527054, + "language_loss": 0.77518082, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79872906, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12573242, + "step": 14411, + "time_per_iteration": 3.1239161491394043 + }, + { + "auxiliary_loss_clip": 0.01318514, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.21265137, + "balance_loss_mlp": 1.01689184, + "epoch": 0.866496317450774, + "flos": 16038776273760.0, + "grad_norm": 1.687361590936576, + "language_loss": 0.73970342, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76317555, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11810303, + "step": 14412, + "time_per_iteration": 2.9405345916748047 + }, + { + "auxiliary_loss_clip": 0.01314651, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.21109176, + "balance_loss_mlp": 1.01683044, + "epoch": 0.8665564407034421, + "flos": 20819891677680.0, + "grad_norm": 1.7942464182445468, + "language_loss": 0.69911587, + "learning_rate": 1.83829844328371e-07, + "loss": 0.72254264, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11199951, + "step": 14413, + "time_per_iteration": 2.8587968349456787 + }, + { + "auxiliary_loss_clip": 0.01326765, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.21866453, + "balance_loss_mlp": 1.01826048, + "epoch": 0.86661656395611, + "flos": 15819430583520.0, + "grad_norm": 1.8692230355642372, + "language_loss": 0.63552213, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.6591, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12768555, + "step": 14414, + "time_per_iteration": 2.935417890548706 + }, + { + "auxiliary_loss_clip": 0.01324745, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.21736038, + "balance_loss_mlp": 1.01855695, + "epoch": 0.866676687208778, + "flos": 23041798059240.0, + "grad_norm": 1.7053341613858515, + "language_loss": 0.64001691, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66357636, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12640381, + "step": 14415, + "time_per_iteration": 2.953300952911377 + }, + { + "auxiliary_loss_clip": 0.01145267, + "auxiliary_loss_mlp": 0.01008099, + "balance_loss_clip": 1.10194182, + "balance_loss_mlp": 1.00533295, + "epoch": 0.866736810461446, + "flos": 63818655229800.0, + "grad_norm": 0.8095286089555415, + "language_loss": 0.60491788, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62645155, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02770996, + "step": 14416, + "time_per_iteration": 5.057419061660767 + }, + { + "auxiliary_loss_clip": 0.01333527, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.22240353, + "balance_loss_mlp": 1.01905358, + "epoch": 0.8667969337141139, + "flos": 20454301991160.0, + "grad_norm": 1.6093410787222295, + "language_loss": 0.74464953, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76830649, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13116455, + "step": 14417, + "time_per_iteration": 2.9128780364990234 + }, + { + "auxiliary_loss_clip": 0.01322098, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.2149992, + "balance_loss_mlp": 1.01841617, + "epoch": 0.866857056966782, + "flos": 21658835302920.0, + "grad_norm": 2.2621924901302375, + "language_loss": 0.75201118, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77554989, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13354492, + "step": 14418, + "time_per_iteration": 2.812913656234741 + }, + { + "auxiliary_loss_clip": 0.01319157, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.21252131, + "balance_loss_mlp": 1.01998377, + "epoch": 0.8669171802194499, + "flos": 22857480402480.0, + "grad_norm": 1.5608396159403166, + "language_loss": 0.68292749, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70645094, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13220215, + "step": 14419, + "time_per_iteration": 2.8062331676483154 + }, + { + "auxiliary_loss_clip": 0.01328592, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.21973705, + "balance_loss_mlp": 1.01925981, + "epoch": 0.8669773034721179, + "flos": 18738828380520.0, + "grad_norm": 1.6201978776413364, + "language_loss": 0.78994054, + "learning_rate": 1.826898250065465e-07, + "loss": 0.8135401, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12103271, + "step": 14420, + "time_per_iteration": 2.815037488937378 + }, + { + "auxiliary_loss_clip": 0.01329036, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.22085524, + "balance_loss_mlp": 1.01814103, + "epoch": 0.8670374267247858, + "flos": 18920628318960.0, + "grad_norm": 1.4315519113729447, + "language_loss": 0.83462834, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.8582263, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12609863, + "step": 14421, + "time_per_iteration": 2.9277350902557373 + }, + { + "auxiliary_loss_clip": 0.01143486, + "auxiliary_loss_mlp": 0.01001741, + "balance_loss_clip": 1.10041654, + "balance_loss_mlp": 0.99892807, + "epoch": 0.8670975499774538, + "flos": 48829982364000.0, + "grad_norm": 0.7044293824484608, + "language_loss": 0.49156511, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51301742, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02807617, + "step": 14422, + "time_per_iteration": 3.378817558288574 + }, + { + "auxiliary_loss_clip": 0.01320263, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.2135514, + "balance_loss_mlp": 1.01552594, + "epoch": 0.8671576732301217, + "flos": 26141168418480.0, + "grad_norm": 1.6695672838445883, + "language_loss": 0.73818684, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.76166523, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.1204834, + "step": 14423, + "time_per_iteration": 2.891845941543579 + }, + { + "auxiliary_loss_clip": 0.01307089, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.20528793, + "balance_loss_mlp": 1.01617277, + "epoch": 0.8672177964827897, + "flos": 18371817401400.0, + "grad_norm": 1.5838289657109603, + "language_loss": 0.77112156, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.79446459, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.11047363, + "step": 14424, + "time_per_iteration": 2.7865684032440186 + }, + { + "auxiliary_loss_clip": 0.01314709, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.21245909, + "balance_loss_mlp": 1.01630044, + "epoch": 0.8672779197354576, + "flos": 28550844167400.0, + "grad_norm": 1.788052383161739, + "language_loss": 0.71841347, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.74183863, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.1151123, + "step": 14425, + "time_per_iteration": 2.892289161682129 + }, + { + "auxiliary_loss_clip": 0.01331397, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.22166204, + "balance_loss_mlp": 1.01281428, + "epoch": 0.8673380429881257, + "flos": 22387578174360.0, + "grad_norm": 1.5358348844219059, + "language_loss": 0.68537819, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70896137, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.14105225, + "step": 14426, + "time_per_iteration": 2.817718029022217 + }, + { + "auxiliary_loss_clip": 0.0133069, + "auxiliary_loss_mlp": 0.01025899, + "balance_loss_clip": 1.22060895, + "balance_loss_mlp": 1.01344144, + "epoch": 0.8673981662407936, + "flos": 21001407357600.0, + "grad_norm": 1.713033598390192, + "language_loss": 0.70687032, + "learning_rate": 1.815531824008234e-07, + "loss": 0.73043621, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12445068, + "step": 14427, + "time_per_iteration": 2.876786231994629 + }, + { + "auxiliary_loss_clip": 0.01324753, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.21876311, + "balance_loss_mlp": 1.01806784, + "epoch": 0.8674582894934616, + "flos": 24432313970520.0, + "grad_norm": 1.8459458505078177, + "language_loss": 0.68433785, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.7078917, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12561035, + "step": 14428, + "time_per_iteration": 2.8625190258026123 + }, + { + "auxiliary_loss_clip": 0.01317078, + "auxiliary_loss_mlp": 0.01024922, + "balance_loss_clip": 1.20987391, + "balance_loss_mlp": 1.01306677, + "epoch": 0.8675184127461296, + "flos": 20741998197240.0, + "grad_norm": 1.7364566101912127, + "language_loss": 0.70862043, + "learning_rate": 1.812290478794889e-07, + "loss": 0.73204041, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11865234, + "step": 14429, + "time_per_iteration": 2.899000644683838 + }, + { + "auxiliary_loss_clip": 0.01320308, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.21391702, + "balance_loss_mlp": 1.0149734, + "epoch": 0.8675785359987975, + "flos": 19140380092800.0, + "grad_norm": 1.8866546935830946, + "language_loss": 0.67104763, + "learning_rate": 1.810670840677151e-07, + "loss": 0.69452798, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12750244, + "step": 14430, + "time_per_iteration": 2.8951895236968994 + }, + { + "auxiliary_loss_clip": 0.01324897, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_clip": 1.21504831, + "balance_loss_mlp": 1.02358007, + "epoch": 0.8676386592514655, + "flos": 22715784542520.0, + "grad_norm": 2.064544040678846, + "language_loss": 0.6916151, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71523333, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13348389, + "step": 14431, + "time_per_iteration": 2.8770954608917236 + }, + { + "auxiliary_loss_clip": 0.01324306, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.21636295, + "balance_loss_mlp": 1.02057362, + "epoch": 0.8676987825041335, + "flos": 14213914076520.0, + "grad_norm": 2.330649145966404, + "language_loss": 0.64500785, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.66858363, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12695312, + "step": 14432, + "time_per_iteration": 2.890866279602051 + }, + { + "auxiliary_loss_clip": 0.01323765, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.21683311, + "balance_loss_mlp": 1.02074397, + "epoch": 0.8677589057568015, + "flos": 13593950666280.0, + "grad_norm": 1.9717072604832, + "language_loss": 0.78333652, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80690265, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12097168, + "step": 14433, + "time_per_iteration": 2.8395235538482666 + }, + { + "auxiliary_loss_clip": 0.01145567, + "auxiliary_loss_mlp": 0.01002221, + "balance_loss_clip": 1.10210371, + "balance_loss_mlp": 0.99928856, + "epoch": 0.8678190290094694, + "flos": 68948995151160.0, + "grad_norm": 0.7098132861828148, + "language_loss": 0.58552587, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60700375, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02929688, + "step": 14434, + "time_per_iteration": 3.4918181896209717 + }, + { + "auxiliary_loss_clip": 0.01313137, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.20920491, + "balance_loss_mlp": 1.01648712, + "epoch": 0.8678791522621374, + "flos": 32563599921720.0, + "grad_norm": 1.8731864563804563, + "language_loss": 0.80171144, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82512355, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.11584473, + "step": 14435, + "time_per_iteration": 3.0753400325775146 + }, + { + "auxiliary_loss_clip": 0.01326317, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.21629047, + "balance_loss_mlp": 1.01490617, + "epoch": 0.8679392755148053, + "flos": 35049341774880.0, + "grad_norm": 1.8191741084337687, + "language_loss": 0.61964226, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64318049, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12597656, + "step": 14436, + "time_per_iteration": 2.9426767826080322 + }, + { + "auxiliary_loss_clip": 0.01328309, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.22013712, + "balance_loss_mlp": 1.01865149, + "epoch": 0.8679993987674733, + "flos": 18557231483880.0, + "grad_norm": 4.16058188058342, + "language_loss": 0.70712662, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.73072767, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13134766, + "step": 14437, + "time_per_iteration": 2.987300395965576 + }, + { + "auxiliary_loss_clip": 0.01324334, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.21730673, + "balance_loss_mlp": 1.01622343, + "epoch": 0.8680595220201412, + "flos": 27460206970200.0, + "grad_norm": 4.716279358931639, + "language_loss": 0.80341506, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82694513, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12457275, + "step": 14438, + "time_per_iteration": 2.8234736919403076 + }, + { + "auxiliary_loss_clip": 0.01316706, + "auxiliary_loss_mlp": 0.01027424, + "balance_loss_clip": 1.21260917, + "balance_loss_mlp": 1.01484776, + "epoch": 0.8681196452728093, + "flos": 19213563003480.0, + "grad_norm": 1.8638621926309755, + "language_loss": 0.67479533, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69823664, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12579346, + "step": 14439, + "time_per_iteration": 2.8141045570373535 + }, + { + "auxiliary_loss_clip": 0.01322579, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.21611643, + "balance_loss_mlp": 1.02080429, + "epoch": 0.8681797685254772, + "flos": 37567512726480.0, + "grad_norm": 1.4607466042270119, + "language_loss": 0.64034677, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.66390061, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11999512, + "step": 14440, + "time_per_iteration": 2.9642622470855713 + }, + { + "auxiliary_loss_clip": 0.01314615, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.21130359, + "balance_loss_mlp": 1.01788783, + "epoch": 0.8682398917781452, + "flos": 23294547448560.0, + "grad_norm": 1.6024767447432828, + "language_loss": 0.65841651, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68186259, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.12103271, + "step": 14441, + "time_per_iteration": 2.8328258991241455 + }, + { + "auxiliary_loss_clip": 0.01314815, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.21185517, + "balance_loss_mlp": 1.01700258, + "epoch": 0.8683000150308132, + "flos": 21877977951360.0, + "grad_norm": 1.4580213808658704, + "language_loss": 0.66565168, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68908376, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.11383057, + "step": 14442, + "time_per_iteration": 2.9364702701568604 + }, + { + "auxiliary_loss_clip": 0.01333137, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.22284746, + "balance_loss_mlp": 1.01808298, + "epoch": 0.8683601382834811, + "flos": 14651265381120.0, + "grad_norm": 1.7199916450914179, + "language_loss": 0.72183013, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74548066, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13842773, + "step": 14443, + "time_per_iteration": 2.75608229637146 + }, + { + "auxiliary_loss_clip": 0.01323882, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.21560478, + "balance_loss_mlp": 1.0148412, + "epoch": 0.8684202615361492, + "flos": 26365590153720.0, + "grad_norm": 1.619014828397966, + "language_loss": 0.83242536, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85593873, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1262207, + "step": 14444, + "time_per_iteration": 2.886415719985962 + }, + { + "auxiliary_loss_clip": 0.01320151, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.2127707, + "balance_loss_mlp": 1.01719439, + "epoch": 0.8684803847888171, + "flos": 20708878756680.0, + "grad_norm": 1.9710881171813328, + "language_loss": 0.77328372, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79678375, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12640381, + "step": 14445, + "time_per_iteration": 2.8126962184906006 + }, + { + "auxiliary_loss_clip": 0.01324374, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.21803629, + "balance_loss_mlp": 1.01632142, + "epoch": 0.8685405080414851, + "flos": 22643291973960.0, + "grad_norm": 1.6783185867018084, + "language_loss": 0.68110907, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.704638, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12200928, + "step": 14446, + "time_per_iteration": 4.294429063796997 + }, + { + "auxiliary_loss_clip": 0.01325841, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.21868932, + "balance_loss_mlp": 1.01481366, + "epoch": 0.868600631294153, + "flos": 24826840436520.0, + "grad_norm": 1.6397964697083665, + "language_loss": 0.83011049, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85364538, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.128479, + "step": 14447, + "time_per_iteration": 4.328519821166992 + }, + { + "auxiliary_loss_clip": 0.0132081, + "auxiliary_loss_mlp": 0.0102453, + "balance_loss_clip": 1.21305537, + "balance_loss_mlp": 1.01246047, + "epoch": 0.868660754546821, + "flos": 25118800520400.0, + "grad_norm": 1.693279134110324, + "language_loss": 0.74080104, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76425451, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12072754, + "step": 14448, + "time_per_iteration": 4.409008502960205 + }, + { + "auxiliary_loss_clip": 0.01323132, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.21606779, + "balance_loss_mlp": 1.01552415, + "epoch": 0.8687208777994889, + "flos": 12681742913640.0, + "grad_norm": 1.9754265925294707, + "language_loss": 0.80956829, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.83308315, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12823486, + "step": 14449, + "time_per_iteration": 2.8941359519958496 + }, + { + "auxiliary_loss_clip": 0.01142987, + "auxiliary_loss_mlp": 0.01003986, + "balance_loss_clip": 1.0998261, + "balance_loss_mlp": 1.00127983, + "epoch": 0.8687810010521569, + "flos": 65632634386560.0, + "grad_norm": 0.8084097708982341, + "language_loss": 0.60654461, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62801433, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02709961, + "step": 14450, + "time_per_iteration": 3.23079514503479 + }, + { + "auxiliary_loss_clip": 0.01332269, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.22339094, + "balance_loss_mlp": 1.01968312, + "epoch": 0.8688411243048249, + "flos": 24249458214720.0, + "grad_norm": 1.656452052508681, + "language_loss": 0.76546305, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78910673, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.1239624, + "step": 14451, + "time_per_iteration": 2.984105348587036 + }, + { + "auxiliary_loss_clip": 0.01320622, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.21419263, + "balance_loss_mlp": 1.01427603, + "epoch": 0.8689012475574929, + "flos": 18226223138880.0, + "grad_norm": 3.1999437568733193, + "language_loss": 0.73016638, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.75363779, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12231445, + "step": 14452, + "time_per_iteration": 2.8121225833892822 + }, + { + "auxiliary_loss_clip": 0.01329017, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.21988916, + "balance_loss_mlp": 1.01593971, + "epoch": 0.8689613708101608, + "flos": 19651564041840.0, + "grad_norm": 1.658941879202209, + "language_loss": 0.72686613, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.75044912, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13336182, + "step": 14453, + "time_per_iteration": 2.8824667930603027 + }, + { + "auxiliary_loss_clip": 0.0132144, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.21579421, + "balance_loss_mlp": 1.01904678, + "epoch": 0.8690214940628288, + "flos": 11732476709520.0, + "grad_norm": 1.9387355955947478, + "language_loss": 0.74041939, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.76394665, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12231445, + "step": 14454, + "time_per_iteration": 2.917248487472534 + }, + { + "auxiliary_loss_clip": 0.0132336, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.21741748, + "balance_loss_mlp": 1.01945877, + "epoch": 0.8690816173154968, + "flos": 34943567332320.0, + "grad_norm": 1.832868086308145, + "language_loss": 0.59383953, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61739284, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12512207, + "step": 14455, + "time_per_iteration": 4.492396593093872 + }, + { + "auxiliary_loss_clip": 0.01324219, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.21632791, + "balance_loss_mlp": 1.01637006, + "epoch": 0.8691417405681647, + "flos": 11617930861200.0, + "grad_norm": 6.379371036789355, + "language_loss": 0.80509663, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82862693, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12438965, + "step": 14456, + "time_per_iteration": 2.987705945968628 + }, + { + "auxiliary_loss_clip": 0.01332191, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.22258663, + "balance_loss_mlp": 1.02106118, + "epoch": 0.8692018638208328, + "flos": 24613423566840.0, + "grad_norm": 2.8781178533656937, + "language_loss": 0.75663304, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.7803064, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14074707, + "step": 14457, + "time_per_iteration": 2.945564031600952 + }, + { + "auxiliary_loss_clip": 0.01315612, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.21199536, + "balance_loss_mlp": 1.01914954, + "epoch": 0.8692619870735007, + "flos": 26000772026040.0, + "grad_norm": 1.384621994728871, + "language_loss": 0.78610742, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80957091, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11578369, + "step": 14458, + "time_per_iteration": 2.856109380722046 + }, + { + "auxiliary_loss_clip": 0.01324281, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.21842241, + "balance_loss_mlp": 1.02142048, + "epoch": 0.8693221103261687, + "flos": 18046494226800.0, + "grad_norm": 1.5785798766864088, + "language_loss": 0.71276903, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73636138, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13525391, + "step": 14459, + "time_per_iteration": 2.8362090587615967 + }, + { + "auxiliary_loss_clip": 0.0131167, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.20993459, + "balance_loss_mlp": 1.01457024, + "epoch": 0.8693822335788366, + "flos": 27498361847400.0, + "grad_norm": 1.3260154078013977, + "language_loss": 0.73707968, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76045346, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.11138916, + "step": 14460, + "time_per_iteration": 2.8456809520721436 + }, + { + "auxiliary_loss_clip": 0.01329311, + "auxiliary_loss_mlp": 0.01032939, + "balance_loss_clip": 1.21792316, + "balance_loss_mlp": 1.02085114, + "epoch": 0.8694423568315046, + "flos": 24103092393360.0, + "grad_norm": 1.907037715417302, + "language_loss": 0.65250814, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67613065, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.12084961, + "step": 14461, + "time_per_iteration": 2.9430952072143555 + }, + { + "auxiliary_loss_clip": 0.01322243, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.21425509, + "balance_loss_mlp": 1.02258646, + "epoch": 0.8695024800841725, + "flos": 18365807364120.0, + "grad_norm": 2.129286463391473, + "language_loss": 0.82505155, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84863102, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13134766, + "step": 14462, + "time_per_iteration": 2.7954766750335693 + }, + { + "auxiliary_loss_clip": 0.01329394, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.2213316, + "balance_loss_mlp": 1.02312958, + "epoch": 0.8695626033368405, + "flos": 14031829879560.0, + "grad_norm": 1.7657252519786415, + "language_loss": 0.65278184, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67643511, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12811279, + "step": 14463, + "time_per_iteration": 2.782506227493286 + }, + { + "auxiliary_loss_clip": 0.01333854, + "auxiliary_loss_mlp": 0.01037882, + "balance_loss_clip": 1.22312808, + "balance_loss_mlp": 1.02453005, + "epoch": 0.8696227265895085, + "flos": 16841270572920.0, + "grad_norm": 1.8320959278519522, + "language_loss": 0.67105961, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.69477701, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.13348389, + "step": 14464, + "time_per_iteration": 2.8367366790771484 + }, + { + "auxiliary_loss_clip": 0.01329981, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.21947503, + "balance_loss_mlp": 1.02025914, + "epoch": 0.8696828498421765, + "flos": 21804145306920.0, + "grad_norm": 2.154204471871581, + "language_loss": 0.6317184, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.65535021, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.12927246, + "step": 14465, + "time_per_iteration": 2.813434600830078 + }, + { + "auxiliary_loss_clip": 0.01314387, + "auxiliary_loss_mlp": 0.01036909, + "balance_loss_clip": 1.21146619, + "balance_loss_mlp": 1.02525043, + "epoch": 0.8697429730948444, + "flos": 22900183416000.0, + "grad_norm": 1.6004133607984954, + "language_loss": 0.8482725, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.87178552, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.11669922, + "step": 14466, + "time_per_iteration": 2.8804996013641357 + }, + { + "auxiliary_loss_clip": 0.0133347, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.22290444, + "balance_loss_mlp": 1.02225018, + "epoch": 0.8698030963475124, + "flos": 24722852761800.0, + "grad_norm": 2.7166404795105024, + "language_loss": 0.62159348, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.6452893, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13873291, + "step": 14467, + "time_per_iteration": 2.8411736488342285 + }, + { + "auxiliary_loss_clip": 0.01314549, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.21065104, + "balance_loss_mlp": 1.01636529, + "epoch": 0.8698632196001803, + "flos": 28448521435440.0, + "grad_norm": 1.3319101081565354, + "language_loss": 0.69018626, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71361285, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11755371, + "step": 14468, + "time_per_iteration": 3.0544700622558594 + }, + { + "auxiliary_loss_clip": 0.01318306, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.21278048, + "balance_loss_mlp": 1.02105474, + "epoch": 0.8699233428528483, + "flos": 27642250558800.0, + "grad_norm": 1.4541492260830369, + "language_loss": 0.71041298, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73393017, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12353516, + "step": 14469, + "time_per_iteration": 2.979015588760376 + }, + { + "auxiliary_loss_clip": 0.01312418, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.2106185, + "balance_loss_mlp": 1.01523793, + "epoch": 0.8699834661055164, + "flos": 20050232560560.0, + "grad_norm": 2.0994345521577675, + "language_loss": 0.84427661, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86767036, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.11712646, + "step": 14470, + "time_per_iteration": 2.9337613582611084 + }, + { + "auxiliary_loss_clip": 0.01322229, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.21656966, + "balance_loss_mlp": 1.01891041, + "epoch": 0.8700435893581843, + "flos": 23738111832240.0, + "grad_norm": 1.8609603187743373, + "language_loss": 0.72911966, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.7526468, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11578369, + "step": 14471, + "time_per_iteration": 2.848050594329834 + }, + { + "auxiliary_loss_clip": 0.01319333, + "auxiliary_loss_mlp": 0.01028847, + "balance_loss_clip": 1.21432066, + "balance_loss_mlp": 1.01739693, + "epoch": 0.8701037126108523, + "flos": 23553469308600.0, + "grad_norm": 1.3686274641526843, + "language_loss": 0.79122841, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.8147102, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.11450195, + "step": 14472, + "time_per_iteration": 3.0090456008911133 + }, + { + "auxiliary_loss_clip": 0.01318047, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.21192598, + "balance_loss_mlp": 1.01581049, + "epoch": 0.8701638358635202, + "flos": 18848095142040.0, + "grad_norm": 1.756040882544767, + "language_loss": 0.7355299, + "learning_rate": 1.741679706279644e-07, + "loss": 0.75899208, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12341309, + "step": 14473, + "time_per_iteration": 2.8879551887512207 + }, + { + "auxiliary_loss_clip": 0.01326509, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.21739233, + "balance_loss_mlp": 1.01614606, + "epoch": 0.8702239591161882, + "flos": 27934170034320.0, + "grad_norm": 1.491356891228684, + "language_loss": 0.7233972, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74695492, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13110352, + "step": 14474, + "time_per_iteration": 2.9258923530578613 + }, + { + "auxiliary_loss_clip": 0.01321726, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.21561146, + "balance_loss_mlp": 1.0211823, + "epoch": 0.8702840823688561, + "flos": 17237908673640.0, + "grad_norm": 2.056475374185798, + "language_loss": 0.66861606, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69218373, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13848877, + "step": 14475, + "time_per_iteration": 2.854637622833252 + }, + { + "auxiliary_loss_clip": 0.01325936, + "auxiliary_loss_mlp": 0.01024666, + "balance_loss_clip": 1.2172128, + "balance_loss_mlp": 1.012429, + "epoch": 0.8703442056215241, + "flos": 19432340176680.0, + "grad_norm": 1.6995732161817438, + "language_loss": 0.77995539, + "learning_rate": 1.736914088262349e-07, + "loss": 0.80346137, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12243652, + "step": 14476, + "time_per_iteration": 2.824509382247925 + }, + { + "auxiliary_loss_clip": 0.01319231, + "auxiliary_loss_mlp": 0.01027878, + "balance_loss_clip": 1.21425462, + "balance_loss_mlp": 1.0160588, + "epoch": 0.8704043288741921, + "flos": 22279042363320.0, + "grad_norm": 1.7756822653971915, + "language_loss": 0.725721, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74919206, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11810303, + "step": 14477, + "time_per_iteration": 2.8434324264526367 + }, + { + "auxiliary_loss_clip": 0.01324774, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.21728611, + "balance_loss_mlp": 1.01401424, + "epoch": 0.8704644521268601, + "flos": 16652364171480.0, + "grad_norm": 2.998320891723899, + "language_loss": 0.59542048, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61892968, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12139893, + "step": 14478, + "time_per_iteration": 2.77911114692688 + }, + { + "auxiliary_loss_clip": 0.01322013, + "auxiliary_loss_mlp": 0.01023646, + "balance_loss_clip": 1.21904778, + "balance_loss_mlp": 1.01249361, + "epoch": 0.870524575379528, + "flos": 24285623282280.0, + "grad_norm": 1.540205202603195, + "language_loss": 0.71858257, + "learning_rate": 1.732154703087323e-07, + "loss": 0.74203914, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.11151123, + "step": 14479, + "time_per_iteration": 2.853050470352173 + }, + { + "auxiliary_loss_clip": 0.01320712, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.21467721, + "balance_loss_mlp": 1.02113497, + "epoch": 0.870584698632196, + "flos": 28774778602320.0, + "grad_norm": 1.3935803810593403, + "language_loss": 0.71327329, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73681933, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12762451, + "step": 14480, + "time_per_iteration": 2.980701446533203 + }, + { + "auxiliary_loss_clip": 0.01327838, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.21939325, + "balance_loss_mlp": 1.02174985, + "epoch": 0.8706448218848639, + "flos": 32455673236080.0, + "grad_norm": 1.6223971455131156, + "language_loss": 0.69869101, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72231048, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12359619, + "step": 14481, + "time_per_iteration": 2.9045281410217285 + }, + { + "auxiliary_loss_clip": 0.01323297, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.21701837, + "balance_loss_mlp": 1.01440513, + "epoch": 0.8707049451375319, + "flos": 22753249077600.0, + "grad_norm": 1.6132505605306797, + "language_loss": 0.77206624, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79556251, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1192627, + "step": 14482, + "time_per_iteration": 2.8425588607788086 + }, + { + "auxiliary_loss_clip": 0.01324697, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.21909773, + "balance_loss_mlp": 1.01543999, + "epoch": 0.8707650683902, + "flos": 15856285993200.0, + "grad_norm": 1.7918082227694383, + "language_loss": 0.76977503, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.79330021, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12384033, + "step": 14483, + "time_per_iteration": 2.805699110031128 + }, + { + "auxiliary_loss_clip": 0.01334976, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.22367644, + "balance_loss_mlp": 1.02177882, + "epoch": 0.8708251916428679, + "flos": 16471985525640.0, + "grad_norm": 1.9937009819080422, + "language_loss": 0.62515354, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64886022, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13891602, + "step": 14484, + "time_per_iteration": 4.214725494384766 + }, + { + "auxiliary_loss_clip": 0.01321814, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.2155503, + "balance_loss_mlp": 1.01576662, + "epoch": 0.8708853148955359, + "flos": 15382119887280.0, + "grad_norm": 1.679694683431328, + "language_loss": 0.68271548, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70621067, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11920166, + "step": 14485, + "time_per_iteration": 2.951723575592041 + }, + { + "auxiliary_loss_clip": 0.01319574, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.21315026, + "balance_loss_mlp": 1.01954818, + "epoch": 0.8709454381482038, + "flos": 30556978394400.0, + "grad_norm": 1.6820665334492393, + "language_loss": 0.63229102, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.6558131, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13098145, + "step": 14486, + "time_per_iteration": 4.239302158355713 + }, + { + "auxiliary_loss_clip": 0.01329328, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.21905684, + "balance_loss_mlp": 1.02256346, + "epoch": 0.8710055614008718, + "flos": 22606558389360.0, + "grad_norm": 2.4677156505580795, + "language_loss": 0.61873472, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.64239347, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13989258, + "step": 14487, + "time_per_iteration": 4.352093696594238 + }, + { + "auxiliary_loss_clip": 0.01320133, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.21428728, + "balance_loss_mlp": 1.01910174, + "epoch": 0.8710656846535397, + "flos": 18447883505640.0, + "grad_norm": 2.039694267346176, + "language_loss": 0.68038821, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.70389247, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11193848, + "step": 14488, + "time_per_iteration": 2.793405771255493 + }, + { + "auxiliary_loss_clip": 0.01329959, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.22153306, + "balance_loss_mlp": 1.01974726, + "epoch": 0.8711258079062077, + "flos": 16507622684520.0, + "grad_norm": 1.8270121711352492, + "language_loss": 0.85486257, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87848794, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12823486, + "step": 14489, + "time_per_iteration": 2.8752005100250244 + }, + { + "auxiliary_loss_clip": 0.01335658, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.22325349, + "balance_loss_mlp": 1.01811934, + "epoch": 0.8711859311588757, + "flos": 15666811074720.0, + "grad_norm": 3.2614322570941208, + "language_loss": 0.7568624, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.78053963, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.13928223, + "step": 14490, + "time_per_iteration": 2.7541048526763916 + }, + { + "auxiliary_loss_clip": 0.01326376, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.21715593, + "balance_loss_mlp": 1.01941764, + "epoch": 0.8712460544115437, + "flos": 15561483324120.0, + "grad_norm": 2.0830726327288676, + "language_loss": 0.761356, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78494811, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13433838, + "step": 14491, + "time_per_iteration": 2.9480032920837402 + }, + { + "auxiliary_loss_clip": 0.0132483, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.2200247, + "balance_loss_mlp": 1.01856279, + "epoch": 0.8713061776642116, + "flos": 16767925228800.0, + "grad_norm": 1.6147694678537114, + "language_loss": 0.6722222, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69577807, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12207031, + "step": 14492, + "time_per_iteration": 2.9074082374572754 + }, + { + "auxiliary_loss_clip": 0.01314676, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.21068954, + "balance_loss_mlp": 1.01753831, + "epoch": 0.8713663009168796, + "flos": 24285257807040.0, + "grad_norm": 2.173800535734039, + "language_loss": 0.6969893, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.72042775, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11639404, + "step": 14493, + "time_per_iteration": 2.9610917568206787 + }, + { + "auxiliary_loss_clip": 0.01329646, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.22241592, + "balance_loss_mlp": 1.01903057, + "epoch": 0.8714264241695475, + "flos": 23799599535240.0, + "grad_norm": 2.2991583763934655, + "language_loss": 0.89558738, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.919209, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13494873, + "step": 14494, + "time_per_iteration": 4.220580101013184 + }, + { + "auxiliary_loss_clip": 0.0131959, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.21547675, + "balance_loss_mlp": 1.01795852, + "epoch": 0.8714865474222155, + "flos": 38005026464520.0, + "grad_norm": 1.6250175541633258, + "language_loss": 0.59540337, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61889648, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.11755371, + "step": 14495, + "time_per_iteration": 2.919487237930298 + }, + { + "auxiliary_loss_clip": 0.01326955, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.2197448, + "balance_loss_mlp": 1.01739788, + "epoch": 0.8715466706748836, + "flos": 22460842301760.0, + "grad_norm": 1.9524979702091436, + "language_loss": 0.80396193, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82754099, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13574219, + "step": 14496, + "time_per_iteration": 2.7683231830596924 + }, + { + "auxiliary_loss_clip": 0.01324511, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.2167778, + "balance_loss_mlp": 1.01697946, + "epoch": 0.8716067939275515, + "flos": 21220103314080.0, + "grad_norm": 1.968889585910419, + "language_loss": 0.79094309, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81449062, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13262939, + "step": 14497, + "time_per_iteration": 2.766420841217041 + }, + { + "auxiliary_loss_clip": 0.01333241, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.22411668, + "balance_loss_mlp": 1.01753736, + "epoch": 0.8716669171802195, + "flos": 23002262497800.0, + "grad_norm": 2.1375202379605067, + "language_loss": 0.67186904, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69551814, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.14111328, + "step": 14498, + "time_per_iteration": 2.715297222137451 + }, + { + "auxiliary_loss_clip": 0.01326928, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.21967518, + "balance_loss_mlp": 1.01879478, + "epoch": 0.8717270404328874, + "flos": 29789471520360.0, + "grad_norm": 1.4947969669525183, + "language_loss": 0.572519, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59609926, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12304688, + "step": 14499, + "time_per_iteration": 2.8490729331970215 + }, + { + "auxiliary_loss_clip": 0.01330312, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.221102, + "balance_loss_mlp": 1.01702213, + "epoch": 0.8717871636855554, + "flos": 22023247347000.0, + "grad_norm": 1.699588637506981, + "language_loss": 0.8039127, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.82751524, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.12927246, + "step": 14500, + "time_per_iteration": 2.7178189754486084 + }, + { + "auxiliary_loss_clip": 0.01324211, + "auxiliary_loss_mlp": 0.01025607, + "balance_loss_clip": 1.21803939, + "balance_loss_mlp": 1.01323354, + "epoch": 0.8718472869382233, + "flos": 16658617858920.0, + "grad_norm": 1.7956821195299206, + "language_loss": 0.73490912, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75840735, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12371826, + "step": 14501, + "time_per_iteration": 2.7380831241607666 + }, + { + "auxiliary_loss_clip": 0.01335526, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.22432721, + "balance_loss_mlp": 1.01867604, + "epoch": 0.8719074101908914, + "flos": 19499431833360.0, + "grad_norm": 1.715568666993589, + "language_loss": 0.64876181, + "learning_rate": 1.695873325782482e-07, + "loss": 0.67243189, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12799072, + "step": 14502, + "time_per_iteration": 2.8246231079101562 + }, + { + "auxiliary_loss_clip": 0.01325751, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.21780729, + "balance_loss_mlp": 1.01611423, + "epoch": 0.8719675334435593, + "flos": 33077139155640.0, + "grad_norm": 1.5710290717339994, + "language_loss": 0.69102395, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71456707, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12451172, + "step": 14503, + "time_per_iteration": 2.926677942276001 + }, + { + "auxiliary_loss_clip": 0.01322838, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.21565866, + "balance_loss_mlp": 1.01721632, + "epoch": 0.8720276566962273, + "flos": 13630278167280.0, + "grad_norm": 3.3944040196952674, + "language_loss": 0.69682777, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.72035372, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12548828, + "step": 14504, + "time_per_iteration": 2.850828170776367 + }, + { + "auxiliary_loss_clip": 0.01330292, + "auxiliary_loss_mlp": 0.01027407, + "balance_loss_clip": 1.22252953, + "balance_loss_mlp": 1.01466393, + "epoch": 0.8720877799488952, + "flos": 23519497092480.0, + "grad_norm": 2.3570815380485177, + "language_loss": 0.70361686, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72719383, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12750244, + "step": 14505, + "time_per_iteration": 2.8871452808380127 + }, + { + "auxiliary_loss_clip": 0.01323844, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.21813047, + "balance_loss_mlp": 1.01794088, + "epoch": 0.8721479032015632, + "flos": 20819120118840.0, + "grad_norm": 1.4233825866679035, + "language_loss": 0.7872175, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.81074965, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.11437988, + "step": 14506, + "time_per_iteration": 2.8045029640197754 + }, + { + "auxiliary_loss_clip": 0.0133214, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.22141814, + "balance_loss_mlp": 1.01373339, + "epoch": 0.8722080264542311, + "flos": 19468383419160.0, + "grad_norm": 2.5658185195436873, + "language_loss": 0.74613094, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76971477, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12524414, + "step": 14507, + "time_per_iteration": 2.763462543487549 + }, + { + "auxiliary_loss_clip": 0.01331738, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.22111142, + "balance_loss_mlp": 1.01842189, + "epoch": 0.8722681497068991, + "flos": 21766843205280.0, + "grad_norm": 1.9008295842510552, + "language_loss": 0.729168, + "learning_rate": 1.686468975443156e-07, + "loss": 0.75279915, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12957764, + "step": 14508, + "time_per_iteration": 2.7487008571624756 + }, + { + "auxiliary_loss_clip": 0.01334794, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.22384596, + "balance_loss_mlp": 1.01546991, + "epoch": 0.8723282729595672, + "flos": 28883111371560.0, + "grad_norm": 1.6272596136444006, + "language_loss": 0.69111824, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.71475446, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13360596, + "step": 14509, + "time_per_iteration": 2.817781686782837 + }, + { + "auxiliary_loss_clip": 0.01321438, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.21519876, + "balance_loss_mlp": 1.02089834, + "epoch": 0.8723883962122351, + "flos": 26474816306880.0, + "grad_norm": 2.238305824111148, + "language_loss": 0.58486897, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60841602, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12365723, + "step": 14510, + "time_per_iteration": 2.8478589057922363 + }, + { + "auxiliary_loss_clip": 0.01342243, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.22836471, + "balance_loss_mlp": 1.0215404, + "epoch": 0.8724485194649031, + "flos": 20526307259400.0, + "grad_norm": 2.6693966582302022, + "language_loss": 0.6809181, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.70469403, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13812256, + "step": 14511, + "time_per_iteration": 2.802187919616699 + }, + { + "auxiliary_loss_clip": 0.01330733, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.22066236, + "balance_loss_mlp": 1.01796794, + "epoch": 0.872508642717571, + "flos": 24358806192960.0, + "grad_norm": 1.5806310387338276, + "language_loss": 0.82032192, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.84393704, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1282959, + "step": 14512, + "time_per_iteration": 2.7928106784820557 + }, + { + "auxiliary_loss_clip": 0.01143849, + "auxiliary_loss_mlp": 0.01002249, + "balance_loss_clip": 1.10144556, + "balance_loss_mlp": 0.9995783, + "epoch": 0.872568765970239, + "flos": 61423378467480.0, + "grad_norm": 0.7955817356209485, + "language_loss": 0.58651841, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60797942, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0267334, + "step": 14513, + "time_per_iteration": 3.211189031600952 + }, + { + "auxiliary_loss_clip": 0.01327773, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.22040713, + "balance_loss_mlp": 1.01576293, + "epoch": 0.8726288892229069, + "flos": 22602741203520.0, + "grad_norm": 1.9377496209256593, + "language_loss": 0.76886916, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.79243338, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12884521, + "step": 14514, + "time_per_iteration": 2.7677879333496094 + }, + { + "auxiliary_loss_clip": 0.01331814, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.22135043, + "balance_loss_mlp": 1.01885211, + "epoch": 0.872689012475575, + "flos": 25890774314040.0, + "grad_norm": 2.0789740655388176, + "language_loss": 0.65642732, + "learning_rate": 1.675528831794055e-07, + "loss": 0.68005228, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.1184082, + "step": 14515, + "time_per_iteration": 2.8904294967651367 + }, + { + "auxiliary_loss_clip": 0.01323636, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.21497202, + "balance_loss_mlp": 1.01651239, + "epoch": 0.8727491357282429, + "flos": 21511697922720.0, + "grad_norm": 1.887763434640856, + "language_loss": 0.78853214, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81206548, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13189697, + "step": 14516, + "time_per_iteration": 2.8760769367218018 + }, + { + "auxiliary_loss_clip": 0.01329404, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.21900475, + "balance_loss_mlp": 1.01745832, + "epoch": 0.8728092589809109, + "flos": 19212101102520.0, + "grad_norm": 1.8179086730230218, + "language_loss": 0.72370362, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74730563, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13342285, + "step": 14517, + "time_per_iteration": 2.90063738822937 + }, + { + "auxiliary_loss_clip": 0.01318168, + "auxiliary_loss_mlp": 0.01022497, + "balance_loss_clip": 1.21247816, + "balance_loss_mlp": 1.01114774, + "epoch": 0.8728693822335788, + "flos": 20600261728920.0, + "grad_norm": 1.8397047344345063, + "language_loss": 0.73304021, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.75644684, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11358643, + "step": 14518, + "time_per_iteration": 2.7883095741271973 + }, + { + "auxiliary_loss_clip": 0.01321481, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.21674621, + "balance_loss_mlp": 1.01930797, + "epoch": 0.8729295054862468, + "flos": 21734008023240.0, + "grad_norm": 1.2881691005103846, + "language_loss": 0.74203008, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76555562, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.11755371, + "step": 14519, + "time_per_iteration": 2.785489320755005 + }, + { + "auxiliary_loss_clip": 0.01334633, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.22426462, + "balance_loss_mlp": 1.01385164, + "epoch": 0.8729896287389147, + "flos": 17677615263120.0, + "grad_norm": 2.4969016187139754, + "language_loss": 0.7705791, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.79420125, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1373291, + "step": 14520, + "time_per_iteration": 2.7267980575561523 + }, + { + "auxiliary_loss_clip": 0.01330356, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.22027397, + "balance_loss_mlp": 1.02524662, + "epoch": 0.8730497519915827, + "flos": 24577217890920.0, + "grad_norm": 1.744202413072566, + "language_loss": 0.82254779, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84623891, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1350708, + "step": 14521, + "time_per_iteration": 2.7428410053253174 + }, + { + "auxiliary_loss_clip": 0.01328106, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.21923864, + "balance_loss_mlp": 1.01530886, + "epoch": 0.8731098752442508, + "flos": 13447706670000.0, + "grad_norm": 1.8823768496861577, + "language_loss": 0.76414323, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78771836, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.14099121, + "step": 14522, + "time_per_iteration": 2.7622601985931396 + }, + { + "auxiliary_loss_clip": 0.01322564, + "auxiliary_loss_mlp": 0.01026033, + "balance_loss_clip": 1.21721387, + "balance_loss_mlp": 1.01426709, + "epoch": 0.8731699984969187, + "flos": 23479352405640.0, + "grad_norm": 1.605714614815845, + "language_loss": 0.75125802, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77474391, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11773682, + "step": 14523, + "time_per_iteration": 4.256255149841309 + }, + { + "auxiliary_loss_clip": 0.01323289, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.21730888, + "balance_loss_mlp": 1.01674557, + "epoch": 0.8732301217495867, + "flos": 17717191432920.0, + "grad_norm": 6.800447583110695, + "language_loss": 0.78590596, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80942738, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12103271, + "step": 14524, + "time_per_iteration": 4.195713043212891 + }, + { + "auxiliary_loss_clip": 0.01316956, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.2130847, + "balance_loss_mlp": 1.0153017, + "epoch": 0.8732902450022546, + "flos": 22059453022920.0, + "grad_norm": 2.0469254293531245, + "language_loss": 0.78717995, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.81061769, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.1151123, + "step": 14525, + "time_per_iteration": 4.349140882492065 + }, + { + "auxiliary_loss_clip": 0.013287, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.21898103, + "balance_loss_mlp": 1.01781511, + "epoch": 0.8733503682549226, + "flos": 22278067762680.0, + "grad_norm": 1.6256915289067961, + "language_loss": 0.69311059, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71670121, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12554932, + "step": 14526, + "time_per_iteration": 2.875211715698242 + }, + { + "auxiliary_loss_clip": 0.01340178, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.2277925, + "balance_loss_mlp": 1.02258849, + "epoch": 0.8734104915075905, + "flos": 23369151651840.0, + "grad_norm": 1.9220057124122318, + "language_loss": 0.6150111, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63877594, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.1373291, + "step": 14527, + "time_per_iteration": 2.8152737617492676 + }, + { + "auxiliary_loss_clip": 0.01341482, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.22612941, + "balance_loss_mlp": 1.01970434, + "epoch": 0.8734706147602586, + "flos": 17716785349320.0, + "grad_norm": 1.7409616875696499, + "language_loss": 0.65927064, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.68303835, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.15576172, + "step": 14528, + "time_per_iteration": 2.8288450241088867 + }, + { + "auxiliary_loss_clip": 0.01320349, + "auxiliary_loss_mlp": 0.01025985, + "balance_loss_clip": 1.21621609, + "balance_loss_mlp": 1.01359892, + "epoch": 0.8735307380129265, + "flos": 22053808460880.0, + "grad_norm": 1.745830030976801, + "language_loss": 0.89989543, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.9233588, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12384033, + "step": 14529, + "time_per_iteration": 2.924203634262085 + }, + { + "auxiliary_loss_clip": 0.01318515, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.21401656, + "balance_loss_mlp": 1.01897097, + "epoch": 0.8735908612655945, + "flos": 25343993814480.0, + "grad_norm": 1.6843535655814383, + "language_loss": 0.8421334, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.86564147, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.13323975, + "step": 14530, + "time_per_iteration": 2.8565709590911865 + }, + { + "auxiliary_loss_clip": 0.01326577, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.21800685, + "balance_loss_mlp": 1.01662064, + "epoch": 0.8736509845182624, + "flos": 21545588922120.0, + "grad_norm": 1.4677722263009496, + "language_loss": 0.74363685, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76718581, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.11688232, + "step": 14531, + "time_per_iteration": 2.908574342727661 + }, + { + "auxiliary_loss_clip": 0.01315811, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.21138442, + "balance_loss_mlp": 1.01407743, + "epoch": 0.8737111077709304, + "flos": 22022232138000.0, + "grad_norm": 1.6925708377562507, + "language_loss": 0.62042272, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.64384401, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12231445, + "step": 14532, + "time_per_iteration": 2.8342225551605225 + }, + { + "auxiliary_loss_clip": 0.01143868, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 1.10091925, + "balance_loss_mlp": 0.99790591, + "epoch": 0.8737712310235983, + "flos": 70082010495000.0, + "grad_norm": 0.8200837213606278, + "language_loss": 0.58842093, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60986835, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02966309, + "step": 14533, + "time_per_iteration": 4.940213441848755 + }, + { + "auxiliary_loss_clip": 0.01323135, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.21726775, + "balance_loss_mlp": 1.01638007, + "epoch": 0.8738313542762663, + "flos": 28664293590000.0, + "grad_norm": 1.5851516293916574, + "language_loss": 0.76775086, + "learning_rate": 1.646005846335954e-07, + "loss": 0.79126155, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11547852, + "step": 14534, + "time_per_iteration": 2.8389554023742676 + }, + { + "auxiliary_loss_clip": 0.01325489, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.2179029, + "balance_loss_mlp": 1.0197072, + "epoch": 0.8738914775289344, + "flos": 22351534931880.0, + "grad_norm": 1.9013477254565898, + "language_loss": 0.75312281, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77669466, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.11993408, + "step": 14535, + "time_per_iteration": 2.7785136699676514 + }, + { + "auxiliary_loss_clip": 0.01322098, + "auxiliary_loss_mlp": 0.01028944, + "balance_loss_clip": 1.21492422, + "balance_loss_mlp": 1.01646304, + "epoch": 0.8739516007816023, + "flos": 31766019234120.0, + "grad_norm": 1.6926728158331348, + "language_loss": 0.74544734, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76895773, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12475586, + "step": 14536, + "time_per_iteration": 2.8499672412872314 + }, + { + "auxiliary_loss_clip": 0.01328944, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.21960402, + "balance_loss_mlp": 1.02065206, + "epoch": 0.8740117240342703, + "flos": 21216245519880.0, + "grad_norm": 1.5572209991181682, + "language_loss": 0.64216721, + "learning_rate": 1.641367279482304e-07, + "loss": 0.6657846, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12158203, + "step": 14537, + "time_per_iteration": 2.8443355560302734 + }, + { + "auxiliary_loss_clip": 0.01322394, + "auxiliary_loss_mlp": 0.01023128, + "balance_loss_clip": 1.21588135, + "balance_loss_mlp": 1.01034331, + "epoch": 0.8740718472869382, + "flos": 25191455522400.0, + "grad_norm": 2.100896609672162, + "language_loss": 0.59137207, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.61482728, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12786865, + "step": 14538, + "time_per_iteration": 2.7457191944122314 + }, + { + "auxiliary_loss_clip": 0.01314318, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.21118283, + "balance_loss_mlp": 1.01633096, + "epoch": 0.8741319705396062, + "flos": 19505766737520.0, + "grad_norm": 1.6857416796803801, + "language_loss": 0.68534112, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70877397, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.12640381, + "step": 14539, + "time_per_iteration": 2.79118013381958 + }, + { + "auxiliary_loss_clip": 0.01330953, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.21873069, + "balance_loss_mlp": 1.02357292, + "epoch": 0.8741920937922741, + "flos": 14105012790240.0, + "grad_norm": 1.7590192679498993, + "language_loss": 0.74728274, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.77095866, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 1.12158203, + "router_z_loss_mlp": 0.13079834, + "step": 14540, + "time_per_iteration": 2.7241928577423096 + }, + { + "auxiliary_loss_clip": 0.01320712, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.21422112, + "balance_loss_mlp": 1.01946306, + "epoch": 0.8742522170449422, + "flos": 27715717728000.0, + "grad_norm": 1.7156234519333997, + "language_loss": 0.79161286, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81514025, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12554932, + "step": 14541, + "time_per_iteration": 2.780458450317383 + }, + { + "auxiliary_loss_clip": 0.01335642, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.22374678, + "balance_loss_mlp": 1.01586592, + "epoch": 0.8743123402976101, + "flos": 21147610745520.0, + "grad_norm": 1.8535298762221366, + "language_loss": 0.67096466, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.69462103, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.14141846, + "step": 14542, + "time_per_iteration": 2.7293336391448975 + }, + { + "auxiliary_loss_clip": 0.01144156, + "auxiliary_loss_mlp": 0.01005687, + "balance_loss_clip": 1.10179889, + "balance_loss_mlp": 1.00302875, + "epoch": 0.8743724635502781, + "flos": 60884272947960.0, + "grad_norm": 0.7794667904085711, + "language_loss": 0.54512775, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56662619, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02661133, + "step": 14543, + "time_per_iteration": 3.100924491882324 + }, + { + "auxiliary_loss_clip": 0.0133045, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.22202826, + "balance_loss_mlp": 1.01717567, + "epoch": 0.874432586802946, + "flos": 28114954763760.0, + "grad_norm": 1.8599098187818157, + "language_loss": 0.69678724, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.72039729, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13391113, + "step": 14544, + "time_per_iteration": 2.862421751022339 + }, + { + "auxiliary_loss_clip": 0.01313717, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.21156311, + "balance_loss_mlp": 1.0207057, + "epoch": 0.874492710055614, + "flos": 23555580943320.0, + "grad_norm": 1.4852698243126066, + "language_loss": 0.7622987, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78575426, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.11138916, + "step": 14545, + "time_per_iteration": 2.8135926723480225 + }, + { + "auxiliary_loss_clip": 0.0132072, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.21292543, + "balance_loss_mlp": 1.01749146, + "epoch": 0.8745528333082819, + "flos": 40922962360560.0, + "grad_norm": 1.6067570043459536, + "language_loss": 0.66276944, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68629456, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.14312744, + "step": 14546, + "time_per_iteration": 2.992417812347412 + }, + { + "auxiliary_loss_clip": 0.0132487, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.21781802, + "balance_loss_mlp": 1.01415801, + "epoch": 0.87461295656095, + "flos": 23628154728600.0, + "grad_norm": 1.5612514580412415, + "language_loss": 0.73238087, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75589132, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12017822, + "step": 14547, + "time_per_iteration": 2.86445951461792 + }, + { + "auxiliary_loss_clip": 0.01341766, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.22783923, + "balance_loss_mlp": 1.02107143, + "epoch": 0.874673079813618, + "flos": 38800454909040.0, + "grad_norm": 2.7862623837021325, + "language_loss": 0.69211179, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.7158801, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.13989258, + "step": 14548, + "time_per_iteration": 2.9877912998199463 + }, + { + "auxiliary_loss_clip": 0.01333466, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.22299576, + "balance_loss_mlp": 1.01881146, + "epoch": 0.8747332030662859, + "flos": 23701459464360.0, + "grad_norm": 1.8547489910629469, + "language_loss": 0.70949721, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.73314941, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.1293335, + "step": 14549, + "time_per_iteration": 3.0108323097229004 + }, + { + "auxiliary_loss_clip": 0.01338925, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.22599185, + "balance_loss_mlp": 1.02133, + "epoch": 0.8747933263189539, + "flos": 24467869912680.0, + "grad_norm": 2.248317338700789, + "language_loss": 0.83846474, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.86220622, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13909912, + "step": 14550, + "time_per_iteration": 2.8729348182678223 + }, + { + "auxiliary_loss_clip": 0.01331136, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.22085392, + "balance_loss_mlp": 1.01886666, + "epoch": 0.8748534495716218, + "flos": 13813255748160.0, + "grad_norm": 1.5370282915515017, + "language_loss": 0.71645117, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74007368, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12243652, + "step": 14551, + "time_per_iteration": 2.8762803077697754 + }, + { + "auxiliary_loss_clip": 0.01318015, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.21357012, + "balance_loss_mlp": 1.01388121, + "epoch": 0.8749135728242898, + "flos": 29868177168000.0, + "grad_norm": 2.072959303185829, + "language_loss": 0.64268744, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66613197, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12573242, + "step": 14552, + "time_per_iteration": 3.0257859230041504 + }, + { + "auxiliary_loss_clip": 0.01331541, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.22161853, + "balance_loss_mlp": 1.01480889, + "epoch": 0.8749736960769577, + "flos": 24138770160600.0, + "grad_norm": 1.7625763751182804, + "language_loss": 0.7987709, + "learning_rate": 1.616734111284479e-07, + "loss": 0.82237792, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14361572, + "step": 14553, + "time_per_iteration": 2.8526811599731445 + }, + { + "auxiliary_loss_clip": 0.01332892, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.22217679, + "balance_loss_mlp": 1.02047992, + "epoch": 0.8750338193296258, + "flos": 17207103909600.0, + "grad_norm": 1.8209774301845634, + "language_loss": 0.70296037, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72662008, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12591553, + "step": 14554, + "time_per_iteration": 2.8563385009765625 + }, + { + "auxiliary_loss_clip": 0.01323867, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.21684158, + "balance_loss_mlp": 1.01442814, + "epoch": 0.8750939425822937, + "flos": 23738599132560.0, + "grad_norm": 1.4852042370253173, + "language_loss": 0.83641392, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85991699, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11999512, + "step": 14555, + "time_per_iteration": 2.8311269283294678 + }, + { + "auxiliary_loss_clip": 0.01323774, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.21663415, + "balance_loss_mlp": 1.01753771, + "epoch": 0.8751540658349617, + "flos": 26547308875440.0, + "grad_norm": 2.829350542279566, + "language_loss": 0.71090829, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73445833, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13665771, + "step": 14556, + "time_per_iteration": 2.8875234127044678 + }, + { + "auxiliary_loss_clip": 0.01330621, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.22019112, + "balance_loss_mlp": 1.02073216, + "epoch": 0.8752141890876296, + "flos": 19391017847400.0, + "grad_norm": 1.6248761620575918, + "language_loss": 0.76830721, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.7919575, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13671875, + "step": 14557, + "time_per_iteration": 2.8850996494293213 + }, + { + "auxiliary_loss_clip": 0.01330446, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.22236443, + "balance_loss_mlp": 1.0200386, + "epoch": 0.8752743123402976, + "flos": 25379915231880.0, + "grad_norm": 1.7910807443485752, + "language_loss": 0.82863718, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85228133, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.1394043, + "step": 14558, + "time_per_iteration": 2.854412078857422 + }, + { + "auxiliary_loss_clip": 0.01141685, + "auxiliary_loss_mlp": 0.01005001, + "balance_loss_clip": 1.09992218, + "balance_loss_mlp": 1.0020808, + "epoch": 0.8753344355929655, + "flos": 59965811507880.0, + "grad_norm": 0.9057195090202367, + "language_loss": 0.56121165, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.5826785, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.0291748, + "step": 14559, + "time_per_iteration": 3.275209665298462 + }, + { + "auxiliary_loss_clip": 0.01324212, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.21714973, + "balance_loss_mlp": 1.01810169, + "epoch": 0.8753945588456336, + "flos": 17899153804800.0, + "grad_norm": 1.6451135799388348, + "language_loss": 0.66019404, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68374121, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12402344, + "step": 14560, + "time_per_iteration": 2.8822860717773438 + }, + { + "auxiliary_loss_clip": 0.01323232, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.21663034, + "balance_loss_mlp": 1.01497841, + "epoch": 0.8754546820983016, + "flos": 30920415837840.0, + "grad_norm": 1.7981288910948892, + "language_loss": 0.79502195, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.81852746, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12341309, + "step": 14561, + "time_per_iteration": 4.313122272491455 + }, + { + "auxiliary_loss_clip": 0.01331797, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.22092748, + "balance_loss_mlp": 1.01717281, + "epoch": 0.8755148053509695, + "flos": 20635939496160.0, + "grad_norm": 2.131804451836663, + "language_loss": 0.77728415, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.80090952, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13580322, + "step": 14562, + "time_per_iteration": 2.8462111949920654 + }, + { + "auxiliary_loss_clip": 0.01309322, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.20650935, + "balance_loss_mlp": 1.01435351, + "epoch": 0.8755749286036375, + "flos": 34976443122720.0, + "grad_norm": 1.4073487845603971, + "language_loss": 0.71959472, + "learning_rate": 1.601428988367981e-07, + "loss": 0.7429477, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.11621094, + "step": 14563, + "time_per_iteration": 4.494205951690674 + }, + { + "auxiliary_loss_clip": 0.01335015, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.22406626, + "balance_loss_mlp": 1.01873469, + "epoch": 0.8756350518563054, + "flos": 18190992063600.0, + "grad_norm": 2.209787015955267, + "language_loss": 0.66363591, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.68730056, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12713623, + "step": 14564, + "time_per_iteration": 4.293198585510254 + }, + { + "auxiliary_loss_clip": 0.01325348, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.21696901, + "balance_loss_mlp": 1.01871014, + "epoch": 0.8756951751089734, + "flos": 20088834129720.0, + "grad_norm": 1.5263223856989436, + "language_loss": 0.71292609, + "learning_rate": 1.598376334037408e-07, + "loss": 0.73649132, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12469482, + "step": 14565, + "time_per_iteration": 2.8447086811065674 + }, + { + "auxiliary_loss_clip": 0.01343488, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.22951293, + "balance_loss_mlp": 1.01847696, + "epoch": 0.8757552983616413, + "flos": 27530790945840.0, + "grad_norm": 1.5212939276730508, + "language_loss": 0.77391148, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79767859, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.14752197, + "step": 14566, + "time_per_iteration": 2.9106156826019287 + }, + { + "auxiliary_loss_clip": 0.01332681, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.22613645, + "balance_loss_mlp": 1.01801252, + "epoch": 0.8758154216143094, + "flos": 18077014732320.0, + "grad_norm": 1.521816830126626, + "language_loss": 0.71330005, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.7369346, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12768555, + "step": 14567, + "time_per_iteration": 2.8361756801605225 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.21498251, + "balance_loss_mlp": 1.01908374, + "epoch": 0.8758755448669773, + "flos": 25051099738320.0, + "grad_norm": 1.6824259525099061, + "language_loss": 0.7444731, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76799941, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.13146973, + "step": 14568, + "time_per_iteration": 2.8765153884887695 + }, + { + "auxiliary_loss_clip": 0.01322271, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.21657419, + "balance_loss_mlp": 1.02242124, + "epoch": 0.8759356681196453, + "flos": 22861987930440.0, + "grad_norm": 8.506081430825136, + "language_loss": 0.86962366, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89319313, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12243652, + "step": 14569, + "time_per_iteration": 2.840855360031128 + }, + { + "auxiliary_loss_clip": 0.01325004, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.21671736, + "balance_loss_mlp": 1.01587486, + "epoch": 0.8759957913723132, + "flos": 21037450600080.0, + "grad_norm": 4.489308138919339, + "language_loss": 0.74117386, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76470804, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12548828, + "step": 14570, + "time_per_iteration": 2.8229267597198486 + }, + { + "auxiliary_loss_clip": 0.01330035, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.21891212, + "balance_loss_mlp": 1.01662779, + "epoch": 0.8760559146249812, + "flos": 20015204527080.0, + "grad_norm": 1.5316345837752168, + "language_loss": 0.67752433, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.70112371, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13275146, + "step": 14571, + "time_per_iteration": 2.8528265953063965 + }, + { + "auxiliary_loss_clip": 0.01320118, + "auxiliary_loss_mlp": 0.01027781, + "balance_loss_clip": 1.2151171, + "balance_loss_mlp": 1.01556838, + "epoch": 0.8761160378776491, + "flos": 19978917634440.0, + "grad_norm": 1.867605750864588, + "language_loss": 0.63045412, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.65393305, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12207031, + "step": 14572, + "time_per_iteration": 4.552991151809692 + }, + { + "auxiliary_loss_clip": 0.01318881, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.21483994, + "balance_loss_mlp": 1.01572502, + "epoch": 0.8761761611303172, + "flos": 28810415761200.0, + "grad_norm": 1.6440940405452091, + "language_loss": 0.73660231, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.76006329, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.11499023, + "step": 14573, + "time_per_iteration": 3.024174451828003 + }, + { + "auxiliary_loss_clip": 0.01315508, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.2117157, + "balance_loss_mlp": 1.01866746, + "epoch": 0.8762362843829851, + "flos": 18337073626440.0, + "grad_norm": 2.2477263458979686, + "language_loss": 0.72922122, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.75267911, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.1161499, + "step": 14574, + "time_per_iteration": 2.8257217407226562 + }, + { + "auxiliary_loss_clip": 0.01324137, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.21721125, + "balance_loss_mlp": 1.01447606, + "epoch": 0.8762964076356531, + "flos": 15783468557760.0, + "grad_norm": 1.6576095146080052, + "language_loss": 0.76500034, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.7885083, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12182617, + "step": 14575, + "time_per_iteration": 2.8823907375335693 + }, + { + "auxiliary_loss_clip": 0.01319285, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.21479464, + "balance_loss_mlp": 1.01939058, + "epoch": 0.8763565308883211, + "flos": 33182548122960.0, + "grad_norm": 1.7578575457866634, + "language_loss": 0.66982961, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69333827, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12207031, + "step": 14576, + "time_per_iteration": 2.9127602577209473 + }, + { + "auxiliary_loss_clip": 0.01323921, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.21697021, + "balance_loss_mlp": 1.01585603, + "epoch": 0.876416654140989, + "flos": 15892369844040.0, + "grad_norm": 1.6266840298540666, + "language_loss": 0.67222595, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.69574797, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12426758, + "step": 14577, + "time_per_iteration": 2.7983973026275635 + }, + { + "auxiliary_loss_clip": 0.0133203, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.22237957, + "balance_loss_mlp": 1.02257943, + "epoch": 0.876476777393657, + "flos": 25890814922400.0, + "grad_norm": 2.0308789978419233, + "language_loss": 0.71331131, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73698401, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12670898, + "step": 14578, + "time_per_iteration": 2.8567540645599365 + }, + { + "auxiliary_loss_clip": 0.01331989, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.22183156, + "balance_loss_mlp": 1.02080631, + "epoch": 0.876536900646325, + "flos": 13593666407760.0, + "grad_norm": 5.840679997343336, + "language_loss": 0.71051937, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73418009, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13275146, + "step": 14579, + "time_per_iteration": 2.772062063217163 + }, + { + "auxiliary_loss_clip": 0.01320568, + "auxiliary_loss_mlp": 0.01026425, + "balance_loss_clip": 1.21671247, + "balance_loss_mlp": 1.01435471, + "epoch": 0.876597023898993, + "flos": 12207170724120.0, + "grad_norm": 1.8426707929419324, + "language_loss": 0.70623946, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72970939, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12084961, + "step": 14580, + "time_per_iteration": 2.7477807998657227 + }, + { + "auxiliary_loss_clip": 0.01321833, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.21755254, + "balance_loss_mlp": 1.01913404, + "epoch": 0.8766571471516609, + "flos": 25342125829920.0, + "grad_norm": 1.8234817645433272, + "language_loss": 0.65840149, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.68193495, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12384033, + "step": 14581, + "time_per_iteration": 2.869704484939575 + }, + { + "auxiliary_loss_clip": 0.01321884, + "auxiliary_loss_mlp": 0.01024982, + "balance_loss_clip": 1.21655416, + "balance_loss_mlp": 1.01325762, + "epoch": 0.8767172704043289, + "flos": 30118855530960.0, + "grad_norm": 1.4827080018909722, + "language_loss": 0.73771298, + "learning_rate": 1.572541512164416e-07, + "loss": 0.76118171, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11724854, + "step": 14582, + "time_per_iteration": 2.9003984928131104 + }, + { + "auxiliary_loss_clip": 0.01324999, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.21768308, + "balance_loss_mlp": 1.01587749, + "epoch": 0.8767773936569968, + "flos": 19285852530240.0, + "grad_norm": 1.8639315692131389, + "language_loss": 0.6728152, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.6963582, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13415527, + "step": 14583, + "time_per_iteration": 2.889678716659546 + }, + { + "auxiliary_loss_clip": 0.01331035, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.22153735, + "balance_loss_mlp": 1.01338255, + "epoch": 0.8768375169096648, + "flos": 21251639028600.0, + "grad_norm": 1.5559833959916476, + "language_loss": 0.79385203, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81742233, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12609863, + "step": 14584, + "time_per_iteration": 2.8922159671783447 + }, + { + "auxiliary_loss_clip": 0.01332026, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.2230109, + "balance_loss_mlp": 1.01548016, + "epoch": 0.8768976401623327, + "flos": 23300882352720.0, + "grad_norm": 1.5295961230004995, + "language_loss": 0.72752082, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.75112069, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12493896, + "step": 14585, + "time_per_iteration": 2.8281726837158203 + }, + { + "auxiliary_loss_clip": 0.01320406, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.21527326, + "balance_loss_mlp": 1.01818466, + "epoch": 0.8769577634150008, + "flos": 21366266093640.0, + "grad_norm": 1.8293079164584354, + "language_loss": 0.75039756, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.77391315, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12982178, + "step": 14586, + "time_per_iteration": 2.9077281951904297 + }, + { + "auxiliary_loss_clip": 0.01326519, + "auxiliary_loss_mlp": 0.01035158, + "balance_loss_clip": 1.21939206, + "balance_loss_mlp": 1.02200913, + "epoch": 0.8770178866676687, + "flos": 23519375267400.0, + "grad_norm": 1.7727104406693648, + "language_loss": 0.79316515, + "learning_rate": 1.564981454895844e-07, + "loss": 0.81678188, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.13140869, + "step": 14587, + "time_per_iteration": 2.9128975868225098 + }, + { + "auxiliary_loss_clip": 0.01324022, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.21605802, + "balance_loss_mlp": 1.01562738, + "epoch": 0.8770780099203367, + "flos": 19723812960240.0, + "grad_norm": 1.5591931250151647, + "language_loss": 0.73937631, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76290905, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1362915, + "step": 14588, + "time_per_iteration": 2.8542895317077637 + }, + { + "auxiliary_loss_clip": 0.01321471, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.21538353, + "balance_loss_mlp": 1.01667058, + "epoch": 0.8771381331730047, + "flos": 21400685001720.0, + "grad_norm": 1.7104448441670652, + "language_loss": 0.66865051, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.69215107, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11914062, + "step": 14589, + "time_per_iteration": 2.8155853748321533 + }, + { + "auxiliary_loss_clip": 0.0132257, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.21573353, + "balance_loss_mlp": 1.02098763, + "epoch": 0.8771982564256726, + "flos": 20266004715120.0, + "grad_norm": 4.625649559965567, + "language_loss": 0.71599209, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.7395497, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12200928, + "step": 14590, + "time_per_iteration": 2.7760114669799805 + }, + { + "auxiliary_loss_clip": 0.0134099, + "auxiliary_loss_mlp": 0.01040619, + "balance_loss_clip": 1.22695446, + "balance_loss_mlp": 1.02707648, + "epoch": 0.8772583796783406, + "flos": 12492471036960.0, + "grad_norm": 2.2589498426332937, + "language_loss": 0.75330681, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77712291, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 1.14111328, + "router_z_loss_mlp": 0.13543701, + "step": 14591, + "time_per_iteration": 2.7302284240722656 + }, + { + "auxiliary_loss_clip": 0.01311837, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.2099731, + "balance_loss_mlp": 1.0168134, + "epoch": 0.8773185029310085, + "flos": 15924920767560.0, + "grad_norm": 1.6067607292119501, + "language_loss": 0.80000824, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82340759, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.112854, + "step": 14592, + "time_per_iteration": 2.7120606899261475 + }, + { + "auxiliary_loss_clip": 0.01315696, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.21238935, + "balance_loss_mlp": 1.01747823, + "epoch": 0.8773786261836766, + "flos": 21509220812760.0, + "grad_norm": 1.6023903670668485, + "language_loss": 0.83141863, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.85486162, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.11132812, + "step": 14593, + "time_per_iteration": 2.7341620922088623 + }, + { + "auxiliary_loss_clip": 0.01319614, + "auxiliary_loss_mlp": 0.01024893, + "balance_loss_clip": 1.21426678, + "balance_loss_mlp": 1.01278114, + "epoch": 0.8774387494363445, + "flos": 26767263691080.0, + "grad_norm": 1.4595984666758859, + "language_loss": 0.76392734, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78737247, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12109375, + "step": 14594, + "time_per_iteration": 2.7937331199645996 + }, + { + "auxiliary_loss_clip": 0.01324789, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.2171061, + "balance_loss_mlp": 1.0225637, + "epoch": 0.8774988726890125, + "flos": 18483480056160.0, + "grad_norm": 2.016359451805482, + "language_loss": 0.77878332, + "learning_rate": 1.552921717241651e-07, + "loss": 0.80238557, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12866211, + "step": 14595, + "time_per_iteration": 2.78134822845459 + }, + { + "auxiliary_loss_clip": 0.01325117, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.21849585, + "balance_loss_mlp": 1.02054393, + "epoch": 0.8775589959416804, + "flos": 24431826670200.0, + "grad_norm": 1.4556513330667142, + "language_loss": 0.70745891, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.73104715, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13165283, + "step": 14596, + "time_per_iteration": 2.894094228744507 + }, + { + "auxiliary_loss_clip": 0.01321008, + "auxiliary_loss_mlp": 0.01023485, + "balance_loss_clip": 1.21599197, + "balance_loss_mlp": 1.01172519, + "epoch": 0.8776191191943484, + "flos": 23445339581160.0, + "grad_norm": 1.6336623709059377, + "language_loss": 0.86104226, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88448715, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11773682, + "step": 14597, + "time_per_iteration": 2.7655444145202637 + }, + { + "auxiliary_loss_clip": 0.01325231, + "auxiliary_loss_mlp": 0.01026211, + "balance_loss_clip": 1.21888554, + "balance_loss_mlp": 1.01423681, + "epoch": 0.8776792424470163, + "flos": 26836182723960.0, + "grad_norm": 1.587537225803559, + "language_loss": 0.73074543, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75425988, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11981201, + "step": 14598, + "time_per_iteration": 2.818833589553833 + }, + { + "auxiliary_loss_clip": 0.01323996, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.21637177, + "balance_loss_mlp": 1.0191195, + "epoch": 0.8777393656996844, + "flos": 15629549581440.0, + "grad_norm": 2.0435190704467865, + "language_loss": 0.77270043, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79626107, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1295166, + "step": 14599, + "time_per_iteration": 2.766826629638672 + }, + { + "auxiliary_loss_clip": 0.01325344, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.2172848, + "balance_loss_mlp": 1.01495993, + "epoch": 0.8777994889523523, + "flos": 18884666293200.0, + "grad_norm": 2.193139457668107, + "language_loss": 0.68053275, + "learning_rate": 1.545407113589332e-07, + "loss": 0.70405835, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12261963, + "step": 14600, + "time_per_iteration": 4.118253231048584 + }, + { + "auxiliary_loss_clip": 0.01324305, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.2169075, + "balance_loss_mlp": 1.02273643, + "epoch": 0.8778596122050203, + "flos": 48834009237600.0, + "grad_norm": 1.9432065118296133, + "language_loss": 0.69646561, + "learning_rate": 1.543906292031072e-07, + "loss": 0.72006392, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12792969, + "step": 14601, + "time_per_iteration": 4.427490711212158 + }, + { + "auxiliary_loss_clip": 0.01338634, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.22683501, + "balance_loss_mlp": 1.01577711, + "epoch": 0.8779197354576883, + "flos": 25664931286200.0, + "grad_norm": 1.7056325600988633, + "language_loss": 0.73086429, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75453877, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13043213, + "step": 14602, + "time_per_iteration": 4.415170192718506 + }, + { + "auxiliary_loss_clip": 0.01319477, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.21354532, + "balance_loss_mlp": 1.01703954, + "epoch": 0.8779798587103562, + "flos": 18847851491880.0, + "grad_norm": 1.762498249372106, + "language_loss": 0.71139866, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73488641, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12261963, + "step": 14603, + "time_per_iteration": 2.8110544681549072 + }, + { + "auxiliary_loss_clip": 0.01143194, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.10050035, + "balance_loss_mlp": 1.00473011, + "epoch": 0.8780399819630242, + "flos": 68629438363680.0, + "grad_norm": 0.745155982756519, + "language_loss": 0.5417285, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56323504, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02734375, + "step": 14604, + "time_per_iteration": 3.331864833831787 + }, + { + "auxiliary_loss_clip": 0.01143013, + "auxiliary_loss_mlp": 0.01006346, + "balance_loss_clip": 1.09991479, + "balance_loss_mlp": 1.00353241, + "epoch": 0.8781001052156922, + "flos": 65751281679240.0, + "grad_norm": 0.6962220714334405, + "language_loss": 0.5926162, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61410975, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02807617, + "step": 14605, + "time_per_iteration": 3.2521936893463135 + }, + { + "auxiliary_loss_clip": 0.01332856, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.22314191, + "balance_loss_mlp": 1.01883078, + "epoch": 0.8781602284683602, + "flos": 22054133327760.0, + "grad_norm": 2.1883550223591923, + "language_loss": 0.85451007, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87815875, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13171387, + "step": 14606, + "time_per_iteration": 2.9325716495513916 + }, + { + "auxiliary_loss_clip": 0.01332365, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.22290516, + "balance_loss_mlp": 1.01635301, + "epoch": 0.8782203517210281, + "flos": 17566967817360.0, + "grad_norm": 1.7880164148977933, + "language_loss": 0.70952284, + "learning_rate": 1.534916061666931e-07, + "loss": 0.73314136, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13134766, + "step": 14607, + "time_per_iteration": 2.8593690395355225 + }, + { + "auxiliary_loss_clip": 0.01322612, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.21759701, + "balance_loss_mlp": 1.02166402, + "epoch": 0.8782804749736961, + "flos": 25525956186360.0, + "grad_norm": 1.8072777417780603, + "language_loss": 0.72708619, + "learning_rate": 1.533420140300785e-07, + "loss": 0.750651, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12219238, + "step": 14608, + "time_per_iteration": 2.8718278408050537 + }, + { + "auxiliary_loss_clip": 0.0133101, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.22011054, + "balance_loss_mlp": 1.01741147, + "epoch": 0.878340598226364, + "flos": 21803779831680.0, + "grad_norm": 3.548631886008135, + "language_loss": 0.87932503, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.90293944, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13000488, + "step": 14609, + "time_per_iteration": 2.8262264728546143 + }, + { + "auxiliary_loss_clip": 0.01325139, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.21797502, + "balance_loss_mlp": 1.01636589, + "epoch": 0.878400721479032, + "flos": 21106938150000.0, + "grad_norm": 1.5254994374123465, + "language_loss": 0.70452696, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72807026, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12823486, + "step": 14610, + "time_per_iteration": 2.8663318157196045 + }, + { + "auxiliary_loss_clip": 0.01317647, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.21367502, + "balance_loss_mlp": 1.01429272, + "epoch": 0.8784608447316999, + "flos": 20928508705440.0, + "grad_norm": 2.2825909548415653, + "language_loss": 0.8128317, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.83626997, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11883545, + "step": 14611, + "time_per_iteration": 4.3027708530426025 + }, + { + "auxiliary_loss_clip": 0.01326635, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.21862233, + "balance_loss_mlp": 1.01864624, + "epoch": 0.878520967984368, + "flos": 23335829169480.0, + "grad_norm": 1.4996016556146863, + "language_loss": 0.77010292, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.79367793, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12237549, + "step": 14612, + "time_per_iteration": 2.736201524734497 + }, + { + "auxiliary_loss_clip": 0.01323061, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.21650219, + "balance_loss_mlp": 1.01596689, + "epoch": 0.8785810912370359, + "flos": 25524331851960.0, + "grad_norm": 1.724167720651846, + "language_loss": 0.72427076, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74778414, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12310791, + "step": 14613, + "time_per_iteration": 2.836029052734375 + }, + { + "auxiliary_loss_clip": 0.01143052, + "auxiliary_loss_mlp": 0.01009427, + "balance_loss_clip": 1.10040808, + "balance_loss_mlp": 1.0067929, + "epoch": 0.8786412144897039, + "flos": 61853582700720.0, + "grad_norm": 1.0438212750952167, + "language_loss": 0.64695716, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66848195, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 14614, + "time_per_iteration": 3.0237796306610107 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01000405, + "balance_loss_clip": 1.0991286, + "balance_loss_mlp": 0.99791336, + "epoch": 0.8787013377423719, + "flos": 71006603797440.0, + "grad_norm": 0.6668306442023274, + "language_loss": 0.58630586, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60773003, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02490234, + "step": 14615, + "time_per_iteration": 3.3204376697540283 + }, + { + "auxiliary_loss_clip": 0.0132586, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.21760511, + "balance_loss_mlp": 1.01852536, + "epoch": 0.8787614609950398, + "flos": 17351845396560.0, + "grad_norm": 2.3895766790093553, + "language_loss": 0.73384559, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.75741047, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12103271, + "step": 14616, + "time_per_iteration": 2.826218366622925 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01004391, + "balance_loss_clip": 1.09748042, + "balance_loss_mlp": 1.00173247, + "epoch": 0.8788215842477078, + "flos": 72527485836240.0, + "grad_norm": 0.8258495929829254, + "language_loss": 0.58022201, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.601668, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02661133, + "step": 14617, + "time_per_iteration": 3.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01318128, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.21354961, + "balance_loss_mlp": 1.01493526, + "epoch": 0.8788817075003758, + "flos": 24832931690520.0, + "grad_norm": 1.8481869154112063, + "language_loss": 0.83687186, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.86032403, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.121521, + "step": 14618, + "time_per_iteration": 2.8105697631835938 + }, + { + "auxiliary_loss_clip": 0.01312841, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.21141481, + "balance_loss_mlp": 1.01625013, + "epoch": 0.8789418307530438, + "flos": 22644022924440.0, + "grad_norm": 1.562445207794197, + "language_loss": 0.69513869, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71854734, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.11767578, + "step": 14619, + "time_per_iteration": 2.723930597305298 + }, + { + "auxiliary_loss_clip": 0.01335478, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.22555041, + "balance_loss_mlp": 1.02035403, + "epoch": 0.8790019540057117, + "flos": 19789686366120.0, + "grad_norm": 1.6789403912302008, + "language_loss": 0.77430189, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79798484, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12457275, + "step": 14620, + "time_per_iteration": 2.7877416610717773 + }, + { + "auxiliary_loss_clip": 0.0132953, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.22081494, + "balance_loss_mlp": 1.01599526, + "epoch": 0.8790620772583797, + "flos": 20234590825680.0, + "grad_norm": 1.6450008225137025, + "language_loss": 0.79217565, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81576145, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1305542, + "step": 14621, + "time_per_iteration": 2.8897171020507812 + }, + { + "auxiliary_loss_clip": 0.01332516, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.22159338, + "balance_loss_mlp": 1.01885343, + "epoch": 0.8791222005110476, + "flos": 24135724533600.0, + "grad_norm": 1.6729216590360072, + "language_loss": 0.67097056, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.69461191, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12768555, + "step": 14622, + "time_per_iteration": 2.8385369777679443 + }, + { + "auxiliary_loss_clip": 0.01321759, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.21619642, + "balance_loss_mlp": 1.01886177, + "epoch": 0.8791823237637156, + "flos": 21618934266240.0, + "grad_norm": 1.8217599128432715, + "language_loss": 0.73402202, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75755501, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12670898, + "step": 14623, + "time_per_iteration": 2.7747249603271484 + }, + { + "auxiliary_loss_clip": 0.01321167, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.21480918, + "balance_loss_mlp": 1.01787508, + "epoch": 0.8792424470163835, + "flos": 24248767872600.0, + "grad_norm": 1.6833022737957242, + "language_loss": 0.78704178, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.81055886, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12658691, + "step": 14624, + "time_per_iteration": 2.8585364818573 + }, + { + "auxiliary_loss_clip": 0.01326096, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.21786892, + "balance_loss_mlp": 1.01729107, + "epoch": 0.8793025702690516, + "flos": 24897749279040.0, + "grad_norm": 1.6318519693478388, + "language_loss": 0.7986967, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.8222667, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1361084, + "step": 14625, + "time_per_iteration": 2.8782074451446533 + }, + { + "auxiliary_loss_clip": 0.01315375, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.21129751, + "balance_loss_mlp": 1.02095318, + "epoch": 0.8793626935217195, + "flos": 25378453330920.0, + "grad_norm": 1.510200139667029, + "language_loss": 0.74380279, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76728797, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12194824, + "step": 14626, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0133169, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.22183323, + "balance_loss_mlp": 1.0209372, + "epoch": 0.8794228167743875, + "flos": 34684686080640.0, + "grad_norm": 1.4673334297050322, + "language_loss": 0.71348274, + "learning_rate": 1.505130747218246e-07, + "loss": 0.7371335, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12438965, + "step": 14627, + "time_per_iteration": 2.8919568061828613 + }, + { + "auxiliary_loss_clip": 0.01323187, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.21579301, + "balance_loss_mlp": 1.01689196, + "epoch": 0.8794829400270555, + "flos": 19468789502760.0, + "grad_norm": 7.987339859775848, + "language_loss": 0.72455037, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.7480818, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13079834, + "step": 14628, + "time_per_iteration": 2.790776252746582 + }, + { + "auxiliary_loss_clip": 0.01329237, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.22099483, + "balance_loss_mlp": 1.02213275, + "epoch": 0.8795430632797234, + "flos": 15235591632480.0, + "grad_norm": 2.8655345925865094, + "language_loss": 0.68875295, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71239775, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.13110352, + "step": 14629, + "time_per_iteration": 2.7753210067749023 + }, + { + "auxiliary_loss_clip": 0.01317348, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.21266282, + "balance_loss_mlp": 1.01799786, + "epoch": 0.8796031865323914, + "flos": 27750380286240.0, + "grad_norm": 1.477180404831194, + "language_loss": 0.68916142, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.71263719, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12243652, + "step": 14630, + "time_per_iteration": 2.9806900024414062 + }, + { + "auxiliary_loss_clip": 0.0131322, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.21016741, + "balance_loss_mlp": 1.01391935, + "epoch": 0.8796633097850594, + "flos": 31291731303120.0, + "grad_norm": 1.402643206384646, + "language_loss": 0.74232495, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76572382, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.12762451, + "step": 14631, + "time_per_iteration": 2.82729172706604 + }, + { + "auxiliary_loss_clip": 0.01313068, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.2098223, + "balance_loss_mlp": 1.01899338, + "epoch": 0.8797234330377274, + "flos": 24248483614080.0, + "grad_norm": 2.4597409576722558, + "language_loss": 0.69564772, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.7190913, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.1229248, + "step": 14632, + "time_per_iteration": 2.788167953491211 + }, + { + "auxiliary_loss_clip": 0.01324785, + "auxiliary_loss_mlp": 0.01026832, + "balance_loss_clip": 1.21910334, + "balance_loss_mlp": 1.01427913, + "epoch": 0.8797835562903953, + "flos": 24172376901480.0, + "grad_norm": 1.91928336812919, + "language_loss": 0.65438807, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67790425, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12554932, + "step": 14633, + "time_per_iteration": 2.799211263656616 + }, + { + "auxiliary_loss_clip": 0.01321415, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.21591413, + "balance_loss_mlp": 1.01764178, + "epoch": 0.8798436795430633, + "flos": 19289832149520.0, + "grad_norm": 1.411419298007318, + "language_loss": 0.84454584, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86805618, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.11975098, + "step": 14634, + "time_per_iteration": 2.837083339691162 + }, + { + "auxiliary_loss_clip": 0.01325431, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.21687543, + "balance_loss_mlp": 1.02216351, + "epoch": 0.8799038027957312, + "flos": 28184685963840.0, + "grad_norm": 1.6209807158088136, + "language_loss": 0.80633992, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82995307, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13708496, + "step": 14635, + "time_per_iteration": 2.8257980346679688 + }, + { + "auxiliary_loss_clip": 0.01322077, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.21484733, + "balance_loss_mlp": 1.01882541, + "epoch": 0.8799639260483992, + "flos": 24650116543080.0, + "grad_norm": 1.5707837282338835, + "language_loss": 0.65063179, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67416954, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12866211, + "step": 14636, + "time_per_iteration": 2.7934465408325195 + }, + { + "auxiliary_loss_clip": 0.0132424, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.21657872, + "balance_loss_mlp": 1.01921344, + "epoch": 0.8800240493010671, + "flos": 22205615802480.0, + "grad_norm": 1.5411977010482076, + "language_loss": 0.70438975, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.7279554, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13116455, + "step": 14637, + "time_per_iteration": 4.162978172302246 + }, + { + "auxiliary_loss_clip": 0.01324714, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.21852183, + "balance_loss_mlp": 1.01628709, + "epoch": 0.8800841725537352, + "flos": 14250119752440.0, + "grad_norm": 1.8353959428195774, + "language_loss": 0.66432333, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68785644, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12304688, + "step": 14638, + "time_per_iteration": 2.6923537254333496 + }, + { + "auxiliary_loss_clip": 0.01328807, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.22081614, + "balance_loss_mlp": 1.01973009, + "epoch": 0.8801442958064031, + "flos": 37423014889680.0, + "grad_norm": 1.6581705905585604, + "language_loss": 0.58299834, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60660899, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12518311, + "step": 14639, + "time_per_iteration": 2.9050650596618652 + }, + { + "auxiliary_loss_clip": 0.01328541, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.21951926, + "balance_loss_mlp": 1.01830471, + "epoch": 0.8802044190590711, + "flos": 25052967722880.0, + "grad_norm": 1.5284894213775568, + "language_loss": 0.74661541, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.77021611, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13238525, + "step": 14640, + "time_per_iteration": 4.233582496643066 + }, + { + "auxiliary_loss_clip": 0.01323556, + "auxiliary_loss_mlp": 0.01033703, + "balance_loss_clip": 1.21529722, + "balance_loss_mlp": 1.02008963, + "epoch": 0.8802645423117391, + "flos": 24139216852560.0, + "grad_norm": 1.786220911647112, + "language_loss": 0.70019472, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.72376728, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1361084, + "step": 14641, + "time_per_iteration": 4.39409065246582 + }, + { + "auxiliary_loss_clip": 0.01328781, + "auxiliary_loss_mlp": 0.01030042, + "balance_loss_clip": 1.21883023, + "balance_loss_mlp": 1.01617837, + "epoch": 0.880324665564407, + "flos": 17935724955960.0, + "grad_norm": 1.8950810088810044, + "language_loss": 0.85268092, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.8762691, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13867188, + "step": 14642, + "time_per_iteration": 2.7845089435577393 + }, + { + "auxiliary_loss_clip": 0.01323969, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.21798515, + "balance_loss_mlp": 1.02079988, + "epoch": 0.880384788817075, + "flos": 21292555274280.0, + "grad_norm": 2.5950774753218746, + "language_loss": 0.79281235, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81639433, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13421631, + "step": 14643, + "time_per_iteration": 2.793227434158325 + }, + { + "auxiliary_loss_clip": 0.01312802, + "auxiliary_loss_mlp": 0.01025311, + "balance_loss_clip": 1.20931077, + "balance_loss_mlp": 1.01370573, + "epoch": 0.880444912069743, + "flos": 12462316006680.0, + "grad_norm": 1.6002456753666106, + "language_loss": 0.7308259, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75420702, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11608887, + "step": 14644, + "time_per_iteration": 2.823610544204712 + }, + { + "auxiliary_loss_clip": 0.01339076, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.22765374, + "balance_loss_mlp": 1.02088892, + "epoch": 0.880505035322411, + "flos": 13629912692040.0, + "grad_norm": 2.142031445350334, + "language_loss": 0.79448295, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81821191, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.12921143, + "step": 14645, + "time_per_iteration": 2.8781964778900146 + }, + { + "auxiliary_loss_clip": 0.01314648, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.21116292, + "balance_loss_mlp": 1.01360965, + "epoch": 0.8805651585750789, + "flos": 23188001447160.0, + "grad_norm": 1.6227151736200478, + "language_loss": 0.64102548, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66443455, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.12640381, + "step": 14646, + "time_per_iteration": 2.8519504070281982 + }, + { + "auxiliary_loss_clip": 0.01335036, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.22238815, + "balance_loss_mlp": 1.01895869, + "epoch": 0.8806252818277469, + "flos": 14907141614160.0, + "grad_norm": 2.3450699646121813, + "language_loss": 0.7817682, + "learning_rate": 1.475625963334055e-07, + "loss": 0.80543935, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13122559, + "step": 14647, + "time_per_iteration": 2.773400068283081 + }, + { + "auxiliary_loss_clip": 0.01324203, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.21941721, + "balance_loss_mlp": 1.01328683, + "epoch": 0.8806854050804148, + "flos": 17643643047000.0, + "grad_norm": 1.9401343074393282, + "language_loss": 0.75244951, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77594346, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11901855, + "step": 14648, + "time_per_iteration": 4.257475137710571 + }, + { + "auxiliary_loss_clip": 0.0133085, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.22154331, + "balance_loss_mlp": 1.01947665, + "epoch": 0.8807455283330828, + "flos": 25337171610000.0, + "grad_norm": 33.966165608082555, + "language_loss": 0.6579842, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.68160909, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.121521, + "step": 14649, + "time_per_iteration": 2.82956862449646 + }, + { + "auxiliary_loss_clip": 0.01327396, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.22060907, + "balance_loss_mlp": 1.01765192, + "epoch": 0.8808056515857507, + "flos": 25270689078720.0, + "grad_norm": 1.2867945179511153, + "language_loss": 0.62425447, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64783573, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13092041, + "step": 14650, + "time_per_iteration": 2.8051583766937256 + }, + { + "auxiliary_loss_clip": 0.01318662, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.2133863, + "balance_loss_mlp": 1.0169332, + "epoch": 0.8808657748384188, + "flos": 26584570368720.0, + "grad_norm": 1.3589040618264423, + "language_loss": 0.73037881, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75385821, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12335205, + "step": 14651, + "time_per_iteration": 2.774789333343506 + }, + { + "auxiliary_loss_clip": 0.01331497, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.22194123, + "balance_loss_mlp": 1.01702178, + "epoch": 0.8809258980910867, + "flos": 18665889120000.0, + "grad_norm": 1.9835559012973005, + "language_loss": 0.71796679, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.7415877, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13580322, + "step": 14652, + "time_per_iteration": 2.8323512077331543 + }, + { + "auxiliary_loss_clip": 0.01320218, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.21426344, + "balance_loss_mlp": 1.01745844, + "epoch": 0.8809860213437547, + "flos": 19797077087640.0, + "grad_norm": 1.8156718402034646, + "language_loss": 0.75218946, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.77569056, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12432861, + "step": 14653, + "time_per_iteration": 2.7964537143707275 + }, + { + "auxiliary_loss_clip": 0.0133533, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.22402203, + "balance_loss_mlp": 1.01977015, + "epoch": 0.8810461445964227, + "flos": 17898950763000.0, + "grad_norm": 1.7822958047936974, + "language_loss": 0.71347558, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73716098, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13439941, + "step": 14654, + "time_per_iteration": 2.6872575283050537 + }, + { + "auxiliary_loss_clip": 0.01327869, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.21959114, + "balance_loss_mlp": 1.02062988, + "epoch": 0.8811062678490906, + "flos": 29169264459960.0, + "grad_norm": 1.5565754981256212, + "language_loss": 0.71516156, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73878723, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.14080811, + "step": 14655, + "time_per_iteration": 2.882401704788208 + }, + { + "auxiliary_loss_clip": 0.01321555, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.2157203, + "balance_loss_mlp": 1.01838589, + "epoch": 0.8811663911017587, + "flos": 20343613937040.0, + "grad_norm": 3.5327228526779826, + "language_loss": 0.81697541, + "learning_rate": 1.462440453077449e-07, + "loss": 0.84049827, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12353516, + "step": 14656, + "time_per_iteration": 2.819749116897583 + }, + { + "auxiliary_loss_clip": 0.01328655, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.22013712, + "balance_loss_mlp": 1.015692, + "epoch": 0.8812265143544266, + "flos": 25891342831080.0, + "grad_norm": 1.5831496925696882, + "language_loss": 0.68896282, + "learning_rate": 1.460978910372914e-07, + "loss": 0.71252608, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.11975098, + "step": 14657, + "time_per_iteration": 2.872175455093384 + }, + { + "auxiliary_loss_clip": 0.01327518, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.21961474, + "balance_loss_mlp": 1.02327943, + "epoch": 0.8812866376070946, + "flos": 27200838418200.0, + "grad_norm": 1.9971733121002648, + "language_loss": 0.84537095, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86900067, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1217041, + "step": 14658, + "time_per_iteration": 2.882450580596924 + }, + { + "auxiliary_loss_clip": 0.01341295, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.22798204, + "balance_loss_mlp": 1.01778555, + "epoch": 0.8813467608597625, + "flos": 23812716035520.0, + "grad_norm": 1.6795244973265613, + "language_loss": 0.77604306, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79976922, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.13537598, + "step": 14659, + "time_per_iteration": 2.7825992107391357 + }, + { + "auxiliary_loss_clip": 0.01325096, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.21817255, + "balance_loss_mlp": 1.02082694, + "epoch": 0.8814068841124305, + "flos": 21110389860600.0, + "grad_norm": 2.4288274951495903, + "language_loss": 0.60816061, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.63174808, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.1282959, + "step": 14660, + "time_per_iteration": 2.7354047298431396 + }, + { + "auxiliary_loss_clip": 0.01320108, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.21262121, + "balance_loss_mlp": 1.01983511, + "epoch": 0.8814670073650984, + "flos": 24722324853120.0, + "grad_norm": 1.8507087887218125, + "language_loss": 0.78107715, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80461198, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13555908, + "step": 14661, + "time_per_iteration": 2.88519549369812 + }, + { + "auxiliary_loss_clip": 0.0133319, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.22360265, + "balance_loss_mlp": 1.02605748, + "epoch": 0.8815271306177664, + "flos": 22971457733760.0, + "grad_norm": 1.7886979694044243, + "language_loss": 0.77017975, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79390544, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13330078, + "step": 14662, + "time_per_iteration": 2.8437771797180176 + }, + { + "auxiliary_loss_clip": 0.01315881, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.21269298, + "balance_loss_mlp": 1.01641166, + "epoch": 0.8815872538704344, + "flos": 19464322583160.0, + "grad_norm": 2.3392795634321386, + "language_loss": 0.73984146, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76327741, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11309814, + "step": 14663, + "time_per_iteration": 2.750786542892456 + }, + { + "auxiliary_loss_clip": 0.01321333, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.21571374, + "balance_loss_mlp": 1.02050805, + "epoch": 0.8816473771231024, + "flos": 32162454293040.0, + "grad_norm": 1.5322224619155969, + "language_loss": 0.69933695, + "learning_rate": 1.450767798584489e-07, + "loss": 0.72287732, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12200928, + "step": 14664, + "time_per_iteration": 2.8601901531219482 + }, + { + "auxiliary_loss_clip": 0.01317472, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.21310973, + "balance_loss_mlp": 1.02057743, + "epoch": 0.8817075003757703, + "flos": 19686998158920.0, + "grad_norm": 1.6988957034072167, + "language_loss": 0.81724012, + "learning_rate": 1.449311881441828e-07, + "loss": 0.84073997, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11938477, + "step": 14665, + "time_per_iteration": 2.9164791107177734 + }, + { + "auxiliary_loss_clip": 0.01326068, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.21878672, + "balance_loss_mlp": 1.02291, + "epoch": 0.8817676236284383, + "flos": 15672780503640.0, + "grad_norm": 1.90786233133172, + "language_loss": 0.58784378, + "learning_rate": 1.447856667743117e-07, + "loss": 0.61145771, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12420654, + "step": 14666, + "time_per_iteration": 2.8186185359954834 + }, + { + "auxiliary_loss_clip": 0.01326376, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.21839225, + "balance_loss_mlp": 1.01641989, + "epoch": 0.8818277468811063, + "flos": 17899762930200.0, + "grad_norm": 2.6336799022021022, + "language_loss": 0.8407433, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.86430657, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13525391, + "step": 14667, + "time_per_iteration": 2.776118278503418 + }, + { + "auxiliary_loss_clip": 0.01323043, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.21648455, + "balance_loss_mlp": 1.01951146, + "epoch": 0.8818878701337742, + "flos": 18775155881520.0, + "grad_norm": 1.73070113155098, + "language_loss": 0.62371314, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64727032, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13165283, + "step": 14668, + "time_per_iteration": 2.7660531997680664 + }, + { + "auxiliary_loss_clip": 0.01321563, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.21716094, + "balance_loss_mlp": 1.01859534, + "epoch": 0.8819479933864423, + "flos": 17716785349320.0, + "grad_norm": 2.0427521071500183, + "language_loss": 0.57119256, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59470683, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11273193, + "step": 14669, + "time_per_iteration": 2.865354061126709 + }, + { + "auxiliary_loss_clip": 0.01324936, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.21790051, + "balance_loss_mlp": 1.01716781, + "epoch": 0.8820081166391102, + "flos": 11732476709520.0, + "grad_norm": 5.1642638630026205, + "language_loss": 0.72002351, + "learning_rate": 1.442042848491043e-07, + "loss": 0.7435689, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12432861, + "step": 14670, + "time_per_iteration": 2.8480165004730225 + }, + { + "auxiliary_loss_clip": 0.0132082, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.21357489, + "balance_loss_mlp": 1.01706922, + "epoch": 0.8820682398917782, + "flos": 27496128387600.0, + "grad_norm": 1.8416563191500304, + "language_loss": 0.73630416, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75981069, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12774658, + "step": 14671, + "time_per_iteration": 2.836764335632324 + }, + { + "auxiliary_loss_clip": 0.01326676, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.21716452, + "balance_loss_mlp": 1.0185256, + "epoch": 0.8821283631444461, + "flos": 16878329024400.0, + "grad_norm": 1.8376925309636591, + "language_loss": 0.85184991, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87543893, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13696289, + "step": 14672, + "time_per_iteration": 2.718590497970581 + }, + { + "auxiliary_loss_clip": 0.01319139, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.21463513, + "balance_loss_mlp": 1.0188725, + "epoch": 0.8821884863971141, + "flos": 24286476057840.0, + "grad_norm": 1.6776039251548762, + "language_loss": 0.72624397, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74974835, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12414551, + "step": 14673, + "time_per_iteration": 2.8042638301849365 + }, + { + "auxiliary_loss_clip": 0.01140314, + "auxiliary_loss_mlp": 0.01008735, + "balance_loss_clip": 1.09711492, + "balance_loss_mlp": 1.00619566, + "epoch": 0.882248609649782, + "flos": 59449243628160.0, + "grad_norm": 0.8215069340355263, + "language_loss": 0.4948352, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51632571, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02539062, + "step": 14674, + "time_per_iteration": 3.338721990585327 + }, + { + "auxiliary_loss_clip": 0.01322599, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.21417904, + "balance_loss_mlp": 1.02204287, + "epoch": 0.88230873290245, + "flos": 19942549525080.0, + "grad_norm": 1.875712546080711, + "language_loss": 0.7671724, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.79074693, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12823486, + "step": 14675, + "time_per_iteration": 2.7104411125183105 + }, + { + "auxiliary_loss_clip": 0.01317842, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.21244526, + "balance_loss_mlp": 1.01769614, + "epoch": 0.882368856155118, + "flos": 16367145075360.0, + "grad_norm": 1.8540764150671558, + "language_loss": 0.79712033, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.82060421, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12866211, + "step": 14676, + "time_per_iteration": 4.37330961227417 + }, + { + "auxiliary_loss_clip": 0.01144219, + "auxiliary_loss_mlp": 0.0100369, + "balance_loss_clip": 1.09976935, + "balance_loss_mlp": 1.00101995, + "epoch": 0.882428979407786, + "flos": 70612605240120.0, + "grad_norm": 0.6982629609554026, + "language_loss": 0.54824352, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56972259, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.0267334, + "step": 14677, + "time_per_iteration": 3.292390823364258 + }, + { + "auxiliary_loss_clip": 0.013237, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.21691489, + "balance_loss_mlp": 1.01465213, + "epoch": 0.8824891026604539, + "flos": 18154867604400.0, + "grad_norm": 2.219747716547886, + "language_loss": 0.65467769, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67817944, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11828613, + "step": 14678, + "time_per_iteration": 2.694679021835327 + }, + { + "auxiliary_loss_clip": 0.01331757, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.22181928, + "balance_loss_mlp": 1.02121043, + "epoch": 0.8825492259131219, + "flos": 27238140519840.0, + "grad_norm": 1.842680942726604, + "language_loss": 0.71505529, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73871219, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12731934, + "step": 14679, + "time_per_iteration": 4.220904588699341 + }, + { + "auxiliary_loss_clip": 0.01322036, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.21642756, + "balance_loss_mlp": 1.02264071, + "epoch": 0.8826093491657898, + "flos": 22279692097080.0, + "grad_norm": 1.5995048648152264, + "language_loss": 0.64081442, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.6643759, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.1149292, + "step": 14680, + "time_per_iteration": 2.8161821365356445 + }, + { + "auxiliary_loss_clip": 0.01318667, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.21311772, + "balance_loss_mlp": 1.01721382, + "epoch": 0.8826694724184578, + "flos": 14208147689400.0, + "grad_norm": 2.31179104500724, + "language_loss": 0.77444279, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79793316, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.13171387, + "step": 14681, + "time_per_iteration": 4.346393585205078 + }, + { + "auxiliary_loss_clip": 0.01327947, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.21925688, + "balance_loss_mlp": 1.01378775, + "epoch": 0.8827295956711259, + "flos": 20636629838280.0, + "grad_norm": 1.8313718673364023, + "language_loss": 0.72684705, + "learning_rate": 1.424668961888047e-07, + "loss": 0.7503925, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12799072, + "step": 14682, + "time_per_iteration": 2.783134937286377 + }, + { + "auxiliary_loss_clip": 0.01332615, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.22178924, + "balance_loss_mlp": 1.01900578, + "epoch": 0.8827897189237938, + "flos": 18517411663920.0, + "grad_norm": 2.04052761657639, + "language_loss": 0.7493608, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.77302086, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14398193, + "step": 14683, + "time_per_iteration": 2.774339199066162 + }, + { + "auxiliary_loss_clip": 0.01329036, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.22065067, + "balance_loss_mlp": 1.01680875, + "epoch": 0.8828498421764618, + "flos": 22752558735480.0, + "grad_norm": 1.7462964409227664, + "language_loss": 0.65777099, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.68136227, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13293457, + "step": 14684, + "time_per_iteration": 2.771674633026123 + }, + { + "auxiliary_loss_clip": 0.01323298, + "auxiliary_loss_mlp": 0.01027403, + "balance_loss_clip": 1.21638429, + "balance_loss_mlp": 1.01561344, + "epoch": 0.8829099654291297, + "flos": 15016530200760.0, + "grad_norm": 2.118587212792293, + "language_loss": 0.6988126, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.7223196, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.11785889, + "step": 14685, + "time_per_iteration": 2.7857162952423096 + }, + { + "auxiliary_loss_clip": 0.01331672, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.22238231, + "balance_loss_mlp": 1.02089119, + "epoch": 0.8829700886817977, + "flos": 16724044572840.0, + "grad_norm": 6.120432894247139, + "language_loss": 0.75026441, + "learning_rate": 1.418900201783806e-07, + "loss": 0.77391934, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1293335, + "step": 14686, + "time_per_iteration": 2.826200246810913 + }, + { + "auxiliary_loss_clip": 0.01322996, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.21790493, + "balance_loss_mlp": 1.01285863, + "epoch": 0.8830302119344656, + "flos": 15266843088480.0, + "grad_norm": 1.7848187602975252, + "language_loss": 0.63610345, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65958464, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12268066, + "step": 14687, + "time_per_iteration": 4.460963249206543 + }, + { + "auxiliary_loss_clip": 0.01328414, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.21889997, + "balance_loss_mlp": 1.01843452, + "epoch": 0.8830903351871336, + "flos": 28623174302520.0, + "grad_norm": 1.8774094087133864, + "language_loss": 0.69281864, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.7164163, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12921143, + "step": 14688, + "time_per_iteration": 2.9239394664764404 + }, + { + "auxiliary_loss_clip": 0.01317055, + "auxiliary_loss_mlp": 0.01025015, + "balance_loss_clip": 1.21372843, + "balance_loss_mlp": 1.01244402, + "epoch": 0.8831504584398016, + "flos": 28007515378440.0, + "grad_norm": 1.6059055549221477, + "language_loss": 0.67076594, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69418669, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12573242, + "step": 14689, + "time_per_iteration": 2.850937843322754 + }, + { + "auxiliary_loss_clip": 0.01327511, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.22255826, + "balance_loss_mlp": 1.02302074, + "epoch": 0.8832105816924696, + "flos": 26585463752640.0, + "grad_norm": 2.210278770746045, + "language_loss": 0.74450648, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76812971, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.11785889, + "step": 14690, + "time_per_iteration": 2.8871490955352783 + }, + { + "auxiliary_loss_clip": 0.01324455, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.21728826, + "balance_loss_mlp": 1.01895785, + "epoch": 0.8832707049451375, + "flos": 24904043574840.0, + "grad_norm": 1.3804990798498984, + "language_loss": 0.73120278, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.75477028, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13330078, + "step": 14691, + "time_per_iteration": 2.788067102432251 + }, + { + "auxiliary_loss_clip": 0.01341673, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.22949827, + "balance_loss_mlp": 1.01781476, + "epoch": 0.8833308281978055, + "flos": 15455465231400.0, + "grad_norm": 1.7259745912194109, + "language_loss": 0.52296507, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54670227, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14227295, + "step": 14692, + "time_per_iteration": 2.758636713027954 + }, + { + "auxiliary_loss_clip": 0.01327582, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.21837771, + "balance_loss_mlp": 1.0172087, + "epoch": 0.8833909514504734, + "flos": 20306190010320.0, + "grad_norm": 1.8597399722018328, + "language_loss": 0.60936278, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.63293827, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12756348, + "step": 14693, + "time_per_iteration": 2.8219826221466064 + }, + { + "auxiliary_loss_clip": 0.01318291, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.21574962, + "balance_loss_mlp": 1.01546812, + "epoch": 0.8834510747031414, + "flos": 20378641970520.0, + "grad_norm": 1.4452605485283212, + "language_loss": 0.75407964, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77752852, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.11132812, + "step": 14694, + "time_per_iteration": 2.711451292037964 + }, + { + "auxiliary_loss_clip": 0.01331451, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.22065592, + "balance_loss_mlp": 1.02087438, + "epoch": 0.8835111979558095, + "flos": 29758017022560.0, + "grad_norm": 1.88273632099662, + "language_loss": 0.73194468, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.75558782, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.11999512, + "step": 14695, + "time_per_iteration": 2.8828067779541016 + }, + { + "auxiliary_loss_clip": 0.01317813, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.21542132, + "balance_loss_mlp": 1.01765871, + "epoch": 0.8835713212084774, + "flos": 24139988411400.0, + "grad_norm": 1.5997062930033923, + "language_loss": 0.8067261, + "learning_rate": 1.404527630961998e-07, + "loss": 0.83020151, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.1206665, + "step": 14696, + "time_per_iteration": 2.8454580307006836 + }, + { + "auxiliary_loss_clip": 0.01328239, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.22044098, + "balance_loss_mlp": 1.01900125, + "epoch": 0.8836314444611454, + "flos": 27678050151120.0, + "grad_norm": 1.431984372159192, + "language_loss": 0.75079232, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.77438784, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12322998, + "step": 14697, + "time_per_iteration": 2.7832376956939697 + }, + { + "auxiliary_loss_clip": 0.0132143, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.21559978, + "balance_loss_mlp": 1.01607275, + "epoch": 0.8836915677138133, + "flos": 16841392398000.0, + "grad_norm": 2.3636393567399514, + "language_loss": 0.71984005, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74334025, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12506104, + "step": 14698, + "time_per_iteration": 2.7345023155212402 + }, + { + "auxiliary_loss_clip": 0.01141594, + "auxiliary_loss_mlp": 0.01001902, + "balance_loss_clip": 1.09868193, + "balance_loss_mlp": 0.99926776, + "epoch": 0.8837516909664813, + "flos": 69327498296160.0, + "grad_norm": 0.8300873851473262, + "language_loss": 0.53745073, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55888563, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02636719, + "step": 14699, + "time_per_iteration": 3.3098132610321045 + }, + { + "auxiliary_loss_clip": 0.01335761, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.2237395, + "balance_loss_mlp": 1.01751256, + "epoch": 0.8838118142191492, + "flos": 21329735550840.0, + "grad_norm": 2.626193171764483, + "language_loss": 0.77440572, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.79807055, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13195801, + "step": 14700, + "time_per_iteration": 2.877737283706665 + }, + { + "auxiliary_loss_clip": 0.01324162, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.21858203, + "balance_loss_mlp": 1.01785493, + "epoch": 0.8838719374718172, + "flos": 21475573463520.0, + "grad_norm": 1.7774358947100553, + "language_loss": 0.73291582, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75645936, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12341309, + "step": 14701, + "time_per_iteration": 2.865215301513672 + }, + { + "auxiliary_loss_clip": 0.01336855, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.22584724, + "balance_loss_mlp": 1.01651204, + "epoch": 0.8839320607244852, + "flos": 26474572656720.0, + "grad_norm": 2.1112762164439873, + "language_loss": 0.71410298, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.7377696, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13299561, + "step": 14702, + "time_per_iteration": 2.877365827560425 + }, + { + "auxiliary_loss_clip": 0.013324, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.22450233, + "balance_loss_mlp": 1.01896882, + "epoch": 0.8839921839771532, + "flos": 45230317742160.0, + "grad_norm": 1.4971695726844823, + "language_loss": 0.71815068, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.74179196, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12774658, + "step": 14703, + "time_per_iteration": 2.966339111328125 + }, + { + "auxiliary_loss_clip": 0.01318807, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.21453869, + "balance_loss_mlp": 1.01986742, + "epoch": 0.8840523072298211, + "flos": 20011143691080.0, + "grad_norm": 1.9686730647434993, + "language_loss": 0.66849178, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.69199318, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.11468506, + "step": 14704, + "time_per_iteration": 2.829619884490967 + }, + { + "auxiliary_loss_clip": 0.01317552, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.21388197, + "balance_loss_mlp": 1.01780868, + "epoch": 0.8841124304824891, + "flos": 24431420586600.0, + "grad_norm": 1.550646758290645, + "language_loss": 0.70410991, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72758269, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11920166, + "step": 14705, + "time_per_iteration": 2.902949333190918 + }, + { + "auxiliary_loss_clip": 0.01328575, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.22317469, + "balance_loss_mlp": 1.01812816, + "epoch": 0.884172553735157, + "flos": 31290837919200.0, + "grad_norm": 1.6010404384566304, + "language_loss": 0.71011353, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73369545, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11499023, + "step": 14706, + "time_per_iteration": 2.988013505935669 + }, + { + "auxiliary_loss_clip": 0.01324015, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.2166853, + "balance_loss_mlp": 1.02286148, + "epoch": 0.884232676987825, + "flos": 21395324698200.0, + "grad_norm": 1.599570045527009, + "language_loss": 0.7508359, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.77442932, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12475586, + "step": 14707, + "time_per_iteration": 2.8635385036468506 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01005663, + "balance_loss_clip": 1.10097682, + "balance_loss_mlp": 1.00256395, + "epoch": 0.8842928002404931, + "flos": 57924892897560.0, + "grad_norm": 0.8173043094978061, + "language_loss": 0.6043157, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62581623, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.03088379, + "step": 14708, + "time_per_iteration": 3.107513904571533 + }, + { + "auxiliary_loss_clip": 0.01314127, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.21151912, + "balance_loss_mlp": 1.01727641, + "epoch": 0.884352923493161, + "flos": 41471407802880.0, + "grad_norm": 1.645439486418474, + "language_loss": 0.67593145, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69936097, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.11547852, + "step": 14709, + "time_per_iteration": 2.9250104427337646 + }, + { + "auxiliary_loss_clip": 0.01336674, + "auxiliary_loss_mlp": 0.01035468, + "balance_loss_clip": 1.22482347, + "balance_loss_mlp": 1.02060235, + "epoch": 0.884413046745829, + "flos": 46550168461080.0, + "grad_norm": 1.5413483500591096, + "language_loss": 0.62488168, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64860308, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14855957, + "step": 14710, + "time_per_iteration": 3.019495964050293 + }, + { + "auxiliary_loss_clip": 0.01323136, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.21906841, + "balance_loss_mlp": 1.01640821, + "epoch": 0.8844731699984969, + "flos": 19140014617560.0, + "grad_norm": 2.209474757139745, + "language_loss": 0.64108425, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66459429, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.11462402, + "step": 14711, + "time_per_iteration": 2.8509700298309326 + }, + { + "auxiliary_loss_clip": 0.01338042, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.22608948, + "balance_loss_mlp": 1.01905298, + "epoch": 0.8845332932511649, + "flos": 23266179186120.0, + "grad_norm": 1.7055147805969932, + "language_loss": 0.76058549, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78428453, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12811279, + "step": 14712, + "time_per_iteration": 2.7910923957824707 + }, + { + "auxiliary_loss_clip": 0.01328078, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.21929884, + "balance_loss_mlp": 1.01659012, + "epoch": 0.8845934165038328, + "flos": 17568835801920.0, + "grad_norm": 2.4127592307203174, + "language_loss": 0.80947286, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83305413, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13464355, + "step": 14713, + "time_per_iteration": 2.694147825241089 + }, + { + "auxiliary_loss_clip": 0.01321362, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.21471441, + "balance_loss_mlp": 1.0145812, + "epoch": 0.8846535397565009, + "flos": 27490889909160.0, + "grad_norm": 1.4366962958930205, + "language_loss": 0.55954188, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.58303005, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12878418, + "step": 14714, + "time_per_iteration": 2.7983996868133545 + }, + { + "auxiliary_loss_clip": 0.01325043, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.21816027, + "balance_loss_mlp": 1.02109504, + "epoch": 0.8847136630091688, + "flos": 28766575713600.0, + "grad_norm": 1.9701763720864256, + "language_loss": 0.74453157, + "learning_rate": 1.377414057838755e-07, + "loss": 0.76812184, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12884521, + "step": 14715, + "time_per_iteration": 4.277295351028442 + }, + { + "auxiliary_loss_clip": 0.01326056, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.21847224, + "balance_loss_mlp": 1.02076471, + "epoch": 0.8847737862618368, + "flos": 23482154382480.0, + "grad_norm": 2.0609391408398676, + "language_loss": 0.75694525, + "learning_rate": 1.375994086138461e-07, + "loss": 0.78053236, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11889648, + "step": 14716, + "time_per_iteration": 2.812962293624878 + }, + { + "auxiliary_loss_clip": 0.01323256, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.21763897, + "balance_loss_mlp": 1.02208579, + "epoch": 0.8848339095145047, + "flos": 18665483036400.0, + "grad_norm": 1.8683367321821092, + "language_loss": 0.71370769, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.7372849, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12384033, + "step": 14717, + "time_per_iteration": 4.240095615386963 + }, + { + "auxiliary_loss_clip": 0.0131838, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.2172761, + "balance_loss_mlp": 1.01714373, + "epoch": 0.8848940327671727, + "flos": 32276715882840.0, + "grad_norm": 2.103764485815103, + "language_loss": 0.74355257, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76702517, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.1171875, + "step": 14718, + "time_per_iteration": 2.8758630752563477 + }, + { + "auxiliary_loss_clip": 0.01332775, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.22182202, + "balance_loss_mlp": 1.01842785, + "epoch": 0.8849541560198406, + "flos": 24026823247320.0, + "grad_norm": 1.540934365029196, + "language_loss": 0.78615326, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80979842, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 1.11083984, + "router_z_loss_mlp": 0.13323975, + "step": 14719, + "time_per_iteration": 4.307582139968872 + }, + { + "auxiliary_loss_clip": 0.0132804, + "auxiliary_loss_mlp": 0.01026606, + "balance_loss_clip": 1.21855998, + "balance_loss_mlp": 1.01454806, + "epoch": 0.8850142792725086, + "flos": 16877435640480.0, + "grad_norm": 1.7006781231559236, + "language_loss": 0.71871901, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74226546, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.1206665, + "step": 14720, + "time_per_iteration": 2.775930404663086 + }, + { + "auxiliary_loss_clip": 0.01339174, + "auxiliary_loss_mlp": 0.01036088, + "balance_loss_clip": 1.22747529, + "balance_loss_mlp": 1.02287972, + "epoch": 0.8850744025251767, + "flos": 24029340965640.0, + "grad_norm": 4.92753335627302, + "language_loss": 0.8266288, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.85038149, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13214111, + "step": 14721, + "time_per_iteration": 2.8164825439453125 + }, + { + "auxiliary_loss_clip": 0.01330457, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.22170866, + "balance_loss_mlp": 1.01593995, + "epoch": 0.8851345257778446, + "flos": 47963570506200.0, + "grad_norm": 1.8085476081930043, + "language_loss": 0.62578768, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.649382, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13031006, + "step": 14722, + "time_per_iteration": 3.0897834300994873 + }, + { + "auxiliary_loss_clip": 0.01330698, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.22099543, + "balance_loss_mlp": 1.01631141, + "epoch": 0.8851946490305126, + "flos": 36617840438760.0, + "grad_norm": 1.84717863563426, + "language_loss": 0.68849838, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71209729, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12890625, + "step": 14723, + "time_per_iteration": 2.905139207839966 + }, + { + "auxiliary_loss_clip": 0.01329622, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.22153986, + "balance_loss_mlp": 1.01851153, + "epoch": 0.8852547722831805, + "flos": 21549365499600.0, + "grad_norm": 1.719548050353965, + "language_loss": 0.78327405, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80688739, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13201904, + "step": 14724, + "time_per_iteration": 2.771920680999756 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.0100816, + "balance_loss_clip": 1.10088301, + "balance_loss_mlp": 1.00516748, + "epoch": 0.8853148955358485, + "flos": 63073790839440.0, + "grad_norm": 0.8102745401614441, + "language_loss": 0.59006649, + "learning_rate": 1.363246127376143e-07, + "loss": 0.61159199, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02990723, + "step": 14725, + "time_per_iteration": 3.1703684329986572 + }, + { + "auxiliary_loss_clip": 0.01342015, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.22751021, + "balance_loss_mlp": 1.02348065, + "epoch": 0.8853750187885164, + "flos": 18154339695720.0, + "grad_norm": 1.919059469583135, + "language_loss": 0.68915582, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71294522, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13464355, + "step": 14726, + "time_per_iteration": 4.276120662689209 + }, + { + "auxiliary_loss_clip": 0.01324425, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.21814585, + "balance_loss_mlp": 1.01796281, + "epoch": 0.8854351420411845, + "flos": 39578316914880.0, + "grad_norm": 1.2538647961982892, + "language_loss": 0.69752157, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.7210722, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12670898, + "step": 14727, + "time_per_iteration": 2.990054130554199 + }, + { + "auxiliary_loss_clip": 0.01325691, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.2190932, + "balance_loss_mlp": 1.01872373, + "epoch": 0.8854952652938524, + "flos": 23774642375040.0, + "grad_norm": 1.5492311477692646, + "language_loss": 0.70488369, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.7284503, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12231445, + "step": 14728, + "time_per_iteration": 2.795267105102539 + }, + { + "auxiliary_loss_clip": 0.01330912, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.22268629, + "balance_loss_mlp": 1.01899719, + "epoch": 0.8855553885465204, + "flos": 18294126962760.0, + "grad_norm": 2.24659519768029, + "language_loss": 0.66773278, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.69135374, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.1217041, + "step": 14729, + "time_per_iteration": 2.7263312339782715 + }, + { + "auxiliary_loss_clip": 0.01321836, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.21660519, + "balance_loss_mlp": 1.02129507, + "epoch": 0.8856155117991883, + "flos": 36874407013920.0, + "grad_norm": 1.5101694749693728, + "language_loss": 0.63234216, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65589017, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11669922, + "step": 14730, + "time_per_iteration": 2.8609583377838135 + }, + { + "auxiliary_loss_clip": 0.01320401, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.21614218, + "balance_loss_mlp": 1.01794291, + "epoch": 0.8856756350518563, + "flos": 22169004042960.0, + "grad_norm": 1.4980838758226196, + "language_loss": 0.79531723, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81882513, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12445068, + "step": 14731, + "time_per_iteration": 2.8061912059783936 + }, + { + "auxiliary_loss_clip": 0.01328749, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.21939778, + "balance_loss_mlp": 1.02223015, + "epoch": 0.8857357583045242, + "flos": 20745896599800.0, + "grad_norm": 1.5997949524472066, + "language_loss": 0.83230579, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85594058, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12506104, + "step": 14732, + "time_per_iteration": 2.7340197563171387 + }, + { + "auxiliary_loss_clip": 0.01148246, + "auxiliary_loss_mlp": 0.01017993, + "balance_loss_clip": 1.10475159, + "balance_loss_mlp": 1.01482189, + "epoch": 0.8857958815571922, + "flos": 69909103787400.0, + "grad_norm": 0.8936666093156689, + "language_loss": 0.59950066, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.62116307, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.03173828, + "step": 14733, + "time_per_iteration": 3.330533504486084 + }, + { + "auxiliary_loss_clip": 0.01330467, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.22317624, + "balance_loss_mlp": 1.0179534, + "epoch": 0.8858560048098603, + "flos": 15126121829160.0, + "grad_norm": 1.6843370304217293, + "language_loss": 0.66796482, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.69157636, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12738037, + "step": 14734, + "time_per_iteration": 2.7313101291656494 + }, + { + "auxiliary_loss_clip": 0.01321683, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.21765423, + "balance_loss_mlp": 1.02417111, + "epoch": 0.8859161280625282, + "flos": 16614493552800.0, + "grad_norm": 1.8825842616201527, + "language_loss": 0.7549029, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77847898, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.11767578, + "step": 14735, + "time_per_iteration": 2.7253215312957764 + }, + { + "auxiliary_loss_clip": 0.01327305, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.21921539, + "balance_loss_mlp": 1.01901436, + "epoch": 0.8859762513151962, + "flos": 18698724302040.0, + "grad_norm": 1.8393373714060857, + "language_loss": 0.70618021, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72976887, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12554932, + "step": 14736, + "time_per_iteration": 2.7223381996154785 + }, + { + "auxiliary_loss_clip": 0.01329623, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.22087193, + "balance_loss_mlp": 1.01704478, + "epoch": 0.8860363745678641, + "flos": 19541647546560.0, + "grad_norm": 1.725602584014836, + "language_loss": 0.84764373, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.87123549, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12506104, + "step": 14737, + "time_per_iteration": 2.815294027328491 + }, + { + "auxiliary_loss_clip": 0.01344221, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.23167324, + "balance_loss_mlp": 1.01914525, + "epoch": 0.8860964978205321, + "flos": 35961752569320.0, + "grad_norm": 2.0758080830600076, + "language_loss": 0.68291998, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70669079, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13708496, + "step": 14738, + "time_per_iteration": 2.9555699825286865 + }, + { + "auxiliary_loss_clip": 0.01338288, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.22528064, + "balance_loss_mlp": 1.0157795, + "epoch": 0.8861566210732, + "flos": 21217220120520.0, + "grad_norm": 1.7150576658679588, + "language_loss": 0.75373542, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77740884, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13299561, + "step": 14739, + "time_per_iteration": 2.81486177444458 + }, + { + "auxiliary_loss_clip": 0.01325556, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.22027874, + "balance_loss_mlp": 1.01971936, + "epoch": 0.886216744325868, + "flos": 14612907462120.0, + "grad_norm": 1.7330599312161346, + "language_loss": 0.87346834, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89703584, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11462402, + "step": 14740, + "time_per_iteration": 2.9162049293518066 + }, + { + "auxiliary_loss_clip": 0.01329734, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.22305405, + "balance_loss_mlp": 1.01833546, + "epoch": 0.886276867578536, + "flos": 26657347195800.0, + "grad_norm": 1.8226985876924464, + "language_loss": 0.63458002, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65818828, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12750244, + "step": 14741, + "time_per_iteration": 2.9099245071411133 + }, + { + "auxiliary_loss_clip": 0.01332462, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.22585368, + "balance_loss_mlp": 1.01877558, + "epoch": 0.886336990831204, + "flos": 16768493745840.0, + "grad_norm": 1.7473305139620592, + "language_loss": 0.72989702, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75353247, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12316895, + "step": 14742, + "time_per_iteration": 2.8615386486053467 + }, + { + "auxiliary_loss_clip": 0.0132497, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.21832919, + "balance_loss_mlp": 1.02141082, + "epoch": 0.8863971140838719, + "flos": 25270810903800.0, + "grad_norm": 1.9918191011647182, + "language_loss": 0.59244043, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61602539, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12103271, + "step": 14743, + "time_per_iteration": 2.924469232559204 + }, + { + "auxiliary_loss_clip": 0.013357, + "auxiliary_loss_mlp": 0.01037829, + "balance_loss_clip": 1.22569382, + "balance_loss_mlp": 1.02342844, + "epoch": 0.8864572373365399, + "flos": 23409580597200.0, + "grad_norm": 1.7220967629124473, + "language_loss": 0.60008919, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62382448, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.14404297, + "step": 14744, + "time_per_iteration": 2.7830066680908203 + }, + { + "auxiliary_loss_clip": 0.01328131, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.22055864, + "balance_loss_mlp": 1.02039409, + "epoch": 0.8865173605892078, + "flos": 18553008214440.0, + "grad_norm": 1.5354375818349288, + "language_loss": 0.76809895, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.79172045, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13635254, + "step": 14745, + "time_per_iteration": 2.897160768508911 + }, + { + "auxiliary_loss_clip": 0.0132705, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.21949077, + "balance_loss_mlp": 1.01948595, + "epoch": 0.8865774838418758, + "flos": 19030544814240.0, + "grad_norm": 1.748114779859554, + "language_loss": 0.77572513, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79931527, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12493896, + "step": 14746, + "time_per_iteration": 2.884488105773926 + }, + { + "auxiliary_loss_clip": 0.01328532, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.21930718, + "balance_loss_mlp": 1.01742244, + "epoch": 0.8866376070945439, + "flos": 22168191875760.0, + "grad_norm": 1.6314779490894922, + "language_loss": 0.77201843, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.79561126, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13342285, + "step": 14747, + "time_per_iteration": 2.8529703617095947 + }, + { + "auxiliary_loss_clip": 0.01316202, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.2114048, + "balance_loss_mlp": 1.01603794, + "epoch": 0.8866977303472118, + "flos": 20709163015200.0, + "grad_norm": 1.7024518523163823, + "language_loss": 0.82935929, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.85280442, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.1227417, + "step": 14748, + "time_per_iteration": 2.7499704360961914 + }, + { + "auxiliary_loss_clip": 0.01324735, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.21787715, + "balance_loss_mlp": 1.0167172, + "epoch": 0.8867578535998798, + "flos": 48802514131440.0, + "grad_norm": 1.78870328134626, + "language_loss": 0.77688754, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.80043077, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12878418, + "step": 14749, + "time_per_iteration": 2.955061435699463 + }, + { + "auxiliary_loss_clip": 0.01338562, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.2271564, + "balance_loss_mlp": 1.0210762, + "epoch": 0.8868179768525477, + "flos": 21110227427160.0, + "grad_norm": 1.825952914808632, + "language_loss": 0.70136178, + "learning_rate": 1.328135602550451e-07, + "loss": 0.72509658, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13818359, + "step": 14750, + "time_per_iteration": 2.859792470932007 + }, + { + "auxiliary_loss_clip": 0.01325197, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.21818542, + "balance_loss_mlp": 1.02202463, + "epoch": 0.8868781001052157, + "flos": 21835396762920.0, + "grad_norm": 1.9770560767703718, + "language_loss": 0.58930629, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61290562, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12713623, + "step": 14751, + "time_per_iteration": 2.7631287574768066 + }, + { + "auxiliary_loss_clip": 0.01324008, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.21689701, + "balance_loss_mlp": 1.01687634, + "epoch": 0.8869382233578836, + "flos": 13520361672000.0, + "grad_norm": 2.1469851677359273, + "language_loss": 0.81939292, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.84293246, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13049316, + "step": 14752, + "time_per_iteration": 2.7952167987823486 + }, + { + "auxiliary_loss_clip": 0.01338134, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.22462165, + "balance_loss_mlp": 1.01794755, + "epoch": 0.8869983466105517, + "flos": 22709368421640.0, + "grad_norm": 1.6332998236991443, + "language_loss": 0.8054359, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82913005, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13342285, + "step": 14753, + "time_per_iteration": 2.7888269424438477 + }, + { + "auxiliary_loss_clip": 0.01324512, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.21800256, + "balance_loss_mlp": 1.01817346, + "epoch": 0.8870584698632196, + "flos": 15344939610720.0, + "grad_norm": 1.6569665787490269, + "language_loss": 0.65260375, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67615384, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12335205, + "step": 14754, + "time_per_iteration": 4.237329483032227 + }, + { + "auxiliary_loss_clip": 0.01332857, + "auxiliary_loss_mlp": 0.01035367, + "balance_loss_clip": 1.22404528, + "balance_loss_mlp": 1.02273655, + "epoch": 0.8871185931158876, + "flos": 26621385170040.0, + "grad_norm": 2.264623777233914, + "language_loss": 0.74595237, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.7696346, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12628174, + "step": 14755, + "time_per_iteration": 2.8545725345611572 + }, + { + "auxiliary_loss_clip": 0.01329099, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.22059584, + "balance_loss_mlp": 1.02059793, + "epoch": 0.8871787163685555, + "flos": 21804104698560.0, + "grad_norm": 1.4960536452809237, + "language_loss": 0.77964973, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.80328393, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1373291, + "step": 14756, + "time_per_iteration": 4.243851900100708 + }, + { + "auxiliary_loss_clip": 0.0132962, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.22041023, + "balance_loss_mlp": 1.01954782, + "epoch": 0.8872388396212235, + "flos": 14907101005800.0, + "grad_norm": 1.9803075411660285, + "language_loss": 0.77098012, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.79460353, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13165283, + "step": 14757, + "time_per_iteration": 2.9274401664733887 + }, + { + "auxiliary_loss_clip": 0.01318181, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.21485066, + "balance_loss_mlp": 1.0193882, + "epoch": 0.8872989628738914, + "flos": 26437757855400.0, + "grad_norm": 1.7886811459836298, + "language_loss": 0.67723602, + "learning_rate": 1.316993656021632e-07, + "loss": 0.7007314, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.11962891, + "step": 14758, + "time_per_iteration": 2.830406427383423 + }, + { + "auxiliary_loss_clip": 0.01330441, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.22318816, + "balance_loss_mlp": 1.02161658, + "epoch": 0.8873590861265594, + "flos": 48151177440120.0, + "grad_norm": 1.5796743659539647, + "language_loss": 0.68890238, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71256196, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13897705, + "step": 14759, + "time_per_iteration": 4.5220489501953125 + }, + { + "auxiliary_loss_clip": 0.01322147, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.2154007, + "balance_loss_mlp": 1.01856804, + "epoch": 0.8874192093792275, + "flos": 18337601535120.0, + "grad_norm": 1.7913529146846414, + "language_loss": 0.74586064, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76938808, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12030029, + "step": 14760, + "time_per_iteration": 2.759256362915039 + }, + { + "auxiliary_loss_clip": 0.0133455, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.22417784, + "balance_loss_mlp": 1.01900017, + "epoch": 0.8874793326318954, + "flos": 17898666504480.0, + "grad_norm": 2.2271026438084354, + "language_loss": 0.76173913, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78541136, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13671875, + "step": 14761, + "time_per_iteration": 2.7578587532043457 + }, + { + "auxiliary_loss_clip": 0.01323841, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.21576548, + "balance_loss_mlp": 1.01976466, + "epoch": 0.8875394558845634, + "flos": 31108428855360.0, + "grad_norm": 1.6513608063398117, + "language_loss": 0.61524987, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63881826, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.13226318, + "step": 14762, + "time_per_iteration": 2.9268593788146973 + }, + { + "auxiliary_loss_clip": 0.01324148, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.21629333, + "balance_loss_mlp": 1.01841664, + "epoch": 0.8875995791372313, + "flos": 21147001620120.0, + "grad_norm": 1.7697877898870873, + "language_loss": 0.64157546, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66513747, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13635254, + "step": 14763, + "time_per_iteration": 2.8254575729370117 + }, + { + "auxiliary_loss_clip": 0.01328507, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.21977401, + "balance_loss_mlp": 1.01685798, + "epoch": 0.8876597023898993, + "flos": 17459365998600.0, + "grad_norm": 2.4274068930859487, + "language_loss": 0.71123505, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73482305, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13427734, + "step": 14764, + "time_per_iteration": 4.193055629730225 + }, + { + "auxiliary_loss_clip": 0.01332476, + "auxiliary_loss_mlp": 0.010335, + "balance_loss_clip": 1.2203021, + "balance_loss_mlp": 1.02064288, + "epoch": 0.8877198256425672, + "flos": 22712698307160.0, + "grad_norm": 2.864376212927636, + "language_loss": 0.66416705, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68782675, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12854004, + "step": 14765, + "time_per_iteration": 2.7122838497161865 + }, + { + "auxiliary_loss_clip": 0.01320688, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.21638656, + "balance_loss_mlp": 1.01897585, + "epoch": 0.8877799488952353, + "flos": 24540484306320.0, + "grad_norm": 1.7486806917075572, + "language_loss": 0.76255035, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78605992, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11291504, + "step": 14766, + "time_per_iteration": 2.7801055908203125 + }, + { + "auxiliary_loss_clip": 0.01321447, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.21680856, + "balance_loss_mlp": 1.0170027, + "epoch": 0.8878400721479032, + "flos": 20964064647600.0, + "grad_norm": 1.6915247103362256, + "language_loss": 0.7327081, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75621665, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12402344, + "step": 14767, + "time_per_iteration": 2.7563655376434326 + }, + { + "auxiliary_loss_clip": 0.01315779, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.21240973, + "balance_loss_mlp": 1.01615334, + "epoch": 0.8879001954005712, + "flos": 25299950725080.0, + "grad_norm": 1.6112680930007321, + "language_loss": 0.7102651, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73369926, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.11499023, + "step": 14768, + "time_per_iteration": 2.877516508102417 + }, + { + "auxiliary_loss_clip": 0.0132785, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.22133815, + "balance_loss_mlp": 1.01817977, + "epoch": 0.8879603186532391, + "flos": 23190559773840.0, + "grad_norm": 1.7650937636042567, + "language_loss": 0.70406032, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.7276479, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12744141, + "step": 14769, + "time_per_iteration": 2.7480149269104004 + }, + { + "auxiliary_loss_clip": 0.01321493, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.21629477, + "balance_loss_mlp": 1.01589465, + "epoch": 0.8880204419059071, + "flos": 13658483996280.0, + "grad_norm": 2.272612815993177, + "language_loss": 0.6764918, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69998544, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11975098, + "step": 14770, + "time_per_iteration": 2.717669725418091 + }, + { + "auxiliary_loss_clip": 0.01318004, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.21528316, + "balance_loss_mlp": 1.02158093, + "epoch": 0.888080565158575, + "flos": 20636832880080.0, + "grad_norm": 2.0919752729763865, + "language_loss": 0.65515327, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.6786741, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.125, + "step": 14771, + "time_per_iteration": 2.802760362625122 + }, + { + "auxiliary_loss_clip": 0.0132583, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.21914124, + "balance_loss_mlp": 1.0150404, + "epoch": 0.888140688411243, + "flos": 28625732629200.0, + "grad_norm": 1.3935493295004593, + "language_loss": 0.82493526, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84847391, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13012695, + "step": 14772, + "time_per_iteration": 2.820805311203003 + }, + { + "auxiliary_loss_clip": 0.01313856, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.2110976, + "balance_loss_mlp": 1.01680493, + "epoch": 0.8882008116639111, + "flos": 25525753144560.0, + "grad_norm": 1.4192101167352584, + "language_loss": 0.76614845, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78957182, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.11669922, + "step": 14773, + "time_per_iteration": 2.7887563705444336 + }, + { + "auxiliary_loss_clip": 0.0132091, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.21692955, + "balance_loss_mlp": 1.01581979, + "epoch": 0.888260934916579, + "flos": 27679593268800.0, + "grad_norm": 1.7204590846826944, + "language_loss": 0.75157964, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77506173, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.1149292, + "step": 14774, + "time_per_iteration": 2.7852368354797363 + }, + { + "auxiliary_loss_clip": 0.01329581, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.2206955, + "balance_loss_mlp": 1.02267671, + "epoch": 0.888321058169247, + "flos": 21615401338920.0, + "grad_norm": 3.9021374325505302, + "language_loss": 0.72299731, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74664962, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12957764, + "step": 14775, + "time_per_iteration": 2.874760627746582 + }, + { + "auxiliary_loss_clip": 0.01321692, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.21597672, + "balance_loss_mlp": 1.01725006, + "epoch": 0.8883811814219149, + "flos": 18153527528520.0, + "grad_norm": 1.7082731043862873, + "language_loss": 0.80811751, + "learning_rate": 1.292090097299432e-07, + "loss": 0.83163488, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12799072, + "step": 14776, + "time_per_iteration": 2.72772216796875 + }, + { + "auxiliary_loss_clip": 0.01334886, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.22319651, + "balance_loss_mlp": 1.01898313, + "epoch": 0.8884413046745829, + "flos": 28329874142760.0, + "grad_norm": 2.418773692047083, + "language_loss": 0.69424605, + "learning_rate": 1.290713302796802e-07, + "loss": 0.7179153, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.1305542, + "step": 14777, + "time_per_iteration": 2.8700342178344727 + }, + { + "auxiliary_loss_clip": 0.01321715, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.21499872, + "balance_loss_mlp": 1.02424741, + "epoch": 0.8885014279272508, + "flos": 15162977238840.0, + "grad_norm": 1.6977198518655414, + "language_loss": 0.70549357, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72908139, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.1282959, + "step": 14778, + "time_per_iteration": 2.789577007293701 + }, + { + "auxiliary_loss_clip": 0.01326073, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.21811509, + "balance_loss_mlp": 1.01794064, + "epoch": 0.8885615511799189, + "flos": 19104377458680.0, + "grad_norm": 1.7619889240902191, + "language_loss": 0.77755296, + "learning_rate": 1.287961842217804e-07, + "loss": 0.80111468, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12164307, + "step": 14779, + "time_per_iteration": 2.736132860183716 + }, + { + "auxiliary_loss_clip": 0.01141213, + "auxiliary_loss_mlp": 0.0100016, + "balance_loss_clip": 1.09816861, + "balance_loss_mlp": 0.99745387, + "epoch": 0.8886216744325868, + "flos": 51197628460320.0, + "grad_norm": 0.8749611033359073, + "language_loss": 0.56867635, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.59009016, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02709961, + "step": 14780, + "time_per_iteration": 3.0734028816223145 + }, + { + "auxiliary_loss_clip": 0.01141309, + "auxiliary_loss_mlp": 0.0100075, + "balance_loss_clip": 1.09848106, + "balance_loss_mlp": 0.99787694, + "epoch": 0.8886817976852548, + "flos": 61629056121600.0, + "grad_norm": 0.785302980243725, + "language_loss": 0.62452102, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64594162, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02868652, + "step": 14781, + "time_per_iteration": 3.326338052749634 + }, + { + "auxiliary_loss_clip": 0.01140121, + "auxiliary_loss_mlp": 0.01001572, + "balance_loss_clip": 1.09704494, + "balance_loss_mlp": 0.9988541, + "epoch": 0.8887419209379227, + "flos": 60660622771560.0, + "grad_norm": 0.8413268107207618, + "language_loss": 0.58174062, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60315758, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02722168, + "step": 14782, + "time_per_iteration": 3.0528833866119385 + }, + { + "auxiliary_loss_clip": 0.01321043, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.21649969, + "balance_loss_mlp": 1.01869512, + "epoch": 0.8888020441905907, + "flos": 29211926865120.0, + "grad_norm": 1.6211326525477185, + "language_loss": 0.65770543, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68122315, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.1204834, + "step": 14783, + "time_per_iteration": 2.8350775241851807 + }, + { + "auxiliary_loss_clip": 0.01332103, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.22247326, + "balance_loss_mlp": 1.01533425, + "epoch": 0.8888621674432586, + "flos": 22167663967080.0, + "grad_norm": 1.4936022165773344, + "language_loss": 0.77915221, + "learning_rate": 1.281095609023415e-07, + "loss": 0.8027609, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13433838, + "step": 14784, + "time_per_iteration": 2.8079512119293213 + }, + { + "auxiliary_loss_clip": 0.01327415, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.21906388, + "balance_loss_mlp": 1.01884079, + "epoch": 0.8889222906959267, + "flos": 27678618668160.0, + "grad_norm": 2.306436804496663, + "language_loss": 0.60897118, + "learning_rate": 1.279724491644565e-07, + "loss": 0.63256013, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12640381, + "step": 14785, + "time_per_iteration": 2.802284002304077 + }, + { + "auxiliary_loss_clip": 0.01327449, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.2206645, + "balance_loss_mlp": 1.01659226, + "epoch": 0.8889824139485947, + "flos": 14172063838560.0, + "grad_norm": 1.771417434934807, + "language_loss": 0.65012395, + "learning_rate": 1.278354084140445e-07, + "loss": 0.6736986, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.13427734, + "step": 14786, + "time_per_iteration": 2.691690444946289 + }, + { + "auxiliary_loss_clip": 0.01337256, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.22484863, + "balance_loss_mlp": 1.01811254, + "epoch": 0.8890425372012626, + "flos": 12855989697120.0, + "grad_norm": 2.218911681077272, + "language_loss": 0.85472339, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87841177, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13476562, + "step": 14787, + "time_per_iteration": 2.772167682647705 + }, + { + "auxiliary_loss_clip": 0.01323586, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.21595252, + "balance_loss_mlp": 1.01814783, + "epoch": 0.8891026604539306, + "flos": 21694310028360.0, + "grad_norm": 2.3832075135752393, + "language_loss": 0.71355999, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73709983, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12261963, + "step": 14788, + "time_per_iteration": 2.7142560482025146 + }, + { + "auxiliary_loss_clip": 0.01315357, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.21133542, + "balance_loss_mlp": 1.0143795, + "epoch": 0.8891627837065985, + "flos": 21876434833680.0, + "grad_norm": 1.6000415793064682, + "language_loss": 0.70008373, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72349823, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.11706543, + "step": 14789, + "time_per_iteration": 2.7779009342193604 + }, + { + "auxiliary_loss_clip": 0.01321846, + "auxiliary_loss_mlp": 0.01025221, + "balance_loss_clip": 1.21638131, + "balance_loss_mlp": 1.01317477, + "epoch": 0.8892229069592665, + "flos": 21585286917000.0, + "grad_norm": 1.465456843945107, + "language_loss": 0.7082333, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.731704, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.1204834, + "step": 14790, + "time_per_iteration": 2.717958927154541 + }, + { + "auxiliary_loss_clip": 0.01327392, + "auxiliary_loss_mlp": 0.0102564, + "balance_loss_clip": 1.22018194, + "balance_loss_mlp": 1.01367164, + "epoch": 0.8892830302119344, + "flos": 23081211795600.0, + "grad_norm": 1.6712722463780056, + "language_loss": 0.72774398, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75127435, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.11962891, + "step": 14791, + "time_per_iteration": 2.7297492027282715 + }, + { + "auxiliary_loss_clip": 0.01318607, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.214818, + "balance_loss_mlp": 1.0203774, + "epoch": 0.8893431534646025, + "flos": 23076866701080.0, + "grad_norm": 1.6247109677655087, + "language_loss": 0.74255323, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76606655, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.12347412, + "step": 14792, + "time_per_iteration": 4.339231967926025 + }, + { + "auxiliary_loss_clip": 0.01332786, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.22246873, + "balance_loss_mlp": 1.01768255, + "epoch": 0.8894032767172704, + "flos": 22460030134560.0, + "grad_norm": 1.9484599110413843, + "language_loss": 0.66574311, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68937969, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13195801, + "step": 14793, + "time_per_iteration": 2.695652961730957 + }, + { + "auxiliary_loss_clip": 0.01327233, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.21746421, + "balance_loss_mlp": 1.01798987, + "epoch": 0.8894633999699384, + "flos": 25343790772680.0, + "grad_norm": 1.657704099125937, + "language_loss": 0.71867263, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.74225771, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1328125, + "step": 14794, + "time_per_iteration": 2.8038554191589355 + }, + { + "auxiliary_loss_clip": 0.01336882, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.22449255, + "balance_loss_mlp": 1.01690388, + "epoch": 0.8895235232226063, + "flos": 20998767814200.0, + "grad_norm": 1.5013646165829035, + "language_loss": 0.75300753, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77667785, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13250732, + "step": 14795, + "time_per_iteration": 2.822307825088501 + }, + { + "auxiliary_loss_clip": 0.01142124, + "auxiliary_loss_mlp": 0.01004468, + "balance_loss_clip": 1.0987066, + "balance_loss_mlp": 1.00191677, + "epoch": 0.8895836464752743, + "flos": 69747956523000.0, + "grad_norm": 0.7709214501499652, + "language_loss": 0.56111014, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58257604, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.0255127, + "step": 14796, + "time_per_iteration": 6.305203199386597 + }, + { + "auxiliary_loss_clip": 0.01331495, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.22191167, + "balance_loss_mlp": 1.01778579, + "epoch": 0.8896437697279422, + "flos": 23227049708280.0, + "grad_norm": 1.774220347718084, + "language_loss": 0.7023679, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72600377, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14306641, + "step": 14797, + "time_per_iteration": 2.81613826751709 + }, + { + "auxiliary_loss_clip": 0.0114322, + "auxiliary_loss_mlp": 0.01002473, + "balance_loss_clip": 1.10054684, + "balance_loss_mlp": 0.99983835, + "epoch": 0.8897038929806103, + "flos": 70767603660960.0, + "grad_norm": 0.7561444883337195, + "language_loss": 0.58054745, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60200441, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 14798, + "time_per_iteration": 3.327530860900879 + }, + { + "auxiliary_loss_clip": 0.01327216, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.21950078, + "balance_loss_mlp": 1.0150075, + "epoch": 0.8897640162332782, + "flos": 19249849896120.0, + "grad_norm": 1.5671828833730708, + "language_loss": 0.79329526, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81684935, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1317749, + "step": 14799, + "time_per_iteration": 2.813843011856079 + }, + { + "auxiliary_loss_clip": 0.01143344, + "auxiliary_loss_mlp": 0.01002548, + "balance_loss_clip": 1.1005857, + "balance_loss_mlp": 0.99999678, + "epoch": 0.8898241394859462, + "flos": 41368777185240.0, + "grad_norm": 0.889448029049235, + "language_loss": 0.58148062, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60293955, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.0255127, + "step": 14800, + "time_per_iteration": 3.123106002807617 + }, + { + "auxiliary_loss_clip": 0.01324814, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.21730077, + "balance_loss_mlp": 1.018067, + "epoch": 0.8898842627386142, + "flos": 18990562560840.0, + "grad_norm": 1.532996594211493, + "language_loss": 0.6622324, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68578267, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12145996, + "step": 14801, + "time_per_iteration": 2.805025100708008 + }, + { + "auxiliary_loss_clip": 0.01330228, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.22077811, + "balance_loss_mlp": 1.02222204, + "epoch": 0.8899443859912821, + "flos": 13220117482680.0, + "grad_norm": 2.2058490132184803, + "language_loss": 0.75685024, + "learning_rate": 1.256524149358682e-07, + "loss": 0.78051454, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13970947, + "step": 14802, + "time_per_iteration": 2.7722995281219482 + }, + { + "auxiliary_loss_clip": 0.01318628, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.21408987, + "balance_loss_mlp": 1.019526, + "epoch": 0.8900045092439501, + "flos": 22679863125120.0, + "grad_norm": 1.8819794413299014, + "language_loss": 0.73686653, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.76037741, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1293335, + "step": 14803, + "time_per_iteration": 4.45866584777832 + }, + { + "auxiliary_loss_clip": 0.01324627, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.2182492, + "balance_loss_mlp": 1.01729584, + "epoch": 0.890064632496618, + "flos": 21146514319800.0, + "grad_norm": 1.8808267143931479, + "language_loss": 0.72598386, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.74953431, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13134766, + "step": 14804, + "time_per_iteration": 2.767772674560547 + }, + { + "auxiliary_loss_clip": 0.01327522, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.21960878, + "balance_loss_mlp": 1.02112794, + "epoch": 0.8901247557492861, + "flos": 23401377708480.0, + "grad_norm": 1.7605004749765327, + "language_loss": 0.81433928, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83795977, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13415527, + "step": 14805, + "time_per_iteration": 2.7216134071350098 + }, + { + "auxiliary_loss_clip": 0.01327311, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.21763098, + "balance_loss_mlp": 1.0169152, + "epoch": 0.890184879001954, + "flos": 29175680580840.0, + "grad_norm": 1.8208009315988627, + "language_loss": 0.67483121, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69841063, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13708496, + "step": 14806, + "time_per_iteration": 2.807373046875 + }, + { + "auxiliary_loss_clip": 0.0132486, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.21702266, + "balance_loss_mlp": 1.01966453, + "epoch": 0.890245002254622, + "flos": 14431797865800.0, + "grad_norm": 1.6733198426975275, + "language_loss": 0.67708457, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.70066357, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13378906, + "step": 14807, + "time_per_iteration": 2.6760246753692627 + }, + { + "auxiliary_loss_clip": 0.01324035, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.21799695, + "balance_loss_mlp": 1.01986372, + "epoch": 0.8903051255072899, + "flos": 22387253307480.0, + "grad_norm": 1.8309101333796145, + "language_loss": 0.75431931, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77788287, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12469482, + "step": 14808, + "time_per_iteration": 2.779465675354004 + }, + { + "auxiliary_loss_clip": 0.01324607, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.21792686, + "balance_loss_mlp": 1.0171113, + "epoch": 0.8903652487599579, + "flos": 20782305317520.0, + "grad_norm": 1.8281097428153252, + "language_loss": 0.81654632, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.84008151, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11816406, + "step": 14809, + "time_per_iteration": 2.75864315032959 + }, + { + "auxiliary_loss_clip": 0.01324466, + "auxiliary_loss_mlp": 0.01026181, + "balance_loss_clip": 1.2168225, + "balance_loss_mlp": 1.0140692, + "epoch": 0.8904253720126258, + "flos": 24429715035480.0, + "grad_norm": 1.6388142895049833, + "language_loss": 0.68837214, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.71187854, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12103271, + "step": 14810, + "time_per_iteration": 2.8038837909698486 + }, + { + "auxiliary_loss_clip": 0.01331943, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.22090209, + "balance_loss_mlp": 1.01543903, + "epoch": 0.8904854952652939, + "flos": 19468627069320.0, + "grad_norm": 2.79924628375956, + "language_loss": 0.70216548, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72577274, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13360596, + "step": 14811, + "time_per_iteration": 2.7893569469451904 + }, + { + "auxiliary_loss_clip": 0.01336145, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.22682881, + "balance_loss_mlp": 1.01898384, + "epoch": 0.8905456185179618, + "flos": 50809013833680.0, + "grad_norm": 1.8536878335986322, + "language_loss": 0.65875626, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.68243372, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.1262207, + "step": 14812, + "time_per_iteration": 3.0366618633270264 + }, + { + "auxiliary_loss_clip": 0.01323377, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.21710277, + "balance_loss_mlp": 1.02048588, + "epoch": 0.8906057417706298, + "flos": 17789318526240.0, + "grad_norm": 1.66583271361259, + "language_loss": 0.68838978, + "learning_rate": 1.24162160341861e-07, + "loss": 0.71195018, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12176514, + "step": 14813, + "time_per_iteration": 2.737992525100708 + }, + { + "auxiliary_loss_clip": 0.0134306, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.22896254, + "balance_loss_mlp": 1.02041698, + "epoch": 0.8906658650232978, + "flos": 21949942611240.0, + "grad_norm": 2.0367237395793936, + "language_loss": 0.75226152, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77604365, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.14727783, + "step": 14814, + "time_per_iteration": 2.7256789207458496 + }, + { + "auxiliary_loss_clip": 0.01334714, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.22400844, + "balance_loss_mlp": 1.01893914, + "epoch": 0.8907259882759657, + "flos": 21292758316080.0, + "grad_norm": 2.4944077141310523, + "language_loss": 0.74896228, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.77263474, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13592529, + "step": 14815, + "time_per_iteration": 2.952955961227417 + }, + { + "auxiliary_loss_clip": 0.01318231, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.21325445, + "balance_loss_mlp": 1.01985264, + "epoch": 0.8907861115286337, + "flos": 20125242847440.0, + "grad_norm": 1.9403099710930074, + "language_loss": 0.75275755, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77626359, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12524414, + "step": 14816, + "time_per_iteration": 2.760789155960083 + }, + { + "auxiliary_loss_clip": 0.0132669, + "auxiliary_loss_mlp": 0.01024794, + "balance_loss_clip": 1.21928549, + "balance_loss_mlp": 1.01246822, + "epoch": 0.8908462347813016, + "flos": 20088996563160.0, + "grad_norm": 1.8627235038832919, + "language_loss": 0.77664876, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.80016363, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12310791, + "step": 14817, + "time_per_iteration": 2.751101493835449 + }, + { + "auxiliary_loss_clip": 0.01141815, + "auxiliary_loss_mlp": 0.01004961, + "balance_loss_clip": 1.09871101, + "balance_loss_mlp": 1.00229061, + "epoch": 0.8909063580339697, + "flos": 65518494621840.0, + "grad_norm": 0.7630515030098878, + "language_loss": 0.56553829, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58700609, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.0267334, + "step": 14818, + "time_per_iteration": 3.296273708343506 + }, + { + "auxiliary_loss_clip": 0.01324731, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.2176553, + "balance_loss_mlp": 1.02395248, + "epoch": 0.8909664812866376, + "flos": 29869273593720.0, + "grad_norm": 2.0002381338187307, + "language_loss": 0.6441285, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66774035, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.125, + "step": 14819, + "time_per_iteration": 2.882826089859009 + }, + { + "auxiliary_loss_clip": 0.01325856, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.21731734, + "balance_loss_mlp": 1.01891708, + "epoch": 0.8910266045393056, + "flos": 25452610842240.0, + "grad_norm": 1.7643534930740108, + "language_loss": 0.79048133, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.81406415, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.13525391, + "step": 14820, + "time_per_iteration": 2.8773105144500732 + }, + { + "auxiliary_loss_clip": 0.01323437, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.21585274, + "balance_loss_mlp": 1.01851368, + "epoch": 0.8910867277919735, + "flos": 24504562888920.0, + "grad_norm": 1.8989959508229337, + "language_loss": 0.76274264, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78628492, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.1227417, + "step": 14821, + "time_per_iteration": 2.9085335731506348 + }, + { + "auxiliary_loss_clip": 0.0113991, + "auxiliary_loss_mlp": 0.00999212, + "balance_loss_clip": 1.09642327, + "balance_loss_mlp": 0.99643463, + "epoch": 0.8911468510446415, + "flos": 60702432401160.0, + "grad_norm": 0.7807925769524985, + "language_loss": 0.59304857, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61443979, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.02783203, + "step": 14822, + "time_per_iteration": 3.198141574859619 + }, + { + "auxiliary_loss_clip": 0.01327847, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.21957326, + "balance_loss_mlp": 1.01710176, + "epoch": 0.8912069742973094, + "flos": 25342937997120.0, + "grad_norm": 2.0319236184436207, + "language_loss": 0.69296527, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71654707, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13220215, + "step": 14823, + "time_per_iteration": 2.8510916233062744 + }, + { + "auxiliary_loss_clip": 0.01318566, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.21378875, + "balance_loss_mlp": 1.01808369, + "epoch": 0.8912670975499775, + "flos": 18228131731800.0, + "grad_norm": 1.7145183849080785, + "language_loss": 0.69401753, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71750778, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1237793, + "step": 14824, + "time_per_iteration": 2.704954147338867 + }, + { + "auxiliary_loss_clip": 0.01331828, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.22110224, + "balance_loss_mlp": 1.01937819, + "epoch": 0.8913272208026454, + "flos": 26510006773800.0, + "grad_norm": 1.9089951467293151, + "language_loss": 0.70472521, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72837394, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13677979, + "step": 14825, + "time_per_iteration": 2.7924766540527344 + }, + { + "auxiliary_loss_clip": 0.01325693, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.22011781, + "balance_loss_mlp": 1.01706123, + "epoch": 0.8913873440553134, + "flos": 18806366729160.0, + "grad_norm": 1.8174983609120803, + "language_loss": 0.71737623, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.74093473, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.13104248, + "step": 14826, + "time_per_iteration": 2.75101375579834 + }, + { + "auxiliary_loss_clip": 0.01327301, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.22090507, + "balance_loss_mlp": 1.01531148, + "epoch": 0.8914474673079814, + "flos": 20889582269400.0, + "grad_norm": 2.063430848145267, + "language_loss": 0.75647849, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.78002906, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12426758, + "step": 14827, + "time_per_iteration": 2.9281744956970215 + }, + { + "auxiliary_loss_clip": 0.0132664, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.21899176, + "balance_loss_mlp": 1.01241541, + "epoch": 0.8915075905606493, + "flos": 20956430275920.0, + "grad_norm": 1.798144262777873, + "language_loss": 0.78551841, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80903614, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12719727, + "step": 14828, + "time_per_iteration": 2.7211573123931885 + }, + { + "auxiliary_loss_clip": 0.01320612, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.21423149, + "balance_loss_mlp": 1.01699114, + "epoch": 0.8915677138133173, + "flos": 23081577270840.0, + "grad_norm": 1.6583041912125396, + "language_loss": 0.75321853, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77672029, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12561035, + "step": 14829, + "time_per_iteration": 2.7508866786956787 + }, + { + "auxiliary_loss_clip": 0.01327862, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.21957767, + "balance_loss_mlp": 1.01827919, + "epoch": 0.8916278370659853, + "flos": 23445461406240.0, + "grad_norm": 1.5393889615460379, + "language_loss": 0.84749687, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.87108099, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12268066, + "step": 14830, + "time_per_iteration": 2.843562602996826 + }, + { + "auxiliary_loss_clip": 0.01319643, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.21549082, + "balance_loss_mlp": 1.01970863, + "epoch": 0.8916879603186533, + "flos": 25166620187280.0, + "grad_norm": 1.3758287641016356, + "language_loss": 0.74714649, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.77065611, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.11602783, + "step": 14831, + "time_per_iteration": 4.301207542419434 + }, + { + "auxiliary_loss_clip": 0.01332854, + "auxiliary_loss_mlp": 0.01024202, + "balance_loss_clip": 1.22203481, + "balance_loss_mlp": 1.01082063, + "epoch": 0.8917480835713212, + "flos": 20234672042400.0, + "grad_norm": 1.6539309126639137, + "language_loss": 0.73397893, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75754946, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13397217, + "step": 14832, + "time_per_iteration": 2.770679235458374 + }, + { + "auxiliary_loss_clip": 0.01329152, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.21983457, + "balance_loss_mlp": 1.01907611, + "epoch": 0.8918082068239892, + "flos": 26107236810720.0, + "grad_norm": 1.8997287435893992, + "language_loss": 0.67003989, + "learning_rate": 1.214746621848355e-07, + "loss": 0.69365156, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12957764, + "step": 14833, + "time_per_iteration": 2.865262508392334 + }, + { + "auxiliary_loss_clip": 0.01333515, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.22354436, + "balance_loss_mlp": 1.02032888, + "epoch": 0.8918683300766571, + "flos": 24837439218480.0, + "grad_norm": 1.5706813790933267, + "language_loss": 0.74013817, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76381451, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.13781738, + "step": 14834, + "time_per_iteration": 5.915663719177246 + }, + { + "auxiliary_loss_clip": 0.01325773, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.2182374, + "balance_loss_mlp": 1.023803, + "epoch": 0.8919284533293251, + "flos": 22309684693920.0, + "grad_norm": 2.027085214678612, + "language_loss": 0.79229522, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81591618, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12524414, + "step": 14835, + "time_per_iteration": 2.798912286758423 + }, + { + "auxiliary_loss_clip": 0.01317115, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.21208858, + "balance_loss_mlp": 1.01688409, + "epoch": 0.891988576581993, + "flos": 30379767200640.0, + "grad_norm": 1.9042225270936837, + "language_loss": 0.7411347, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76459825, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12347412, + "step": 14836, + "time_per_iteration": 2.928532600402832 + }, + { + "auxiliary_loss_clip": 0.01326846, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.2192595, + "balance_loss_mlp": 1.01666927, + "epoch": 0.8920486998346611, + "flos": 15556285454040.0, + "grad_norm": 2.2907441761434484, + "language_loss": 0.69013929, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.71369839, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1239624, + "step": 14837, + "time_per_iteration": 2.77144193649292 + }, + { + "auxiliary_loss_clip": 0.01333014, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.22079265, + "balance_loss_mlp": 1.0160594, + "epoch": 0.892108823087329, + "flos": 21219940880640.0, + "grad_norm": 1.708886069440914, + "language_loss": 0.68140483, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.70503092, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13537598, + "step": 14838, + "time_per_iteration": 3.0618791580200195 + }, + { + "auxiliary_loss_clip": 0.01324356, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.21625078, + "balance_loss_mlp": 1.0161891, + "epoch": 0.892168946339997, + "flos": 21983752393920.0, + "grad_norm": 1.8122660529333265, + "language_loss": 0.76297641, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78651494, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13299561, + "step": 14839, + "time_per_iteration": 2.943690776824951 + }, + { + "auxiliary_loss_clip": 0.01139969, + "auxiliary_loss_mlp": 0.01003054, + "balance_loss_clip": 1.09747386, + "balance_loss_mlp": 1.00041926, + "epoch": 0.892229069592665, + "flos": 67490575416000.0, + "grad_norm": 0.67768643360175, + "language_loss": 0.49496228, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51639247, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 14840, + "time_per_iteration": 3.241164445877075 + }, + { + "auxiliary_loss_clip": 0.01338589, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.22568631, + "balance_loss_mlp": 1.01678491, + "epoch": 0.8922891928453329, + "flos": 19463754066120.0, + "grad_norm": 2.213269048302415, + "language_loss": 0.64529514, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66899216, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14318848, + "step": 14841, + "time_per_iteration": 2.8261306285858154 + }, + { + "auxiliary_loss_clip": 0.0131985, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.21644688, + "balance_loss_mlp": 1.02600956, + "epoch": 0.8923493160980009, + "flos": 23372603362440.0, + "grad_norm": 1.476437420878721, + "language_loss": 0.68592227, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70949823, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11737061, + "step": 14842, + "time_per_iteration": 4.340276002883911 + }, + { + "auxiliary_loss_clip": 0.01321129, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.21674013, + "balance_loss_mlp": 1.01702714, + "epoch": 0.8924094393506689, + "flos": 26182653181200.0, + "grad_norm": 1.8954013385779842, + "language_loss": 0.80391693, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.82741678, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.1184082, + "step": 14843, + "time_per_iteration": 2.7691471576690674 + }, + { + "auxiliary_loss_clip": 0.01332717, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.22176933, + "balance_loss_mlp": 1.01763153, + "epoch": 0.8924695626033369, + "flos": 22023694038960.0, + "grad_norm": 1.9347388070970597, + "language_loss": 0.69162488, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.71526295, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13476562, + "step": 14844, + "time_per_iteration": 2.8032641410827637 + }, + { + "auxiliary_loss_clip": 0.01327772, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.21935546, + "balance_loss_mlp": 1.01904988, + "epoch": 0.8925296858560048, + "flos": 14798118502800.0, + "grad_norm": 2.0122431527095577, + "language_loss": 0.91824782, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.94183981, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12384033, + "step": 14845, + "time_per_iteration": 2.76098370552063 + }, + { + "auxiliary_loss_clip": 0.01322196, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.21759677, + "balance_loss_mlp": 1.01344872, + "epoch": 0.8925898091086728, + "flos": 22351819190400.0, + "grad_norm": 1.9209318882238235, + "language_loss": 0.72974139, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.75321853, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1206665, + "step": 14846, + "time_per_iteration": 2.8251428604125977 + }, + { + "auxiliary_loss_clip": 0.01328081, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.21947229, + "balance_loss_mlp": 1.02002323, + "epoch": 0.8926499323613407, + "flos": 45814481560080.0, + "grad_norm": 1.7693437525906845, + "language_loss": 0.57215554, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59576303, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12640381, + "step": 14847, + "time_per_iteration": 2.9473612308502197 + }, + { + "auxiliary_loss_clip": 0.01327231, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.21899486, + "balance_loss_mlp": 1.02002633, + "epoch": 0.8927100556140087, + "flos": 22132108024920.0, + "grad_norm": 2.153531560113274, + "language_loss": 0.77126014, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.79485303, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12030029, + "step": 14848, + "time_per_iteration": 2.741849184036255 + }, + { + "auxiliary_loss_clip": 0.01318586, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.21283495, + "balance_loss_mlp": 1.01715708, + "epoch": 0.8927701788666766, + "flos": 28336087221840.0, + "grad_norm": 1.6673423839586021, + "language_loss": 0.69323915, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71671999, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12341309, + "step": 14849, + "time_per_iteration": 2.9597461223602295 + }, + { + "auxiliary_loss_clip": 0.01332411, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.22428179, + "balance_loss_mlp": 1.01878238, + "epoch": 0.8928303021193447, + "flos": 25299625858200.0, + "grad_norm": 1.5215240841176494, + "language_loss": 0.80836356, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83199537, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.11999512, + "step": 14850, + "time_per_iteration": 2.760108709335327 + }, + { + "auxiliary_loss_clip": 0.01321332, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.21680284, + "balance_loss_mlp": 1.0176121, + "epoch": 0.8928904253720126, + "flos": 22241821478400.0, + "grad_norm": 1.8212917647405598, + "language_loss": 0.74875158, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77226472, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12371826, + "step": 14851, + "time_per_iteration": 2.8006434440612793 + }, + { + "auxiliary_loss_clip": 0.0132171, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.21570182, + "balance_loss_mlp": 1.02043509, + "epoch": 0.8929505486246806, + "flos": 27098515686240.0, + "grad_norm": 1.5797254335459519, + "language_loss": 0.78858387, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.81213129, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.1260376, + "step": 14852, + "time_per_iteration": 2.8307409286499023 + }, + { + "auxiliary_loss_clip": 0.01317414, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.21302438, + "balance_loss_mlp": 1.01487041, + "epoch": 0.8930106718773486, + "flos": 23044559427720.0, + "grad_norm": 1.3585002002662836, + "language_loss": 0.69343704, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71688098, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12109375, + "step": 14853, + "time_per_iteration": 2.8246941566467285 + }, + { + "auxiliary_loss_clip": 0.01324864, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.21778584, + "balance_loss_mlp": 1.01938939, + "epoch": 0.8930707951300165, + "flos": 35633464984440.0, + "grad_norm": 1.5374547255251538, + "language_loss": 0.6754092, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69897628, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12463379, + "step": 14854, + "time_per_iteration": 2.8963139057159424 + }, + { + "auxiliary_loss_clip": 0.01317444, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.21358824, + "balance_loss_mlp": 1.01807404, + "epoch": 0.8931309183826845, + "flos": 23045168553120.0, + "grad_norm": 1.427996803917915, + "language_loss": 0.74741793, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.77089328, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.12023926, + "step": 14855, + "time_per_iteration": 2.8552963733673096 + }, + { + "auxiliary_loss_clip": 0.0132031, + "auxiliary_loss_mlp": 0.01025222, + "balance_loss_clip": 1.21511245, + "balance_loss_mlp": 1.0135572, + "epoch": 0.8931910416353525, + "flos": 26510169207240.0, + "grad_norm": 1.8857283346768299, + "language_loss": 0.64642358, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.6698789, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11657715, + "step": 14856, + "time_per_iteration": 2.856170177459717 + }, + { + "auxiliary_loss_clip": 0.01321211, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.21382999, + "balance_loss_mlp": 1.0148139, + "epoch": 0.8932511648880205, + "flos": 24979135078440.0, + "grad_norm": 1.4865788274180516, + "language_loss": 0.66496897, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68845719, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12823486, + "step": 14857, + "time_per_iteration": 2.905881881713867 + }, + { + "auxiliary_loss_clip": 0.01330717, + "auxiliary_loss_mlp": 0.01029536, + "balance_loss_clip": 1.22262478, + "balance_loss_mlp": 1.01687026, + "epoch": 0.8933112881406884, + "flos": 24465920711400.0, + "grad_norm": 2.1465948260138434, + "language_loss": 0.75461662, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77821922, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12652588, + "step": 14858, + "time_per_iteration": 2.8496673107147217 + }, + { + "auxiliary_loss_clip": 0.01326356, + "auxiliary_loss_mlp": 0.01028846, + "balance_loss_clip": 1.21910942, + "balance_loss_mlp": 1.01635933, + "epoch": 0.8933714113933564, + "flos": 28298947553640.0, + "grad_norm": 1.9131190713664181, + "language_loss": 0.69491291, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71846485, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12487793, + "step": 14859, + "time_per_iteration": 2.8372561931610107 + }, + { + "auxiliary_loss_clip": 0.01313417, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.21120691, + "balance_loss_mlp": 1.01529312, + "epoch": 0.8934315346460243, + "flos": 21439976913000.0, + "grad_norm": 1.618833420691861, + "language_loss": 0.75736856, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.78077757, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.12182617, + "step": 14860, + "time_per_iteration": 2.8438782691955566 + }, + { + "auxiliary_loss_clip": 0.01331118, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.22210836, + "balance_loss_mlp": 1.01895761, + "epoch": 0.8934916578986923, + "flos": 23774967241920.0, + "grad_norm": 2.07346719357954, + "language_loss": 0.57964909, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.60328615, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13635254, + "step": 14861, + "time_per_iteration": 2.8470683097839355 + }, + { + "auxiliary_loss_clip": 0.01320119, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.21522999, + "balance_loss_mlp": 1.01568985, + "epoch": 0.8935517811513602, + "flos": 18920790752400.0, + "grad_norm": 1.8053818085098887, + "language_loss": 0.63833356, + "learning_rate": 1.176284122190685e-07, + "loss": 0.66182023, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12860107, + "step": 14862, + "time_per_iteration": 2.780184745788574 + }, + { + "auxiliary_loss_clip": 0.01318215, + "auxiliary_loss_mlp": 0.01026993, + "balance_loss_clip": 1.21356761, + "balance_loss_mlp": 1.01515603, + "epoch": 0.8936119044040283, + "flos": 24066764892360.0, + "grad_norm": 1.6419419058217204, + "language_loss": 0.78023362, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80368572, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11846924, + "step": 14863, + "time_per_iteration": 2.804579973220825 + }, + { + "auxiliary_loss_clip": 0.01315141, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.21106207, + "balance_loss_mlp": 1.0179224, + "epoch": 0.8936720276566962, + "flos": 21329004600360.0, + "grad_norm": 1.6895310596926616, + "language_loss": 0.70774722, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.7311933, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.11560059, + "step": 14864, + "time_per_iteration": 2.756373405456543 + }, + { + "auxiliary_loss_clip": 0.01347858, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.23422432, + "balance_loss_mlp": 1.01811934, + "epoch": 0.8937321509093642, + "flos": 18410784445800.0, + "grad_norm": 2.6449305532454668, + "language_loss": 0.7642324, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.78802747, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13543701, + "step": 14865, + "time_per_iteration": 2.783977508544922 + }, + { + "auxiliary_loss_clip": 0.01320043, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.21530735, + "balance_loss_mlp": 1.01930261, + "epoch": 0.8937922741620322, + "flos": 22059899714880.0, + "grad_norm": 1.43338056772022, + "language_loss": 0.71849918, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74201035, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11767578, + "step": 14866, + "time_per_iteration": 2.7776331901550293 + }, + { + "auxiliary_loss_clip": 0.01337952, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.22699749, + "balance_loss_mlp": 1.01774502, + "epoch": 0.8938523974147001, + "flos": 25669682464320.0, + "grad_norm": 1.7479326853624557, + "language_loss": 0.83939636, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86309636, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14300537, + "step": 14867, + "time_per_iteration": 2.811556339263916 + }, + { + "auxiliary_loss_clip": 0.01323775, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.21644938, + "balance_loss_mlp": 1.01972532, + "epoch": 0.8939125206673681, + "flos": 25748875412280.0, + "grad_norm": 1.5650540884977409, + "language_loss": 0.80932164, + "learning_rate": 1.168401272009567e-07, + "loss": 0.83287013, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.11346436, + "step": 14868, + "time_per_iteration": 2.851405382156372 + }, + { + "auxiliary_loss_clip": 0.01325949, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.21792269, + "balance_loss_mlp": 1.01861942, + "epoch": 0.8939726439200361, + "flos": 27350249866560.0, + "grad_norm": 1.5578547941744165, + "language_loss": 0.77328199, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79686713, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13952637, + "step": 14869, + "time_per_iteration": 2.8695850372314453 + }, + { + "auxiliary_loss_clip": 0.01320915, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.21490717, + "balance_loss_mlp": 1.01616907, + "epoch": 0.8940327671727041, + "flos": 20343613937040.0, + "grad_norm": 1.3800546885634568, + "language_loss": 0.65498745, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67848659, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12823486, + "step": 14870, + "time_per_iteration": 2.7447643280029297 + }, + { + "auxiliary_loss_clip": 0.01141428, + "auxiliary_loss_mlp": 0.01003935, + "balance_loss_clip": 1.09879947, + "balance_loss_mlp": 1.00107431, + "epoch": 0.894092890425372, + "flos": 58422554262720.0, + "grad_norm": 0.8013138543682947, + "language_loss": 0.56003433, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.58148795, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02856445, + "step": 14871, + "time_per_iteration": 4.74564790725708 + }, + { + "auxiliary_loss_clip": 0.01323196, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.21813226, + "balance_loss_mlp": 1.02136385, + "epoch": 0.89415301367804, + "flos": 19834582231080.0, + "grad_norm": 1.6788497735585914, + "language_loss": 0.76607549, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78964186, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1206665, + "step": 14872, + "time_per_iteration": 2.81660532951355 + }, + { + "auxiliary_loss_clip": 0.01320589, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.2170639, + "balance_loss_mlp": 1.01658785, + "epoch": 0.8942131369307079, + "flos": 25781345119080.0, + "grad_norm": 1.5636157757931006, + "language_loss": 0.67278618, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69627106, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11297607, + "step": 14873, + "time_per_iteration": 5.807668447494507 + }, + { + "auxiliary_loss_clip": 0.01322003, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.21661305, + "balance_loss_mlp": 1.02190948, + "epoch": 0.8942732601833759, + "flos": 23154272881200.0, + "grad_norm": 1.4567556135037294, + "language_loss": 0.5934633, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61703295, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.13061523, + "step": 14874, + "time_per_iteration": 2.807157516479492 + }, + { + "auxiliary_loss_clip": 0.01330487, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.22166348, + "balance_loss_mlp": 1.01876211, + "epoch": 0.8943333834360438, + "flos": 27861393207240.0, + "grad_norm": 1.7187090308331785, + "language_loss": 0.75975788, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.78337926, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12902832, + "step": 14875, + "time_per_iteration": 2.8267903327941895 + }, + { + "auxiliary_loss_clip": 0.013406, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.22799063, + "balance_loss_mlp": 1.01750612, + "epoch": 0.8943935066887119, + "flos": 22169085259680.0, + "grad_norm": 1.8913764642997168, + "language_loss": 0.77531689, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79904783, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.15002441, + "step": 14876, + "time_per_iteration": 2.816660165786743 + }, + { + "auxiliary_loss_clip": 0.01324968, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.21920276, + "balance_loss_mlp": 1.01568735, + "epoch": 0.8944536299413798, + "flos": 21475004946480.0, + "grad_norm": 1.6123689082470454, + "language_loss": 0.79177761, + "learning_rate": 1.156625201573287e-07, + "loss": 0.81530547, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.121521, + "step": 14877, + "time_per_iteration": 2.8124141693115234 + }, + { + "auxiliary_loss_clip": 0.01324742, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.21699834, + "balance_loss_mlp": 1.0165, + "epoch": 0.8945137531940478, + "flos": 17753275283760.0, + "grad_norm": 2.2712517339011096, + "language_loss": 0.75316775, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77670836, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12823486, + "step": 14878, + "time_per_iteration": 2.773348093032837 + }, + { + "auxiliary_loss_clip": 0.01331413, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.22192931, + "balance_loss_mlp": 1.01471806, + "epoch": 0.8945738764467158, + "flos": 21147245270280.0, + "grad_norm": 1.5715765397693013, + "language_loss": 0.76168364, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78527963, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13470459, + "step": 14879, + "time_per_iteration": 2.966019868850708 + }, + { + "auxiliary_loss_clip": 0.01330148, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.22325063, + "balance_loss_mlp": 1.01715863, + "epoch": 0.8946339996993837, + "flos": 14907141614160.0, + "grad_norm": 1.8159769345359953, + "language_loss": 0.75002527, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.77362496, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12652588, + "step": 14880, + "time_per_iteration": 4.40045952796936 + }, + { + "auxiliary_loss_clip": 0.01323761, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.21695209, + "balance_loss_mlp": 1.01650691, + "epoch": 0.8946941229520518, + "flos": 27388079876880.0, + "grad_norm": 1.4845646738100684, + "language_loss": 0.83001268, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.85354269, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12744141, + "step": 14881, + "time_per_iteration": 2.760688066482544 + }, + { + "auxiliary_loss_clip": 0.01322184, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.21725166, + "balance_loss_mlp": 1.01731277, + "epoch": 0.8947542462047197, + "flos": 31802671602000.0, + "grad_norm": 1.527967826540328, + "language_loss": 0.68155664, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.70507753, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12579346, + "step": 14882, + "time_per_iteration": 3.0202722549438477 + }, + { + "auxiliary_loss_clip": 0.01335177, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.22385716, + "balance_loss_mlp": 1.01271129, + "epoch": 0.8948143694573877, + "flos": 20888120368440.0, + "grad_norm": 2.0268909295749737, + "language_loss": 0.75654793, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.78016657, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.13983154, + "step": 14883, + "time_per_iteration": 2.69075870513916 + }, + { + "auxiliary_loss_clip": 0.01316803, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.21295369, + "balance_loss_mlp": 1.01876199, + "epoch": 0.8948744927100556, + "flos": 28220810423040.0, + "grad_norm": 1.5307826657509056, + "language_loss": 0.72568631, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74916577, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.12384033, + "step": 14884, + "time_per_iteration": 2.9196252822875977 + }, + { + "auxiliary_loss_clip": 0.0131892, + "auxiliary_loss_mlp": 0.01023611, + "balance_loss_clip": 1.21436381, + "balance_loss_mlp": 1.01172626, + "epoch": 0.8949346159627236, + "flos": 21906995947560.0, + "grad_norm": 1.6880648144635442, + "language_loss": 0.75737762, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.78080297, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.11889648, + "step": 14885, + "time_per_iteration": 2.701566696166992 + }, + { + "auxiliary_loss_clip": 0.01338061, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.22657239, + "balance_loss_mlp": 1.01838708, + "epoch": 0.8949947392153915, + "flos": 21363870200400.0, + "grad_norm": 2.400575290421205, + "language_loss": 0.82159495, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84529316, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13360596, + "step": 14886, + "time_per_iteration": 2.747898578643799 + }, + { + "auxiliary_loss_clip": 0.0132201, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.21493316, + "balance_loss_mlp": 1.01406109, + "epoch": 0.8950548624680595, + "flos": 52454187727200.0, + "grad_norm": 1.48715980207194, + "language_loss": 0.63779968, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.66128337, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12298584, + "step": 14887, + "time_per_iteration": 2.976731538772583 + }, + { + "auxiliary_loss_clip": 0.0133418, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.22366798, + "balance_loss_mlp": 1.02336907, + "epoch": 0.8951149857207275, + "flos": 20126258056440.0, + "grad_norm": 2.136723172704931, + "language_loss": 0.61325121, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63695246, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12573242, + "step": 14888, + "time_per_iteration": 2.734819173812866 + }, + { + "auxiliary_loss_clip": 0.01331281, + "auxiliary_loss_mlp": 0.01024803, + "balance_loss_clip": 1.2221998, + "balance_loss_mlp": 1.01227391, + "epoch": 0.8951751089733955, + "flos": 29868786293400.0, + "grad_norm": 1.8611765402878255, + "language_loss": 0.70574421, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72930503, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12524414, + "step": 14889, + "time_per_iteration": 2.8082668781280518 + }, + { + "auxiliary_loss_clip": 0.01323425, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.21559215, + "balance_loss_mlp": 1.01305652, + "epoch": 0.8952352322260634, + "flos": 15266315179800.0, + "grad_norm": 2.3569333974529605, + "language_loss": 0.70876813, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73226553, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13269043, + "step": 14890, + "time_per_iteration": 2.823275089263916 + }, + { + "auxiliary_loss_clip": 0.01329725, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.22142518, + "balance_loss_mlp": 1.01622796, + "epoch": 0.8952953554787314, + "flos": 26804159709120.0, + "grad_norm": 1.53872099264154, + "language_loss": 0.75820535, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.78179359, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12878418, + "step": 14891, + "time_per_iteration": 2.848987102508545 + }, + { + "auxiliary_loss_clip": 0.01330688, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.21893644, + "balance_loss_mlp": 1.02062917, + "epoch": 0.8953554787313994, + "flos": 14141299682880.0, + "grad_norm": 3.9579003751284376, + "language_loss": 0.76887518, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.7925154, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12695312, + "step": 14892, + "time_per_iteration": 2.8636586666107178 + }, + { + "auxiliary_loss_clip": 0.01327458, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.21994269, + "balance_loss_mlp": 1.01906431, + "epoch": 0.8954156019840673, + "flos": 25708933767240.0, + "grad_norm": 1.6314335287844133, + "language_loss": 0.81699294, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.84058088, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12268066, + "step": 14893, + "time_per_iteration": 2.8505773544311523 + }, + { + "auxiliary_loss_clip": 0.01320272, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.21527839, + "balance_loss_mlp": 1.01816797, + "epoch": 0.8954757252367354, + "flos": 21913005984840.0, + "grad_norm": 1.7945488328984918, + "language_loss": 0.75434047, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.77784348, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11865234, + "step": 14894, + "time_per_iteration": 2.7870895862579346 + }, + { + "auxiliary_loss_clip": 0.01331999, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.22254252, + "balance_loss_mlp": 1.01824641, + "epoch": 0.8955358484894033, + "flos": 12974433948000.0, + "grad_norm": 1.6122045423507363, + "language_loss": 0.67147845, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.6951164, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13537598, + "step": 14895, + "time_per_iteration": 2.7946550846099854 + }, + { + "auxiliary_loss_clip": 0.01333345, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.22229028, + "balance_loss_mlp": 1.01558137, + "epoch": 0.8955959717420713, + "flos": 17278337619000.0, + "grad_norm": 1.6112601868913385, + "language_loss": 0.6736812, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.6973058, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13549805, + "step": 14896, + "time_per_iteration": 2.8863298892974854 + }, + { + "auxiliary_loss_clip": 0.01324846, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.21841264, + "balance_loss_mlp": 1.0138787, + "epoch": 0.8956560949947392, + "flos": 14798037286080.0, + "grad_norm": 2.0098340338021576, + "language_loss": 0.75961125, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78312534, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12689209, + "step": 14897, + "time_per_iteration": 2.7694284915924072 + }, + { + "auxiliary_loss_clip": 0.01142787, + "auxiliary_loss_mlp": 0.01006409, + "balance_loss_clip": 1.10016906, + "balance_loss_mlp": 1.00384569, + "epoch": 0.8957162182474072, + "flos": 63622601757000.0, + "grad_norm": 0.7503715652529526, + "language_loss": 0.55345041, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57494235, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02563477, + "step": 14898, + "time_per_iteration": 3.2890591621398926 + }, + { + "auxiliary_loss_clip": 0.01327944, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.22031426, + "balance_loss_mlp": 1.02045548, + "epoch": 0.8957763415000751, + "flos": 25015340754360.0, + "grad_norm": 1.4811323969106442, + "language_loss": 0.70635986, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72997284, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12896729, + "step": 14899, + "time_per_iteration": 2.7626564502716064 + }, + { + "auxiliary_loss_clip": 0.01332036, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.22311211, + "balance_loss_mlp": 1.01690364, + "epoch": 0.8958364647527431, + "flos": 19833404588640.0, + "grad_norm": 1.6367687970132694, + "language_loss": 0.73558939, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.75921798, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13922119, + "step": 14900, + "time_per_iteration": 2.7403507232666016 + }, + { + "auxiliary_loss_clip": 0.01141788, + "auxiliary_loss_mlp": 0.01006995, + "balance_loss_clip": 1.09926474, + "balance_loss_mlp": 1.00409842, + "epoch": 0.895896588005411, + "flos": 65551654670760.0, + "grad_norm": 0.819629733445513, + "language_loss": 0.61946183, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.64094967, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02893066, + "step": 14901, + "time_per_iteration": 3.2021138668060303 + }, + { + "auxiliary_loss_clip": 0.01327051, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.21733737, + "balance_loss_mlp": 1.01347375, + "epoch": 0.8959567112580791, + "flos": 25596255903480.0, + "grad_norm": 1.725252478417675, + "language_loss": 0.70625967, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72979212, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12731934, + "step": 14902, + "time_per_iteration": 2.7946016788482666 + }, + { + "auxiliary_loss_clip": 0.01320245, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.21556079, + "balance_loss_mlp": 1.01608562, + "epoch": 0.896016834510747, + "flos": 24206470942680.0, + "grad_norm": 1.6728834203096248, + "language_loss": 0.78728426, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.81076753, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12005615, + "step": 14903, + "time_per_iteration": 2.8159680366516113 + }, + { + "auxiliary_loss_clip": 0.01330622, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_clip": 1.21950054, + "balance_loss_mlp": 1.01234448, + "epoch": 0.896076957763415, + "flos": 23081171187240.0, + "grad_norm": 1.6036678369586475, + "language_loss": 0.73006642, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75363833, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14221191, + "step": 14904, + "time_per_iteration": 2.8588881492614746 + }, + { + "auxiliary_loss_clip": 0.01329223, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.21955144, + "balance_loss_mlp": 1.01330853, + "epoch": 0.8961370810160829, + "flos": 22241699653320.0, + "grad_norm": 2.168735888453677, + "language_loss": 0.7516824, + "learning_rate": 1.12035883275166e-07, + "loss": 0.77525437, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.14672852, + "step": 14905, + "time_per_iteration": 2.8014328479766846 + }, + { + "auxiliary_loss_clip": 0.01320651, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.2148968, + "balance_loss_mlp": 1.01675558, + "epoch": 0.8961972042687509, + "flos": 23077069742880.0, + "grad_norm": 1.7453612986004867, + "language_loss": 0.76888847, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.7923885, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.1260376, + "step": 14906, + "time_per_iteration": 2.7311007976531982 + }, + { + "auxiliary_loss_clip": 0.01326562, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.21781909, + "balance_loss_mlp": 1.01941776, + "epoch": 0.896257327521419, + "flos": 18190342329840.0, + "grad_norm": 1.612906880254614, + "language_loss": 0.74175131, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76533842, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12731934, + "step": 14907, + "time_per_iteration": 2.725860118865967 + }, + { + "auxiliary_loss_clip": 0.01323644, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.21821892, + "balance_loss_mlp": 1.02265668, + "epoch": 0.8963174507740869, + "flos": 17900087797080.0, + "grad_norm": 1.8146239181450217, + "language_loss": 0.82686734, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85045522, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12493896, + "step": 14908, + "time_per_iteration": 2.8784313201904297 + }, + { + "auxiliary_loss_clip": 0.01328398, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.21854663, + "balance_loss_mlp": 1.01761162, + "epoch": 0.8963775740267549, + "flos": 21037166341560.0, + "grad_norm": 1.6377746240983269, + "language_loss": 0.71277821, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.73637497, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13677979, + "step": 14909, + "time_per_iteration": 2.8548378944396973 + }, + { + "auxiliary_loss_clip": 0.01330307, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.22230875, + "balance_loss_mlp": 1.01911533, + "epoch": 0.8964376972794228, + "flos": 23184103044600.0, + "grad_norm": 3.5780549876600425, + "language_loss": 0.72721112, + "learning_rate": 1.113941727737877e-07, + "loss": 0.75083995, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13452148, + "step": 14910, + "time_per_iteration": 4.253382205963135 + }, + { + "auxiliary_loss_clip": 0.01325773, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.21883833, + "balance_loss_mlp": 1.01779234, + "epoch": 0.8964978205320908, + "flos": 24978444736320.0, + "grad_norm": 1.9191214042733702, + "language_loss": 0.63277221, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65633166, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.1239624, + "step": 14911, + "time_per_iteration": 4.438300132751465 + }, + { + "auxiliary_loss_clip": 0.01329768, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.22137809, + "balance_loss_mlp": 1.02158201, + "epoch": 0.8965579437847587, + "flos": 19176748202160.0, + "grad_norm": 1.8766371233578076, + "language_loss": 0.75427175, + "learning_rate": 1.111379898520437e-07, + "loss": 0.77791512, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13000488, + "step": 14912, + "time_per_iteration": 4.256327152252197 + }, + { + "auxiliary_loss_clip": 0.01322339, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.21516073, + "balance_loss_mlp": 1.0201838, + "epoch": 0.8966180670374267, + "flos": 24281643663000.0, + "grad_norm": 1.8750542272127577, + "language_loss": 0.82133794, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84488904, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12579346, + "step": 14913, + "time_per_iteration": 2.8181445598602295 + }, + { + "auxiliary_loss_clip": 0.01329343, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.21946812, + "balance_loss_mlp": 1.02043176, + "epoch": 0.8966781902900947, + "flos": 13557176473320.0, + "grad_norm": 2.085957670146128, + "language_loss": 0.61158705, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63522422, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1395874, + "step": 14914, + "time_per_iteration": 2.7923712730407715 + }, + { + "auxiliary_loss_clip": 0.01141685, + "auxiliary_loss_mlp": 0.01007275, + "balance_loss_clip": 1.09907317, + "balance_loss_mlp": 1.00468779, + "epoch": 0.8967383135427627, + "flos": 65080940275440.0, + "grad_norm": 0.72058133219072, + "language_loss": 0.55112481, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57261443, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02587891, + "step": 14915, + "time_per_iteration": 3.3580880165100098 + }, + { + "auxiliary_loss_clip": 0.01319119, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.21361387, + "balance_loss_mlp": 1.01966906, + "epoch": 0.8967984367954306, + "flos": 29718684502920.0, + "grad_norm": 1.4207816804607754, + "language_loss": 0.71855056, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.74205518, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.11669922, + "step": 14916, + "time_per_iteration": 2.9435253143310547 + }, + { + "auxiliary_loss_clip": 0.01328367, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.22127903, + "balance_loss_mlp": 1.02168119, + "epoch": 0.8968585600480986, + "flos": 25707999774960.0, + "grad_norm": 1.6313488391487383, + "language_loss": 0.78161502, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80523616, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12072754, + "step": 14917, + "time_per_iteration": 2.7675445079803467 + }, + { + "auxiliary_loss_clip": 0.01333954, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.22272837, + "balance_loss_mlp": 1.02821052, + "epoch": 0.8969186833007665, + "flos": 30050829882000.0, + "grad_norm": 2.749476417337264, + "language_loss": 0.68896431, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.71271861, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13250732, + "step": 14918, + "time_per_iteration": 2.792513132095337 + }, + { + "auxiliary_loss_clip": 0.0133114, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.22180665, + "balance_loss_mlp": 1.01642084, + "epoch": 0.8969788065534345, + "flos": 22823020886040.0, + "grad_norm": 1.6083416141070683, + "language_loss": 0.83369744, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85729474, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12176514, + "step": 14919, + "time_per_iteration": 4.375135660171509 + }, + { + "auxiliary_loss_clip": 0.01331416, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.22182822, + "balance_loss_mlp": 1.01499414, + "epoch": 0.8970389298061026, + "flos": 13265460039600.0, + "grad_norm": 2.4193779222489287, + "language_loss": 0.72334898, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74694431, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13116455, + "step": 14920, + "time_per_iteration": 2.7596545219421387 + }, + { + "auxiliary_loss_clip": 0.01327022, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.21896791, + "balance_loss_mlp": 1.02291048, + "epoch": 0.8970990530587705, + "flos": 10269183971160.0, + "grad_norm": 2.3672466768948954, + "language_loss": 0.91541922, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93905652, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13775635, + "step": 14921, + "time_per_iteration": 2.6994214057922363 + }, + { + "auxiliary_loss_clip": 0.01334974, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.22343087, + "balance_loss_mlp": 1.01565218, + "epoch": 0.8971591763114385, + "flos": 20307530086200.0, + "grad_norm": 1.7878318433972105, + "language_loss": 0.73810983, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.76175135, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13531494, + "step": 14922, + "time_per_iteration": 2.7625339031219482 + }, + { + "auxiliary_loss_clip": 0.01322795, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.21622717, + "balance_loss_mlp": 1.01551056, + "epoch": 0.8972192995641064, + "flos": 23262118350120.0, + "grad_norm": 1.5691463178394762, + "language_loss": 0.70721877, + "learning_rate": 1.097341060694219e-07, + "loss": 0.73073065, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12878418, + "step": 14923, + "time_per_iteration": 2.739698886871338 + }, + { + "auxiliary_loss_clip": 0.0132968, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.21984816, + "balance_loss_mlp": 1.01257777, + "epoch": 0.8972794228167744, + "flos": 18374253903000.0, + "grad_norm": 2.5091609515978455, + "language_loss": 0.71337038, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.73692191, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12902832, + "step": 14924, + "time_per_iteration": 2.78200364112854 + }, + { + "auxiliary_loss_clip": 0.01326303, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.21866441, + "balance_loss_mlp": 1.01791763, + "epoch": 0.8973395460694423, + "flos": 23957457522480.0, + "grad_norm": 1.5813032240738418, + "language_loss": 0.7214815, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74504489, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12109375, + "step": 14925, + "time_per_iteration": 2.757676839828491 + }, + { + "auxiliary_loss_clip": 0.01330606, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.22196937, + "balance_loss_mlp": 1.01519799, + "epoch": 0.8973996693221103, + "flos": 24976008234720.0, + "grad_norm": 1.6432218413931496, + "language_loss": 0.82945848, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.85305452, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13812256, + "step": 14926, + "time_per_iteration": 2.827030897140503 + }, + { + "auxiliary_loss_clip": 0.01330479, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.22289062, + "balance_loss_mlp": 1.0198698, + "epoch": 0.8974597925747783, + "flos": 25744449101040.0, + "grad_norm": 1.6980137413221534, + "language_loss": 0.79276454, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81638753, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.11950684, + "step": 14927, + "time_per_iteration": 2.9585530757904053 + }, + { + "auxiliary_loss_clip": 0.01323114, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.21639693, + "balance_loss_mlp": 1.0171926, + "epoch": 0.8975199158274463, + "flos": 38079143367480.0, + "grad_norm": 1.4794312925946085, + "language_loss": 0.66713607, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.69066072, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12158203, + "step": 14928, + "time_per_iteration": 2.9373679161071777 + }, + { + "auxiliary_loss_clip": 0.01327221, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.21658182, + "balance_loss_mlp": 1.02782786, + "epoch": 0.8975800390801142, + "flos": 25417298550240.0, + "grad_norm": 1.693401413576974, + "language_loss": 0.71270907, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.73641086, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.15130615, + "step": 14929, + "time_per_iteration": 2.8262853622436523 + }, + { + "auxiliary_loss_clip": 0.01328452, + "auxiliary_loss_mlp": 0.01027848, + "balance_loss_clip": 1.22053719, + "balance_loss_mlp": 1.01570082, + "epoch": 0.8976401623327822, + "flos": 21764284878600.0, + "grad_norm": 1.6824789839424574, + "language_loss": 0.67731047, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70087343, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.121521, + "step": 14930, + "time_per_iteration": 2.8359169960021973 + }, + { + "auxiliary_loss_clip": 0.01322237, + "auxiliary_loss_mlp": 0.01030018, + "balance_loss_clip": 1.21624386, + "balance_loss_mlp": 1.01706564, + "epoch": 0.8977002855854501, + "flos": 13849380207360.0, + "grad_norm": 1.8257542247826333, + "language_loss": 0.75424314, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77776569, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12945557, + "step": 14931, + "time_per_iteration": 2.9460301399230957 + }, + { + "auxiliary_loss_clip": 0.01324317, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.21772099, + "balance_loss_mlp": 1.01735997, + "epoch": 0.8977604088381181, + "flos": 19432299568320.0, + "grad_norm": 1.5381030875866766, + "language_loss": 0.63135934, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65489471, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.11865234, + "step": 14932, + "time_per_iteration": 2.7229056358337402 + }, + { + "auxiliary_loss_clip": 0.0131325, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.21140325, + "balance_loss_mlp": 1.01930916, + "epoch": 0.8978205320907862, + "flos": 22746548698200.0, + "grad_norm": 1.7196304304533236, + "language_loss": 0.71735001, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.74079257, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.11700439, + "step": 14933, + "time_per_iteration": 2.8013830184936523 + }, + { + "auxiliary_loss_clip": 0.01332915, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.22219992, + "balance_loss_mlp": 1.02221358, + "epoch": 0.8978806553434541, + "flos": 21365453926440.0, + "grad_norm": 1.4975802522022534, + "language_loss": 0.74710059, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.77078795, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13616943, + "step": 14934, + "time_per_iteration": 2.78642201423645 + }, + { + "auxiliary_loss_clip": 0.01318884, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.21254885, + "balance_loss_mlp": 1.02331996, + "epoch": 0.8979407785961221, + "flos": 20929239655920.0, + "grad_norm": 1.8742440323363165, + "language_loss": 0.60544658, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62899697, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12817383, + "step": 14935, + "time_per_iteration": 2.805391788482666 + }, + { + "auxiliary_loss_clip": 0.01321216, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.21621823, + "balance_loss_mlp": 1.0184319, + "epoch": 0.89800090184879, + "flos": 25234483402800.0, + "grad_norm": 1.8275431917260607, + "language_loss": 0.76529938, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78882241, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12658691, + "step": 14936, + "time_per_iteration": 2.8386917114257812 + }, + { + "auxiliary_loss_clip": 0.01324625, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.2184993, + "balance_loss_mlp": 1.02116966, + "epoch": 0.898061025101458, + "flos": 22567510128240.0, + "grad_norm": 1.5822888090945781, + "language_loss": 0.74060166, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76418495, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12524414, + "step": 14937, + "time_per_iteration": 2.811342716217041 + }, + { + "auxiliary_loss_clip": 0.01142315, + "auxiliary_loss_mlp": 0.01005059, + "balance_loss_clip": 1.09925842, + "balance_loss_mlp": 1.00234151, + "epoch": 0.8981211483541259, + "flos": 56205236625840.0, + "grad_norm": 1.0685407099230682, + "language_loss": 0.63478518, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65625894, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02722168, + "step": 14938, + "time_per_iteration": 3.126317262649536 + }, + { + "auxiliary_loss_clip": 0.01317006, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.21288323, + "balance_loss_mlp": 1.01576316, + "epoch": 0.898181271606794, + "flos": 16396244288280.0, + "grad_norm": 2.62374156336293, + "language_loss": 0.8053658, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82882202, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12854004, + "step": 14939, + "time_per_iteration": 2.7417361736297607 + }, + { + "auxiliary_loss_clip": 0.01141584, + "auxiliary_loss_mlp": 0.01008727, + "balance_loss_clip": 1.0991708, + "balance_loss_mlp": 1.00599694, + "epoch": 0.8982413948594619, + "flos": 63456963945840.0, + "grad_norm": 0.7181105143013784, + "language_loss": 0.529567, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.55107009, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02734375, + "step": 14940, + "time_per_iteration": 3.3851492404937744 + }, + { + "auxiliary_loss_clip": 0.0132416, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.21583343, + "balance_loss_mlp": 1.01504743, + "epoch": 0.8983015181121299, + "flos": 21840391591200.0, + "grad_norm": 1.741561535814165, + "language_loss": 0.77779084, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80132037, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13745117, + "step": 14941, + "time_per_iteration": 2.7624049186706543 + }, + { + "auxiliary_loss_clip": 0.01331422, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.2236079, + "balance_loss_mlp": 1.01658762, + "epoch": 0.8983616413647978, + "flos": 28955928807000.0, + "grad_norm": 2.005572826900437, + "language_loss": 0.7324307, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75604129, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13049316, + "step": 14942, + "time_per_iteration": 2.8625316619873047 + }, + { + "auxiliary_loss_clip": 0.01329046, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.22051787, + "balance_loss_mlp": 1.02324772, + "epoch": 0.8984217646174658, + "flos": 17789562176400.0, + "grad_norm": 2.6879424248926145, + "language_loss": 0.80421627, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82787061, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13134766, + "step": 14943, + "time_per_iteration": 2.8307087421417236 + }, + { + "auxiliary_loss_clip": 0.01328042, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.21900988, + "balance_loss_mlp": 1.01694775, + "epoch": 0.8984818878701337, + "flos": 23409824247360.0, + "grad_norm": 3.802563437237527, + "language_loss": 0.71369934, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73729146, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.14227295, + "step": 14944, + "time_per_iteration": 2.727778673171997 + }, + { + "auxiliary_loss_clip": 0.01336532, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.22605371, + "balance_loss_mlp": 1.01614416, + "epoch": 0.8985420111228017, + "flos": 22351169456640.0, + "grad_norm": 2.13314057666924, + "language_loss": 0.76303744, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78670096, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13677979, + "step": 14945, + "time_per_iteration": 2.7757208347320557 + }, + { + "auxiliary_loss_clip": 0.0134777, + "auxiliary_loss_mlp": 0.01035401, + "balance_loss_clip": 1.23304236, + "balance_loss_mlp": 1.02093482, + "epoch": 0.8986021343754698, + "flos": 21397152074400.0, + "grad_norm": 2.1948023114034, + "language_loss": 0.73820293, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.76203465, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.14459229, + "step": 14946, + "time_per_iteration": 2.692626476287842 + }, + { + "auxiliary_loss_clip": 0.01326942, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.21880579, + "balance_loss_mlp": 1.0150876, + "epoch": 0.8986622576281377, + "flos": 21330466501320.0, + "grad_norm": 2.00193146881889, + "language_loss": 0.64517063, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66872931, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13848877, + "step": 14947, + "time_per_iteration": 2.7476298809051514 + }, + { + "auxiliary_loss_clip": 0.01322477, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.21590996, + "balance_loss_mlp": 1.01793861, + "epoch": 0.8987223808808057, + "flos": 23993663198400.0, + "grad_norm": 1.6618401876063027, + "language_loss": 0.70282131, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72635561, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13024902, + "step": 14948, + "time_per_iteration": 4.218341827392578 + }, + { + "auxiliary_loss_clip": 0.01324634, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.21683407, + "balance_loss_mlp": 1.01723599, + "epoch": 0.8987825041334736, + "flos": 41508547471080.0, + "grad_norm": 1.7857828202011539, + "language_loss": 0.74513596, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76869059, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13598633, + "step": 14949, + "time_per_iteration": 2.911076545715332 + }, + { + "auxiliary_loss_clip": 0.01333136, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.22386336, + "balance_loss_mlp": 1.0165906, + "epoch": 0.8988426273861416, + "flos": 27570164073840.0, + "grad_norm": 1.608070468596629, + "language_loss": 0.75946635, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78310525, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.14160156, + "step": 14950, + "time_per_iteration": 2.8844165802001953 + }, + { + "auxiliary_loss_clip": 0.01327343, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.22107482, + "balance_loss_mlp": 1.01602829, + "epoch": 0.8989027506388095, + "flos": 17097146805960.0, + "grad_norm": 1.4475189466826874, + "language_loss": 0.66652381, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.69007194, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.11444092, + "step": 14951, + "time_per_iteration": 4.469545602798462 + }, + { + "auxiliary_loss_clip": 0.01337964, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.22518802, + "balance_loss_mlp": 1.01969051, + "epoch": 0.8989628738914776, + "flos": 20559589133400.0, + "grad_norm": 1.8862628280494491, + "language_loss": 0.74081492, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.7645185, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.12689209, + "step": 14952, + "time_per_iteration": 4.40645170211792 + }, + { + "auxiliary_loss_clip": 0.01327395, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.22040272, + "balance_loss_mlp": 1.02109528, + "epoch": 0.8990229971441455, + "flos": 16255157553720.0, + "grad_norm": 2.108437340372712, + "language_loss": 0.57277489, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.59639025, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13049316, + "step": 14953, + "time_per_iteration": 2.8034613132476807 + }, + { + "auxiliary_loss_clip": 0.01324176, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.2170763, + "balance_loss_mlp": 1.01985264, + "epoch": 0.8990831203968135, + "flos": 21986391937320.0, + "grad_norm": 1.7777115447307816, + "language_loss": 0.82645941, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.85002661, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12683105, + "step": 14954, + "time_per_iteration": 2.872654438018799 + }, + { + "auxiliary_loss_clip": 0.01321336, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.21654749, + "balance_loss_mlp": 1.01592278, + "epoch": 0.8991432436494814, + "flos": 27452044689840.0, + "grad_norm": 2.3008192877278995, + "language_loss": 0.60431433, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62780929, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12231445, + "step": 14955, + "time_per_iteration": 2.8880653381347656 + }, + { + "auxiliary_loss_clip": 0.01320444, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.21647835, + "balance_loss_mlp": 1.01627111, + "epoch": 0.8992033669021494, + "flos": 21584840225040.0, + "grad_norm": 1.9052534462093205, + "language_loss": 0.54813564, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57162523, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.12237549, + "step": 14956, + "time_per_iteration": 2.7614965438842773 + }, + { + "auxiliary_loss_clip": 0.0132988, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.22240865, + "balance_loss_mlp": 1.01968789, + "epoch": 0.8992634901548173, + "flos": 28590014253600.0, + "grad_norm": 1.606298936112363, + "language_loss": 0.80282092, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82644439, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12786865, + "step": 14957, + "time_per_iteration": 2.7566606998443604 + }, + { + "auxiliary_loss_clip": 0.01333126, + "auxiliary_loss_mlp": 0.01023618, + "balance_loss_clip": 1.22182703, + "balance_loss_mlp": 1.01095212, + "epoch": 0.8993236134074853, + "flos": 19872534066480.0, + "grad_norm": 1.6768796608306606, + "language_loss": 0.78775257, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.81132001, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12670898, + "step": 14958, + "time_per_iteration": 4.182622671127319 + }, + { + "auxiliary_loss_clip": 0.01328627, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.22359324, + "balance_loss_mlp": 1.02133155, + "epoch": 0.8993837366601534, + "flos": 19395362941920.0, + "grad_norm": 4.3558779886951084, + "language_loss": 0.74983197, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.7734499, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1184082, + "step": 14959, + "time_per_iteration": 2.7492103576660156 + }, + { + "auxiliary_loss_clip": 0.01319784, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.21518064, + "balance_loss_mlp": 1.01467752, + "epoch": 0.8994438599128213, + "flos": 18556378708320.0, + "grad_norm": 1.8788331067615969, + "language_loss": 0.68977714, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.71324611, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12432861, + "step": 14960, + "time_per_iteration": 2.6986238956451416 + }, + { + "auxiliary_loss_clip": 0.01322364, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.21609557, + "balance_loss_mlp": 1.01770413, + "epoch": 0.8995039831654893, + "flos": 24433207354440.0, + "grad_norm": 1.4068396235433933, + "language_loss": 0.65917522, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68269795, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12194824, + "step": 14961, + "time_per_iteration": 2.7858755588531494 + }, + { + "auxiliary_loss_clip": 0.01317307, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.21256208, + "balance_loss_mlp": 1.01641369, + "epoch": 0.8995641064181572, + "flos": 21256268381640.0, + "grad_norm": 1.437796047405872, + "language_loss": 0.82953149, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.8529858, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1171875, + "step": 14962, + "time_per_iteration": 2.8480491638183594 + }, + { + "auxiliary_loss_clip": 0.01333399, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.22284865, + "balance_loss_mlp": 1.01714313, + "epoch": 0.8996242296708252, + "flos": 23518928575440.0, + "grad_norm": 2.0278854120601637, + "language_loss": 0.76739478, + "learning_rate": 1.047022340612298e-07, + "loss": 0.79103124, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13116455, + "step": 14963, + "time_per_iteration": 2.7963500022888184 + }, + { + "auxiliary_loss_clip": 0.0114281, + "auxiliary_loss_mlp": 0.01010229, + "balance_loss_clip": 1.09995818, + "balance_loss_mlp": 1.00741565, + "epoch": 0.8996843529234931, + "flos": 62417784186720.0, + "grad_norm": 0.9397743924003391, + "language_loss": 0.57622874, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59775913, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02807617, + "step": 14964, + "time_per_iteration": 3.0839812755584717 + }, + { + "auxiliary_loss_clip": 0.01340776, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.22821975, + "balance_loss_mlp": 1.01631343, + "epoch": 0.8997444761761612, + "flos": 24241336542720.0, + "grad_norm": 3.64976653261318, + "language_loss": 0.67355037, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69725788, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13665771, + "step": 14965, + "time_per_iteration": 2.873387336730957 + }, + { + "auxiliary_loss_clip": 0.01326405, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.21803093, + "balance_loss_mlp": 1.01720142, + "epoch": 0.8998045994288291, + "flos": 21366103660200.0, + "grad_norm": 1.9998425667167632, + "language_loss": 0.71325636, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73681569, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12322998, + "step": 14966, + "time_per_iteration": 2.801957368850708 + }, + { + "auxiliary_loss_clip": 0.013257, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.21881509, + "balance_loss_mlp": 1.01844144, + "epoch": 0.8998647226814971, + "flos": 28992093874560.0, + "grad_norm": 1.669508713191355, + "language_loss": 0.73550487, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75907874, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13262939, + "step": 14967, + "time_per_iteration": 2.785146474838257 + }, + { + "auxiliary_loss_clip": 0.01324837, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.21715057, + "balance_loss_mlp": 1.01730514, + "epoch": 0.899924845934165, + "flos": 13629872083680.0, + "grad_norm": 1.8635312599660407, + "language_loss": 0.7265147, + "learning_rate": 1.040813291960323e-07, + "loss": 0.75006485, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12866211, + "step": 14968, + "time_per_iteration": 2.716548442840576 + }, + { + "auxiliary_loss_clip": 0.0133183, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.22400856, + "balance_loss_mlp": 1.0201906, + "epoch": 0.899984969186833, + "flos": 20887023942720.0, + "grad_norm": 1.808057110146503, + "language_loss": 0.71024597, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73389316, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12713623, + "step": 14969, + "time_per_iteration": 2.7361865043640137 + }, + { + "auxiliary_loss_clip": 0.01334163, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.22437191, + "balance_loss_mlp": 1.01490498, + "epoch": 0.9000450924395009, + "flos": 20926559504160.0, + "grad_norm": 1.9192099114864227, + "language_loss": 0.76368833, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78730512, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1260376, + "step": 14970, + "time_per_iteration": 2.7527074813842773 + }, + { + "auxiliary_loss_clip": 0.01330141, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.2206707, + "balance_loss_mlp": 1.01638341, + "epoch": 0.900105215692169, + "flos": 17169476941080.0, + "grad_norm": 1.7376346342321722, + "language_loss": 0.73540211, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75898635, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.11901855, + "step": 14971, + "time_per_iteration": 2.8494389057159424 + }, + { + "auxiliary_loss_clip": 0.01324582, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.21593261, + "balance_loss_mlp": 1.01427448, + "epoch": 0.900165338944837, + "flos": 19935930362400.0, + "grad_norm": 2.127661976678534, + "language_loss": 0.82245624, + "learning_rate": 1.035858993572476e-07, + "loss": 0.84597737, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13250732, + "step": 14972, + "time_per_iteration": 2.8901562690734863 + }, + { + "auxiliary_loss_clip": 0.01338717, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.22638261, + "balance_loss_mlp": 1.02188993, + "epoch": 0.9002254621975049, + "flos": 16111390667400.0, + "grad_norm": 1.9693231303686347, + "language_loss": 0.81715286, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.84088707, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.12811279, + "step": 14973, + "time_per_iteration": 2.7707273960113525 + }, + { + "auxiliary_loss_clip": 0.01325607, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.21791172, + "balance_loss_mlp": 1.01761627, + "epoch": 0.9002855854501729, + "flos": 28482087567960.0, + "grad_norm": 1.8432544280795295, + "language_loss": 0.58504111, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60860944, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.1361084, + "step": 14974, + "time_per_iteration": 2.904561996459961 + }, + { + "auxiliary_loss_clip": 0.01333487, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.22447181, + "balance_loss_mlp": 1.02495956, + "epoch": 0.9003457087028408, + "flos": 25635913290000.0, + "grad_norm": 1.7123937028275273, + "language_loss": 0.63400847, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65771925, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.12652588, + "step": 14975, + "time_per_iteration": 2.910393714904785 + }, + { + "auxiliary_loss_clip": 0.0133262, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.22327232, + "balance_loss_mlp": 1.02016675, + "epoch": 0.9004058319555088, + "flos": 24394808827080.0, + "grad_norm": 1.5292202693465557, + "language_loss": 0.73477072, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75842714, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12860107, + "step": 14976, + "time_per_iteration": 2.781270980834961 + }, + { + "auxiliary_loss_clip": 0.01334794, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.2275207, + "balance_loss_mlp": 1.02235568, + "epoch": 0.9004659552081767, + "flos": 29065317393600.0, + "grad_norm": 1.5405912441480705, + "language_loss": 0.69612098, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71982086, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12835693, + "step": 14977, + "time_per_iteration": 2.7862305641174316 + }, + { + "auxiliary_loss_clip": 0.01332922, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.22305739, + "balance_loss_mlp": 1.02331722, + "epoch": 0.9005260784608448, + "flos": 16768250095680.0, + "grad_norm": 2.85963046948476, + "language_loss": 0.65422606, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67792857, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.14019775, + "step": 14978, + "time_per_iteration": 2.821887731552124 + }, + { + "auxiliary_loss_clip": 0.01336, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.22405457, + "balance_loss_mlp": 1.02134454, + "epoch": 0.9005862017135127, + "flos": 20380875430320.0, + "grad_norm": 1.713218107455767, + "language_loss": 0.78737652, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81108922, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13916016, + "step": 14979, + "time_per_iteration": 2.7720558643341064 + }, + { + "auxiliary_loss_clip": 0.01139937, + "auxiliary_loss_mlp": 0.01004867, + "balance_loss_clip": 1.09779, + "balance_loss_mlp": 1.00256598, + "epoch": 0.9006463249661807, + "flos": 67594969174320.0, + "grad_norm": 0.7247759571247615, + "language_loss": 0.53689712, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.5583452, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02294922, + "step": 14980, + "time_per_iteration": 3.2758028507232666 + }, + { + "auxiliary_loss_clip": 0.01338777, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.22725677, + "balance_loss_mlp": 1.02580738, + "epoch": 0.9007064482188486, + "flos": 28299231812160.0, + "grad_norm": 1.7526003621137547, + "language_loss": 0.82723051, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.85101116, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13482666, + "step": 14981, + "time_per_iteration": 2.7941787242889404 + }, + { + "auxiliary_loss_clip": 0.01316902, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.2116909, + "balance_loss_mlp": 1.0176208, + "epoch": 0.9007665714715166, + "flos": 21621330159480.0, + "grad_norm": 1.4452570253125379, + "language_loss": 0.81425756, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83772999, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1272583, + "step": 14982, + "time_per_iteration": 2.7595536708831787 + }, + { + "auxiliary_loss_clip": 0.01314447, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.21237135, + "balance_loss_mlp": 1.01552272, + "epoch": 0.9008266947241845, + "flos": 26547511917240.0, + "grad_norm": 1.74346184875735, + "language_loss": 0.7190218, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.74243778, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.11639404, + "step": 14983, + "time_per_iteration": 2.8094334602355957 + }, + { + "auxiliary_loss_clip": 0.0132007, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.21507072, + "balance_loss_mlp": 1.01531959, + "epoch": 0.9008868179768525, + "flos": 23115508878600.0, + "grad_norm": 1.426225889395319, + "language_loss": 0.75248218, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77594888, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11297607, + "step": 14984, + "time_per_iteration": 2.7973849773406982 + }, + { + "auxiliary_loss_clip": 0.0132255, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.21789908, + "balance_loss_mlp": 1.01875055, + "epoch": 0.9009469412295206, + "flos": 19065288589200.0, + "grad_norm": 1.4569591562873518, + "language_loss": 0.70576608, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.7293117, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.13256836, + "step": 14985, + "time_per_iteration": 2.7439870834350586 + }, + { + "auxiliary_loss_clip": 0.01328582, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.21804404, + "balance_loss_mlp": 1.01771927, + "epoch": 0.9010070644821885, + "flos": 23227577616960.0, + "grad_norm": 1.8711460101471853, + "language_loss": 0.70839584, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7319864, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.12756348, + "step": 14986, + "time_per_iteration": 2.855426549911499 + }, + { + "auxiliary_loss_clip": 0.01332087, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.22180986, + "balance_loss_mlp": 1.0209589, + "epoch": 0.9010671877348565, + "flos": 17389391148360.0, + "grad_norm": 3.7460423760302466, + "language_loss": 0.77082586, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79448748, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13122559, + "step": 14987, + "time_per_iteration": 4.296716928482056 + }, + { + "auxiliary_loss_clip": 0.01322491, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.21536446, + "balance_loss_mlp": 1.01804757, + "epoch": 0.9011273109875244, + "flos": 21913209026640.0, + "grad_norm": 1.6869570281211221, + "language_loss": 0.73780918, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.76134104, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12646484, + "step": 14988, + "time_per_iteration": 4.348497152328491 + }, + { + "auxiliary_loss_clip": 0.01335527, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.22465241, + "balance_loss_mlp": 1.01907396, + "epoch": 0.9011874342401924, + "flos": 24066155766960.0, + "grad_norm": 1.7484188145925808, + "language_loss": 0.69149828, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71518707, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14263916, + "step": 14989, + "time_per_iteration": 2.9098050594329834 + }, + { + "auxiliary_loss_clip": 0.01328929, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.21996665, + "balance_loss_mlp": 1.01617789, + "epoch": 0.9012475574928603, + "flos": 16762524316920.0, + "grad_norm": 1.90040186008725, + "language_loss": 0.80084407, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.8244195, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.12432861, + "step": 14990, + "time_per_iteration": 2.757427215576172 + }, + { + "auxiliary_loss_clip": 0.01338419, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.22735524, + "balance_loss_mlp": 1.01668215, + "epoch": 0.9013076807455284, + "flos": 19975465923840.0, + "grad_norm": 1.962738718180208, + "language_loss": 0.77936065, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.8030408, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12902832, + "step": 14991, + "time_per_iteration": 4.252264022827148 + }, + { + "auxiliary_loss_clip": 0.01141229, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 1.09865546, + "balance_loss_mlp": 1.0012964, + "epoch": 0.9013678039981963, + "flos": 65195729773920.0, + "grad_norm": 0.8188983506684931, + "language_loss": 0.60218191, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62363374, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02661133, + "step": 14992, + "time_per_iteration": 3.203972578048706 + }, + { + "auxiliary_loss_clip": 0.01318858, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.2125597, + "balance_loss_mlp": 1.01554036, + "epoch": 0.9014279272508643, + "flos": 20526023000880.0, + "grad_norm": 2.104872731793744, + "language_loss": 0.83302414, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85649407, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12585449, + "step": 14993, + "time_per_iteration": 2.889072895050049 + }, + { + "auxiliary_loss_clip": 0.01329855, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.22145391, + "balance_loss_mlp": 1.0187006, + "epoch": 0.9014880505035322, + "flos": 17313243827400.0, + "grad_norm": 2.309822575667418, + "language_loss": 0.73821068, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.76182508, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12902832, + "step": 14994, + "time_per_iteration": 2.7521209716796875 + }, + { + "auxiliary_loss_clip": 0.01318726, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.21396065, + "balance_loss_mlp": 1.0180788, + "epoch": 0.9015481737562002, + "flos": 28408782832200.0, + "grad_norm": 1.7111539595788272, + "language_loss": 0.65144438, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.67493206, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.11968994, + "step": 14995, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0132985, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.22001636, + "balance_loss_mlp": 1.01698971, + "epoch": 0.9016082970088681, + "flos": 29758747973040.0, + "grad_norm": 1.6226173183341053, + "language_loss": 0.67238915, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.69598794, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13043213, + "step": 14996, + "time_per_iteration": 4.304367542266846 + }, + { + "auxiliary_loss_clip": 0.01324611, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.21781111, + "balance_loss_mlp": 1.01694977, + "epoch": 0.9016684202615362, + "flos": 23518441275120.0, + "grad_norm": 1.6540507225334538, + "language_loss": 0.66041493, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68395579, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12524414, + "step": 14997, + "time_per_iteration": 2.8695945739746094 + }, + { + "auxiliary_loss_clip": 0.01322588, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.2163651, + "balance_loss_mlp": 1.02196002, + "epoch": 0.9017285435142042, + "flos": 16983210083040.0, + "grad_norm": 2.1989879800145102, + "language_loss": 0.78351712, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.80708539, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12298584, + "step": 14998, + "time_per_iteration": 2.797215223312378 + }, + { + "auxiliary_loss_clip": 0.01328725, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.21950531, + "balance_loss_mlp": 1.01571512, + "epoch": 0.9017886667668721, + "flos": 21397801808160.0, + "grad_norm": 1.7092797738123815, + "language_loss": 0.75077456, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77434516, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.12634277, + "step": 14999, + "time_per_iteration": 2.779778242111206 + }, + { + "auxiliary_loss_clip": 0.01323124, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.2167064, + "balance_loss_mlp": 1.01525927, + "epoch": 0.9018487900195401, + "flos": 21001569791040.0, + "grad_norm": 2.7700024626379878, + "language_loss": 0.76363337, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.7871502, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13299561, + "step": 15000, + "time_per_iteration": 2.8199174404144287 + }, + { + "auxiliary_loss_clip": 0.01322028, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.21635926, + "balance_loss_mlp": 1.01838183, + "epoch": 0.901908913272208, + "flos": 53367532513920.0, + "grad_norm": 2.0024445089942478, + "language_loss": 0.81131047, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.83483291, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11816406, + "step": 15001, + "time_per_iteration": 3.082547187805176 + }, + { + "auxiliary_loss_clip": 0.01320934, + "auxiliary_loss_mlp": 0.01023462, + "balance_loss_clip": 1.21548676, + "balance_loss_mlp": 1.01176131, + "epoch": 0.901969036524876, + "flos": 22094805923280.0, + "grad_norm": 1.3861599518826193, + "language_loss": 0.78372169, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80716562, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.11694336, + "step": 15002, + "time_per_iteration": 2.7282345294952393 + }, + { + "auxiliary_loss_clip": 0.0132539, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.21719408, + "balance_loss_mlp": 1.01839364, + "epoch": 0.9020291597775439, + "flos": 23839419355200.0, + "grad_norm": 6.823508883050238, + "language_loss": 0.68543899, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70901257, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13574219, + "step": 15003, + "time_per_iteration": 2.819133996963501 + }, + { + "auxiliary_loss_clip": 0.01331931, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.22168994, + "balance_loss_mlp": 1.01608562, + "epoch": 0.902089283030212, + "flos": 18329723513280.0, + "grad_norm": 1.8137655264288868, + "language_loss": 0.86251581, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88612747, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1315918, + "step": 15004, + "time_per_iteration": 2.843803644180298 + }, + { + "auxiliary_loss_clip": 0.01330397, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.22207701, + "balance_loss_mlp": 1.01994014, + "epoch": 0.9021494062828799, + "flos": 24175706787000.0, + "grad_norm": 2.465709415086755, + "language_loss": 0.72766066, + "learning_rate": 9.954253314356575e-08, + "loss": 0.7512933, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12927246, + "step": 15005, + "time_per_iteration": 2.8020212650299072 + }, + { + "auxiliary_loss_clip": 0.01327908, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.21796012, + "balance_loss_mlp": 1.0187887, + "epoch": 0.9022095295355479, + "flos": 21621939284880.0, + "grad_norm": 1.8356610767614994, + "language_loss": 0.71373188, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73732865, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12969971, + "step": 15006, + "time_per_iteration": 2.8361098766326904 + }, + { + "auxiliary_loss_clip": 0.01333306, + "auxiliary_loss_mlp": 0.01027066, + "balance_loss_clip": 1.22324967, + "balance_loss_mlp": 1.01491868, + "epoch": 0.9022696527882158, + "flos": 18729853932960.0, + "grad_norm": 1.8380169820446235, + "language_loss": 0.84339416, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86699784, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12158203, + "step": 15007, + "time_per_iteration": 2.7863287925720215 + }, + { + "auxiliary_loss_clip": 0.01323957, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.21855509, + "balance_loss_mlp": 1.01995933, + "epoch": 0.9023297760408838, + "flos": 26766938824200.0, + "grad_norm": 1.5644687582181205, + "language_loss": 0.78548014, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80904603, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12670898, + "step": 15008, + "time_per_iteration": 2.871609687805176 + }, + { + "auxiliary_loss_clip": 0.01313664, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.21161163, + "balance_loss_mlp": 1.01660872, + "epoch": 0.9023898992935517, + "flos": 20527525510200.0, + "grad_norm": 1.630718484442545, + "language_loss": 0.74058414, + "learning_rate": 9.905775769002156e-08, + "loss": 0.76400793, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.12109375, + "step": 15009, + "time_per_iteration": 2.840428590774536 + }, + { + "auxiliary_loss_clip": 0.01326208, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.22001815, + "balance_loss_mlp": 1.01647425, + "epoch": 0.9024500225462198, + "flos": 17461071549720.0, + "grad_norm": 2.528944863530545, + "language_loss": 0.73415869, + "learning_rate": 9.893674402495399e-08, + "loss": 0.7577098, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12432861, + "step": 15010, + "time_per_iteration": 2.8176586627960205 + }, + { + "auxiliary_loss_clip": 0.01329166, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.22045803, + "balance_loss_mlp": 1.01923919, + "epoch": 0.9025101457988878, + "flos": 20818510993440.0, + "grad_norm": 1.9083303629768384, + "language_loss": 0.74704111, + "learning_rate": 9.881580244839538e-08, + "loss": 0.77065516, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13000488, + "step": 15011, + "time_per_iteration": 2.840066909790039 + }, + { + "auxiliary_loss_clip": 0.01336929, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.22551274, + "balance_loss_mlp": 1.01826, + "epoch": 0.9025702690515557, + "flos": 19031113331280.0, + "grad_norm": 2.012664515646204, + "language_loss": 0.73495233, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75863701, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13287354, + "step": 15012, + "time_per_iteration": 2.9372949600219727 + }, + { + "auxiliary_loss_clip": 0.01321024, + "auxiliary_loss_mlp": 0.01032895, + "balance_loss_clip": 1.21503055, + "balance_loss_mlp": 1.02025843, + "epoch": 0.9026303923042237, + "flos": 19687607284320.0, + "grad_norm": 1.5491412436447825, + "language_loss": 0.69373357, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71727276, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12646484, + "step": 15013, + "time_per_iteration": 2.869727611541748 + }, + { + "auxiliary_loss_clip": 0.01314402, + "auxiliary_loss_mlp": 0.0102737, + "balance_loss_clip": 1.21128583, + "balance_loss_mlp": 1.01582432, + "epoch": 0.9026905155568916, + "flos": 24613504783560.0, + "grad_norm": 1.366660663811599, + "language_loss": 0.73326671, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75668442, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.11535645, + "step": 15014, + "time_per_iteration": 2.8200037479400635 + }, + { + "auxiliary_loss_clip": 0.01332624, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.22384512, + "balance_loss_mlp": 1.0157479, + "epoch": 0.9027506388095596, + "flos": 20526672734640.0, + "grad_norm": 1.786257598078523, + "language_loss": 0.72208345, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74569452, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12744141, + "step": 15015, + "time_per_iteration": 2.9247913360595703 + }, + { + "auxiliary_loss_clip": 0.01327556, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.21929312, + "balance_loss_mlp": 1.01860642, + "epoch": 0.9028107620622275, + "flos": 22789739012040.0, + "grad_norm": 1.7856088062093007, + "language_loss": 0.69293582, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71651918, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12158203, + "step": 15016, + "time_per_iteration": 2.812288999557495 + }, + { + "auxiliary_loss_clip": 0.01321911, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.21541166, + "balance_loss_mlp": 1.01786733, + "epoch": 0.9028708853148956, + "flos": 25415999082720.0, + "grad_norm": 1.9360989565911264, + "language_loss": 0.70767498, + "learning_rate": 9.809166710436855e-08, + "loss": 0.73119128, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11846924, + "step": 15017, + "time_per_iteration": 2.8032476902008057 + }, + { + "auxiliary_loss_clip": 0.01326846, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.22062492, + "balance_loss_mlp": 1.01942968, + "epoch": 0.9029310085675635, + "flos": 21876313008600.0, + "grad_norm": 1.6730192261540668, + "language_loss": 0.70003313, + "learning_rate": 9.797123027563237e-08, + "loss": 0.72361916, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12329102, + "step": 15018, + "time_per_iteration": 2.7905969619750977 + }, + { + "auxiliary_loss_clip": 0.01326263, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.21870196, + "balance_loss_mlp": 1.01831019, + "epoch": 0.9029911318202315, + "flos": 26220117716280.0, + "grad_norm": 1.6268209081182512, + "language_loss": 0.69546652, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71903896, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12670898, + "step": 15019, + "time_per_iteration": 2.790004014968872 + }, + { + "auxiliary_loss_clip": 0.01318371, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.21363008, + "balance_loss_mlp": 1.017097, + "epoch": 0.9030512550728994, + "flos": 15965674579800.0, + "grad_norm": 4.88111059052779, + "language_loss": 0.72332472, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74679577, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11627197, + "step": 15020, + "time_per_iteration": 2.735180377960205 + }, + { + "auxiliary_loss_clip": 0.01329733, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.22089481, + "balance_loss_mlp": 1.01439357, + "epoch": 0.9031113783255674, + "flos": 23992891639560.0, + "grad_norm": 1.445286034948544, + "language_loss": 0.74534518, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76891708, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13061523, + "step": 15021, + "time_per_iteration": 2.813443899154663 + }, + { + "auxiliary_loss_clip": 0.01331934, + "auxiliary_loss_mlp": 0.01031646, + "balance_loss_clip": 1.22068846, + "balance_loss_mlp": 1.01866412, + "epoch": 0.9031715015782353, + "flos": 22242389995440.0, + "grad_norm": 2.2039460173511904, + "language_loss": 0.72919488, + "learning_rate": 9.749020425753251e-08, + "loss": 0.75283062, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12988281, + "step": 15022, + "time_per_iteration": 2.8731818199157715 + }, + { + "auxiliary_loss_clip": 0.01309146, + "auxiliary_loss_mlp": 0.01027096, + "balance_loss_clip": 1.20808041, + "balance_loss_mlp": 1.01468635, + "epoch": 0.9032316248309034, + "flos": 26328815960760.0, + "grad_norm": 2.1078685736704346, + "language_loss": 0.73303616, + "learning_rate": 9.737012810001943e-08, + "loss": 0.75639856, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.12402344, + "step": 15023, + "time_per_iteration": 2.843458414077759 + }, + { + "auxiliary_loss_clip": 0.01325166, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.21795774, + "balance_loss_mlp": 1.01757431, + "epoch": 0.9032917480835713, + "flos": 22641667639560.0, + "grad_norm": 1.8908442466895679, + "language_loss": 0.82917345, + "learning_rate": 9.725012409042155e-08, + "loss": 0.85272169, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12084961, + "step": 15024, + "time_per_iteration": 2.9401941299438477 + }, + { + "auxiliary_loss_clip": 0.01326838, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.21770334, + "balance_loss_mlp": 1.01542974, + "epoch": 0.9033518713362393, + "flos": 23884112178360.0, + "grad_norm": 1.4949022387231183, + "language_loss": 0.69921374, + "learning_rate": 9.713019223328966e-08, + "loss": 0.72275788, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12164307, + "step": 15025, + "time_per_iteration": 4.319639205932617 + }, + { + "auxiliary_loss_clip": 0.01323232, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.21726966, + "balance_loss_mlp": 1.02327991, + "epoch": 0.9034119945889073, + "flos": 26910786927240.0, + "grad_norm": 1.6690400727057213, + "language_loss": 0.77211773, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79570693, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12408447, + "step": 15026, + "time_per_iteration": 2.832374334335327 + }, + { + "auxiliary_loss_clip": 0.01324937, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.21796024, + "balance_loss_mlp": 1.0168407, + "epoch": 0.9034721178415752, + "flos": 20855082144600.0, + "grad_norm": 1.7716388837634658, + "language_loss": 0.69039202, + "learning_rate": 9.68905449946129e-08, + "loss": 0.71393275, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12298584, + "step": 15027, + "time_per_iteration": 4.378148317337036 + }, + { + "auxiliary_loss_clip": 0.01312133, + "auxiliary_loss_mlp": 0.01025891, + "balance_loss_clip": 1.20973516, + "balance_loss_mlp": 1.01389861, + "epoch": 0.9035322410942432, + "flos": 22239100718280.0, + "grad_norm": 1.4608921750139692, + "language_loss": 0.75852561, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78190589, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11993408, + "step": 15028, + "time_per_iteration": 2.814950466156006 + }, + { + "auxiliary_loss_clip": 0.0132085, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.21499836, + "balance_loss_mlp": 1.01673794, + "epoch": 0.9035923643469111, + "flos": 25929213449760.0, + "grad_norm": 1.751645077984368, + "language_loss": 0.69356918, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71707201, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12689209, + "step": 15029, + "time_per_iteration": 4.356348514556885 + }, + { + "auxiliary_loss_clip": 0.01334608, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.22396016, + "balance_loss_mlp": 1.01805997, + "epoch": 0.9036524875995792, + "flos": 20344710362760.0, + "grad_norm": 1.7650016720713078, + "language_loss": 0.73852253, + "learning_rate": 9.653161539369858e-08, + "loss": 0.76218593, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13665771, + "step": 15030, + "time_per_iteration": 2.807286500930786 + }, + { + "auxiliary_loss_clip": 0.01329829, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.21963847, + "balance_loss_mlp": 1.02273369, + "epoch": 0.9037126108522471, + "flos": 40122051787440.0, + "grad_norm": 1.9665799896548577, + "language_loss": 0.68574965, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70940393, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12860107, + "step": 15031, + "time_per_iteration": 3.0409915447235107 + }, + { + "auxiliary_loss_clip": 0.01319599, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.21360517, + "balance_loss_mlp": 1.01636767, + "epoch": 0.9037727341049151, + "flos": 23337372287160.0, + "grad_norm": 1.5359508008142533, + "language_loss": 0.7653991, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78888261, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12371826, + "step": 15032, + "time_per_iteration": 2.81811261177063 + }, + { + "auxiliary_loss_clip": 0.01332205, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.22359514, + "balance_loss_mlp": 1.02169371, + "epoch": 0.903832857357583, + "flos": 12826443792240.0, + "grad_norm": 1.8125439017000864, + "language_loss": 0.75710952, + "learning_rate": 9.617333541017502e-08, + "loss": 0.78076982, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12127686, + "step": 15033, + "time_per_iteration": 2.7994298934936523 + }, + { + "auxiliary_loss_clip": 0.01327452, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.21896291, + "balance_loss_mlp": 1.02139974, + "epoch": 0.903892980610251, + "flos": 25708608900360.0, + "grad_norm": 2.0306737479463033, + "language_loss": 0.74143577, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76505554, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13134766, + "step": 15034, + "time_per_iteration": 2.8888003826141357 + }, + { + "auxiliary_loss_clip": 0.0132481, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.21863961, + "balance_loss_mlp": 1.01562142, + "epoch": 0.9039531038629189, + "flos": 14688202007520.0, + "grad_norm": 1.6088003115241265, + "language_loss": 0.63497031, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65850127, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12701416, + "step": 15035, + "time_per_iteration": 4.402267932891846 + }, + { + "auxiliary_loss_clip": 0.0132815, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.22081292, + "balance_loss_mlp": 1.02554154, + "epoch": 0.904013227115587, + "flos": 24030315566280.0, + "grad_norm": 2.432512440763882, + "language_loss": 0.62493068, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64859962, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13208008, + "step": 15036, + "time_per_iteration": 2.814288854598999 + }, + { + "auxiliary_loss_clip": 0.01320168, + "auxiliary_loss_mlp": 0.01025979, + "balance_loss_clip": 1.21608996, + "balance_loss_mlp": 1.01352191, + "epoch": 0.9040733503682549, + "flos": 22861581846840.0, + "grad_norm": 1.5377435842977565, + "language_loss": 0.82283378, + "learning_rate": 9.569663949272455e-08, + "loss": 0.8462953, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12463379, + "step": 15037, + "time_per_iteration": 2.7736051082611084 + }, + { + "auxiliary_loss_clip": 0.01332376, + "auxiliary_loss_mlp": 0.01028183, + "balance_loss_clip": 1.22215247, + "balance_loss_mlp": 1.01566601, + "epoch": 0.9041334736209229, + "flos": 19979811018360.0, + "grad_norm": 2.1225970399606116, + "language_loss": 0.6826424, + "learning_rate": 9.557764603050667e-08, + "loss": 0.70624804, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12506104, + "step": 15038, + "time_per_iteration": 2.847339153289795 + }, + { + "auxiliary_loss_clip": 0.01319788, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.21259725, + "balance_loss_mlp": 1.01768446, + "epoch": 0.9041935968735909, + "flos": 17535107235960.0, + "grad_norm": 1.7613789372268451, + "language_loss": 0.75674403, + "learning_rate": 9.545872478417494e-08, + "loss": 0.7802484, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12963867, + "step": 15039, + "time_per_iteration": 2.7616708278656006 + }, + { + "auxiliary_loss_clip": 0.01321855, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.21670568, + "balance_loss_mlp": 1.01764607, + "epoch": 0.9042537201262588, + "flos": 22784825400480.0, + "grad_norm": 1.5235420868309528, + "language_loss": 0.70622176, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72973961, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12286377, + "step": 15040, + "time_per_iteration": 2.753772497177124 + }, + { + "auxiliary_loss_clip": 0.01320402, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.21517169, + "balance_loss_mlp": 1.01523244, + "epoch": 0.9043138433789268, + "flos": 20600383554000.0, + "grad_norm": 1.5960227268263192, + "language_loss": 0.68174672, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70522761, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12463379, + "step": 15041, + "time_per_iteration": 2.741229772567749 + }, + { + "auxiliary_loss_clip": 0.01324463, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.21754098, + "balance_loss_mlp": 1.01711011, + "epoch": 0.9043739666315948, + "flos": 32969415511800.0, + "grad_norm": 1.8704573000977505, + "language_loss": 0.5754559, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59900469, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13330078, + "step": 15042, + "time_per_iteration": 2.883592367172241 + }, + { + "auxiliary_loss_clip": 0.01145932, + "auxiliary_loss_mlp": 0.01014196, + "balance_loss_clip": 1.10378075, + "balance_loss_mlp": 1.01162076, + "epoch": 0.9044340898842628, + "flos": 67312145971440.0, + "grad_norm": 0.7967229784601795, + "language_loss": 0.56925941, + "learning_rate": 9.498376204786351e-08, + "loss": 0.59086072, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02575684, + "step": 15043, + "time_per_iteration": 3.2749338150024414 + }, + { + "auxiliary_loss_clip": 0.01323982, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.2160846, + "balance_loss_mlp": 1.01364994, + "epoch": 0.9044942131369307, + "flos": 17717759949960.0, + "grad_norm": 1.632589010461219, + "language_loss": 0.70104229, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72455025, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13165283, + "step": 15044, + "time_per_iteration": 2.854891538619995 + }, + { + "auxiliary_loss_clip": 0.01329069, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.21926355, + "balance_loss_mlp": 1.01877761, + "epoch": 0.9045543363895987, + "flos": 17824915076760.0, + "grad_norm": 2.1478968168774637, + "language_loss": 0.70008415, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72369474, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13214111, + "step": 15045, + "time_per_iteration": 2.7956736087799072 + }, + { + "auxiliary_loss_clip": 0.0132933, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.22074771, + "balance_loss_mlp": 1.01797092, + "epoch": 0.9046144596422666, + "flos": 21877896734640.0, + "grad_norm": 2.275452844914575, + "language_loss": 0.65887594, + "learning_rate": 9.462829848313081e-08, + "loss": 0.68247867, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12969971, + "step": 15046, + "time_per_iteration": 2.868753671646118 + }, + { + "auxiliary_loss_clip": 0.01331919, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.22152185, + "balance_loss_mlp": 1.01786208, + "epoch": 0.9046745828949346, + "flos": 17676721879200.0, + "grad_norm": 2.1582584281314086, + "language_loss": 0.62051636, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64414454, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.1305542, + "step": 15047, + "time_per_iteration": 2.8882761001586914 + }, + { + "auxiliary_loss_clip": 0.01321841, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.21620798, + "balance_loss_mlp": 1.01686287, + "epoch": 0.9047347061476025, + "flos": 25708080991680.0, + "grad_norm": 1.4365339659429381, + "language_loss": 0.71221471, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73572111, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.11932373, + "step": 15048, + "time_per_iteration": 2.8962435722351074 + }, + { + "auxiliary_loss_clip": 0.01327101, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.216851, + "balance_loss_mlp": 1.01956403, + "epoch": 0.9047948294002706, + "flos": 15162286896720.0, + "grad_norm": 2.422041758608268, + "language_loss": 0.75015324, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77376008, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.14013672, + "step": 15049, + "time_per_iteration": 2.756040334701538 + }, + { + "auxiliary_loss_clip": 0.01322218, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.21584976, + "balance_loss_mlp": 1.01891279, + "epoch": 0.9048549526529385, + "flos": 21877409434320.0, + "grad_norm": 1.7901639385198254, + "language_loss": 0.75366282, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77720058, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12646484, + "step": 15050, + "time_per_iteration": 2.854665994644165 + }, + { + "auxiliary_loss_clip": 0.01330733, + "auxiliary_loss_mlp": 0.01034694, + "balance_loss_clip": 1.22204256, + "balance_loss_mlp": 1.0221889, + "epoch": 0.9049150759056065, + "flos": 23551438890600.0, + "grad_norm": 1.9525829730817479, + "language_loss": 0.82574451, + "learning_rate": 9.403730430606472e-08, + "loss": 0.84939879, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12518311, + "step": 15051, + "time_per_iteration": 2.818880319595337 + }, + { + "auxiliary_loss_clip": 0.01329197, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.22135735, + "balance_loss_mlp": 1.02122378, + "epoch": 0.9049751991582745, + "flos": 19650630049560.0, + "grad_norm": 1.9308083924434671, + "language_loss": 0.89695752, + "learning_rate": 9.391932227562582e-08, + "loss": 0.92058349, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12164307, + "step": 15052, + "time_per_iteration": 2.730649471282959 + }, + { + "auxiliary_loss_clip": 0.01335532, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.2243768, + "balance_loss_mlp": 1.02218294, + "epoch": 0.9050353224109424, + "flos": 15600450368520.0, + "grad_norm": 2.0796251534632875, + "language_loss": 0.77429873, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79800427, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.12835693, + "step": 15053, + "time_per_iteration": 2.75319504737854 + }, + { + "auxiliary_loss_clip": 0.01320721, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.21506822, + "balance_loss_mlp": 1.02010083, + "epoch": 0.9050954456636104, + "flos": 28189437141960.0, + "grad_norm": 1.534802390259459, + "language_loss": 0.73145044, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75498581, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12713623, + "step": 15054, + "time_per_iteration": 2.7683396339416504 + }, + { + "auxiliary_loss_clip": 0.01321423, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.21552825, + "balance_loss_mlp": 1.01585019, + "epoch": 0.9051555689162784, + "flos": 25736164995600.0, + "grad_norm": 1.6178745098977731, + "language_loss": 0.83450699, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85799831, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.11853027, + "step": 15055, + "time_per_iteration": 2.847228765487671 + }, + { + "auxiliary_loss_clip": 0.01324083, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.21860611, + "balance_loss_mlp": 1.02006412, + "epoch": 0.9052156921689464, + "flos": 23262362000280.0, + "grad_norm": 1.6812592993631605, + "language_loss": 0.8480159, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87158459, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.1272583, + "step": 15056, + "time_per_iteration": 2.808171510696411 + }, + { + "auxiliary_loss_clip": 0.01324642, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.21763885, + "balance_loss_mlp": 1.01602924, + "epoch": 0.9052758154216143, + "flos": 29569557313080.0, + "grad_norm": 1.8038096252864126, + "language_loss": 0.71978837, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74331468, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11956787, + "step": 15057, + "time_per_iteration": 2.8606338500976562 + }, + { + "auxiliary_loss_clip": 0.01322061, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.21749389, + "balance_loss_mlp": 1.01577616, + "epoch": 0.9053359386742823, + "flos": 22132838975400.0, + "grad_norm": 1.4403841687710484, + "language_loss": 0.80998176, + "learning_rate": 9.321294810356418e-08, + "loss": 0.83348477, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12463379, + "step": 15058, + "time_per_iteration": 2.945578098297119 + }, + { + "auxiliary_loss_clip": 0.01140251, + "auxiliary_loss_mlp": 0.01003696, + "balance_loss_clip": 1.0980736, + "balance_loss_mlp": 1.00102568, + "epoch": 0.9053960619269502, + "flos": 67107459899160.0, + "grad_norm": 0.6843819542042122, + "language_loss": 0.51433069, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53577012, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.0267334, + "step": 15059, + "time_per_iteration": 3.359215021133423 + }, + { + "auxiliary_loss_clip": 0.01323835, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.21588957, + "balance_loss_mlp": 1.01477146, + "epoch": 0.9054561851796182, + "flos": 15819714842040.0, + "grad_norm": 1.8910886072164128, + "language_loss": 0.67635393, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69986749, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12744141, + "step": 15060, + "time_per_iteration": 2.8965744972229004 + }, + { + "auxiliary_loss_clip": 0.0133596, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.22662008, + "balance_loss_mlp": 1.01928437, + "epoch": 0.9055163084322861, + "flos": 17571556562040.0, + "grad_norm": 1.8808160586574172, + "language_loss": 0.63732469, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66100478, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12786865, + "step": 15061, + "time_per_iteration": 2.7364962100982666 + }, + { + "auxiliary_loss_clip": 0.0132754, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.21995592, + "balance_loss_mlp": 1.01753211, + "epoch": 0.9055764316849542, + "flos": 17644089738960.0, + "grad_norm": 1.8570136061319726, + "language_loss": 0.71789491, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74147403, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12823486, + "step": 15062, + "time_per_iteration": 2.7846312522888184 + }, + { + "auxiliary_loss_clip": 0.01321207, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.21547878, + "balance_loss_mlp": 1.01516223, + "epoch": 0.9056365549376221, + "flos": 20125730147760.0, + "grad_norm": 1.5256630956765207, + "language_loss": 0.70962077, + "learning_rate": 9.2626291321936e-08, + "loss": 0.73310602, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.1217041, + "step": 15063, + "time_per_iteration": 4.25417160987854 + }, + { + "auxiliary_loss_clip": 0.01319387, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.21556902, + "balance_loss_mlp": 1.01820695, + "epoch": 0.9056966781902901, + "flos": 27604420548480.0, + "grad_norm": 1.544957895110784, + "language_loss": 0.72183985, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74533808, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.12225342, + "step": 15064, + "time_per_iteration": 2.885138750076294 + }, + { + "auxiliary_loss_clip": 0.01330528, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.22118986, + "balance_loss_mlp": 1.02063107, + "epoch": 0.9057568014429581, + "flos": 25925477480640.0, + "grad_norm": 1.744245471681056, + "language_loss": 0.69838905, + "learning_rate": 9.23921348727752e-08, + "loss": 0.72202635, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12561035, + "step": 15065, + "time_per_iteration": 4.426839351654053 + }, + { + "auxiliary_loss_clip": 0.01325918, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.21917534, + "balance_loss_mlp": 1.02003682, + "epoch": 0.905816924695626, + "flos": 22935617533080.0, + "grad_norm": 1.6014838931318327, + "language_loss": 0.63704556, + "learning_rate": 9.227516515099743e-08, + "loss": 0.66063106, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12597656, + "step": 15066, + "time_per_iteration": 2.7870938777923584 + }, + { + "auxiliary_loss_clip": 0.01337985, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.22453189, + "balance_loss_mlp": 1.02000165, + "epoch": 0.905877047948294, + "flos": 22161694538160.0, + "grad_norm": 2.0864152420640574, + "language_loss": 0.80538124, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82910395, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14276123, + "step": 15067, + "time_per_iteration": 2.8037638664245605 + }, + { + "auxiliary_loss_clip": 0.01326485, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.21853685, + "balance_loss_mlp": 1.01481438, + "epoch": 0.905937171200962, + "flos": 15309058801680.0, + "grad_norm": 1.6991589697512535, + "language_loss": 0.70354295, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72708982, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1338501, + "step": 15068, + "time_per_iteration": 4.356507301330566 + }, + { + "auxiliary_loss_clip": 0.01317279, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.21337795, + "balance_loss_mlp": 1.01558745, + "epoch": 0.90599729445363, + "flos": 19467611860320.0, + "grad_norm": 1.8699265543257315, + "language_loss": 0.85480684, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87825865, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12304688, + "step": 15069, + "time_per_iteration": 2.790555953979492 + }, + { + "auxiliary_loss_clip": 0.01331018, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.22011352, + "balance_loss_mlp": 1.01717305, + "epoch": 0.9060574177062979, + "flos": 23738314874040.0, + "grad_norm": 2.1486588300346097, + "language_loss": 0.59528959, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61890852, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13708496, + "step": 15070, + "time_per_iteration": 2.727522850036621 + }, + { + "auxiliary_loss_clip": 0.01333023, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.22173858, + "balance_loss_mlp": 1.01203632, + "epoch": 0.9061175409589659, + "flos": 17315599112280.0, + "grad_norm": 2.8206266396932835, + "language_loss": 0.81152511, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83511484, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13922119, + "step": 15071, + "time_per_iteration": 2.785108804702759 + }, + { + "auxiliary_loss_clip": 0.01336001, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.22505689, + "balance_loss_mlp": 1.02342391, + "epoch": 0.9061776642116338, + "flos": 17782049629800.0, + "grad_norm": 2.1407660307958216, + "language_loss": 0.62384439, + "learning_rate": 9.157486613883758e-08, + "loss": 0.6475805, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14190674, + "step": 15072, + "time_per_iteration": 2.698176622390747 + }, + { + "auxiliary_loss_clip": 0.01323876, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.2173202, + "balance_loss_mlp": 1.01589656, + "epoch": 0.9062377874643018, + "flos": 42785410917960.0, + "grad_norm": 1.7181842174451312, + "language_loss": 0.72884488, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75237292, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13043213, + "step": 15073, + "time_per_iteration": 4.39551568031311 + }, + { + "auxiliary_loss_clip": 0.01319898, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.21539235, + "balance_loss_mlp": 1.01497734, + "epoch": 0.9062979107169697, + "flos": 16365845607840.0, + "grad_norm": 2.069553745184734, + "language_loss": 0.81061876, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83409035, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.1227417, + "step": 15074, + "time_per_iteration": 2.8609275817871094 + }, + { + "auxiliary_loss_clip": 0.01141367, + "auxiliary_loss_mlp": 0.01006125, + "balance_loss_clip": 1.09902883, + "balance_loss_mlp": 1.00358605, + "epoch": 0.9063580339696378, + "flos": 69331396698720.0, + "grad_norm": 0.732871475951395, + "language_loss": 0.52400231, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54547727, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02539062, + "step": 15075, + "time_per_iteration": 3.286837339401245 + }, + { + "auxiliary_loss_clip": 0.01141909, + "auxiliary_loss_mlp": 0.01006886, + "balance_loss_clip": 1.0996089, + "balance_loss_mlp": 1.00416803, + "epoch": 0.9064181572223057, + "flos": 58809300904800.0, + "grad_norm": 3.321796401583912, + "language_loss": 0.62221503, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64370298, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02722168, + "step": 15076, + "time_per_iteration": 3.1499364376068115 + }, + { + "auxiliary_loss_clip": 0.01322453, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.21641207, + "balance_loss_mlp": 1.01983738, + "epoch": 0.9064782804749737, + "flos": 21767777197560.0, + "grad_norm": 1.7368891240832134, + "language_loss": 0.82044727, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84399414, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12402344, + "step": 15077, + "time_per_iteration": 2.7964088916778564 + }, + { + "auxiliary_loss_clip": 0.01321417, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.21799755, + "balance_loss_mlp": 1.01368475, + "epoch": 0.9065384037276417, + "flos": 21402349944480.0, + "grad_norm": 1.7277668019340644, + "language_loss": 0.83859771, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86206585, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.1171875, + "step": 15078, + "time_per_iteration": 2.846238613128662 + }, + { + "auxiliary_loss_clip": 0.0131425, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.21169627, + "balance_loss_mlp": 1.01669359, + "epoch": 0.9065985269803096, + "flos": 38289961302120.0, + "grad_norm": 1.4452723563101373, + "language_loss": 0.65472674, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67815745, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.12121582, + "step": 15079, + "time_per_iteration": 3.086904525756836 + }, + { + "auxiliary_loss_clip": 0.01323469, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.21660709, + "balance_loss_mlp": 1.01200902, + "epoch": 0.9066586502329776, + "flos": 44826248311560.0, + "grad_norm": 1.6752332628424431, + "language_loss": 0.70875448, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73223227, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12298584, + "step": 15080, + "time_per_iteration": 3.0732157230377197 + }, + { + "auxiliary_loss_clip": 0.01335745, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.22568631, + "balance_loss_mlp": 1.01832843, + "epoch": 0.9067187734856456, + "flos": 18628546410000.0, + "grad_norm": 2.019904295448482, + "language_loss": 0.71564317, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73931384, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12994385, + "step": 15081, + "time_per_iteration": 2.817409038543701 + }, + { + "auxiliary_loss_clip": 0.01321872, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.21571851, + "balance_loss_mlp": 1.02131879, + "epoch": 0.9067788967383136, + "flos": 22752761777280.0, + "grad_norm": 1.8763178125289797, + "language_loss": 0.7474587, + "learning_rate": 9.04134910022032e-08, + "loss": 0.77101588, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12530518, + "step": 15082, + "time_per_iteration": 3.001391887664795 + }, + { + "auxiliary_loss_clip": 0.01317921, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.21304011, + "balance_loss_mlp": 1.01974058, + "epoch": 0.9068390199909815, + "flos": 27676303991640.0, + "grad_norm": 1.6843017024743256, + "language_loss": 0.78103048, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80452704, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12005615, + "step": 15083, + "time_per_iteration": 2.8400793075561523 + }, + { + "auxiliary_loss_clip": 0.01318513, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.21532154, + "balance_loss_mlp": 1.01912618, + "epoch": 0.9068991432436495, + "flos": 24249458214720.0, + "grad_norm": 1.6515224869821088, + "language_loss": 0.69070703, + "learning_rate": 9.01820847747028e-08, + "loss": 0.71419924, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11584473, + "step": 15084, + "time_per_iteration": 2.9138686656951904 + }, + { + "auxiliary_loss_clip": 0.0132353, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.21688485, + "balance_loss_mlp": 1.01627898, + "epoch": 0.9069592664963174, + "flos": 28038563792640.0, + "grad_norm": 2.2552243794010574, + "language_loss": 0.67224318, + "learning_rate": 9.006649028948965e-08, + "loss": 0.69576824, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12689209, + "step": 15085, + "time_per_iteration": 2.8321378231048584 + }, + { + "auxiliary_loss_clip": 0.01143044, + "auxiliary_loss_mlp": 0.01001929, + "balance_loss_clip": 1.10058475, + "balance_loss_mlp": 0.99921077, + "epoch": 0.9070193897489854, + "flos": 68792517848160.0, + "grad_norm": 0.7738210448737978, + "language_loss": 0.61327362, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63472342, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02722168, + "step": 15086, + "time_per_iteration": 3.285283088684082 + }, + { + "auxiliary_loss_clip": 0.01321465, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.21497345, + "balance_loss_mlp": 1.02017105, + "epoch": 0.9070795130016533, + "flos": 23447004523920.0, + "grad_norm": 1.6433194373004043, + "language_loss": 0.72611678, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74966657, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13342285, + "step": 15087, + "time_per_iteration": 2.837160348892212 + }, + { + "auxiliary_loss_clip": 0.01323452, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.21684313, + "balance_loss_mlp": 1.01604128, + "epoch": 0.9071396362543214, + "flos": 18921075010920.0, + "grad_norm": 2.3131835150076774, + "language_loss": 0.76881176, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79232848, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12176514, + "step": 15088, + "time_per_iteration": 2.83469820022583 + }, + { + "auxiliary_loss_clip": 0.013151, + "auxiliary_loss_mlp": 0.01027116, + "balance_loss_clip": 1.21244037, + "balance_loss_mlp": 1.01529026, + "epoch": 0.9071997595069893, + "flos": 25234158535920.0, + "grad_norm": 6.262689983099263, + "language_loss": 0.73494637, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75836855, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.1182251, + "step": 15089, + "time_per_iteration": 2.893998622894287 + }, + { + "auxiliary_loss_clip": 0.01314706, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.21254349, + "balance_loss_mlp": 1.01770353, + "epoch": 0.9072598827596573, + "flos": 24351171821280.0, + "grad_norm": 1.6738331769923975, + "language_loss": 0.75638324, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77982384, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.11633301, + "step": 15090, + "time_per_iteration": 2.7723441123962402 + }, + { + "auxiliary_loss_clip": 0.01328836, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.22017026, + "balance_loss_mlp": 1.01615548, + "epoch": 0.9073200060123253, + "flos": 22680187992000.0, + "grad_norm": 2.299998764923569, + "language_loss": 0.78018498, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80376589, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13092041, + "step": 15091, + "time_per_iteration": 2.984985113143921 + }, + { + "auxiliary_loss_clip": 0.01315034, + "auxiliary_loss_mlp": 0.01026236, + "balance_loss_clip": 1.2129879, + "balance_loss_mlp": 1.01536417, + "epoch": 0.9073801292649932, + "flos": 23700687905520.0, + "grad_norm": 1.4402996916785662, + "language_loss": 0.86160016, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88501281, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.10870361, + "step": 15092, + "time_per_iteration": 2.8560492992401123 + }, + { + "auxiliary_loss_clip": 0.01324344, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.21790218, + "balance_loss_mlp": 1.02038944, + "epoch": 0.9074402525176612, + "flos": 25381052265960.0, + "grad_norm": 1.538212104833242, + "language_loss": 0.78925794, + "learning_rate": 8.914434207073296e-08, + "loss": 0.81282717, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12200928, + "step": 15093, + "time_per_iteration": 2.840282678604126 + }, + { + "auxiliary_loss_clip": 0.01142185, + "auxiliary_loss_mlp": 0.01006652, + "balance_loss_clip": 1.09945798, + "balance_loss_mlp": 1.00405335, + "epoch": 0.9075003757703292, + "flos": 67663197865080.0, + "grad_norm": 0.7582018090050592, + "language_loss": 0.56999207, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59148043, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02600098, + "step": 15094, + "time_per_iteration": 3.189011335372925 + }, + { + "auxiliary_loss_clip": 0.0133236, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.22282195, + "balance_loss_mlp": 1.0186305, + "epoch": 0.9075604990229972, + "flos": 22458527625240.0, + "grad_norm": 2.2471563996522237, + "language_loss": 0.71962655, + "learning_rate": 8.891452952710742e-08, + "loss": 0.74327213, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13555908, + "step": 15095, + "time_per_iteration": 2.837425470352173 + }, + { + "auxiliary_loss_clip": 0.01324677, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.21729505, + "balance_loss_mlp": 1.0227108, + "epoch": 0.9076206222756651, + "flos": 19541403896400.0, + "grad_norm": 1.6236205835370732, + "language_loss": 0.74311239, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76671267, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12646484, + "step": 15096, + "time_per_iteration": 2.756838798522949 + }, + { + "auxiliary_loss_clip": 0.01331792, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.22227669, + "balance_loss_mlp": 1.02149963, + "epoch": 0.9076807455283331, + "flos": 30122956975320.0, + "grad_norm": 1.7149300574405528, + "language_loss": 0.56660855, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59027755, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13616943, + "step": 15097, + "time_per_iteration": 2.80680251121521 + }, + { + "auxiliary_loss_clip": 0.01318126, + "auxiliary_loss_mlp": 0.01024195, + "balance_loss_clip": 1.21308541, + "balance_loss_mlp": 1.01232743, + "epoch": 0.907740868781001, + "flos": 18702176012640.0, + "grad_norm": 1.54050547471354, + "language_loss": 0.80030382, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82372701, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11871338, + "step": 15098, + "time_per_iteration": 2.752912759780884 + }, + { + "auxiliary_loss_clip": 0.0133386, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.22275484, + "balance_loss_mlp": 1.01581812, + "epoch": 0.907800992033669, + "flos": 22644063532800.0, + "grad_norm": 1.6127722903201094, + "language_loss": 0.66480917, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68843651, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13067627, + "step": 15099, + "time_per_iteration": 2.767404794692993 + }, + { + "auxiliary_loss_clip": 0.01327248, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.21816456, + "balance_loss_mlp": 1.02041733, + "epoch": 0.907861115286337, + "flos": 21292595882640.0, + "grad_norm": 2.0982887033231528, + "language_loss": 0.71242446, + "learning_rate": 8.834126644384477e-08, + "loss": 0.73603451, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13342285, + "step": 15100, + "time_per_iteration": 2.828646421432495 + }, + { + "auxiliary_loss_clip": 0.01143724, + "auxiliary_loss_mlp": 0.01007506, + "balance_loss_clip": 1.10150981, + "balance_loss_mlp": 1.00493097, + "epoch": 0.907921238539005, + "flos": 69755347244520.0, + "grad_norm": 0.658339742138984, + "language_loss": 0.53525233, + "learning_rate": 8.822683128068775e-08, + "loss": 0.5567646, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02575684, + "step": 15101, + "time_per_iteration": 3.3342063426971436 + }, + { + "auxiliary_loss_clip": 0.01327927, + "auxiliary_loss_mlp": 0.0102662, + "balance_loss_clip": 1.22100127, + "balance_loss_mlp": 1.0136801, + "epoch": 0.9079813617916729, + "flos": 23482682291160.0, + "grad_norm": 1.6208596464716822, + "language_loss": 0.68457448, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70811987, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12927246, + "step": 15102, + "time_per_iteration": 4.442996501922607 + }, + { + "auxiliary_loss_clip": 0.01321921, + "auxiliary_loss_mlp": 0.01024459, + "balance_loss_clip": 1.21574426, + "balance_loss_mlp": 1.01223445, + "epoch": 0.9080414850443409, + "flos": 22935130232760.0, + "grad_norm": 1.8167765185787765, + "language_loss": 0.79402447, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81748831, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12219238, + "step": 15103, + "time_per_iteration": 2.9021224975585938 + }, + { + "auxiliary_loss_clip": 0.01325302, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.21612954, + "balance_loss_mlp": 1.01774335, + "epoch": 0.9081016082970089, + "flos": 26182693789560.0, + "grad_norm": 1.669800401442571, + "language_loss": 0.71873319, + "learning_rate": 8.78839607763413e-08, + "loss": 0.74229044, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.12677002, + "step": 15104, + "time_per_iteration": 4.505504369735718 + }, + { + "auxiliary_loss_clip": 0.01319257, + "auxiliary_loss_mlp": 0.01026001, + "balance_loss_clip": 1.21413374, + "balance_loss_mlp": 1.01399016, + "epoch": 0.9081617315496768, + "flos": 24467545045800.0, + "grad_norm": 1.6958664839501219, + "language_loss": 0.77594674, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79939938, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11999512, + "step": 15105, + "time_per_iteration": 4.3237786293029785 + }, + { + "auxiliary_loss_clip": 0.01327131, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.21969914, + "balance_loss_mlp": 1.0206486, + "epoch": 0.9082218548023449, + "flos": 24751342849320.0, + "grad_norm": 1.8604680562911298, + "language_loss": 0.74243963, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76604164, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12414551, + "step": 15106, + "time_per_iteration": 2.837374448776245 + }, + { + "auxiliary_loss_clip": 0.01328977, + "auxiliary_loss_mlp": 0.01032089, + "balance_loss_clip": 1.22040033, + "balance_loss_mlp": 1.0190953, + "epoch": 0.9082819780550128, + "flos": 24426100891440.0, + "grad_norm": 1.5623254282609915, + "language_loss": 0.80318129, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82679188, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12994385, + "step": 15107, + "time_per_iteration": 2.8915762901306152 + }, + { + "auxiliary_loss_clip": 0.01146237, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.10336876, + "balance_loss_mlp": 1.00025392, + "epoch": 0.9083421013076808, + "flos": 59625967521600.0, + "grad_norm": 1.4253680245996112, + "language_loss": 0.59727567, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61876434, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02368164, + "step": 15108, + "time_per_iteration": 3.2620012760162354 + }, + { + "auxiliary_loss_clip": 0.01326632, + "auxiliary_loss_mlp": 0.01025413, + "balance_loss_clip": 1.21757746, + "balance_loss_mlp": 1.01231146, + "epoch": 0.9084022245603487, + "flos": 33626802848760.0, + "grad_norm": 1.4979661896862524, + "language_loss": 0.74055076, + "learning_rate": 8.73139601460482e-08, + "loss": 0.76407123, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13098145, + "step": 15109, + "time_per_iteration": 2.831902503967285 + }, + { + "auxiliary_loss_clip": 0.01316236, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.21121383, + "balance_loss_mlp": 1.01600134, + "epoch": 0.9084623478130167, + "flos": 24976982835360.0, + "grad_norm": 1.5039561067533387, + "language_loss": 0.71947974, + "learning_rate": 8.720017759045073e-08, + "loss": 0.74292135, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.11914062, + "step": 15110, + "time_per_iteration": 2.8257675170898438 + }, + { + "auxiliary_loss_clip": 0.01320525, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.21563649, + "balance_loss_mlp": 1.01408386, + "epoch": 0.9085224710656846, + "flos": 31467358770840.0, + "grad_norm": 1.920211038868928, + "language_loss": 0.689601, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71306998, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12280273, + "step": 15111, + "time_per_iteration": 2.823605537414551 + }, + { + "auxiliary_loss_clip": 0.01146791, + "auxiliary_loss_mlp": 0.01001, + "balance_loss_clip": 1.10428238, + "balance_loss_mlp": 0.99853289, + "epoch": 0.9085825943183526, + "flos": 64931407002000.0, + "grad_norm": 0.6869761554997613, + "language_loss": 0.51804829, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53952622, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0246582, + "step": 15112, + "time_per_iteration": 4.699567079544067 + }, + { + "auxiliary_loss_clip": 0.01328638, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.22024643, + "balance_loss_mlp": 1.01859617, + "epoch": 0.9086427175710206, + "flos": 18957930420600.0, + "grad_norm": 2.6291397155273244, + "language_loss": 0.70052743, + "learning_rate": 8.685926514226837e-08, + "loss": 0.72411978, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12011719, + "step": 15113, + "time_per_iteration": 2.8294217586517334 + }, + { + "auxiliary_loss_clip": 0.01322919, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.21574831, + "balance_loss_mlp": 1.0159775, + "epoch": 0.9087028408236886, + "flos": 34020638972640.0, + "grad_norm": 2.2976360637441746, + "language_loss": 0.79413545, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81764293, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1184082, + "step": 15114, + "time_per_iteration": 2.973280668258667 + }, + { + "auxiliary_loss_clip": 0.01336492, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.22460485, + "balance_loss_mlp": 1.01725137, + "epoch": 0.9087629640763565, + "flos": 21949617744360.0, + "grad_norm": 1.8309653729918647, + "language_loss": 0.70699573, + "learning_rate": 8.663235290207405e-08, + "loss": 0.73067099, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 1.11767578, + "router_z_loss_mlp": 0.13775635, + "step": 15115, + "time_per_iteration": 2.8163387775421143 + }, + { + "auxiliary_loss_clip": 0.01335911, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.22403109, + "balance_loss_mlp": 1.01911569, + "epoch": 0.9088230873290245, + "flos": 21768020847720.0, + "grad_norm": 1.5177913826774654, + "language_loss": 0.66034943, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68403584, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13616943, + "step": 15116, + "time_per_iteration": 2.8080005645751953 + }, + { + "auxiliary_loss_clip": 0.01316812, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.21359968, + "balance_loss_mlp": 1.01930642, + "epoch": 0.9088832105816925, + "flos": 21545995005720.0, + "grad_norm": 1.479868796444601, + "language_loss": 0.69433755, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71782774, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12908936, + "step": 15117, + "time_per_iteration": 2.756293296813965 + }, + { + "auxiliary_loss_clip": 0.01325356, + "auxiliary_loss_mlp": 0.01028297, + "balance_loss_clip": 1.21943688, + "balance_loss_mlp": 1.01635242, + "epoch": 0.9089433338343604, + "flos": 26002558793880.0, + "grad_norm": 1.4925263564515028, + "language_loss": 0.74990666, + "learning_rate": 8.629252871571745e-08, + "loss": 0.77344328, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11956787, + "step": 15118, + "time_per_iteration": 2.886230945587158 + }, + { + "auxiliary_loss_clip": 0.01336372, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.2229569, + "balance_loss_mlp": 1.02077937, + "epoch": 0.9090034570870285, + "flos": 21183369729480.0, + "grad_norm": 2.19700456914001, + "language_loss": 0.73555893, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75926703, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.13665771, + "step": 15119, + "time_per_iteration": 2.7895960807800293 + }, + { + "auxiliary_loss_clip": 0.01337805, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.22555411, + "balance_loss_mlp": 1.01314068, + "epoch": 0.9090635803396964, + "flos": 16145890792200.0, + "grad_norm": 2.1823623929273954, + "language_loss": 0.71768212, + "learning_rate": 8.60663420908827e-08, + "loss": 0.74132651, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.1350708, + "step": 15120, + "time_per_iteration": 2.789696455001831 + }, + { + "auxiliary_loss_clip": 0.0133069, + "auxiliary_loss_mlp": 0.01025599, + "balance_loss_clip": 1.22299612, + "balance_loss_mlp": 1.013291, + "epoch": 0.9091237035923644, + "flos": 20595997851120.0, + "grad_norm": 2.2222572059549113, + "language_loss": 0.6635015, + "learning_rate": 8.595335764115596e-08, + "loss": 0.68706441, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12304688, + "step": 15121, + "time_per_iteration": 2.786227226257324 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.21836972, + "balance_loss_mlp": 1.02265429, + "epoch": 0.9091838268450323, + "flos": 52236750629880.0, + "grad_norm": 1.7530439088890069, + "language_loss": 0.70766091, + "learning_rate": 8.58404457722699e-08, + "loss": 0.73126942, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12530518, + "step": 15122, + "time_per_iteration": 3.055321216583252 + }, + { + "auxiliary_loss_clip": 0.01321602, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.21595263, + "balance_loss_mlp": 1.01598406, + "epoch": 0.9092439500977003, + "flos": 20564665178400.0, + "grad_norm": 1.283829672445228, + "language_loss": 0.74762917, + "learning_rate": 8.572760648850575e-08, + "loss": 0.77112931, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12438965, + "step": 15123, + "time_per_iteration": 2.843780755996704 + }, + { + "auxiliary_loss_clip": 0.01314776, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.21057844, + "balance_loss_mlp": 1.01684213, + "epoch": 0.9093040733503682, + "flos": 28623296127600.0, + "grad_norm": 2.5134733924503387, + "language_loss": 0.76049995, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78393948, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12335205, + "step": 15124, + "time_per_iteration": 2.905972719192505 + }, + { + "auxiliary_loss_clip": 0.01325589, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.21861315, + "balance_loss_mlp": 1.01601195, + "epoch": 0.9093641966030362, + "flos": 23445380189520.0, + "grad_norm": 2.153536045306202, + "language_loss": 0.73052132, + "learning_rate": 8.55021456934566e-08, + "loss": 0.75406903, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1315918, + "step": 15125, + "time_per_iteration": 2.9642395973205566 + }, + { + "auxiliary_loss_clip": 0.0132144, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.21660721, + "balance_loss_mlp": 1.02572262, + "epoch": 0.9094243198557042, + "flos": 16804577596680.0, + "grad_norm": 1.9096507831230725, + "language_loss": 0.79199588, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81558841, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12084961, + "step": 15126, + "time_per_iteration": 2.8073012828826904 + }, + { + "auxiliary_loss_clip": 0.01321897, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.21731746, + "balance_loss_mlp": 1.02115369, + "epoch": 0.9094844431083722, + "flos": 24277785868800.0, + "grad_norm": 1.437871027087334, + "language_loss": 0.75177801, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77533215, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12353516, + "step": 15127, + "time_per_iteration": 2.826037645339966 + }, + { + "auxiliary_loss_clip": 0.01326772, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.21725893, + "balance_loss_mlp": 1.01694047, + "epoch": 0.9095445663610401, + "flos": 21949698961080.0, + "grad_norm": 1.9411791088652999, + "language_loss": 0.63446975, + "learning_rate": 8.516449899618173e-08, + "loss": 0.65803736, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13049316, + "step": 15128, + "time_per_iteration": 2.7611985206604004 + }, + { + "auxiliary_loss_clip": 0.0132021, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.21421695, + "balance_loss_mlp": 1.01405406, + "epoch": 0.9096046896137081, + "flos": 19797726821400.0, + "grad_norm": 1.6542541495980565, + "language_loss": 0.76626909, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78973579, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12414551, + "step": 15129, + "time_per_iteration": 2.8074731826782227 + }, + { + "auxiliary_loss_clip": 0.01326646, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.21871781, + "balance_loss_mlp": 1.01588094, + "epoch": 0.909664812866376, + "flos": 22643251365600.0, + "grad_norm": 1.7568073981145338, + "language_loss": 0.83421189, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85776657, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12945557, + "step": 15130, + "time_per_iteration": 2.8220767974853516 + }, + { + "auxiliary_loss_clip": 0.01327321, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.21937585, + "balance_loss_mlp": 1.01804793, + "epoch": 0.909724936119044, + "flos": 39858622399440.0, + "grad_norm": 1.6899052700277466, + "language_loss": 0.74973047, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77331394, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.13000488, + "step": 15131, + "time_per_iteration": 2.9601492881774902 + }, + { + "auxiliary_loss_clip": 0.01324803, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.21736932, + "balance_loss_mlp": 1.02043962, + "epoch": 0.9097850593717121, + "flos": 35077344562080.0, + "grad_norm": 2.0562461962968666, + "language_loss": 0.59324306, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61682338, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12780762, + "step": 15132, + "time_per_iteration": 2.878798246383667 + }, + { + "auxiliary_loss_clip": 0.01325362, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.21835756, + "balance_loss_mlp": 1.01819408, + "epoch": 0.90984518262438, + "flos": 23372684579160.0, + "grad_norm": 1.3756353402394088, + "language_loss": 0.82613909, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84969151, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11676025, + "step": 15133, + "time_per_iteration": 2.7589666843414307 + }, + { + "auxiliary_loss_clip": 0.01330403, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.22170925, + "balance_loss_mlp": 1.01561272, + "epoch": 0.909905305877048, + "flos": 27528598094400.0, + "grad_norm": 1.6108409740277414, + "language_loss": 0.74303985, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76663178, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1315918, + "step": 15134, + "time_per_iteration": 2.82383394241333 + }, + { + "auxiliary_loss_clip": 0.01341346, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.22722983, + "balance_loss_mlp": 1.02175212, + "epoch": 0.9099654291297159, + "flos": 24352593113880.0, + "grad_norm": 1.4912241486920679, + "language_loss": 0.73208213, + "learning_rate": 8.437919827761786e-08, + "loss": 0.75584507, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13189697, + "step": 15135, + "time_per_iteration": 2.7842183113098145 + }, + { + "auxiliary_loss_clip": 0.01321521, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.21619785, + "balance_loss_mlp": 1.016868, + "epoch": 0.9100255523823839, + "flos": 21220225139160.0, + "grad_norm": 2.140705110288838, + "language_loss": 0.7014029, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72490633, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1194458, + "step": 15136, + "time_per_iteration": 2.892794609069824 + }, + { + "auxiliary_loss_clip": 0.01145132, + "auxiliary_loss_mlp": 0.01010621, + "balance_loss_clip": 1.10244775, + "balance_loss_mlp": 1.00798655, + "epoch": 0.9100856756350518, + "flos": 46063999261800.0, + "grad_norm": 0.8190192883257212, + "language_loss": 0.59318531, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61474282, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 15137, + "time_per_iteration": 3.02406907081604 + }, + { + "auxiliary_loss_clip": 0.01329504, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.22124982, + "balance_loss_mlp": 1.02173972, + "epoch": 0.9101457988877198, + "flos": 20234753259120.0, + "grad_norm": 1.569485228238605, + "language_loss": 0.82781291, + "learning_rate": 8.40437303497834e-08, + "loss": 0.85145271, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12731934, + "step": 15138, + "time_per_iteration": 2.796529769897461 + }, + { + "auxiliary_loss_clip": 0.01315259, + "auxiliary_loss_mlp": 0.01027495, + "balance_loss_clip": 1.2131418, + "balance_loss_mlp": 1.0159142, + "epoch": 0.9102059221403878, + "flos": 26620857261360.0, + "grad_norm": 1.8419584933434467, + "language_loss": 0.81333697, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83676451, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.11578369, + "step": 15139, + "time_per_iteration": 2.8827600479125977 + }, + { + "auxiliary_loss_clip": 0.01321225, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.214728, + "balance_loss_mlp": 1.01721954, + "epoch": 0.9102660453930558, + "flos": 21913655718600.0, + "grad_norm": 1.6155487709698917, + "language_loss": 0.77399445, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79750538, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12634277, + "step": 15140, + "time_per_iteration": 2.859555244445801 + }, + { + "auxiliary_loss_clip": 0.01322671, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.21620893, + "balance_loss_mlp": 1.01662183, + "epoch": 0.9103261686457237, + "flos": 36183900236400.0, + "grad_norm": 1.531626797524089, + "language_loss": 0.6615414, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68505788, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12365723, + "step": 15141, + "time_per_iteration": 4.370828151702881 + }, + { + "auxiliary_loss_clip": 0.01328454, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.21947789, + "balance_loss_mlp": 1.0229305, + "epoch": 0.9103862918983917, + "flos": 23883868528200.0, + "grad_norm": 1.6541398487609957, + "language_loss": 0.75413191, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77777189, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12609863, + "step": 15142, + "time_per_iteration": 2.8255674839019775 + }, + { + "auxiliary_loss_clip": 0.01321825, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.21584892, + "balance_loss_mlp": 1.02224755, + "epoch": 0.9104464151510596, + "flos": 14943672156960.0, + "grad_norm": 1.6073097745503073, + "language_loss": 0.64440238, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66796231, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.11932373, + "step": 15143, + "time_per_iteration": 4.403769016265869 + }, + { + "auxiliary_loss_clip": 0.01325196, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.21571136, + "balance_loss_mlp": 1.02211571, + "epoch": 0.9105065384037276, + "flos": 33662277574200.0, + "grad_norm": 1.7155557836455573, + "language_loss": 0.61215353, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63576293, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.13616943, + "step": 15144, + "time_per_iteration": 4.397980690002441 + }, + { + "auxiliary_loss_clip": 0.01312059, + "auxiliary_loss_mlp": 0.01028002, + "balance_loss_clip": 1.21080589, + "balance_loss_mlp": 1.01584256, + "epoch": 0.9105666616563957, + "flos": 24322113216720.0, + "grad_norm": 1.6334993087642407, + "language_loss": 0.71207303, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73547363, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.12164307, + "step": 15145, + "time_per_iteration": 2.936476707458496 + }, + { + "auxiliary_loss_clip": 0.01314647, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.21039176, + "balance_loss_mlp": 1.01800621, + "epoch": 0.9106267849090636, + "flos": 29977809404760.0, + "grad_norm": 1.6442007928722397, + "language_loss": 0.71071249, + "learning_rate": 8.315234626222545e-08, + "loss": 0.73416042, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.12139893, + "step": 15146, + "time_per_iteration": 2.9920339584350586 + }, + { + "auxiliary_loss_clip": 0.01322043, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.21522379, + "balance_loss_mlp": 1.01849914, + "epoch": 0.9106869081617316, + "flos": 25343222255640.0, + "grad_norm": 1.7871118849067797, + "language_loss": 0.72941279, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75293171, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.11352539, + "step": 15147, + "time_per_iteration": 2.7951102256774902 + }, + { + "auxiliary_loss_clip": 0.01330522, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.2203536, + "balance_loss_mlp": 1.01543713, + "epoch": 0.9107470314143995, + "flos": 18191641797360.0, + "grad_norm": 2.4050517867696906, + "language_loss": 0.8021729, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82575995, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12756348, + "step": 15148, + "time_per_iteration": 2.758687973022461 + }, + { + "auxiliary_loss_clip": 0.01329315, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.21990681, + "balance_loss_mlp": 1.02253652, + "epoch": 0.9108071546670675, + "flos": 23556190068720.0, + "grad_norm": 1.5965180285522753, + "language_loss": 0.68070138, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70435721, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13720703, + "step": 15149, + "time_per_iteration": 2.7590994834899902 + }, + { + "auxiliary_loss_clip": 0.01326804, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.21880651, + "balance_loss_mlp": 1.01681042, + "epoch": 0.9108672779197354, + "flos": 25635913290000.0, + "grad_norm": 2.9721957299795965, + "language_loss": 0.63505888, + "learning_rate": 8.270839857265776e-08, + "loss": 0.6586262, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13128662, + "step": 15150, + "time_per_iteration": 2.853215217590332 + }, + { + "auxiliary_loss_clip": 0.01328407, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.22131896, + "balance_loss_mlp": 1.0158639, + "epoch": 0.9109274011724035, + "flos": 22343291434800.0, + "grad_norm": 1.9865980144689974, + "language_loss": 0.72899234, + "learning_rate": 8.259759339947514e-08, + "loss": 0.75256234, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12750244, + "step": 15151, + "time_per_iteration": 4.316228151321411 + }, + { + "auxiliary_loss_clip": 0.0132522, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.21858037, + "balance_loss_mlp": 1.01561618, + "epoch": 0.9109875244250714, + "flos": 26693918346960.0, + "grad_norm": 1.6107557661551286, + "language_loss": 0.64715242, + "learning_rate": 8.248686093438429e-08, + "loss": 0.67068815, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12750244, + "step": 15152, + "time_per_iteration": 2.774240493774414 + }, + { + "auxiliary_loss_clip": 0.01324729, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.21764278, + "balance_loss_mlp": 1.01615083, + "epoch": 0.9110476476777394, + "flos": 22935333274560.0, + "grad_norm": 1.817009450086546, + "language_loss": 0.74104589, + "learning_rate": 8.23762011815834e-08, + "loss": 0.76458442, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12976074, + "step": 15153, + "time_per_iteration": 2.7534594535827637 + }, + { + "auxiliary_loss_clip": 0.01329111, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.22065783, + "balance_loss_mlp": 1.01613319, + "epoch": 0.9111077709304073, + "flos": 13475871890640.0, + "grad_norm": 1.7072218097560945, + "language_loss": 0.71991587, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74349928, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13085938, + "step": 15154, + "time_per_iteration": 2.7203218936920166 + }, + { + "auxiliary_loss_clip": 0.01321619, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.21636534, + "balance_loss_mlp": 1.01910126, + "epoch": 0.9111678941830753, + "flos": 20855528836560.0, + "grad_norm": 1.6834755486721449, + "language_loss": 0.81776297, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84128922, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.11907959, + "step": 15155, + "time_per_iteration": 2.809643268585205 + }, + { + "auxiliary_loss_clip": 0.01323509, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.21721601, + "balance_loss_mlp": 1.01582623, + "epoch": 0.9112280174357432, + "flos": 19686957550560.0, + "grad_norm": 1.476281311048987, + "language_loss": 0.60091716, + "learning_rate": 8.204465823887252e-08, + "loss": 0.62443519, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12457275, + "step": 15156, + "time_per_iteration": 2.803422689437866 + }, + { + "auxiliary_loss_clip": 0.01330703, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.21888983, + "balance_loss_mlp": 1.01824152, + "epoch": 0.9112881406884112, + "flos": 25452651450600.0, + "grad_norm": 1.8149963995142915, + "language_loss": 0.74225307, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76587701, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13433838, + "step": 15157, + "time_per_iteration": 2.791942834854126 + }, + { + "auxiliary_loss_clip": 0.01327416, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.21911025, + "balance_loss_mlp": 1.02140903, + "epoch": 0.9113482639410793, + "flos": 33072712844400.0, + "grad_norm": 1.5936529355588762, + "language_loss": 0.59599471, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61959821, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.11523438, + "step": 15158, + "time_per_iteration": 2.9494800567626953 + }, + { + "auxiliary_loss_clip": 0.01323096, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.21627069, + "balance_loss_mlp": 1.02642381, + "epoch": 0.9114083871937472, + "flos": 21840675849720.0, + "grad_norm": 1.5072324212481978, + "language_loss": 0.68264568, + "learning_rate": 8.171376985767375e-08, + "loss": 0.70626205, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12115479, + "step": 15159, + "time_per_iteration": 2.8410027027130127 + }, + { + "auxiliary_loss_clip": 0.01324702, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.21632195, + "balance_loss_mlp": 1.01593256, + "epoch": 0.9114685104464152, + "flos": 27094698500400.0, + "grad_norm": 1.8808220142749446, + "language_loss": 0.78415465, + "learning_rate": 8.160361920824588e-08, + "loss": 0.8076914, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13012695, + "step": 15160, + "time_per_iteration": 2.826648473739624 + }, + { + "auxiliary_loss_clip": 0.01329988, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.22211051, + "balance_loss_mlp": 1.016891, + "epoch": 0.9115286336990831, + "flos": 17971483939920.0, + "grad_norm": 1.6813612962237927, + "language_loss": 0.69490194, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71850616, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13568115, + "step": 15161, + "time_per_iteration": 2.752863645553589 + }, + { + "auxiliary_loss_clip": 0.0132489, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.2169373, + "balance_loss_mlp": 1.01651287, + "epoch": 0.9115887569517511, + "flos": 22935049016040.0, + "grad_norm": 1.7818405352143425, + "language_loss": 0.76695478, + "learning_rate": 8.138353615091321e-08, + "loss": 0.79050487, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13623047, + "step": 15162, + "time_per_iteration": 2.8664462566375732 + }, + { + "auxiliary_loss_clip": 0.01328113, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.22046697, + "balance_loss_mlp": 1.02168989, + "epoch": 0.911648880204419, + "flos": 23993825631840.0, + "grad_norm": 1.996516541007526, + "language_loss": 0.67237616, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69600129, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12719727, + "step": 15163, + "time_per_iteration": 2.840165376663208 + }, + { + "auxiliary_loss_clip": 0.01335994, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.22441423, + "balance_loss_mlp": 1.02041948, + "epoch": 0.911709003457087, + "flos": 17059682270880.0, + "grad_norm": 2.313015789772253, + "language_loss": 0.70921552, + "learning_rate": 8.116374411009186e-08, + "loss": 0.73290879, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.12921143, + "step": 15164, + "time_per_iteration": 2.7888336181640625 + }, + { + "auxiliary_loss_clip": 0.01319191, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.21639645, + "balance_loss_mlp": 1.02120185, + "epoch": 0.911769126709755, + "flos": 21658429219320.0, + "grad_norm": 1.4648250717126214, + "language_loss": 0.7616055, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78513777, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.12841797, + "step": 15165, + "time_per_iteration": 2.7837483882904053 + }, + { + "auxiliary_loss_clip": 0.01327822, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.21961486, + "balance_loss_mlp": 1.01909959, + "epoch": 0.911829249962423, + "flos": 24795710805600.0, + "grad_norm": 2.14335956585487, + "language_loss": 0.72147197, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74506819, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1270752, + "step": 15166, + "time_per_iteration": 2.858208417892456 + }, + { + "auxiliary_loss_clip": 0.013318, + "auxiliary_loss_mlp": 0.01032603, + "balance_loss_clip": 1.22225416, + "balance_loss_mlp": 1.0192101, + "epoch": 0.9118893732150909, + "flos": 20964308297760.0, + "grad_norm": 1.7013898036524597, + "language_loss": 0.73053753, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75418156, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13397217, + "step": 15167, + "time_per_iteration": 2.8139638900756836 + }, + { + "auxiliary_loss_clip": 0.01141234, + "auxiliary_loss_mlp": 0.01000498, + "balance_loss_clip": 1.09887123, + "balance_loss_mlp": 0.9979586, + "epoch": 0.9119494964677589, + "flos": 67933287024120.0, + "grad_norm": 0.8160153307314547, + "language_loss": 0.65637672, + "learning_rate": 8.072503321129298e-08, + "loss": 0.6777941, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02539062, + "step": 15168, + "time_per_iteration": 3.1935336589813232 + }, + { + "auxiliary_loss_clip": 0.01326017, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.21947742, + "balance_loss_mlp": 1.01954377, + "epoch": 0.9120096197204268, + "flos": 18556135058160.0, + "grad_norm": 1.9927210645485613, + "language_loss": 0.78612071, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80969805, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12164307, + "step": 15169, + "time_per_iteration": 2.9087626934051514 + }, + { + "auxiliary_loss_clip": 0.01324166, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.21674943, + "balance_loss_mlp": 1.01622486, + "epoch": 0.9120697429730948, + "flos": 19030382380800.0, + "grad_norm": 1.640743103568866, + "language_loss": 0.8233819, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84690857, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12286377, + "step": 15170, + "time_per_iteration": 2.7571372985839844 + }, + { + "auxiliary_loss_clip": 0.01329246, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.22147572, + "balance_loss_mlp": 1.01540542, + "epoch": 0.9121298662257629, + "flos": 17167933823400.0, + "grad_norm": 1.865134011687619, + "language_loss": 0.77765208, + "learning_rate": 8.039676420316799e-08, + "loss": 0.8012256, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.1270752, + "step": 15171, + "time_per_iteration": 2.697277784347534 + }, + { + "auxiliary_loss_clip": 0.01319885, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.21436834, + "balance_loss_mlp": 1.01931262, + "epoch": 0.9121899894784308, + "flos": 19687485459240.0, + "grad_norm": 1.2653105497616832, + "language_loss": 0.67135876, + "learning_rate": 8.02874867780241e-08, + "loss": 0.6948781, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12713623, + "step": 15172, + "time_per_iteration": 2.7497122287750244 + }, + { + "auxiliary_loss_clip": 0.01326833, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.21929669, + "balance_loss_mlp": 1.01797593, + "epoch": 0.9122501127310988, + "flos": 22240725052680.0, + "grad_norm": 1.6726492492479301, + "language_loss": 0.75200421, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77558374, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13153076, + "step": 15173, + "time_per_iteration": 2.7411510944366455 + }, + { + "auxiliary_loss_clip": 0.01334416, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.22238183, + "balance_loss_mlp": 1.01921582, + "epoch": 0.9123102359837667, + "flos": 15960882793320.0, + "grad_norm": 2.119309492814404, + "language_loss": 0.66250885, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68618912, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 1.12158203, + "router_z_loss_mlp": 0.1439209, + "step": 15174, + "time_per_iteration": 2.746565341949463 + }, + { + "auxiliary_loss_clip": 0.01326713, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.21720243, + "balance_loss_mlp": 1.0165906, + "epoch": 0.9123703592364347, + "flos": 25161300492120.0, + "grad_norm": 1.5771630001728527, + "language_loss": 0.75360692, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77717876, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13885498, + "step": 15175, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.01143295, + "auxiliary_loss_mlp": 0.01003235, + "balance_loss_clip": 1.10083604, + "balance_loss_mlp": 1.00075519, + "epoch": 0.9124304824891026, + "flos": 60816490949160.0, + "grad_norm": 0.9609022848026679, + "language_loss": 0.58446729, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60593259, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02478027, + "step": 15176, + "time_per_iteration": 3.3623249530792236 + }, + { + "auxiliary_loss_clip": 0.01328173, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.21996391, + "balance_loss_mlp": 1.0201683, + "epoch": 0.9124906057417707, + "flos": 18155598554880.0, + "grad_norm": 1.9187357800724998, + "language_loss": 0.65810585, + "learning_rate": 7.97421916704475e-08, + "loss": 0.68171167, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12261963, + "step": 15177, + "time_per_iteration": 2.8350424766540527 + }, + { + "auxiliary_loss_clip": 0.01321228, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.21591306, + "balance_loss_mlp": 1.01784301, + "epoch": 0.9125507289944386, + "flos": 11689773696000.0, + "grad_norm": 1.915495328305365, + "language_loss": 0.81281507, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83632541, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11956787, + "step": 15178, + "time_per_iteration": 2.7103588581085205 + }, + { + "auxiliary_loss_clip": 0.0132301, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.2168529, + "balance_loss_mlp": 1.01630402, + "epoch": 0.9126108522471066, + "flos": 17753315892120.0, + "grad_norm": 1.9164377703116993, + "language_loss": 0.78948653, + "learning_rate": 7.952458331306711e-08, + "loss": 0.81300569, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12597656, + "step": 15179, + "time_per_iteration": 4.108703851699829 + }, + { + "auxiliary_loss_clip": 0.01324401, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.21922243, + "balance_loss_mlp": 1.01704764, + "epoch": 0.9126709754997745, + "flos": 27641438391600.0, + "grad_norm": 1.5743671874434524, + "language_loss": 0.68412745, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70766491, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12298584, + "step": 15180, + "time_per_iteration": 2.8907434940338135 + }, + { + "auxiliary_loss_clip": 0.01318266, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.21338844, + "balance_loss_mlp": 1.01548386, + "epoch": 0.9127310987524425, + "flos": 15929306470440.0, + "grad_norm": 1.8949764604400812, + "language_loss": 0.75423229, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77769017, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12036133, + "step": 15181, + "time_per_iteration": 4.421180009841919 + }, + { + "auxiliary_loss_clip": 0.01334268, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.22362423, + "balance_loss_mlp": 1.01718032, + "epoch": 0.9127912220051104, + "flos": 21541040785800.0, + "grad_norm": 1.6419516972012176, + "language_loss": 0.74835593, + "learning_rate": 7.919871697194614e-08, + "loss": 0.77199417, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.12371826, + "step": 15182, + "time_per_iteration": 4.266456842422485 + }, + { + "auxiliary_loss_clip": 0.0132856, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.21934295, + "balance_loss_mlp": 1.01818943, + "epoch": 0.9128513452577784, + "flos": 24069242002320.0, + "grad_norm": 1.4549497751029943, + "language_loss": 0.76397932, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78757238, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.12561035, + "step": 15183, + "time_per_iteration": 2.825289011001587 + }, + { + "auxiliary_loss_clip": 0.01331761, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.22212625, + "balance_loss_mlp": 1.01640546, + "epoch": 0.9129114685104465, + "flos": 16220779254000.0, + "grad_norm": 2.0229932036148206, + "language_loss": 0.76893473, + "learning_rate": 7.898183692255256e-08, + "loss": 0.79254556, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12921143, + "step": 15184, + "time_per_iteration": 2.7131717205047607 + }, + { + "auxiliary_loss_clip": 0.01325986, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.2185626, + "balance_loss_mlp": 1.01822376, + "epoch": 0.9129715917631144, + "flos": 19388256478920.0, + "grad_norm": 1.6146025186725779, + "language_loss": 0.74761707, + "learning_rate": 7.887350616360233e-08, + "loss": 0.77118492, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12579346, + "step": 15185, + "time_per_iteration": 2.811018466949463 + }, + { + "auxiliary_loss_clip": 0.01323476, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.21696258, + "balance_loss_mlp": 1.01952887, + "epoch": 0.9130317150157824, + "flos": 20594779600320.0, + "grad_norm": 1.8732001627085084, + "language_loss": 0.68541074, + "learning_rate": 7.876524825396158e-08, + "loss": 0.7089656, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12481689, + "step": 15186, + "time_per_iteration": 2.8326892852783203 + }, + { + "auxiliary_loss_clip": 0.01336914, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.22350168, + "balance_loss_mlp": 1.01970029, + "epoch": 0.9130918382684503, + "flos": 20193512146560.0, + "grad_norm": 1.799740605594922, + "language_loss": 0.77797067, + "learning_rate": 7.865706319773502e-08, + "loss": 0.80167085, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.13415527, + "step": 15187, + "time_per_iteration": 2.7756128311157227 + }, + { + "auxiliary_loss_clip": 0.0132449, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.2169615, + "balance_loss_mlp": 1.01766646, + "epoch": 0.9131519615211183, + "flos": 25562405512440.0, + "grad_norm": 1.8067205259721764, + "language_loss": 0.65830612, + "learning_rate": 7.854895099902515e-08, + "loss": 0.68184453, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11688232, + "step": 15188, + "time_per_iteration": 4.29918360710144 + }, + { + "auxiliary_loss_clip": 0.01320286, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.21436071, + "balance_loss_mlp": 1.02040517, + "epoch": 0.9132120847737862, + "flos": 17936131039560.0, + "grad_norm": 1.6816946535684716, + "language_loss": 0.76386672, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78739738, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.1237793, + "step": 15189, + "time_per_iteration": 2.744048833847046 + }, + { + "auxiliary_loss_clip": 0.0131964, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.21406972, + "balance_loss_mlp": 1.01831758, + "epoch": 0.9132722080264543, + "flos": 20052466020360.0, + "grad_norm": 1.6528478170145882, + "language_loss": 0.75849718, + "learning_rate": 7.8332945190551e-08, + "loss": 0.78198957, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.11273193, + "step": 15190, + "time_per_iteration": 2.896811008453369 + }, + { + "auxiliary_loss_clip": 0.01144534, + "auxiliary_loss_mlp": 0.0100171, + "balance_loss_clip": 1.10190618, + "balance_loss_mlp": 0.99915904, + "epoch": 0.9133323312791222, + "flos": 70456858887600.0, + "grad_norm": 0.8870805741902517, + "language_loss": 0.57416797, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59563041, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0255127, + "step": 15191, + "time_per_iteration": 3.369863271713257 + }, + { + "auxiliary_loss_clip": 0.01331455, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.22191846, + "balance_loss_mlp": 1.01748824, + "epoch": 0.9133924545317902, + "flos": 25489303818480.0, + "grad_norm": 1.7617033201373786, + "language_loss": 0.74970913, + "learning_rate": 7.81172308613034e-08, + "loss": 0.77333355, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13482666, + "step": 15192, + "time_per_iteration": 2.7670774459838867 + }, + { + "auxiliary_loss_clip": 0.0131903, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.21450472, + "balance_loss_mlp": 1.01819277, + "epoch": 0.9134525777844581, + "flos": 39937896564120.0, + "grad_norm": 1.603430975082704, + "language_loss": 0.69087851, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71437407, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12335205, + "step": 15193, + "time_per_iteration": 2.928522825241089 + }, + { + "auxiliary_loss_clip": 0.01319785, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.21485209, + "balance_loss_mlp": 1.02424085, + "epoch": 0.9135127010371261, + "flos": 20891653295760.0, + "grad_norm": 1.5390089036256542, + "language_loss": 0.73635459, + "learning_rate": 7.790180804400215e-08, + "loss": 0.7599225, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12768555, + "step": 15194, + "time_per_iteration": 2.7160744667053223 + }, + { + "auxiliary_loss_clip": 0.01332656, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.22095609, + "balance_loss_mlp": 1.01687229, + "epoch": 0.913572824289794, + "flos": 20818104909840.0, + "grad_norm": 2.5653016638303137, + "language_loss": 0.62025315, + "learning_rate": 7.779420596254383e-08, + "loss": 0.64388984, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.14147949, + "step": 15195, + "time_per_iteration": 2.767519950866699 + }, + { + "auxiliary_loss_clip": 0.01329989, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.22234774, + "balance_loss_mlp": 1.01973975, + "epoch": 0.913632947542462, + "flos": 25708893158880.0, + "grad_norm": 1.4625410428136485, + "language_loss": 0.71445084, + "learning_rate": 7.768667677132201e-08, + "loss": 0.7380743, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12615967, + "step": 15196, + "time_per_iteration": 2.914397954940796 + }, + { + "auxiliary_loss_clip": 0.01324503, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.21807539, + "balance_loss_mlp": 1.01914835, + "epoch": 0.9136930707951301, + "flos": 26291798117640.0, + "grad_norm": 1.5140732492708757, + "language_loss": 0.71665817, + "learning_rate": 7.757922047441411e-08, + "loss": 0.74021429, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11956787, + "step": 15197, + "time_per_iteration": 2.8588979244232178 + }, + { + "auxiliary_loss_clip": 0.01331583, + "auxiliary_loss_mlp": 0.01026894, + "balance_loss_clip": 1.22135258, + "balance_loss_mlp": 1.01400757, + "epoch": 0.913753194047798, + "flos": 22097120599800.0, + "grad_norm": 2.7533201748651037, + "language_loss": 0.78040242, + "learning_rate": 7.747183707589489e-08, + "loss": 0.80398721, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12896729, + "step": 15198, + "time_per_iteration": 2.814131259918213 + }, + { + "auxiliary_loss_clip": 0.01321015, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.21549404, + "balance_loss_mlp": 1.0230546, + "epoch": 0.913813317300466, + "flos": 23592558178080.0, + "grad_norm": 1.3435648738724937, + "language_loss": 0.67905593, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70261961, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12310791, + "step": 15199, + "time_per_iteration": 2.821277379989624 + }, + { + "auxiliary_loss_clip": 0.01331113, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.22240746, + "balance_loss_mlp": 1.02336657, + "epoch": 0.9138734405531339, + "flos": 28882339812720.0, + "grad_norm": 1.5018600594913671, + "language_loss": 0.67757839, + "learning_rate": 7.725728899030714e-08, + "loss": 0.70124471, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12145996, + "step": 15200, + "time_per_iteration": 2.89320969581604 + }, + { + "auxiliary_loss_clip": 0.01316932, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.21243036, + "balance_loss_mlp": 1.0182476, + "epoch": 0.9139335638058019, + "flos": 22826350771560.0, + "grad_norm": 1.6397574215503041, + "language_loss": 0.71713221, + "learning_rate": 7.715012431137435e-08, + "loss": 0.74060178, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.11773682, + "step": 15201, + "time_per_iteration": 2.8143508434295654 + }, + { + "auxiliary_loss_clip": 0.01325776, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.21833563, + "balance_loss_mlp": 1.01395679, + "epoch": 0.9139936870584698, + "flos": 18008704824840.0, + "grad_norm": 1.801169534784985, + "language_loss": 0.70772505, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73124349, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12121582, + "step": 15202, + "time_per_iteration": 2.9156980514526367 + }, + { + "auxiliary_loss_clip": 0.0132738, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.21902096, + "balance_loss_mlp": 1.01886821, + "epoch": 0.9140538103111379, + "flos": 15817846857480.0, + "grad_norm": 1.827355932932087, + "language_loss": 0.66869688, + "learning_rate": 7.693601370155001e-08, + "loss": 0.69228983, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13049316, + "step": 15203, + "time_per_iteration": 2.8214447498321533 + }, + { + "auxiliary_loss_clip": 0.01330623, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.22305489, + "balance_loss_mlp": 1.01867867, + "epoch": 0.9141139335638058, + "flos": 23992241905800.0, + "grad_norm": 3.1879615614315098, + "language_loss": 0.69012994, + "learning_rate": 7.682906777877751e-08, + "loss": 0.71375269, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12963867, + "step": 15204, + "time_per_iteration": 2.8622004985809326 + }, + { + "auxiliary_loss_clip": 0.01326389, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.21778059, + "balance_loss_mlp": 1.01682448, + "epoch": 0.9141740568164738, + "flos": 24029747049240.0, + "grad_norm": 2.0513728029513874, + "language_loss": 0.60109174, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62465572, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1317749, + "step": 15205, + "time_per_iteration": 2.877211809158325 + }, + { + "auxiliary_loss_clip": 0.01319947, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.21591568, + "balance_loss_mlp": 1.01683569, + "epoch": 0.9142341800691417, + "flos": 27024276958200.0, + "grad_norm": 1.6786536073985519, + "language_loss": 0.81519628, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83869684, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.1328125, + "step": 15206, + "time_per_iteration": 2.8619236946105957 + }, + { + "auxiliary_loss_clip": 0.01330261, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.22126019, + "balance_loss_mlp": 1.01357257, + "epoch": 0.9142943033218097, + "flos": 20417365364760.0, + "grad_norm": 2.537600782415353, + "language_loss": 0.74172497, + "learning_rate": 7.650866758767382e-08, + "loss": 0.76529562, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13232422, + "step": 15207, + "time_per_iteration": 2.891465663909912 + }, + { + "auxiliary_loss_clip": 0.01318964, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.21256459, + "balance_loss_mlp": 1.02162552, + "epoch": 0.9143544265744776, + "flos": 19760059244520.0, + "grad_norm": 1.5407918231262296, + "language_loss": 0.73018062, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75371772, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13128662, + "step": 15208, + "time_per_iteration": 2.7385013103485107 + }, + { + "auxiliary_loss_clip": 0.01323819, + "auxiliary_loss_mlp": 0.01026594, + "balance_loss_clip": 1.21860766, + "balance_loss_mlp": 1.01453042, + "epoch": 0.9144145498271457, + "flos": 17169923633040.0, + "grad_norm": 3.381323268858311, + "language_loss": 0.86315048, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88665462, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.1206665, + "step": 15209, + "time_per_iteration": 2.7831833362579346 + }, + { + "auxiliary_loss_clip": 0.01321502, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.2152946, + "balance_loss_mlp": 1.02327275, + "epoch": 0.9144746730798137, + "flos": 23730518068920.0, + "grad_norm": 1.5541381538170196, + "language_loss": 0.7555899, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77916193, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12432861, + "step": 15210, + "time_per_iteration": 2.8574721813201904 + }, + { + "auxiliary_loss_clip": 0.01327087, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.21895385, + "balance_loss_mlp": 1.01784718, + "epoch": 0.9145347963324816, + "flos": 25853309778960.0, + "grad_norm": 1.7145500232937485, + "language_loss": 0.78276634, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80634087, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12524414, + "step": 15211, + "time_per_iteration": 2.8987553119659424 + }, + { + "auxiliary_loss_clip": 0.01325729, + "auxiliary_loss_mlp": 0.01025795, + "balance_loss_clip": 1.2182312, + "balance_loss_mlp": 1.0136776, + "epoch": 0.9145949195851496, + "flos": 19247210352720.0, + "grad_norm": 1.5915779809919401, + "language_loss": 0.83158445, + "learning_rate": 7.597612610270986e-08, + "loss": 0.85509968, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12127686, + "step": 15212, + "time_per_iteration": 2.8515963554382324 + }, + { + "auxiliary_loss_clip": 0.01317777, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.21384871, + "balance_loss_mlp": 1.0167594, + "epoch": 0.9146550428378175, + "flos": 18300989775600.0, + "grad_norm": 1.679731611594971, + "language_loss": 0.84046626, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86392814, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 1.03759766, + "router_z_loss_mlp": 0.11645508, + "step": 15213, + "time_per_iteration": 2.857041835784912 + }, + { + "auxiliary_loss_clip": 0.01327693, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.21961069, + "balance_loss_mlp": 1.01813257, + "epoch": 0.9147151660904855, + "flos": 20089118388240.0, + "grad_norm": 1.6762158688965587, + "language_loss": 0.70778227, + "learning_rate": 7.576362019471894e-08, + "loss": 0.73136544, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12475586, + "step": 15214, + "time_per_iteration": 2.77506947517395 + }, + { + "auxiliary_loss_clip": 0.01334229, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.22373354, + "balance_loss_mlp": 1.02163851, + "epoch": 0.9147752893431534, + "flos": 24394483960200.0, + "grad_norm": 1.6915559435906153, + "language_loss": 0.63082355, + "learning_rate": 7.565747668956413e-08, + "loss": 0.65451449, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13226318, + "step": 15215, + "time_per_iteration": 2.8001749515533447 + }, + { + "auxiliary_loss_clip": 0.01336666, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.2253437, + "balance_loss_mlp": 1.01653004, + "epoch": 0.9148354125958215, + "flos": 18154989429480.0, + "grad_norm": 2.7296935714921156, + "language_loss": 0.76990116, + "learning_rate": 7.555140615567058e-08, + "loss": 0.79356134, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12835693, + "step": 15216, + "time_per_iteration": 2.806459426879883 + }, + { + "auxiliary_loss_clip": 0.01324584, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.21810961, + "balance_loss_mlp": 1.02100849, + "epoch": 0.9148955358484894, + "flos": 23372765795880.0, + "grad_norm": 2.122209414115213, + "language_loss": 0.68583739, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70941985, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12652588, + "step": 15217, + "time_per_iteration": 4.338181972503662 + }, + { + "auxiliary_loss_clip": 0.01326664, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.22048116, + "balance_loss_mlp": 1.01616085, + "epoch": 0.9149556591011574, + "flos": 18080953743240.0, + "grad_norm": 3.7257264366112572, + "language_loss": 0.80420136, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82775819, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12866211, + "step": 15218, + "time_per_iteration": 2.8139162063598633 + }, + { + "auxiliary_loss_clip": 0.01143243, + "auxiliary_loss_mlp": 0.01005509, + "balance_loss_clip": 1.10104418, + "balance_loss_mlp": 1.00267208, + "epoch": 0.9150157823538253, + "flos": 54598745518200.0, + "grad_norm": 0.8497811606978476, + "language_loss": 0.59175837, + "learning_rate": 7.523363242176595e-08, + "loss": 0.6132459, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02832031, + "step": 15219, + "time_per_iteration": 3.2490053176879883 + }, + { + "auxiliary_loss_clip": 0.0131795, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.21271455, + "balance_loss_mlp": 1.02166724, + "epoch": 0.9150759056064933, + "flos": 17897407645320.0, + "grad_norm": 1.7017599936319876, + "language_loss": 0.78435147, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80786979, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12213135, + "step": 15220, + "time_per_iteration": 4.352906942367554 + }, + { + "auxiliary_loss_clip": 0.01330415, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.21927488, + "balance_loss_mlp": 1.02074409, + "epoch": 0.9151360288591612, + "flos": 18077583249360.0, + "grad_norm": 2.7410728051590016, + "language_loss": 0.65669781, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68034458, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13513184, + "step": 15221, + "time_per_iteration": 4.272027492523193 + }, + { + "auxiliary_loss_clip": 0.01324278, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.21594942, + "balance_loss_mlp": 1.02268243, + "epoch": 0.9151961521118293, + "flos": 19359279091080.0, + "grad_norm": 1.9115824856617945, + "language_loss": 0.84754598, + "learning_rate": 7.491651557384692e-08, + "loss": 0.87114012, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12451172, + "step": 15222, + "time_per_iteration": 2.8403449058532715 + }, + { + "auxiliary_loss_clip": 0.01143025, + "auxiliary_loss_mlp": 0.01002742, + "balance_loss_clip": 1.10102916, + "balance_loss_mlp": 1.00021446, + "epoch": 0.9152562753644973, + "flos": 72162667708560.0, + "grad_norm": 0.7288758678990306, + "language_loss": 0.49656564, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51802325, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.02526855, + "step": 15223, + "time_per_iteration": 3.3109259605407715 + }, + { + "auxiliary_loss_clip": 0.01329914, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.22139549, + "balance_loss_mlp": 1.02663565, + "epoch": 0.9153163986171652, + "flos": 20781899233920.0, + "grad_norm": 3.0901673333314577, + "language_loss": 0.72706306, + "learning_rate": 7.470546933201349e-08, + "loss": 0.75076163, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13305664, + "step": 15224, + "time_per_iteration": 2.8713581562042236 + }, + { + "auxiliary_loss_clip": 0.01321661, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.21521938, + "balance_loss_mlp": 1.01366282, + "epoch": 0.9153765218698332, + "flos": 23045412203280.0, + "grad_norm": 1.8984083918644539, + "language_loss": 0.81710649, + "learning_rate": 7.460005572013895e-08, + "loss": 0.84058797, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12841797, + "step": 15225, + "time_per_iteration": 2.764971971511841 + }, + { + "auxiliary_loss_clip": 0.01325922, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.21893072, + "balance_loss_mlp": 1.01632166, + "epoch": 0.9154366451225011, + "flos": 28997454178080.0, + "grad_norm": 1.3000348991995079, + "language_loss": 0.71555716, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73910344, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1239624, + "step": 15226, + "time_per_iteration": 2.8455216884613037 + }, + { + "auxiliary_loss_clip": 0.01327225, + "auxiliary_loss_mlp": 0.01027106, + "balance_loss_clip": 1.21866155, + "balance_loss_mlp": 1.01441622, + "epoch": 0.9154967683751691, + "flos": 22314598305480.0, + "grad_norm": 2.433018745091948, + "language_loss": 0.75238967, + "learning_rate": 7.43894475344613e-08, + "loss": 0.77593303, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12689209, + "step": 15227, + "time_per_iteration": 4.333539009094238 + }, + { + "auxiliary_loss_clip": 0.01323288, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.21710634, + "balance_loss_mlp": 1.01777744, + "epoch": 0.915556891627837, + "flos": 24576852415680.0, + "grad_norm": 1.524923610533926, + "language_loss": 0.74303508, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76656795, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12231445, + "step": 15228, + "time_per_iteration": 2.7974040508270264 + }, + { + "auxiliary_loss_clip": 0.01324531, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.21769142, + "balance_loss_mlp": 1.01958299, + "epoch": 0.9156170148805051, + "flos": 22169897426880.0, + "grad_norm": 1.5051387370802827, + "language_loss": 0.72070783, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74426937, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12036133, + "step": 15229, + "time_per_iteration": 2.788494110107422 + }, + { + "auxiliary_loss_clip": 0.01322012, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.21397448, + "balance_loss_mlp": 1.02462721, + "epoch": 0.915677138133173, + "flos": 20925260036640.0, + "grad_norm": 1.5524584929985434, + "language_loss": 0.83127522, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85487306, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13146973, + "step": 15230, + "time_per_iteration": 2.7934985160827637 + }, + { + "auxiliary_loss_clip": 0.01322854, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.21740365, + "balance_loss_mlp": 1.01565289, + "epoch": 0.915737261385841, + "flos": 24349506878520.0, + "grad_norm": 1.525866862988241, + "language_loss": 0.84108233, + "learning_rate": 7.396910742713957e-08, + "loss": 0.86458999, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12249756, + "step": 15231, + "time_per_iteration": 2.7886552810668945 + }, + { + "auxiliary_loss_clip": 0.01322323, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.21653414, + "balance_loss_mlp": 1.01561999, + "epoch": 0.9157973846385089, + "flos": 26767141866000.0, + "grad_norm": 1.5211999674854535, + "language_loss": 0.72485089, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74835652, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12615967, + "step": 15232, + "time_per_iteration": 2.819495439529419 + }, + { + "auxiliary_loss_clip": 0.01327528, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.21961784, + "balance_loss_mlp": 1.01746953, + "epoch": 0.9158575078911769, + "flos": 18483358231080.0, + "grad_norm": 2.247936477981068, + "language_loss": 0.67763233, + "learning_rate": 7.375937556925338e-08, + "loss": 0.70120633, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12402344, + "step": 15233, + "time_per_iteration": 2.8389034271240234 + }, + { + "auxiliary_loss_clip": 0.01327218, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.21810615, + "balance_loss_mlp": 1.019207, + "epoch": 0.9159176311438448, + "flos": 21803982873480.0, + "grad_norm": 2.1443353044158293, + "language_loss": 0.70368409, + "learning_rate": 7.365461920317861e-08, + "loss": 0.72728288, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13446045, + "step": 15234, + "time_per_iteration": 2.9372997283935547 + }, + { + "auxiliary_loss_clip": 0.01330617, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.22173643, + "balance_loss_mlp": 1.01730895, + "epoch": 0.9159777543965129, + "flos": 24788279475720.0, + "grad_norm": 1.5804311281659602, + "language_loss": 0.88374019, + "learning_rate": 7.354993588431391e-08, + "loss": 0.9073475, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12811279, + "step": 15235, + "time_per_iteration": 2.9140207767486572 + }, + { + "auxiliary_loss_clip": 0.01327699, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.21891785, + "balance_loss_mlp": 1.01697326, + "epoch": 0.9160378776491809, + "flos": 26874256384440.0, + "grad_norm": 1.684381524103063, + "language_loss": 0.77564657, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79922438, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13116455, + "step": 15236, + "time_per_iteration": 2.8857946395874023 + }, + { + "auxiliary_loss_clip": 0.01145056, + "auxiliary_loss_mlp": 0.01002582, + "balance_loss_clip": 1.1026206, + "balance_loss_mlp": 0.99960178, + "epoch": 0.9160980009018488, + "flos": 70594128436320.0, + "grad_norm": 0.6847908137772751, + "language_loss": 0.62274337, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64421976, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02978516, + "step": 15237, + "time_per_iteration": 3.254887104034424 + }, + { + "auxiliary_loss_clip": 0.01327419, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.21870363, + "balance_loss_mlp": 1.01684427, + "epoch": 0.9161581241545168, + "flos": 16293637297800.0, + "grad_norm": 3.373063241288312, + "language_loss": 0.7475493, + "learning_rate": 7.323632425066151e-08, + "loss": 0.77113074, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13867188, + "step": 15238, + "time_per_iteration": 2.7669901847839355 + }, + { + "auxiliary_loss_clip": 0.01326445, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.21785247, + "balance_loss_mlp": 1.01438022, + "epoch": 0.9162182474071847, + "flos": 18442238943600.0, + "grad_norm": 1.6614000570446406, + "language_loss": 0.74752533, + "learning_rate": 7.313193316030464e-08, + "loss": 0.77105999, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12652588, + "step": 15239, + "time_per_iteration": 2.774388074874878 + }, + { + "auxiliary_loss_clip": 0.01327439, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.21864414, + "balance_loss_mlp": 1.02011871, + "epoch": 0.9162783706598527, + "flos": 19171347290280.0, + "grad_norm": 2.313450914116159, + "language_loss": 0.64085704, + "learning_rate": 7.302761513697819e-08, + "loss": 0.66445738, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12481689, + "step": 15240, + "time_per_iteration": 2.760390520095825 + }, + { + "auxiliary_loss_clip": 0.01325988, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.22010267, + "balance_loss_mlp": 1.018785, + "epoch": 0.9163384939125206, + "flos": 20417933881800.0, + "grad_norm": 1.7655100013072949, + "language_loss": 0.76368403, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78725278, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12121582, + "step": 15241, + "time_per_iteration": 2.8019683361053467 + }, + { + "auxiliary_loss_clip": 0.01346666, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.23045993, + "balance_loss_mlp": 1.02008665, + "epoch": 0.9163986171651887, + "flos": 19650630049560.0, + "grad_norm": 2.252933136591054, + "language_loss": 0.67883635, + "learning_rate": 7.281919830723549e-08, + "loss": 0.70264375, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.13983154, + "step": 15242, + "time_per_iteration": 2.8162789344787598 + }, + { + "auxiliary_loss_clip": 0.0132467, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.21695101, + "balance_loss_mlp": 1.01948333, + "epoch": 0.9164587404178566, + "flos": 12826890484200.0, + "grad_norm": 1.7493024671978843, + "language_loss": 0.8112582, + "learning_rate": 7.271509950872334e-08, + "loss": 0.83482301, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12341309, + "step": 15243, + "time_per_iteration": 2.7745463848114014 + }, + { + "auxiliary_loss_clip": 0.0133307, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.22205973, + "balance_loss_mlp": 1.01904571, + "epoch": 0.9165188636705246, + "flos": 22314476480400.0, + "grad_norm": 1.7314540093452684, + "language_loss": 0.82095104, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84459984, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12762451, + "step": 15244, + "time_per_iteration": 2.7757396697998047 + }, + { + "auxiliary_loss_clip": 0.01335807, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.22360039, + "balance_loss_mlp": 1.02316141, + "epoch": 0.9165789869231925, + "flos": 18227928690000.0, + "grad_norm": 4.243612631882319, + "language_loss": 0.73207211, + "learning_rate": 7.250712116415214e-08, + "loss": 0.75579035, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.128479, + "step": 15245, + "time_per_iteration": 2.793440341949463 + }, + { + "auxiliary_loss_clip": 0.01324133, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.21812725, + "balance_loss_mlp": 1.01536345, + "epoch": 0.9166391101758605, + "flos": 13694567847120.0, + "grad_norm": 1.5542008185701168, + "language_loss": 0.75288093, + "learning_rate": 7.240324162598033e-08, + "loss": 0.77639854, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.1227417, + "step": 15246, + "time_per_iteration": 2.8302996158599854 + }, + { + "auxiliary_loss_clip": 0.01328175, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.22038269, + "balance_loss_mlp": 1.01501215, + "epoch": 0.9166992334285284, + "flos": 17351155054440.0, + "grad_norm": 1.9319062238102895, + "language_loss": 0.75276005, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77632451, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.13262939, + "step": 15247, + "time_per_iteration": 2.7663307189941406 + }, + { + "auxiliary_loss_clip": 0.01330725, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.22221696, + "balance_loss_mlp": 1.01764965, + "epoch": 0.9167593566811965, + "flos": 23736243847680.0, + "grad_norm": 1.5814918744147797, + "language_loss": 0.76242077, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78603137, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12689209, + "step": 15248, + "time_per_iteration": 2.885319709777832 + }, + { + "auxiliary_loss_clip": 0.01330784, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.22195554, + "balance_loss_mlp": 1.01879501, + "epoch": 0.9168194799338644, + "flos": 27823928672160.0, + "grad_norm": 3.070911826229778, + "language_loss": 0.7386384, + "learning_rate": 7.209204159518178e-08, + "loss": 0.76227009, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13592529, + "step": 15249, + "time_per_iteration": 2.8200321197509766 + }, + { + "auxiliary_loss_clip": 0.01323217, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.21571136, + "balance_loss_mlp": 1.016626, + "epoch": 0.9168796031865324, + "flos": 21721500648360.0, + "grad_norm": 2.001044221482222, + "language_loss": 0.76016635, + "learning_rate": 7.198845445926616e-08, + "loss": 0.783692, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1272583, + "step": 15250, + "time_per_iteration": 2.7487940788269043 + }, + { + "auxiliary_loss_clip": 0.01327169, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.22073209, + "balance_loss_mlp": 1.01374054, + "epoch": 0.9169397264392004, + "flos": 23409702422280.0, + "grad_norm": 1.6053358311144303, + "language_loss": 0.7586081, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78214526, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12817383, + "step": 15251, + "time_per_iteration": 2.7586593627929688 + }, + { + "auxiliary_loss_clip": 0.01328699, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.22096717, + "balance_loss_mlp": 1.02196789, + "epoch": 0.9169998496918683, + "flos": 23956239271680.0, + "grad_norm": 1.897475283688295, + "language_loss": 0.80059791, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82424235, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13787842, + "step": 15252, + "time_per_iteration": 2.8253726959228516 + }, + { + "auxiliary_loss_clip": 0.01326133, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.21880996, + "balance_loss_mlp": 1.02015972, + "epoch": 0.9170599729445363, + "flos": 18337033018080.0, + "grad_norm": 1.616242030071554, + "language_loss": 0.77369469, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79728305, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12548828, + "step": 15253, + "time_per_iteration": 2.7463481426239014 + }, + { + "auxiliary_loss_clip": 0.01330847, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.22330189, + "balance_loss_mlp": 1.01718342, + "epoch": 0.9171200961972042, + "flos": 22680066166920.0, + "grad_norm": 2.228882214099025, + "language_loss": 0.73096979, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75457036, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12042236, + "step": 15254, + "time_per_iteration": 2.8427140712738037 + }, + { + "auxiliary_loss_clip": 0.01317209, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.21328211, + "balance_loss_mlp": 1.01538765, + "epoch": 0.9171802194498723, + "flos": 26724317027400.0, + "grad_norm": 1.5280107046200198, + "language_loss": 0.79215235, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81560332, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.12493896, + "step": 15255, + "time_per_iteration": 2.8827717304229736 + }, + { + "auxiliary_loss_clip": 0.01331963, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.22124243, + "balance_loss_mlp": 1.02090788, + "epoch": 0.9172403427025402, + "flos": 37896409436760.0, + "grad_norm": 2.732868165824387, + "language_loss": 0.6861136, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70977473, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13262939, + "step": 15256, + "time_per_iteration": 2.95733642578125 + }, + { + "auxiliary_loss_clip": 0.01321784, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.21680212, + "balance_loss_mlp": 1.01921499, + "epoch": 0.9173004659552082, + "flos": 17059519837440.0, + "grad_norm": 1.714582265896303, + "language_loss": 0.84499574, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86852473, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11901855, + "step": 15257, + "time_per_iteration": 4.451902627944946 + }, + { + "auxiliary_loss_clip": 0.01321036, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.21627522, + "balance_loss_mlp": 1.01895607, + "epoch": 0.9173605892078761, + "flos": 22206874661640.0, + "grad_norm": 1.5162797836933426, + "language_loss": 0.77636945, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79988313, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1137085, + "step": 15258, + "time_per_iteration": 4.477877140045166 + }, + { + "auxiliary_loss_clip": 0.01324458, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.21800137, + "balance_loss_mlp": 1.01700473, + "epoch": 0.9174207124605441, + "flos": 16512048995760.0, + "grad_norm": 1.7560177996048607, + "language_loss": 0.7878598, + "learning_rate": 7.105946067406999e-08, + "loss": 0.81141186, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13751221, + "step": 15259, + "time_per_iteration": 4.25730562210083 + }, + { + "auxiliary_loss_clip": 0.01327507, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.22187757, + "balance_loss_mlp": 1.0228883, + "epoch": 0.917480835713212, + "flos": 24541052823360.0, + "grad_norm": 1.530052999720553, + "language_loss": 0.76419938, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78782207, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11877441, + "step": 15260, + "time_per_iteration": 2.8266541957855225 + }, + { + "auxiliary_loss_clip": 0.01324522, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.21796155, + "balance_loss_mlp": 1.02039623, + "epoch": 0.9175409589658801, + "flos": 20884871699640.0, + "grad_norm": 1.6060212913686027, + "language_loss": 0.61476243, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63833702, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12554932, + "step": 15261, + "time_per_iteration": 2.877609968185425 + }, + { + "auxiliary_loss_clip": 0.01321898, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.2159915, + "balance_loss_mlp": 1.01714528, + "epoch": 0.917601082218548, + "flos": 14279218965360.0, + "grad_norm": 1.7841814209575282, + "language_loss": 0.74039066, + "learning_rate": 7.075111255942002e-08, + "loss": 0.76390719, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.1262207, + "step": 15262, + "time_per_iteration": 2.779395818710327 + }, + { + "auxiliary_loss_clip": 0.01334216, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.22255206, + "balance_loss_mlp": 1.01944983, + "epoch": 0.917661205471216, + "flos": 19103971375080.0, + "grad_norm": 1.7139958368213908, + "language_loss": 0.77801073, + "learning_rate": 7.064847616396496e-08, + "loss": 0.80168033, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13305664, + "step": 15263, + "time_per_iteration": 2.7732574939727783 + }, + { + "auxiliary_loss_clip": 0.0133352, + "auxiliary_loss_mlp": 0.01037517, + "balance_loss_clip": 1.22135353, + "balance_loss_mlp": 1.02450538, + "epoch": 0.917721328723884, + "flos": 21111648719760.0, + "grad_norm": 1.6298458667286286, + "language_loss": 0.76100743, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78471774, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13006592, + "step": 15264, + "time_per_iteration": 2.7463226318359375 + }, + { + "auxiliary_loss_clip": 0.01327275, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.2200141, + "balance_loss_mlp": 1.02121091, + "epoch": 0.9177814519765519, + "flos": 21948074626680.0, + "grad_norm": 1.7120390669201513, + "language_loss": 0.83486879, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85847586, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12237549, + "step": 15265, + "time_per_iteration": 4.294763565063477 + }, + { + "auxiliary_loss_clip": 0.01327224, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.21613717, + "balance_loss_mlp": 1.02768052, + "epoch": 0.9178415752292199, + "flos": 24211222120800.0, + "grad_norm": 1.52253163392938, + "language_loss": 0.73743296, + "learning_rate": 7.034100596037306e-08, + "loss": 0.76112461, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.14239502, + "step": 15266, + "time_per_iteration": 2.808253765106201 + }, + { + "auxiliary_loss_clip": 0.01329054, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.22061396, + "balance_loss_mlp": 1.01286602, + "epoch": 0.9179016984818879, + "flos": 20046171724560.0, + "grad_norm": 1.6236675913550338, + "language_loss": 0.78244871, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80599308, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12512207, + "step": 15267, + "time_per_iteration": 2.8235960006713867 + }, + { + "auxiliary_loss_clip": 0.01145124, + "auxiliary_loss_mlp": 0.01006531, + "balance_loss_clip": 1.10243416, + "balance_loss_mlp": 1.0039562, + "epoch": 0.9179618217345559, + "flos": 65572811626320.0, + "grad_norm": 0.7435044379400969, + "language_loss": 0.56259966, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58411622, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02575684, + "step": 15268, + "time_per_iteration": 3.319786787033081 + }, + { + "auxiliary_loss_clip": 0.01325569, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.21810472, + "balance_loss_mlp": 1.01699638, + "epoch": 0.9180219449872238, + "flos": 21329613725760.0, + "grad_norm": 1.7416642731988905, + "language_loss": 0.76582074, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78937721, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13110352, + "step": 15269, + "time_per_iteration": 2.8633949756622314 + }, + { + "auxiliary_loss_clip": 0.0132432, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.21700847, + "balance_loss_mlp": 1.02255726, + "epoch": 0.9180820682398918, + "flos": 41067135330480.0, + "grad_norm": 1.9403519433175114, + "language_loss": 0.72930372, + "learning_rate": 6.993207012706936e-08, + "loss": 0.75290048, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12792969, + "step": 15270, + "time_per_iteration": 2.9351115226745605 + }, + { + "auxiliary_loss_clip": 0.0132043, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.21486986, + "balance_loss_mlp": 1.01703453, + "epoch": 0.9181421914925597, + "flos": 28078586654400.0, + "grad_norm": 1.4915094676881453, + "language_loss": 0.79622144, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81972718, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.13104248, + "step": 15271, + "time_per_iteration": 2.8985342979431152 + }, + { + "auxiliary_loss_clip": 0.01327996, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.21908903, + "balance_loss_mlp": 1.01730871, + "epoch": 0.9182023147452277, + "flos": 29175477539040.0, + "grad_norm": 5.683688118795638, + "language_loss": 0.72851622, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75209677, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12756348, + "step": 15272, + "time_per_iteration": 2.8555185794830322 + }, + { + "auxiliary_loss_clip": 0.01325647, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.2177906, + "balance_loss_mlp": 1.02325583, + "epoch": 0.9182624379978956, + "flos": 24066155766960.0, + "grad_norm": 1.9331256705422473, + "language_loss": 0.72679734, + "learning_rate": 6.962613671639105e-08, + "loss": 0.75041145, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.125, + "step": 15273, + "time_per_iteration": 2.8917460441589355 + }, + { + "auxiliary_loss_clip": 0.01314425, + "auxiliary_loss_mlp": 0.01026593, + "balance_loss_clip": 1.21101999, + "balance_loss_mlp": 1.01508355, + "epoch": 0.9183225612505637, + "flos": 23298608284560.0, + "grad_norm": 1.4182569729654353, + "language_loss": 0.74455285, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76796305, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.11517334, + "step": 15274, + "time_per_iteration": 2.8102874755859375 + }, + { + "auxiliary_loss_clip": 0.01326619, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.21754265, + "balance_loss_mlp": 1.02634239, + "epoch": 0.9183826845032316, + "flos": 19613937073320.0, + "grad_norm": 1.4037897841395803, + "language_loss": 0.69184792, + "learning_rate": 6.942254710267902e-08, + "loss": 0.71549433, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.11663818, + "step": 15275, + "time_per_iteration": 2.8091087341308594 + }, + { + "auxiliary_loss_clip": 0.01323789, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.21686053, + "balance_loss_mlp": 1.01734316, + "epoch": 0.9184428077558996, + "flos": 18483723706320.0, + "grad_norm": 1.8086532014962948, + "language_loss": 0.72414762, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74768257, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1237793, + "step": 15276, + "time_per_iteration": 2.8005857467651367 + }, + { + "auxiliary_loss_clip": 0.01329904, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.22288537, + "balance_loss_mlp": 1.01595449, + "epoch": 0.9185029310085676, + "flos": 20745977816520.0, + "grad_norm": 1.5995352560761062, + "language_loss": 0.73547351, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75905496, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12286377, + "step": 15277, + "time_per_iteration": 2.8280773162841797 + }, + { + "auxiliary_loss_clip": 0.01143387, + "auxiliary_loss_mlp": 0.01004282, + "balance_loss_clip": 1.10034633, + "balance_loss_mlp": 1.00136137, + "epoch": 0.9185630542612355, + "flos": 68225165891280.0, + "grad_norm": 0.717400756336828, + "language_loss": 0.59277171, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61424839, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.0291748, + "step": 15278, + "time_per_iteration": 3.461318016052246 + }, + { + "auxiliary_loss_clip": 0.01315668, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.21016669, + "balance_loss_mlp": 1.01647615, + "epoch": 0.9186231775139035, + "flos": 12243254574960.0, + "grad_norm": 1.6765776601091542, + "language_loss": 0.63779521, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66123438, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.11785889, + "step": 15279, + "time_per_iteration": 2.9114465713500977 + }, + { + "auxiliary_loss_clip": 0.01143189, + "auxiliary_loss_mlp": 0.00999884, + "balance_loss_clip": 1.10059798, + "balance_loss_mlp": 0.99736899, + "epoch": 0.9186833007665715, + "flos": 63954462877560.0, + "grad_norm": 0.8564548562852996, + "language_loss": 0.60212678, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62355751, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02514648, + "step": 15280, + "time_per_iteration": 3.242314577102661 + }, + { + "auxiliary_loss_clip": 0.01331286, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.22291636, + "balance_loss_mlp": 1.02175629, + "epoch": 0.9187434240192395, + "flos": 19979526759840.0, + "grad_norm": 1.6982166503588516, + "language_loss": 0.70068753, + "learning_rate": 6.881353536939815e-08, + "loss": 0.7243467, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12878418, + "step": 15281, + "time_per_iteration": 2.9382500648498535 + }, + { + "auxiliary_loss_clip": 0.01328997, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.21992683, + "balance_loss_mlp": 1.01585734, + "epoch": 0.9188035472719074, + "flos": 25233224543640.0, + "grad_norm": 1.585071950958082, + "language_loss": 0.84571505, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86930364, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.14007568, + "step": 15282, + "time_per_iteration": 2.800832986831665 + }, + { + "auxiliary_loss_clip": 0.01324746, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.21929717, + "balance_loss_mlp": 1.01696026, + "epoch": 0.9188636705245754, + "flos": 18410134712040.0, + "grad_norm": 1.8889334652953091, + "language_loss": 0.60364544, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62719524, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.13305664, + "step": 15283, + "time_per_iteration": 2.7457969188690186 + }, + { + "auxiliary_loss_clip": 0.01336847, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.22462511, + "balance_loss_mlp": 1.01919782, + "epoch": 0.9189237937772433, + "flos": 23774439333240.0, + "grad_norm": 1.6346957683903123, + "language_loss": 0.65779191, + "learning_rate": 6.851001806641554e-08, + "loss": 0.68147707, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.12475586, + "step": 15284, + "time_per_iteration": 2.7653489112854004 + }, + { + "auxiliary_loss_clip": 0.01322255, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.21563244, + "balance_loss_mlp": 1.01727605, + "epoch": 0.9189839170299113, + "flos": 21219534797040.0, + "grad_norm": 1.7289088068741691, + "language_loss": 0.73703396, + "learning_rate": 6.840899211156292e-08, + "loss": 0.76055503, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12567139, + "step": 15285, + "time_per_iteration": 2.7649219036102295 + }, + { + "auxiliary_loss_clip": 0.01325167, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.21831858, + "balance_loss_mlp": 1.01942313, + "epoch": 0.9190440402825792, + "flos": 16731760161240.0, + "grad_norm": 1.7369125778468066, + "language_loss": 0.72027373, + "learning_rate": 6.830803940283458e-08, + "loss": 0.74384731, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12768555, + "step": 15286, + "time_per_iteration": 2.7658019065856934 + }, + { + "auxiliary_loss_clip": 0.01322755, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.21557593, + "balance_loss_mlp": 1.01850486, + "epoch": 0.9191041635352473, + "flos": 23446517223600.0, + "grad_norm": 1.7915112111105156, + "language_loss": 0.73423868, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75778186, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1305542, + "step": 15287, + "time_per_iteration": 2.796889543533325 + }, + { + "auxiliary_loss_clip": 0.01330339, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.22117448, + "balance_loss_mlp": 1.01951015, + "epoch": 0.9191642867879152, + "flos": 18812254941360.0, + "grad_norm": 4.590322597603798, + "language_loss": 0.65108073, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67471814, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13885498, + "step": 15288, + "time_per_iteration": 2.8069353103637695 + }, + { + "auxiliary_loss_clip": 0.01323914, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.21742153, + "balance_loss_mlp": 1.02315259, + "epoch": 0.9192244100405832, + "flos": 32167367904600.0, + "grad_norm": 1.730955494410395, + "language_loss": 0.7093873, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73298395, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12597656, + "step": 15289, + "time_per_iteration": 2.944376230239868 + }, + { + "auxiliary_loss_clip": 0.01327934, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.2198379, + "balance_loss_mlp": 1.01721168, + "epoch": 0.9192845332932512, + "flos": 16361378688240.0, + "grad_norm": 1.7763748100339902, + "language_loss": 0.74890077, + "learning_rate": 6.790496110568921e-08, + "loss": 0.77247477, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12249756, + "step": 15290, + "time_per_iteration": 2.754927158355713 + }, + { + "auxiliary_loss_clip": 0.01322184, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.21741581, + "balance_loss_mlp": 1.01809359, + "epoch": 0.9193446565459191, + "flos": 26620085702520.0, + "grad_norm": 1.8893073010962522, + "language_loss": 0.72511822, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74863911, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.1182251, + "step": 15291, + "time_per_iteration": 2.77001690864563 + }, + { + "auxiliary_loss_clip": 0.01320214, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.21544218, + "balance_loss_mlp": 1.01401269, + "epoch": 0.9194047797985871, + "flos": 22497332236200.0, + "grad_norm": 1.7126827350151492, + "language_loss": 0.71379483, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73726112, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.1239624, + "step": 15292, + "time_per_iteration": 2.843916416168213 + }, + { + "auxiliary_loss_clip": 0.01328229, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.21914649, + "balance_loss_mlp": 1.01632738, + "epoch": 0.9194649030512551, + "flos": 25083772486920.0, + "grad_norm": 4.656004405140868, + "language_loss": 0.73292077, + "learning_rate": 6.760342165443988e-08, + "loss": 0.7564978, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.1315918, + "step": 15293, + "time_per_iteration": 2.87546706199646 + }, + { + "auxiliary_loss_clip": 0.01321761, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.21630275, + "balance_loss_mlp": 1.01810527, + "epoch": 0.9195250263039231, + "flos": 11914885773360.0, + "grad_norm": 1.8714711616251198, + "language_loss": 0.78425419, + "learning_rate": 6.750305505228837e-08, + "loss": 0.8077811, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12811279, + "step": 15294, + "time_per_iteration": 2.9029042720794678 + }, + { + "auxiliary_loss_clip": 0.0132996, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.21995282, + "balance_loss_mlp": 1.01821125, + "epoch": 0.919585149556591, + "flos": 21839051515320.0, + "grad_norm": 1.480700545176906, + "language_loss": 0.77501822, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79863989, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13983154, + "step": 15295, + "time_per_iteration": 4.227335453033447 + }, + { + "auxiliary_loss_clip": 0.01314451, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.21181297, + "balance_loss_mlp": 1.01791966, + "epoch": 0.919645272809259, + "flos": 28189965050640.0, + "grad_norm": 2.0272871712550655, + "language_loss": 0.71976554, + "learning_rate": 6.730254169322114e-08, + "loss": 0.74320602, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.11682129, + "step": 15296, + "time_per_iteration": 2.8412461280822754 + }, + { + "auxiliary_loss_clip": 0.01324612, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.21780145, + "balance_loss_mlp": 1.02112341, + "epoch": 0.9197053960619269, + "flos": 18337439101680.0, + "grad_norm": 1.9062554284418105, + "language_loss": 0.75164562, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77522707, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12414551, + "step": 15297, + "time_per_iteration": 4.3266706466674805 + }, + { + "auxiliary_loss_clip": 0.0132885, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.22070146, + "balance_loss_mlp": 1.01673293, + "epoch": 0.9197655193145949, + "flos": 28189274708520.0, + "grad_norm": 1.5451157415777583, + "language_loss": 0.73914677, + "learning_rate": 6.710232148647676e-08, + "loss": 0.76274204, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1395874, + "step": 15298, + "time_per_iteration": 4.354564905166626 + }, + { + "auxiliary_loss_clip": 0.01331828, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.22213125, + "balance_loss_mlp": 1.02450204, + "epoch": 0.9198256425672628, + "flos": 17310157592040.0, + "grad_norm": 2.250144306517088, + "language_loss": 0.79590297, + "learning_rate": 6.70023213247175e-08, + "loss": 0.8195948, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12841797, + "step": 15299, + "time_per_iteration": 2.9114511013031006 + }, + { + "auxiliary_loss_clip": 0.01322645, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.21640503, + "balance_loss_mlp": 1.01854491, + "epoch": 0.9198857658199309, + "flos": 17863069953960.0, + "grad_norm": 1.9504951626983218, + "language_loss": 0.64405996, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66759741, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12554932, + "step": 15300, + "time_per_iteration": 2.821197986602783 + }, + { + "auxiliary_loss_clip": 0.01311305, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.20875049, + "balance_loss_mlp": 1.0173862, + "epoch": 0.9199458890725988, + "flos": 22132676541960.0, + "grad_norm": 1.6636799331894958, + "language_loss": 0.69500315, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71840072, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.11065674, + "step": 15301, + "time_per_iteration": 2.8231310844421387 + }, + { + "auxiliary_loss_clip": 0.0133264, + "auxiliary_loss_mlp": 0.01039003, + "balance_loss_clip": 1.22348416, + "balance_loss_mlp": 1.02511513, + "epoch": 0.9200060123252668, + "flos": 16038329581800.0, + "grad_norm": 2.827452914396899, + "language_loss": 0.71199441, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73571086, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13903809, + "step": 15302, + "time_per_iteration": 2.736039400100708 + }, + { + "auxiliary_loss_clip": 0.0132997, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.22189856, + "balance_loss_mlp": 1.01869893, + "epoch": 0.9200661355779348, + "flos": 26869464597960.0, + "grad_norm": 2.0921686116090403, + "language_loss": 0.76718879, + "learning_rate": 6.660305371021579e-08, + "loss": 0.79079455, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.11895752, + "step": 15303, + "time_per_iteration": 2.8084840774536133 + }, + { + "auxiliary_loss_clip": 0.01325307, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.21929407, + "balance_loss_mlp": 1.01920033, + "epoch": 0.9201262588306027, + "flos": 12790603591560.0, + "grad_norm": 1.9794533823681226, + "language_loss": 0.88085544, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90442228, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12176514, + "step": 15304, + "time_per_iteration": 4.262352228164673 + }, + { + "auxiliary_loss_clip": 0.01343611, + "auxiliary_loss_mlp": 0.01035381, + "balance_loss_clip": 1.23235822, + "balance_loss_mlp": 1.02101636, + "epoch": 0.9201863820832707, + "flos": 20636548621560.0, + "grad_norm": 2.0064443521027515, + "language_loss": 0.775653, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79944289, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14367676, + "step": 15305, + "time_per_iteration": 2.7448649406433105 + }, + { + "auxiliary_loss_clip": 0.01326119, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.21761036, + "balance_loss_mlp": 1.0255456, + "epoch": 0.9202465053359387, + "flos": 26401430354400.0, + "grad_norm": 2.1869651017377105, + "language_loss": 0.81888199, + "learning_rate": 6.630437278944501e-08, + "loss": 0.84252334, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12469482, + "step": 15306, + "time_per_iteration": 2.8501198291778564 + }, + { + "auxiliary_loss_clip": 0.01325628, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.22048044, + "balance_loss_mlp": 1.023615, + "epoch": 0.9203066285886067, + "flos": 10491737721840.0, + "grad_norm": 1.9274376720987567, + "language_loss": 0.72183412, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74544382, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11724854, + "step": 15307, + "time_per_iteration": 2.7125766277313232 + }, + { + "auxiliary_loss_clip": 0.01332572, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.22281778, + "balance_loss_mlp": 1.01766229, + "epoch": 0.9203667518412746, + "flos": 19395444158640.0, + "grad_norm": 2.513048114920517, + "language_loss": 0.78976649, + "learning_rate": 6.610561879896526e-08, + "loss": 0.81339902, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13031006, + "step": 15308, + "time_per_iteration": 2.8502297401428223 + }, + { + "auxiliary_loss_clip": 0.01325768, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.21869385, + "balance_loss_mlp": 1.01666486, + "epoch": 0.9204268750939426, + "flos": 15929306470440.0, + "grad_norm": 1.844976646298114, + "language_loss": 0.78352094, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80707216, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12689209, + "step": 15309, + "time_per_iteration": 2.790008783340454 + }, + { + "auxiliary_loss_clip": 0.01326645, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.2182827, + "balance_loss_mlp": 1.01590466, + "epoch": 0.9204869983466105, + "flos": 16476086970000.0, + "grad_norm": 1.8798858230732682, + "language_loss": 0.66447526, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68802738, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12670898, + "step": 15310, + "time_per_iteration": 2.733637571334839 + }, + { + "auxiliary_loss_clip": 0.01325448, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.21762204, + "balance_loss_mlp": 1.01817679, + "epoch": 0.9205471215992785, + "flos": 21543680329200.0, + "grad_norm": 1.7036792782182313, + "language_loss": 0.66307962, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68664688, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13098145, + "step": 15311, + "time_per_iteration": 2.8153648376464844 + }, + { + "auxiliary_loss_clip": 0.01326564, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.21836615, + "balance_loss_mlp": 1.0249294, + "epoch": 0.9206072448519464, + "flos": 25010873834760.0, + "grad_norm": 1.6796353044588663, + "language_loss": 0.76668698, + "learning_rate": 6.570899084972503e-08, + "loss": 0.79033065, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12884521, + "step": 15312, + "time_per_iteration": 2.9090988636016846 + }, + { + "auxiliary_loss_clip": 0.01323308, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.21859896, + "balance_loss_mlp": 1.01832294, + "epoch": 0.9206673681046145, + "flos": 20527647335280.0, + "grad_norm": 2.1475584576391946, + "language_loss": 0.7944541, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81799459, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.12420654, + "step": 15313, + "time_per_iteration": 2.8439035415649414 + }, + { + "auxiliary_loss_clip": 0.0132863, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.2200737, + "balance_loss_mlp": 1.01655126, + "epoch": 0.9207274913572824, + "flos": 20887957935000.0, + "grad_norm": 1.6563135334132626, + "language_loss": 0.78723395, + "learning_rate": 6.55111169511251e-08, + "loss": 0.81081426, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12854004, + "step": 15314, + "time_per_iteration": 2.8126933574676514 + }, + { + "auxiliary_loss_clip": 0.01341193, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.22820425, + "balance_loss_mlp": 1.01935267, + "epoch": 0.9207876146099504, + "flos": 22713063782400.0, + "grad_norm": 2.6901353772064356, + "language_loss": 0.79353184, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81727195, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.13476562, + "step": 15315, + "time_per_iteration": 2.8102970123291016 + }, + { + "auxiliary_loss_clip": 0.01331449, + "auxiliary_loss_mlp": 0.01038154, + "balance_loss_clip": 1.22011518, + "balance_loss_mlp": 1.02490401, + "epoch": 0.9208477378626184, + "flos": 18510954934680.0, + "grad_norm": 2.0106006178759044, + "language_loss": 0.76634318, + "learning_rate": 6.531353647657156e-08, + "loss": 0.79003918, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13250732, + "step": 15316, + "time_per_iteration": 2.823967695236206 + }, + { + "auxiliary_loss_clip": 0.01331968, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.22238421, + "balance_loss_mlp": 1.0175488, + "epoch": 0.9209078611152863, + "flos": 23004414740880.0, + "grad_norm": 1.5282729707296692, + "language_loss": 0.69419825, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71782929, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.13586426, + "step": 15317, + "time_per_iteration": 2.8716156482696533 + }, + { + "auxiliary_loss_clip": 0.01328081, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.22004151, + "balance_loss_mlp": 1.02127051, + "epoch": 0.9209679843679544, + "flos": 24066927325800.0, + "grad_norm": 1.6466020599885087, + "language_loss": 0.83698404, + "learning_rate": 6.511624945603378e-08, + "loss": 0.86060584, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.1282959, + "step": 15318, + "time_per_iteration": 2.836143970489502 + }, + { + "auxiliary_loss_clip": 0.01325805, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.21754742, + "balance_loss_mlp": 1.02046204, + "epoch": 0.9210281076206223, + "flos": 13557379515120.0, + "grad_norm": 1.858185098124462, + "language_loss": 0.85593605, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87952578, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1270752, + "step": 15319, + "time_per_iteration": 2.9150848388671875 + }, + { + "auxiliary_loss_clip": 0.01143891, + "auxiliary_loss_mlp": 0.01001885, + "balance_loss_clip": 1.10119629, + "balance_loss_mlp": 0.99958462, + "epoch": 0.9210882308732903, + "flos": 71443467801720.0, + "grad_norm": 0.7837108647432363, + "language_loss": 0.56316102, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58461869, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02294922, + "step": 15320, + "time_per_iteration": 3.3635354042053223 + }, + { + "auxiliary_loss_clip": 0.01337249, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.22495556, + "balance_loss_mlp": 1.02308834, + "epoch": 0.9211483541259582, + "flos": 18512944744320.0, + "grad_norm": 2.0983785828293513, + "language_loss": 0.64046681, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66421133, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14111328, + "step": 15321, + "time_per_iteration": 2.787856101989746 + }, + { + "auxiliary_loss_clip": 0.01313845, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.21204841, + "balance_loss_mlp": 1.01526713, + "epoch": 0.9212084773786262, + "flos": 23263458426000.0, + "grad_norm": 1.7707790967640553, + "language_loss": 0.71678829, + "learning_rate": 6.47225558966582e-08, + "loss": 0.7401979, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.11853027, + "step": 15322, + "time_per_iteration": 2.7998239994049072 + }, + { + "auxiliary_loss_clip": 0.0132295, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.21649027, + "balance_loss_mlp": 1.0192132, + "epoch": 0.9212686006312941, + "flos": 16293840339600.0, + "grad_norm": 1.8538368338910272, + "language_loss": 0.70236468, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72591251, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12628174, + "step": 15323, + "time_per_iteration": 2.8262593746185303 + }, + { + "auxiliary_loss_clip": 0.01333875, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.22263157, + "balance_loss_mlp": 1.02062833, + "epoch": 0.9213287238839621, + "flos": 19789726974480.0, + "grad_norm": 1.706196960298147, + "language_loss": 0.74723899, + "learning_rate": 6.452614941753597e-08, + "loss": 0.77092886, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14489746, + "step": 15324, + "time_per_iteration": 2.8572380542755127 + }, + { + "auxiliary_loss_clip": 0.01321261, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.21504426, + "balance_loss_mlp": 1.02383614, + "epoch": 0.92138884713663, + "flos": 21035054706840.0, + "grad_norm": 1.7933488925172394, + "language_loss": 0.71078545, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73436582, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12969971, + "step": 15325, + "time_per_iteration": 2.7758028507232666 + }, + { + "auxiliary_loss_clip": 0.01323227, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.21669173, + "balance_loss_mlp": 1.02048898, + "epoch": 0.9214489703892981, + "flos": 28593465964200.0, + "grad_norm": 1.5342013571726731, + "language_loss": 0.78696132, + "learning_rate": 6.433003651186109e-08, + "loss": 0.81052727, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12878418, + "step": 15326, + "time_per_iteration": 2.80590558052063 + }, + { + "auxiliary_loss_clip": 0.01331574, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.22242725, + "balance_loss_mlp": 1.02084374, + "epoch": 0.921509093641966, + "flos": 16365683174400.0, + "grad_norm": 2.6858307593029505, + "language_loss": 0.7182765, + "learning_rate": 6.42320901583635e-08, + "loss": 0.74193871, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13800049, + "step": 15327, + "time_per_iteration": 2.7664496898651123 + }, + { + "auxiliary_loss_clip": 0.01334614, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.22339869, + "balance_loss_mlp": 1.02293801, + "epoch": 0.921569216894634, + "flos": 26836142115600.0, + "grad_norm": 2.330296481772428, + "language_loss": 0.7787255, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80243671, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13592529, + "step": 15328, + "time_per_iteration": 2.8562912940979004 + }, + { + "auxiliary_loss_clip": 0.01320719, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.21498144, + "balance_loss_mlp": 1.01858091, + "epoch": 0.921629340147302, + "flos": 24650603843400.0, + "grad_norm": 2.2237498429732567, + "language_loss": 0.71766669, + "learning_rate": 6.4036417668619e-08, + "loss": 0.7411828, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12316895, + "step": 15329, + "time_per_iteration": 2.8913443088531494 + }, + { + "auxiliary_loss_clip": 0.01322713, + "auxiliary_loss_mlp": 0.01026517, + "balance_loss_clip": 1.21621966, + "balance_loss_mlp": 1.01453042, + "epoch": 0.9216894633999699, + "flos": 15090809537160.0, + "grad_norm": 1.7874805927857755, + "language_loss": 0.87037373, + "learning_rate": 6.393869153979192e-08, + "loss": 0.893866, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11981201, + "step": 15330, + "time_per_iteration": 2.840761423110962 + }, + { + "auxiliary_loss_clip": 0.01326435, + "auxiliary_loss_mlp": 0.01035668, + "balance_loss_clip": 1.21747398, + "balance_loss_mlp": 1.02306783, + "epoch": 0.921749586652638, + "flos": 19208730608640.0, + "grad_norm": 2.1019170393605724, + "language_loss": 0.76168764, + "learning_rate": 6.384103882660397e-08, + "loss": 0.78530866, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12609863, + "step": 15331, + "time_per_iteration": 2.793833017349243 + }, + { + "auxiliary_loss_clip": 0.01326801, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.21908355, + "balance_loss_mlp": 1.01846957, + "epoch": 0.9218097099053059, + "flos": 20527403685120.0, + "grad_norm": 1.6203759473684285, + "language_loss": 0.76041794, + "learning_rate": 6.374345953275794e-08, + "loss": 0.78400016, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12969971, + "step": 15332, + "time_per_iteration": 2.808290481567383 + }, + { + "auxiliary_loss_clip": 0.01323835, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.21674442, + "balance_loss_mlp": 1.01901269, + "epoch": 0.9218698331579739, + "flos": 17353226080800.0, + "grad_norm": 1.8148201194804965, + "language_loss": 0.75125802, + "learning_rate": 6.364595366195358e-08, + "loss": 0.77480817, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12158203, + "step": 15333, + "time_per_iteration": 2.799652338027954 + }, + { + "auxiliary_loss_clip": 0.01144656, + "auxiliary_loss_mlp": 0.01000361, + "balance_loss_clip": 1.10238838, + "balance_loss_mlp": 0.9979288, + "epoch": 0.9219299564106418, + "flos": 61971905126520.0, + "grad_norm": 0.8077947444013907, + "language_loss": 0.52958715, + "learning_rate": 6.354852121788879e-08, + "loss": 0.55103731, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02429199, + "step": 15334, + "time_per_iteration": 4.670477390289307 + }, + { + "auxiliary_loss_clip": 0.01319172, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.21527803, + "balance_loss_mlp": 1.01938653, + "epoch": 0.9219900796633098, + "flos": 15705493860600.0, + "grad_norm": 1.6895292988937243, + "language_loss": 0.62908751, + "learning_rate": 6.345116220425839e-08, + "loss": 0.65259635, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.12329102, + "step": 15335, + "time_per_iteration": 2.7755305767059326 + }, + { + "auxiliary_loss_clip": 0.01326521, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.21987808, + "balance_loss_mlp": 1.01805973, + "epoch": 0.9220502029159777, + "flos": 24937609707360.0, + "grad_norm": 1.5929893057541633, + "language_loss": 0.71723378, + "learning_rate": 6.335387662475366e-08, + "loss": 0.74080598, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12646484, + "step": 15336, + "time_per_iteration": 4.422336101531982 + }, + { + "auxiliary_loss_clip": 0.01320239, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.21585393, + "balance_loss_mlp": 1.01935053, + "epoch": 0.9221103261686457, + "flos": 15671196777600.0, + "grad_norm": 2.037038046371778, + "language_loss": 0.71847188, + "learning_rate": 6.325666448306433e-08, + "loss": 0.74197751, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.10980225, + "step": 15337, + "time_per_iteration": 2.7450571060180664 + }, + { + "auxiliary_loss_clip": 0.01145008, + "auxiliary_loss_mlp": 0.01004768, + "balance_loss_clip": 1.1026063, + "balance_loss_mlp": 1.00239563, + "epoch": 0.9221704494213137, + "flos": 67532182003800.0, + "grad_norm": 0.8829257276920486, + "language_loss": 0.65435827, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67585599, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02368164, + "step": 15338, + "time_per_iteration": 3.2514100074768066 + }, + { + "auxiliary_loss_clip": 0.01326833, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.21833599, + "balance_loss_mlp": 1.01706028, + "epoch": 0.9222305726739817, + "flos": 30233076512400.0, + "grad_norm": 1.6810976973531329, + "language_loss": 0.67569518, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69926107, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12695312, + "step": 15339, + "time_per_iteration": 2.9345052242279053 + }, + { + "auxiliary_loss_clip": 0.01323347, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.21568072, + "balance_loss_mlp": 1.01607466, + "epoch": 0.9222906959266496, + "flos": 25342613130240.0, + "grad_norm": 1.8907023073256208, + "language_loss": 0.73166001, + "learning_rate": 6.296546872173513e-08, + "loss": 0.75517833, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12420654, + "step": 15340, + "time_per_iteration": 2.866239309310913 + }, + { + "auxiliary_loss_clip": 0.0132752, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.22140646, + "balance_loss_mlp": 1.01607072, + "epoch": 0.9223508191793176, + "flos": 27605638799280.0, + "grad_norm": 1.4713712191713684, + "language_loss": 0.70326155, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72682333, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12591553, + "step": 15341, + "time_per_iteration": 2.844453811645508 + }, + { + "auxiliary_loss_clip": 0.01315759, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.21325684, + "balance_loss_mlp": 1.01948559, + "epoch": 0.9224109424319856, + "flos": 27313272631800.0, + "grad_norm": 1.5976620638000105, + "language_loss": 0.67490017, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69836986, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.11724854, + "step": 15342, + "time_per_iteration": 2.7869365215301514 + }, + { + "auxiliary_loss_clip": 0.01325601, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.21819878, + "balance_loss_mlp": 1.02389157, + "epoch": 0.9224710656846535, + "flos": 48215995028640.0, + "grad_norm": 2.0697726445823235, + "language_loss": 0.6932224, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71683383, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.11639404, + "step": 15343, + "time_per_iteration": 4.51086950302124 + }, + { + "auxiliary_loss_clip": 0.01144368, + "auxiliary_loss_mlp": 0.01005969, + "balance_loss_clip": 1.10178232, + "balance_loss_mlp": 1.0033468, + "epoch": 0.9225311889373216, + "flos": 66739068235800.0, + "grad_norm": 0.724312186167836, + "language_loss": 0.52059698, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54210037, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02624512, + "step": 15344, + "time_per_iteration": 3.4171230792999268 + }, + { + "auxiliary_loss_clip": 0.01315616, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.21321583, + "balance_loss_mlp": 1.01723981, + "epoch": 0.9225913121899895, + "flos": 22276159169760.0, + "grad_norm": 1.5395630815347283, + "language_loss": 0.7075758, + "learning_rate": 6.248161155266162e-08, + "loss": 0.73102355, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11920166, + "step": 15345, + "time_per_iteration": 2.7429730892181396 + }, + { + "auxiliary_loss_clip": 0.01329714, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.22190928, + "balance_loss_mlp": 1.0203228, + "epoch": 0.9226514354426575, + "flos": 20087372228760.0, + "grad_norm": 1.8184426463554344, + "language_loss": 0.77323973, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79686749, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12738037, + "step": 15346, + "time_per_iteration": 2.7807841300964355 + }, + { + "auxiliary_loss_clip": 0.01338441, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.22614849, + "balance_loss_mlp": 1.02310157, + "epoch": 0.9227115586953254, + "flos": 16075184991480.0, + "grad_norm": 1.7321898978878743, + "language_loss": 0.76537436, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78911644, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.12677002, + "step": 15347, + "time_per_iteration": 2.7140390872955322 + }, + { + "auxiliary_loss_clip": 0.01317493, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.21437109, + "balance_loss_mlp": 1.02131438, + "epoch": 0.9227716819479934, + "flos": 20450525413680.0, + "grad_norm": 1.475769781084593, + "language_loss": 0.77028233, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79378176, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.11126709, + "step": 15348, + "time_per_iteration": 2.762625217437744 + }, + { + "auxiliary_loss_clip": 0.013301, + "auxiliary_loss_mlp": 0.01030374, + "balance_loss_clip": 1.21968389, + "balance_loss_mlp": 1.01683187, + "epoch": 0.9228318052006613, + "flos": 25012579385880.0, + "grad_norm": 2.070241261148646, + "language_loss": 0.68520784, + "learning_rate": 6.209584827138959e-08, + "loss": 0.70881253, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13543701, + "step": 15349, + "time_per_iteration": 2.87424635887146 + }, + { + "auxiliary_loss_clip": 0.01327476, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.21854544, + "balance_loss_mlp": 1.01753843, + "epoch": 0.9228919284533293, + "flos": 12681093179880.0, + "grad_norm": 2.874446159198003, + "language_loss": 0.86823547, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89181221, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12677002, + "step": 15350, + "time_per_iteration": 2.8180785179138184 + }, + { + "auxiliary_loss_clip": 0.01143683, + "auxiliary_loss_mlp": 0.01001403, + "balance_loss_clip": 1.10139346, + "balance_loss_mlp": 0.99879259, + "epoch": 0.9229520517059973, + "flos": 70000908954120.0, + "grad_norm": 0.8663436186412219, + "language_loss": 0.60407531, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62552619, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02612305, + "step": 15351, + "time_per_iteration": 3.2823476791381836 + }, + { + "auxiliary_loss_clip": 0.01328827, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.21969235, + "balance_loss_mlp": 1.01324248, + "epoch": 0.9230121749586653, + "flos": 14797996677720.0, + "grad_norm": 1.8684248226600066, + "language_loss": 0.77982903, + "learning_rate": 6.180729739558233e-08, + "loss": 0.80337071, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12097168, + "step": 15352, + "time_per_iteration": 2.809187650680542 + }, + { + "auxiliary_loss_clip": 0.01334627, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.22195208, + "balance_loss_mlp": 1.02155924, + "epoch": 0.9230722982113332, + "flos": 22972513551120.0, + "grad_norm": 1.9027702964328699, + "language_loss": 0.59951913, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62321734, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13647461, + "step": 15353, + "time_per_iteration": 2.8596770763397217 + }, + { + "auxiliary_loss_clip": 0.01321552, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.2165401, + "balance_loss_mlp": 1.01954758, + "epoch": 0.9231324214640012, + "flos": 18556053841440.0, + "grad_norm": 1.5406526206256927, + "language_loss": 0.74846065, + "learning_rate": 6.161529762127293e-08, + "loss": 0.77198994, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1182251, + "step": 15354, + "time_per_iteration": 2.791672706604004 + }, + { + "auxiliary_loss_clip": 0.01336466, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.22312641, + "balance_loss_mlp": 1.01614952, + "epoch": 0.9231925447166691, + "flos": 22086765468000.0, + "grad_norm": 2.2555957117499323, + "language_loss": 0.65597534, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67963994, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13848877, + "step": 15355, + "time_per_iteration": 2.831435441970825 + }, + { + "auxiliary_loss_clip": 0.01319898, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.21564603, + "balance_loss_mlp": 1.01787412, + "epoch": 0.9232526679693371, + "flos": 26546699750040.0, + "grad_norm": 1.4021111978282832, + "language_loss": 0.7437945, + "learning_rate": 6.142359186192947e-08, + "loss": 0.7672869, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.11468506, + "step": 15356, + "time_per_iteration": 2.853778123855591 + }, + { + "auxiliary_loss_clip": 0.01327494, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.21936786, + "balance_loss_mlp": 1.0165745, + "epoch": 0.9233127912220052, + "flos": 14760735184440.0, + "grad_norm": 1.7239379747295174, + "language_loss": 0.61389059, + "learning_rate": 6.132784924695844e-08, + "loss": 0.6374597, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.1282959, + "step": 15357, + "time_per_iteration": 2.749154806137085 + }, + { + "auxiliary_loss_clip": 0.01333532, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.22276044, + "balance_loss_mlp": 1.01937199, + "epoch": 0.9233729144746731, + "flos": 25266871892880.0, + "grad_norm": 1.432830502925642, + "language_loss": 0.70024455, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72390735, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 1.10791016, + "router_z_loss_mlp": 0.13372803, + "step": 15358, + "time_per_iteration": 2.8931503295898438 + }, + { + "auxiliary_loss_clip": 0.01324658, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.2167803, + "balance_loss_mlp": 1.01880956, + "epoch": 0.9234330377273411, + "flos": 27854936478000.0, + "grad_norm": 3.011690549218329, + "language_loss": 0.73849595, + "learning_rate": 6.113658456457104e-08, + "loss": 0.76205862, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12799072, + "step": 15359, + "time_per_iteration": 2.8161537647247314 + }, + { + "auxiliary_loss_clip": 0.01330247, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.22249746, + "balance_loss_mlp": 1.0180105, + "epoch": 0.923493160980009, + "flos": 24613789042080.0, + "grad_norm": 2.0156317888916355, + "language_loss": 0.64473766, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66834295, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1227417, + "step": 15360, + "time_per_iteration": 2.8537349700927734 + }, + { + "auxiliary_loss_clip": 0.01147252, + "auxiliary_loss_mlp": 0.01006637, + "balance_loss_clip": 1.10476756, + "balance_loss_mlp": 1.00400209, + "epoch": 0.923553284232677, + "flos": 67717108785960.0, + "grad_norm": 0.7801627179295763, + "language_loss": 0.55215013, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57368898, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 15361, + "time_per_iteration": 3.2021915912628174 + }, + { + "auxiliary_loss_clip": 0.01337938, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.22621727, + "balance_loss_mlp": 1.01837504, + "epoch": 0.9236134074853449, + "flos": 18811930074480.0, + "grad_norm": 1.670410425915254, + "language_loss": 0.70025253, + "learning_rate": 6.085023896425112e-08, + "loss": 0.72394872, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13311768, + "step": 15362, + "time_per_iteration": 2.741069793701172 + }, + { + "auxiliary_loss_clip": 0.01336115, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.22499347, + "balance_loss_mlp": 1.01319337, + "epoch": 0.923673530738013, + "flos": 27788169688200.0, + "grad_norm": 1.4643971919663288, + "language_loss": 0.76066029, + "learning_rate": 6.075493749149463e-08, + "loss": 0.7842927, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13928223, + "step": 15363, + "time_per_iteration": 2.809974193572998 + }, + { + "auxiliary_loss_clip": 0.0132781, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.21992064, + "balance_loss_mlp": 1.01700568, + "epoch": 0.9237336539906809, + "flos": 26802779024880.0, + "grad_norm": 2.0646832919994784, + "language_loss": 0.84014714, + "learning_rate": 6.065970955510514e-08, + "loss": 0.86372125, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12585449, + "step": 15364, + "time_per_iteration": 2.8059024810791016 + }, + { + "auxiliary_loss_clip": 0.01321895, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.21608567, + "balance_loss_mlp": 1.01567578, + "epoch": 0.9237937772433489, + "flos": 23593126695120.0, + "grad_norm": 1.4420809185871222, + "language_loss": 0.68446469, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70796061, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12011719, + "step": 15365, + "time_per_iteration": 2.804511785507202 + }, + { + "auxiliary_loss_clip": 0.01327772, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.22048283, + "balance_loss_mlp": 1.01828885, + "epoch": 0.9238539004960168, + "flos": 26146163246760.0, + "grad_norm": 2.0165325024958185, + "language_loss": 0.63128138, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65487081, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12884521, + "step": 15366, + "time_per_iteration": 2.7684147357940674 + }, + { + "auxiliary_loss_clip": 0.01321395, + "auxiliary_loss_mlp": 0.01029036, + "balance_loss_clip": 1.21592641, + "balance_loss_mlp": 1.01653123, + "epoch": 0.9239140237486848, + "flos": 21072803500440.0, + "grad_norm": 1.6056544366423648, + "language_loss": 0.74645191, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76995623, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12512207, + "step": 15367, + "time_per_iteration": 2.7393736839294434 + }, + { + "auxiliary_loss_clip": 0.0131209, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.20967722, + "balance_loss_mlp": 1.01998973, + "epoch": 0.9239741470013527, + "flos": 24613301741760.0, + "grad_norm": 2.135675352921688, + "language_loss": 0.65148026, + "learning_rate": 6.027953324539759e-08, + "loss": 0.6749258, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.12469482, + "step": 15368, + "time_per_iteration": 2.7699966430664062 + }, + { + "auxiliary_loss_clip": 0.01333562, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.22301638, + "balance_loss_mlp": 1.015921, + "epoch": 0.9240342702540207, + "flos": 24723746145720.0, + "grad_norm": 1.639390995256076, + "language_loss": 0.74992973, + "learning_rate": 6.018467304495401e-08, + "loss": 0.77355409, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12969971, + "step": 15369, + "time_per_iteration": 2.856419801712036 + }, + { + "auxiliary_loss_clip": 0.01337986, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.22515893, + "balance_loss_mlp": 1.02227879, + "epoch": 0.9240943935066888, + "flos": 20854879102800.0, + "grad_norm": 1.7769304279912799, + "language_loss": 0.76802129, + "learning_rate": 6.008988640250145e-08, + "loss": 0.79176491, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.14105225, + "step": 15370, + "time_per_iteration": 2.8278417587280273 + }, + { + "auxiliary_loss_clip": 0.01330088, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.22170484, + "balance_loss_mlp": 1.01929998, + "epoch": 0.9241545167593567, + "flos": 24467707479240.0, + "grad_norm": 2.160371784043738, + "language_loss": 0.67215097, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69576907, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12420654, + "step": 15371, + "time_per_iteration": 2.9060471057891846 + }, + { + "auxiliary_loss_clip": 0.01142495, + "auxiliary_loss_mlp": 0.01006005, + "balance_loss_clip": 1.10019684, + "balance_loss_mlp": 1.00346553, + "epoch": 0.9242146400120247, + "flos": 61842757249800.0, + "grad_norm": 0.7410935083185061, + "language_loss": 0.57751775, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59900278, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02539062, + "step": 15372, + "time_per_iteration": 3.2116687297821045 + }, + { + "auxiliary_loss_clip": 0.01323247, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.21909976, + "balance_loss_mlp": 1.01760578, + "epoch": 0.9242747632646926, + "flos": 22052712035160.0, + "grad_norm": 1.6947701210391903, + "language_loss": 0.70088851, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72440952, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.11248779, + "step": 15373, + "time_per_iteration": 4.246969223022461 + }, + { + "auxiliary_loss_clip": 0.01328092, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.22064745, + "balance_loss_mlp": 1.02674747, + "epoch": 0.9243348865173606, + "flos": 18482911539120.0, + "grad_norm": 2.202287760321663, + "language_loss": 0.75298059, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77665818, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12921143, + "step": 15374, + "time_per_iteration": 2.755415916442871 + }, + { + "auxiliary_loss_clip": 0.01327192, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.21924376, + "balance_loss_mlp": 1.02064216, + "epoch": 0.9243950097700285, + "flos": 23264067551400.0, + "grad_norm": 2.1369805011337766, + "language_loss": 0.65444791, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67804515, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.11901855, + "step": 15375, + "time_per_iteration": 5.844547271728516 + }, + { + "auxiliary_loss_clip": 0.0132378, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.21841383, + "balance_loss_mlp": 1.02148151, + "epoch": 0.9244551330226966, + "flos": 29754687137040.0, + "grad_norm": 1.7335480460452282, + "language_loss": 0.66580355, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68937659, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12036133, + "step": 15376, + "time_per_iteration": 2.85497784614563 + }, + { + "auxiliary_loss_clip": 0.0114235, + "auxiliary_loss_mlp": 0.01000612, + "balance_loss_clip": 1.10026908, + "balance_loss_mlp": 0.99814463, + "epoch": 0.9245152562753645, + "flos": 68880847677120.0, + "grad_norm": 0.6498087262112566, + "language_loss": 0.61099541, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63242507, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.0246582, + "step": 15377, + "time_per_iteration": 3.3450541496276855 + }, + { + "auxiliary_loss_clip": 0.01326774, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.22027469, + "balance_loss_mlp": 1.02177382, + "epoch": 0.9245753795280325, + "flos": 21584190491280.0, + "grad_norm": 1.667643929776482, + "language_loss": 0.74159765, + "learning_rate": 5.933424178131341e-08, + "loss": 0.7652114, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12835693, + "step": 15378, + "time_per_iteration": 2.7321858406066895 + }, + { + "auxiliary_loss_clip": 0.01330154, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.22126293, + "balance_loss_mlp": 1.02011895, + "epoch": 0.9246355027807004, + "flos": 34502804925480.0, + "grad_norm": 2.033323747787361, + "language_loss": 0.62385678, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64748853, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12915039, + "step": 15379, + "time_per_iteration": 2.8879294395446777 + }, + { + "auxiliary_loss_clip": 0.01321173, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.21619391, + "balance_loss_mlp": 1.01510954, + "epoch": 0.9246956260333684, + "flos": 15957187432560.0, + "grad_norm": 1.8971108798504852, + "language_loss": 0.84186828, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86535281, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1217041, + "step": 15380, + "time_per_iteration": 2.749171257019043 + }, + { + "auxiliary_loss_clip": 0.01330307, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.21993732, + "balance_loss_mlp": 1.01922715, + "epoch": 0.9247557492860363, + "flos": 23373699788160.0, + "grad_norm": 1.4716510007761652, + "language_loss": 0.73771822, + "learning_rate": 5.905208918895233e-08, + "loss": 0.76134694, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13348389, + "step": 15381, + "time_per_iteration": 2.805727243423462 + }, + { + "auxiliary_loss_clip": 0.01327616, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.22036183, + "balance_loss_mlp": 1.02254367, + "epoch": 0.9248158725387043, + "flos": 23045127944760.0, + "grad_norm": 1.7389615221526136, + "language_loss": 0.79013491, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.81376231, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12585449, + "step": 15382, + "time_per_iteration": 4.391753911972046 + }, + { + "auxiliary_loss_clip": 0.01330527, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.22187078, + "balance_loss_mlp": 1.02164388, + "epoch": 0.9248759957913724, + "flos": 22526756316000.0, + "grad_norm": 1.628755420153703, + "language_loss": 0.75249428, + "learning_rate": 5.886435545946455e-08, + "loss": 0.7761445, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12854004, + "step": 15383, + "time_per_iteration": 2.7666656970977783 + }, + { + "auxiliary_loss_clip": 0.01321908, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.21652389, + "balance_loss_mlp": 1.01713514, + "epoch": 0.9249361190440403, + "flos": 25452773275680.0, + "grad_norm": 1.538644562909507, + "language_loss": 0.76056659, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.78407919, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.12207031, + "step": 15384, + "time_per_iteration": 2.8387255668640137 + }, + { + "auxiliary_loss_clip": 0.01317012, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.21297276, + "balance_loss_mlp": 1.01677918, + "epoch": 0.9249962422967083, + "flos": 12381539332680.0, + "grad_norm": 1.888472957506396, + "language_loss": 0.66526014, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68872815, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.13000488, + "step": 15385, + "time_per_iteration": 2.7138829231262207 + }, + { + "auxiliary_loss_clip": 0.01322127, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.21571171, + "balance_loss_mlp": 1.02049613, + "epoch": 0.9250563655493762, + "flos": 22934561715720.0, + "grad_norm": 1.792579161001859, + "language_loss": 0.80709714, + "learning_rate": 5.85833069345496e-08, + "loss": 0.83064693, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1237793, + "step": 15386, + "time_per_iteration": 2.884709358215332 + }, + { + "auxiliary_loss_clip": 0.01319539, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.21491683, + "balance_loss_mlp": 1.02176762, + "epoch": 0.9251164888020442, + "flos": 18483277014360.0, + "grad_norm": 1.6192875392499142, + "language_loss": 0.75925434, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.78279746, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.13006592, + "step": 15387, + "time_per_iteration": 2.7858774662017822 + }, + { + "auxiliary_loss_clip": 0.0131776, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.21346974, + "balance_loss_mlp": 1.01879203, + "epoch": 0.9251766120547121, + "flos": 33043654239840.0, + "grad_norm": 1.2823013855343788, + "language_loss": 0.69968641, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72316808, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11621094, + "step": 15388, + "time_per_iteration": 2.8733139038085938 + }, + { + "auxiliary_loss_clip": 0.0133104, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.22178531, + "balance_loss_mlp": 1.01571476, + "epoch": 0.9252367353073802, + "flos": 24393103275960.0, + "grad_norm": 1.6377702064182305, + "language_loss": 0.8242808, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84787607, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12774658, + "step": 15389, + "time_per_iteration": 2.8700528144836426 + }, + { + "auxiliary_loss_clip": 0.01345138, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.22975242, + "balance_loss_mlp": 1.02033257, + "epoch": 0.9252968585600481, + "flos": 18921562311240.0, + "grad_norm": 1.5992514190398175, + "language_loss": 0.79561967, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81941009, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.13574219, + "step": 15390, + "time_per_iteration": 2.822211503982544 + }, + { + "auxiliary_loss_clip": 0.0133421, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.22320628, + "balance_loss_mlp": 1.02219903, + "epoch": 0.9253569818127161, + "flos": 21730312662480.0, + "grad_norm": 1.6614599687014944, + "language_loss": 0.7534824, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77718157, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13513184, + "step": 15391, + "time_per_iteration": 2.8247694969177246 + }, + { + "auxiliary_loss_clip": 0.01329261, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.22029006, + "balance_loss_mlp": 1.01777911, + "epoch": 0.925417105065384, + "flos": 34247009909160.0, + "grad_norm": 2.284203223701264, + "language_loss": 0.52843422, + "learning_rate": 5.80231976856802e-08, + "loss": 0.55204737, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.14276123, + "step": 15392, + "time_per_iteration": 2.8825297355651855 + }, + { + "auxiliary_loss_clip": 0.01328389, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.21970606, + "balance_loss_mlp": 1.01609564, + "epoch": 0.925477228318052, + "flos": 25965297300600.0, + "grad_norm": 1.741191303397346, + "language_loss": 0.76968443, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79325235, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.12322998, + "step": 15393, + "time_per_iteration": 2.871955156326294 + }, + { + "auxiliary_loss_clip": 0.01324795, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.21919394, + "balance_loss_mlp": 1.01861835, + "epoch": 0.9255373515707199, + "flos": 11841987121200.0, + "grad_norm": 1.8277024502563928, + "language_loss": 0.70033967, + "learning_rate": 5.783708368464357e-08, + "loss": 0.72390038, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12664795, + "step": 15394, + "time_per_iteration": 2.7641243934631348 + }, + { + "auxiliary_loss_clip": 0.01329793, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.22148252, + "balance_loss_mlp": 1.01536989, + "epoch": 0.925597474823388, + "flos": 21439367787600.0, + "grad_norm": 1.6995651863585226, + "language_loss": 0.72965515, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75323266, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12597656, + "step": 15395, + "time_per_iteration": 2.803173780441284 + }, + { + "auxiliary_loss_clip": 0.01319336, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.2137711, + "balance_loss_mlp": 1.01643229, + "epoch": 0.925657598076056, + "flos": 22862840706000.0, + "grad_norm": 1.8509415358130876, + "language_loss": 0.71650541, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73998463, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.1217041, + "step": 15396, + "time_per_iteration": 2.772942304611206 + }, + { + "auxiliary_loss_clip": 0.0132688, + "auxiliary_loss_mlp": 0.01030052, + "balance_loss_clip": 1.22031617, + "balance_loss_mlp": 1.01702285, + "epoch": 0.9257177213287239, + "flos": 25709745934440.0, + "grad_norm": 1.577669846007423, + "language_loss": 0.87113261, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89470196, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13037109, + "step": 15397, + "time_per_iteration": 2.8482346534729004 + }, + { + "auxiliary_loss_clip": 0.01143331, + "auxiliary_loss_mlp": 0.01003163, + "balance_loss_clip": 1.10090888, + "balance_loss_mlp": 1.00069547, + "epoch": 0.9257778445813919, + "flos": 59607003417480.0, + "grad_norm": 1.0386007230923215, + "language_loss": 0.55176771, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57323265, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.0246582, + "step": 15398, + "time_per_iteration": 3.176410436630249 + }, + { + "auxiliary_loss_clip": 0.01341516, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.2277385, + "balance_loss_mlp": 1.01958334, + "epoch": 0.9258379678340598, + "flos": 27715230427680.0, + "grad_norm": 1.8230627690010095, + "language_loss": 0.76517987, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78893113, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.14031982, + "step": 15399, + "time_per_iteration": 2.838204860687256 + }, + { + "auxiliary_loss_clip": 0.01315241, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.21243238, + "balance_loss_mlp": 1.01778102, + "epoch": 0.9258980910867278, + "flos": 24868974933000.0, + "grad_norm": 1.3630922529413014, + "language_loss": 0.7799083, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80335182, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11322021, + "step": 15400, + "time_per_iteration": 2.795536756515503 + }, + { + "auxiliary_loss_clip": 0.01144872, + "auxiliary_loss_mlp": 0.01004594, + "balance_loss_clip": 1.10218835, + "balance_loss_mlp": 1.00212669, + "epoch": 0.9259582143393957, + "flos": 63149450860080.0, + "grad_norm": 0.7232627098352127, + "language_loss": 0.5137409, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53523552, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0246582, + "step": 15401, + "time_per_iteration": 3.2267892360687256 + }, + { + "auxiliary_loss_clip": 0.01312014, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.20953703, + "balance_loss_mlp": 1.01721215, + "epoch": 0.9260183375920638, + "flos": 24131582480880.0, + "grad_norm": 1.685070526104527, + "language_loss": 0.82439905, + "learning_rate": 5.709557384259378e-08, + "loss": 0.847812, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.1206665, + "step": 15402, + "time_per_iteration": 2.7698886394500732 + }, + { + "auxiliary_loss_clip": 0.01144009, + "auxiliary_loss_mlp": 0.01009383, + "balance_loss_clip": 1.10161245, + "balance_loss_mlp": 1.00668895, + "epoch": 0.9260784608447317, + "flos": 63060041586600.0, + "grad_norm": 0.7387079201645904, + "language_loss": 0.51236522, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53389913, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02697754, + "step": 15403, + "time_per_iteration": 3.2685964107513428 + }, + { + "auxiliary_loss_clip": 0.0114684, + "auxiliary_loss_mlp": 0.01009847, + "balance_loss_clip": 1.10442758, + "balance_loss_mlp": 1.00712895, + "epoch": 0.9261385840973997, + "flos": 70602899842080.0, + "grad_norm": 0.7894076222913421, + "language_loss": 0.588081, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60964787, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02722168, + "step": 15404, + "time_per_iteration": 3.2185230255126953 + }, + { + "auxiliary_loss_clip": 0.01328492, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.21890664, + "balance_loss_mlp": 1.01706445, + "epoch": 0.9261987073500676, + "flos": 20234753259120.0, + "grad_norm": 2.5915747820186925, + "language_loss": 0.72468877, + "learning_rate": 5.681872319494596e-08, + "loss": 0.74827307, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.12872314, + "step": 15405, + "time_per_iteration": 2.7896244525909424 + }, + { + "auxiliary_loss_clip": 0.01329646, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.21953321, + "balance_loss_mlp": 1.02361417, + "epoch": 0.9262588306027356, + "flos": 20958379477200.0, + "grad_norm": 1.6544668261712216, + "language_loss": 0.68685901, + "learning_rate": 5.672658701232458e-08, + "loss": 0.71052927, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13781738, + "step": 15406, + "time_per_iteration": 2.777784585952759 + }, + { + "auxiliary_loss_clip": 0.01330316, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.22121346, + "balance_loss_mlp": 1.02059269, + "epoch": 0.9263189538554035, + "flos": 22163115830760.0, + "grad_norm": 2.5930207879646088, + "language_loss": 0.76731437, + "learning_rate": 5.663452451882555e-08, + "loss": 0.79095364, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13012695, + "step": 15407, + "time_per_iteration": 2.752606153488159 + }, + { + "auxiliary_loss_clip": 0.01341372, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.22747171, + "balance_loss_mlp": 1.01793122, + "epoch": 0.9263790771080715, + "flos": 18191844839160.0, + "grad_norm": 2.0734605404379125, + "language_loss": 0.72792041, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.7516517, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13824463, + "step": 15408, + "time_per_iteration": 2.779815912246704 + }, + { + "auxiliary_loss_clip": 0.01320603, + "auxiliary_loss_mlp": 0.01026789, + "balance_loss_clip": 1.21519804, + "balance_loss_mlp": 1.0154345, + "epoch": 0.9264392003607396, + "flos": 48187098857520.0, + "grad_norm": 1.6192848688926547, + "language_loss": 0.68664277, + "learning_rate": 5.645062061315675e-08, + "loss": 0.71011674, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11358643, + "step": 15409, + "time_per_iteration": 3.016740083694458 + }, + { + "auxiliary_loss_clip": 0.01329772, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.22130036, + "balance_loss_mlp": 1.02108145, + "epoch": 0.9264993236134075, + "flos": 26394242674680.0, + "grad_norm": 1.7508186692340988, + "language_loss": 0.75975227, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.78339553, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.13470459, + "step": 15410, + "time_per_iteration": 2.8976597785949707 + }, + { + "auxiliary_loss_clip": 0.01332539, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.22352314, + "balance_loss_mlp": 1.01405334, + "epoch": 0.9265594468660755, + "flos": 20924610302880.0, + "grad_norm": 1.4856667622635107, + "language_loss": 0.82123071, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84482253, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12597656, + "step": 15411, + "time_per_iteration": 4.3526787757873535 + }, + { + "auxiliary_loss_clip": 0.01330597, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.22214437, + "balance_loss_mlp": 1.02076316, + "epoch": 0.9266195701187434, + "flos": 17528853548520.0, + "grad_norm": 1.7652958171608222, + "language_loss": 0.75567973, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77931321, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.11987305, + "step": 15412, + "time_per_iteration": 2.90875506401062 + }, + { + "auxiliary_loss_clip": 0.01324785, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.21760917, + "balance_loss_mlp": 1.01407909, + "epoch": 0.9266796933714114, + "flos": 33694747281000.0, + "grad_norm": 1.6852310092156217, + "language_loss": 0.67400134, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69751453, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12463379, + "step": 15413, + "time_per_iteration": 4.341021776199341 + }, + { + "auxiliary_loss_clip": 0.01334353, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.2253617, + "balance_loss_mlp": 1.02024603, + "epoch": 0.9267398166240793, + "flos": 18921196836000.0, + "grad_norm": 1.7105479833632564, + "language_loss": 0.75997072, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78364766, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13085938, + "step": 15414, + "time_per_iteration": 4.271642684936523 + }, + { + "auxiliary_loss_clip": 0.01320099, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.21403468, + "balance_loss_mlp": 1.01750898, + "epoch": 0.9267999398767474, + "flos": 20482995120480.0, + "grad_norm": 2.1443541511127555, + "language_loss": 0.81367958, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83717889, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12322998, + "step": 15415, + "time_per_iteration": 2.803382396697998 + }, + { + "auxiliary_loss_clip": 0.01333081, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.22387612, + "balance_loss_mlp": 1.02499819, + "epoch": 0.9268600631294153, + "flos": 24794614379880.0, + "grad_norm": 1.9459232261057318, + "language_loss": 0.54528588, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56900114, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.13433838, + "step": 15416, + "time_per_iteration": 2.7790520191192627 + }, + { + "auxiliary_loss_clip": 0.01322792, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.21747756, + "balance_loss_mlp": 1.0204041, + "epoch": 0.9269201863820833, + "flos": 18701769929040.0, + "grad_norm": 1.5031896078318656, + "language_loss": 0.7191819, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74273813, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12420654, + "step": 15417, + "time_per_iteration": 2.7489731311798096 + }, + { + "auxiliary_loss_clip": 0.01329996, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.22381639, + "balance_loss_mlp": 1.01640606, + "epoch": 0.9269803096347512, + "flos": 20929036614120.0, + "grad_norm": 2.055002184865818, + "language_loss": 0.75424051, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77783632, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13183594, + "step": 15418, + "time_per_iteration": 2.726942777633667 + }, + { + "auxiliary_loss_clip": 0.01321239, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.21505666, + "balance_loss_mlp": 1.0152216, + "epoch": 0.9270404328874192, + "flos": 28008733629240.0, + "grad_norm": 1.430532330729651, + "language_loss": 0.76482242, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78831685, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12988281, + "step": 15419, + "time_per_iteration": 2.8657491207122803 + }, + { + "auxiliary_loss_clip": 0.01314071, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.21020794, + "balance_loss_mlp": 1.02037716, + "epoch": 0.9271005561400871, + "flos": 25896053400840.0, + "grad_norm": 1.6216842977497583, + "language_loss": 0.75122344, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77468526, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11730957, + "step": 15420, + "time_per_iteration": 2.786604881286621 + }, + { + "auxiliary_loss_clip": 0.01334412, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.22356355, + "balance_loss_mlp": 1.01967406, + "epoch": 0.9271606793927551, + "flos": 27059792292000.0, + "grad_norm": 1.520134701622585, + "language_loss": 0.7682941, + "learning_rate": 5.535338891759389e-08, + "loss": 0.7919718, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13677979, + "step": 15421, + "time_per_iteration": 4.294287443161011 + }, + { + "auxiliary_loss_clip": 0.01325511, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.21846783, + "balance_loss_mlp": 1.01505661, + "epoch": 0.9272208026454232, + "flos": 26215001062920.0, + "grad_norm": 1.823309216989619, + "language_loss": 0.72880578, + "learning_rate": 5.526243217829041e-08, + "loss": 0.75233698, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12554932, + "step": 15422, + "time_per_iteration": 2.8634727001190186 + }, + { + "auxiliary_loss_clip": 0.0133229, + "auxiliary_loss_mlp": 0.01038949, + "balance_loss_clip": 1.22201037, + "balance_loss_mlp": 1.02522826, + "epoch": 0.9272809258980911, + "flos": 12462275398320.0, + "grad_norm": 1.917281585451703, + "language_loss": 0.78129864, + "learning_rate": 5.517154918363065e-08, + "loss": 0.80501103, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13708496, + "step": 15423, + "time_per_iteration": 2.7335174083709717 + }, + { + "auxiliary_loss_clip": 0.01333092, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.22364879, + "balance_loss_mlp": 1.01521659, + "epoch": 0.9273410491507591, + "flos": 22862069147160.0, + "grad_norm": 3.3363353693732765, + "language_loss": 0.75751984, + "learning_rate": 5.508073993706053e-08, + "loss": 0.78113377, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13092041, + "step": 15424, + "time_per_iteration": 2.833991289138794 + }, + { + "auxiliary_loss_clip": 0.01145116, + "auxiliary_loss_mlp": 0.01003804, + "balance_loss_clip": 1.10258222, + "balance_loss_mlp": 1.00136054, + "epoch": 0.927401172403427, + "flos": 47678327782920.0, + "grad_norm": 0.7789744558352726, + "language_loss": 0.60686505, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62835425, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02441406, + "step": 15425, + "time_per_iteration": 3.063232421875 + }, + { + "auxiliary_loss_clip": 0.01329806, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.22209013, + "balance_loss_mlp": 1.02075124, + "epoch": 0.927461295656095, + "flos": 29978580963600.0, + "grad_norm": 1.366774238315682, + "language_loss": 0.71032614, + "learning_rate": 5.489934270196106e-08, + "loss": 0.73396319, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13153076, + "step": 15426, + "time_per_iteration": 2.811527729034424 + }, + { + "auxiliary_loss_clip": 0.01321944, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.21573114, + "balance_loss_mlp": 1.01664793, + "epoch": 0.9275214189087629, + "flos": 20380225696560.0, + "grad_norm": 1.7459490787308216, + "language_loss": 0.83103144, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85453606, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.11877441, + "step": 15427, + "time_per_iteration": 2.7815465927124023 + }, + { + "auxiliary_loss_clip": 0.01327673, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.22080016, + "balance_loss_mlp": 1.01898694, + "epoch": 0.927581542161431, + "flos": 22388390341560.0, + "grad_norm": 1.4652511830529236, + "language_loss": 0.7691288, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79272735, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13208008, + "step": 15428, + "time_per_iteration": 2.779886484146118 + }, + { + "auxiliary_loss_clip": 0.01320389, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.21378303, + "balance_loss_mlp": 1.01948893, + "epoch": 0.9276416654140989, + "flos": 23957904214440.0, + "grad_norm": 1.7668876535433689, + "language_loss": 0.75015199, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.77368236, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.1317749, + "step": 15429, + "time_per_iteration": 2.8653321266174316 + }, + { + "auxiliary_loss_clip": 0.01321593, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.21685982, + "balance_loss_mlp": 1.02508354, + "epoch": 0.9277017886667669, + "flos": 13921710342480.0, + "grad_norm": 1.689803069315898, + "language_loss": 0.75179243, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77538574, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12652588, + "step": 15430, + "time_per_iteration": 2.9196012020111084 + }, + { + "auxiliary_loss_clip": 0.01333425, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.22395015, + "balance_loss_mlp": 1.01644242, + "epoch": 0.9277619119194348, + "flos": 35673731496360.0, + "grad_norm": 1.5262767684084317, + "language_loss": 0.76951015, + "learning_rate": 5.444714044648391e-08, + "loss": 0.79314518, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13635254, + "step": 15431, + "time_per_iteration": 3.0304768085479736 + }, + { + "auxiliary_loss_clip": 0.0131874, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.21416926, + "balance_loss_mlp": 1.01959085, + "epoch": 0.9278220351721028, + "flos": 23846444601480.0, + "grad_norm": 1.8340566522759285, + "language_loss": 0.71072626, + "learning_rate": 5.4356921308363e-08, + "loss": 0.73423213, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.12255859, + "step": 15432, + "time_per_iteration": 2.8526225090026855 + }, + { + "auxiliary_loss_clip": 0.01332599, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.22333348, + "balance_loss_mlp": 1.01936245, + "epoch": 0.9278821584247707, + "flos": 15231896271720.0, + "grad_norm": 2.266455892087081, + "language_loss": 0.83521992, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.85886312, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12359619, + "step": 15433, + "time_per_iteration": 2.830707311630249 + }, + { + "auxiliary_loss_clip": 0.01317107, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.21406627, + "balance_loss_mlp": 1.01521218, + "epoch": 0.9279422816774388, + "flos": 24686850127680.0, + "grad_norm": 1.8767118859784757, + "language_loss": 0.67094809, + "learning_rate": 5.417670437248056e-08, + "loss": 0.69439018, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.11883545, + "step": 15434, + "time_per_iteration": 2.894819736480713 + }, + { + "auxiliary_loss_clip": 0.01310281, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.20947814, + "balance_loss_mlp": 1.02216864, + "epoch": 0.9280024049301068, + "flos": 19173540141720.0, + "grad_norm": 1.6630174101429682, + "language_loss": 0.69189602, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71533775, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.11730957, + "step": 15435, + "time_per_iteration": 2.8456146717071533 + }, + { + "auxiliary_loss_clip": 0.01333458, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.22497153, + "balance_loss_mlp": 1.01734233, + "epoch": 0.9280625281827747, + "flos": 11396514144600.0, + "grad_norm": 1.9898130570037544, + "language_loss": 0.7332257, + "learning_rate": 5.399678257985263e-08, + "loss": 0.75686049, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12677002, + "step": 15436, + "time_per_iteration": 2.7774603366851807 + }, + { + "auxiliary_loss_clip": 0.01323439, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.21696317, + "balance_loss_mlp": 1.0179286, + "epoch": 0.9281226514354427, + "flos": 24790391110440.0, + "grad_norm": 1.9219120832176442, + "language_loss": 0.67865056, + "learning_rate": 5.390693237078925e-08, + "loss": 0.70218694, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1227417, + "step": 15437, + "time_per_iteration": 2.7631475925445557 + }, + { + "auxiliary_loss_clip": 0.0133078, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.22118807, + "balance_loss_mlp": 1.01917577, + "epoch": 0.9281827746881106, + "flos": 15086992351320.0, + "grad_norm": 2.0755621493435563, + "language_loss": 0.71541089, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73904765, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13708496, + "step": 15438, + "time_per_iteration": 2.7261242866516113 + }, + { + "auxiliary_loss_clip": 0.01330011, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.22172117, + "balance_loss_mlp": 1.01713896, + "epoch": 0.9282428979407786, + "flos": 24140556928440.0, + "grad_norm": 1.6980316929208958, + "language_loss": 0.65008211, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.67367703, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12347412, + "step": 15439, + "time_per_iteration": 2.7765228748321533 + }, + { + "auxiliary_loss_clip": 0.01327614, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.22201025, + "balance_loss_mlp": 1.01626337, + "epoch": 0.9283030211934465, + "flos": 24828139904040.0, + "grad_norm": 1.9409906482636428, + "language_loss": 0.70648813, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7300514, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12426758, + "step": 15440, + "time_per_iteration": 2.8003058433532715 + }, + { + "auxiliary_loss_clip": 0.01334311, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.2228651, + "balance_loss_mlp": 1.02329862, + "epoch": 0.9283631444461146, + "flos": 23985460309680.0, + "grad_norm": 1.6986859982105584, + "language_loss": 0.77175516, + "learning_rate": 5.354826952900682e-08, + "loss": 0.79546273, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13153076, + "step": 15441, + "time_per_iteration": 2.764829158782959 + }, + { + "auxiliary_loss_clip": 0.01316935, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.21414757, + "balance_loss_mlp": 1.01567924, + "epoch": 0.9284232676987825, + "flos": 22789779620400.0, + "grad_norm": 1.6193411781642608, + "language_loss": 0.6497699, + "learning_rate": 5.345878833417949e-08, + "loss": 0.67319953, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.10357666, + "step": 15442, + "time_per_iteration": 2.8885855674743652 + }, + { + "auxiliary_loss_clip": 0.01334992, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.22319102, + "balance_loss_mlp": 1.02306414, + "epoch": 0.9284833909514505, + "flos": 19505279437200.0, + "grad_norm": 1.9620025103892693, + "language_loss": 0.81379467, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83750707, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.1317749, + "step": 15443, + "time_per_iteration": 2.761014938354492 + }, + { + "auxiliary_loss_clip": 0.01329015, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.22064841, + "balance_loss_mlp": 1.01777732, + "epoch": 0.9285435142041184, + "flos": 23191290724320.0, + "grad_norm": 1.8412866747966319, + "language_loss": 0.65559697, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67919278, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12799072, + "step": 15444, + "time_per_iteration": 2.7397358417510986 + }, + { + "auxiliary_loss_clip": 0.01325388, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.21646905, + "balance_loss_mlp": 1.01695871, + "epoch": 0.9286036374567864, + "flos": 17679970548000.0, + "grad_norm": 2.1835545817431137, + "language_loss": 0.73728812, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.76083392, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12231445, + "step": 15445, + "time_per_iteration": 2.731107234954834 + }, + { + "auxiliary_loss_clip": 0.01327556, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.21984231, + "balance_loss_mlp": 1.01656508, + "epoch": 0.9286637607094543, + "flos": 20891612687400.0, + "grad_norm": 1.6638420243893763, + "language_loss": 0.71374941, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73732066, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12994385, + "step": 15446, + "time_per_iteration": 2.7701632976531982 + }, + { + "auxiliary_loss_clip": 0.01342566, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.22941613, + "balance_loss_mlp": 1.01523972, + "epoch": 0.9287238839621224, + "flos": 19030707247680.0, + "grad_norm": 1.7739705237026253, + "language_loss": 0.69784987, + "learning_rate": 5.301248962337523e-08, + "loss": 0.72155964, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.1317749, + "step": 15447, + "time_per_iteration": 2.856214761734009 + }, + { + "auxiliary_loss_clip": 0.01314227, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.21250439, + "balance_loss_mlp": 1.01701999, + "epoch": 0.9287840072147904, + "flos": 20561741376480.0, + "grad_norm": 1.5663223212072905, + "language_loss": 0.73221821, + "learning_rate": 5.292345135757403e-08, + "loss": 0.75564814, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.11743164, + "step": 15448, + "time_per_iteration": 2.861433267593384 + }, + { + "auxiliary_loss_clip": 0.01322011, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.21560287, + "balance_loss_mlp": 1.01331878, + "epoch": 0.9288441304674583, + "flos": 21255699864600.0, + "grad_norm": 1.5120888844176252, + "language_loss": 0.74596453, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76944989, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.13201904, + "step": 15449, + "time_per_iteration": 2.807018280029297 + }, + { + "auxiliary_loss_clip": 0.01323931, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.21721435, + "balance_loss_mlp": 1.01513875, + "epoch": 0.9289042537201263, + "flos": 27675248174280.0, + "grad_norm": 1.999083037743674, + "language_loss": 0.67707199, + "learning_rate": 5.27455963293586e-08, + "loss": 0.70059413, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13134766, + "step": 15450, + "time_per_iteration": 4.251011610031128 + }, + { + "auxiliary_loss_clip": 0.01326413, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.21770144, + "balance_loss_mlp": 1.01439071, + "epoch": 0.9289643769727942, + "flos": 19322464289760.0, + "grad_norm": 2.0677171694920755, + "language_loss": 0.72344548, + "learning_rate": 5.265677957368875e-08, + "loss": 0.74698246, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12890625, + "step": 15451, + "time_per_iteration": 2.7596840858459473 + }, + { + "auxiliary_loss_clip": 0.01326479, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.21847236, + "balance_loss_mlp": 1.02107561, + "epoch": 0.9290245002254622, + "flos": 14061335176080.0, + "grad_norm": 1.8826100566978894, + "language_loss": 0.73324585, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75684804, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12670898, + "step": 15452, + "time_per_iteration": 4.29036021232605 + }, + { + "auxiliary_loss_clip": 0.01324625, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.21924388, + "balance_loss_mlp": 1.0179354, + "epoch": 0.9290846234781301, + "flos": 20051897503320.0, + "grad_norm": 1.784276959394936, + "language_loss": 0.74100351, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76455611, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.1270752, + "step": 15453, + "time_per_iteration": 4.380360126495361 + }, + { + "auxiliary_loss_clip": 0.01145972, + "auxiliary_loss_mlp": 0.01008858, + "balance_loss_clip": 1.10322523, + "balance_loss_mlp": 1.00629449, + "epoch": 0.9291447467307982, + "flos": 61240847578560.0, + "grad_norm": 0.8241774638579517, + "language_loss": 0.60663939, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62818766, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02563477, + "step": 15454, + "time_per_iteration": 3.2484543323516846 + }, + { + "auxiliary_loss_clip": 0.01328522, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.21993899, + "balance_loss_mlp": 1.02364993, + "epoch": 0.9292048699834661, + "flos": 20557558715400.0, + "grad_norm": 4.340882875725046, + "language_loss": 0.69214576, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71579731, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12969971, + "step": 15455, + "time_per_iteration": 2.81742525100708 + }, + { + "auxiliary_loss_clip": 0.01329172, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.22185874, + "balance_loss_mlp": 1.01908851, + "epoch": 0.9292649932361341, + "flos": 23629007504160.0, + "grad_norm": 1.93979021041506, + "language_loss": 0.64974296, + "learning_rate": 5.22138035143509e-08, + "loss": 0.67335492, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12945557, + "step": 15456, + "time_per_iteration": 2.909843683242798 + }, + { + "auxiliary_loss_clip": 0.01325199, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.21950889, + "balance_loss_mlp": 1.01722908, + "epoch": 0.929325116488802, + "flos": 15013768832280.0, + "grad_norm": 1.7820190288438233, + "language_loss": 0.68322122, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70677137, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12597656, + "step": 15457, + "time_per_iteration": 2.766281843185425 + }, + { + "auxiliary_loss_clip": 0.01326857, + "auxiliary_loss_mlp": 0.01025806, + "balance_loss_clip": 1.2178793, + "balance_loss_mlp": 1.01352763, + "epoch": 0.92938523974147, + "flos": 17971849415160.0, + "grad_norm": 1.8854766568207, + "language_loss": 0.80954874, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83307534, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.12280273, + "step": 15458, + "time_per_iteration": 4.26290225982666 + }, + { + "auxiliary_loss_clip": 0.01324709, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.2173624, + "balance_loss_mlp": 1.01864457, + "epoch": 0.9294453629941379, + "flos": 23008272535080.0, + "grad_norm": 1.6537760846916414, + "language_loss": 0.72429472, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74785179, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12353516, + "step": 15459, + "time_per_iteration": 2.7766404151916504 + }, + { + "auxiliary_loss_clip": 0.01324102, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.21720088, + "balance_loss_mlp": 1.02027237, + "epoch": 0.929505486246806, + "flos": 17059601054160.0, + "grad_norm": 2.9238605441133703, + "language_loss": 0.59471273, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.61828351, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1270752, + "step": 15460, + "time_per_iteration": 2.7857987880706787 + }, + { + "auxiliary_loss_clip": 0.01329908, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.22033858, + "balance_loss_mlp": 1.01686347, + "epoch": 0.9295656094994739, + "flos": 27345620513520.0, + "grad_norm": 1.6542796808728186, + "language_loss": 0.8071003, + "learning_rate": 5.177267396106733e-08, + "loss": 0.83070219, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13421631, + "step": 15461, + "time_per_iteration": 2.8803484439849854 + }, + { + "auxiliary_loss_clip": 0.01325274, + "auxiliary_loss_mlp": 0.01024331, + "balance_loss_clip": 1.21956241, + "balance_loss_mlp": 1.01220751, + "epoch": 0.9296257327521419, + "flos": 21476020155480.0, + "grad_norm": 2.1152158451988328, + "language_loss": 0.78263968, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80613571, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.12121582, + "step": 15462, + "time_per_iteration": 2.840510129928589 + }, + { + "auxiliary_loss_clip": 0.0132586, + "auxiliary_loss_mlp": 0.01031329, + "balance_loss_clip": 1.21839809, + "balance_loss_mlp": 1.01925957, + "epoch": 0.9296858560048099, + "flos": 16367023250280.0, + "grad_norm": 1.8479531166575969, + "language_loss": 0.62816453, + "learning_rate": 5.159673925518282e-08, + "loss": 0.65173644, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.1206665, + "step": 15463, + "time_per_iteration": 2.7362635135650635 + }, + { + "auxiliary_loss_clip": 0.01326496, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.21988368, + "balance_loss_mlp": 1.0156219, + "epoch": 0.9297459792574778, + "flos": 29864238157080.0, + "grad_norm": 1.4047093931452517, + "language_loss": 0.71353805, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73707747, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.1184082, + "step": 15464, + "time_per_iteration": 2.839574098587036 + }, + { + "auxiliary_loss_clip": 0.01333171, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.22443008, + "balance_loss_mlp": 1.01521635, + "epoch": 0.9298061025101458, + "flos": 15929144037000.0, + "grad_norm": 1.964823253972459, + "language_loss": 0.7747041, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79831314, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.12524414, + "step": 15465, + "time_per_iteration": 2.7589473724365234 + }, + { + "auxiliary_loss_clip": 0.01145633, + "auxiliary_loss_mlp": 0.01012473, + "balance_loss_clip": 1.10306644, + "balance_loss_mlp": 1.01010025, + "epoch": 0.9298662257628137, + "flos": 64113782765760.0, + "grad_norm": 0.6929966080488967, + "language_loss": 0.56506509, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58664614, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02368164, + "step": 15466, + "time_per_iteration": 3.4252755641937256 + }, + { + "auxiliary_loss_clip": 0.01329403, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.22046638, + "balance_loss_mlp": 1.02094865, + "epoch": 0.9299263490154818, + "flos": 24285623282280.0, + "grad_norm": 1.5095999751425675, + "language_loss": 0.73119593, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75483209, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13269043, + "step": 15467, + "time_per_iteration": 2.832080841064453 + }, + { + "auxiliary_loss_clip": 0.01329584, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.22140145, + "balance_loss_mlp": 1.0200665, + "epoch": 0.9299864722681497, + "flos": 23299542276840.0, + "grad_norm": 1.5340814516037968, + "language_loss": 0.71957386, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.7432071, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13677979, + "step": 15468, + "time_per_iteration": 2.8275961875915527 + }, + { + "auxiliary_loss_clip": 0.01331649, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.22184467, + "balance_loss_mlp": 1.01416278, + "epoch": 0.9300465955208177, + "flos": 21400766218440.0, + "grad_norm": 1.6487602098956564, + "language_loss": 0.75612491, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77971238, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12945557, + "step": 15469, + "time_per_iteration": 2.8879899978637695 + }, + { + "auxiliary_loss_clip": 0.01330718, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.22106755, + "balance_loss_mlp": 1.02307987, + "epoch": 0.9301067187734856, + "flos": 24576811807320.0, + "grad_norm": 1.7493697215768558, + "language_loss": 0.75958061, + "learning_rate": 5.098329529416379e-08, + "loss": 0.78324831, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12982178, + "step": 15470, + "time_per_iteration": 2.9262807369232178 + }, + { + "auxiliary_loss_clip": 0.01328356, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.22199488, + "balance_loss_mlp": 1.01985669, + "epoch": 0.9301668420261536, + "flos": 22201636183200.0, + "grad_norm": 1.5328052276538102, + "language_loss": 0.74832648, + "learning_rate": 5.089595604367902e-08, + "loss": 0.77193159, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.1229248, + "step": 15471, + "time_per_iteration": 2.8328957557678223 + }, + { + "auxiliary_loss_clip": 0.01325428, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.2188952, + "balance_loss_mlp": 1.01519597, + "epoch": 0.9302269652788215, + "flos": 17751975816240.0, + "grad_norm": 2.407589253094397, + "language_loss": 0.69542593, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71896195, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12982178, + "step": 15472, + "time_per_iteration": 2.8190195560455322 + }, + { + "auxiliary_loss_clip": 0.01312565, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.20985413, + "balance_loss_mlp": 1.01653063, + "epoch": 0.9302870885314896, + "flos": 19395525375360.0, + "grad_norm": 1.6849228535226468, + "language_loss": 0.88583314, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90924084, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.11676025, + "step": 15473, + "time_per_iteration": 2.840651273727417 + }, + { + "auxiliary_loss_clip": 0.01334297, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.22357821, + "balance_loss_mlp": 1.02019906, + "epoch": 0.9303472117841575, + "flos": 21765056437440.0, + "grad_norm": 1.9617327331995882, + "language_loss": 0.64438254, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66807067, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14312744, + "step": 15474, + "time_per_iteration": 3.030486822128296 + }, + { + "auxiliary_loss_clip": 0.01326246, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.21843505, + "balance_loss_mlp": 1.01985645, + "epoch": 0.9304073350368255, + "flos": 19614505590360.0, + "grad_norm": 1.7465769829724471, + "language_loss": 0.74940729, + "learning_rate": 5.054733817702339e-08, + "loss": 0.77299583, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12744141, + "step": 15475, + "time_per_iteration": 2.7796387672424316 + }, + { + "auxiliary_loss_clip": 0.01323672, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.2163887, + "balance_loss_mlp": 1.01416504, + "epoch": 0.9304674582894935, + "flos": 30447183724200.0, + "grad_norm": 2.698368126134116, + "language_loss": 0.67038894, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.69389176, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12445068, + "step": 15476, + "time_per_iteration": 2.8628692626953125 + }, + { + "auxiliary_loss_clip": 0.01332281, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.22386169, + "balance_loss_mlp": 1.01859069, + "epoch": 0.9305275815421614, + "flos": 17790252518520.0, + "grad_norm": 1.7657514107204348, + "language_loss": 0.68873751, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71237808, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13201904, + "step": 15477, + "time_per_iteration": 2.770876884460449 + }, + { + "auxiliary_loss_clip": 0.01321714, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.21615136, + "balance_loss_mlp": 1.01730442, + "epoch": 0.9305877047948294, + "flos": 25303564869120.0, + "grad_norm": 1.9243869017109103, + "language_loss": 0.58388215, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60739511, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12280273, + "step": 15478, + "time_per_iteration": 2.8287012577056885 + }, + { + "auxiliary_loss_clip": 0.01340528, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.22697043, + "balance_loss_mlp": 1.01862931, + "epoch": 0.9306478280474973, + "flos": 16980936014880.0, + "grad_norm": 2.3345678373445846, + "language_loss": 0.79506302, + "learning_rate": 5.01999030853566e-08, + "loss": 0.81879413, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1394043, + "step": 15479, + "time_per_iteration": 2.7101330757141113 + }, + { + "auxiliary_loss_clip": 0.01330747, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.2224977, + "balance_loss_mlp": 1.02042186, + "epoch": 0.9307079513001654, + "flos": 35670198569040.0, + "grad_norm": 1.7534391118954304, + "language_loss": 0.68932223, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.71296048, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12658691, + "step": 15480, + "time_per_iteration": 2.880786418914795 + }, + { + "auxiliary_loss_clip": 0.01330674, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.22326946, + "balance_loss_mlp": 1.0240761, + "epoch": 0.9307680745528333, + "flos": 19212832053000.0, + "grad_norm": 1.4872723169474438, + "language_loss": 0.67873144, + "learning_rate": 5.002662914604583e-08, + "loss": 0.70240331, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.12438965, + "step": 15481, + "time_per_iteration": 2.7457802295684814 + }, + { + "auxiliary_loss_clip": 0.01322831, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.21715617, + "balance_loss_mlp": 1.01436806, + "epoch": 0.9308281978055013, + "flos": 19067359615560.0, + "grad_norm": 1.922434852659296, + "language_loss": 0.75485146, + "learning_rate": 4.994010308952701e-08, + "loss": 0.77834588, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.12231445, + "step": 15482, + "time_per_iteration": 2.8205630779266357 + }, + { + "auxiliary_loss_clip": 0.01316582, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.2118175, + "balance_loss_mlp": 1.01953578, + "epoch": 0.9308883210581692, + "flos": 20526388476120.0, + "grad_norm": 1.9845432341494105, + "language_loss": 0.79730552, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82079035, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.1237793, + "step": 15483, + "time_per_iteration": 2.8686141967773438 + }, + { + "auxiliary_loss_clip": 0.01326302, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.21849275, + "balance_loss_mlp": 1.01776123, + "epoch": 0.9309484443108372, + "flos": 13004751411720.0, + "grad_norm": 2.0637680666988834, + "language_loss": 0.75134945, + "learning_rate": 4.976727281916782e-08, + "loss": 0.77491611, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1260376, + "step": 15484, + "time_per_iteration": 2.7337968349456787 + }, + { + "auxiliary_loss_clip": 0.0132993, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.22008729, + "balance_loss_mlp": 1.02254987, + "epoch": 0.9310085675635051, + "flos": 12571582768200.0, + "grad_norm": 2.816691276517382, + "language_loss": 0.76792794, + "learning_rate": 4.968096861188087e-08, + "loss": 0.79158199, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12921143, + "step": 15485, + "time_per_iteration": 2.7795443534851074 + }, + { + "auxiliary_loss_clip": 0.01331278, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.22072911, + "balance_loss_mlp": 1.0187366, + "epoch": 0.9310686908161732, + "flos": 23482885332960.0, + "grad_norm": 1.7306384643793298, + "language_loss": 0.77562547, + "learning_rate": 4.959473836088723e-08, + "loss": 0.79926062, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13513184, + "step": 15486, + "time_per_iteration": 2.8150036334991455 + }, + { + "auxiliary_loss_clip": 0.01331336, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.22124672, + "balance_loss_mlp": 1.0166316, + "epoch": 0.9311288140688411, + "flos": 24175625570280.0, + "grad_norm": 2.104431747273315, + "language_loss": 0.77165246, + "learning_rate": 4.950858206945674e-08, + "loss": 0.7952621, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13000488, + "step": 15487, + "time_per_iteration": 2.845212697982788 + }, + { + "auxiliary_loss_clip": 0.01327401, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.21996295, + "balance_loss_mlp": 1.01532376, + "epoch": 0.9311889373215091, + "flos": 35597015658360.0, + "grad_norm": 1.9720724130557272, + "language_loss": 0.67305779, + "learning_rate": 4.942249974085633e-08, + "loss": 0.6966182, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13311768, + "step": 15488, + "time_per_iteration": 2.9255897998809814 + }, + { + "auxiliary_loss_clip": 0.01314584, + "auxiliary_loss_mlp": 0.01026001, + "balance_loss_clip": 1.21208942, + "balance_loss_mlp": 1.01391292, + "epoch": 0.9312490605741771, + "flos": 20235443601240.0, + "grad_norm": 1.841859698525482, + "language_loss": 0.7533496, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77675545, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.12084961, + "step": 15489, + "time_per_iteration": 4.254619598388672 + }, + { + "auxiliary_loss_clip": 0.01330907, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.22075117, + "balance_loss_mlp": 1.01962614, + "epoch": 0.931309183826845, + "flos": 13954098832560.0, + "grad_norm": 2.3661716388405964, + "language_loss": 0.81397009, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83761179, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 1.10107422, + "router_z_loss_mlp": 0.13623047, + "step": 15490, + "time_per_iteration": 2.794830083847046 + }, + { + "auxiliary_loss_clip": 0.01335093, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.22450769, + "balance_loss_mlp": 1.01890993, + "epoch": 0.931369307079513, + "flos": 20161529740080.0, + "grad_norm": 1.55164939730094, + "language_loss": 0.72182703, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74549842, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13128662, + "step": 15491, + "time_per_iteration": 4.282211065292358 + }, + { + "auxiliary_loss_clip": 0.0131963, + "auxiliary_loss_mlp": 0.01024436, + "balance_loss_clip": 1.21590734, + "balance_loss_mlp": 1.01228261, + "epoch": 0.931429430332181, + "flos": 25344156247920.0, + "grad_norm": 2.0112393085818967, + "language_loss": 0.7432493, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76668996, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 1.03759766, + "router_z_loss_mlp": 0.12158203, + "step": 15492, + "time_per_iteration": 4.392824649810791 + }, + { + "auxiliary_loss_clip": 0.01145061, + "auxiliary_loss_mlp": 0.01017178, + "balance_loss_clip": 1.10289264, + "balance_loss_mlp": 1.01469851, + "epoch": 0.931489553584849, + "flos": 71241038816400.0, + "grad_norm": 0.7098827439983305, + "language_loss": 0.53521025, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55683255, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02478027, + "step": 15493, + "time_per_iteration": 3.2264480590820312 + }, + { + "auxiliary_loss_clip": 0.01325556, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.21895552, + "balance_loss_mlp": 1.02338028, + "epoch": 0.9315496768375169, + "flos": 14647245153480.0, + "grad_norm": 1.8064942809526359, + "language_loss": 0.71496755, + "learning_rate": 4.890755917128531e-08, + "loss": 0.73858172, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12487793, + "step": 15494, + "time_per_iteration": 2.822610378265381 + }, + { + "auxiliary_loss_clip": 0.01332337, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.22248793, + "balance_loss_mlp": 1.01543355, + "epoch": 0.9316098000901849, + "flos": 28335843571680.0, + "grad_norm": 1.53026850985939, + "language_loss": 0.68181568, + "learning_rate": 4.882199467373671e-08, + "loss": 0.7054252, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13183594, + "step": 15495, + "time_per_iteration": 2.8325822353363037 + }, + { + "auxiliary_loss_clip": 0.01319866, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.21471274, + "balance_loss_mlp": 1.01814055, + "epoch": 0.9316699233428528, + "flos": 28518658719120.0, + "grad_norm": 1.6418875931423722, + "language_loss": 0.62004197, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.6435374, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11541748, + "step": 15496, + "time_per_iteration": 2.8729374408721924 + }, + { + "auxiliary_loss_clip": 0.01325379, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.21741509, + "balance_loss_mlp": 1.01855314, + "epoch": 0.9317300465955208, + "flos": 33700026367800.0, + "grad_norm": 1.4115527487969453, + "language_loss": 0.77097666, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79454452, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12860107, + "step": 15497, + "time_per_iteration": 4.354463815689087 + }, + { + "auxiliary_loss_clip": 0.01336581, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.22566843, + "balance_loss_mlp": 1.01830959, + "epoch": 0.9317901698481887, + "flos": 23663345195520.0, + "grad_norm": 1.8700756673444812, + "language_loss": 0.67039216, + "learning_rate": 4.856574512724898e-08, + "loss": 0.69407392, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13299561, + "step": 15498, + "time_per_iteration": 2.7997660636901855 + }, + { + "auxiliary_loss_clip": 0.01326827, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.21868026, + "balance_loss_mlp": 1.01948869, + "epoch": 0.9318502931008568, + "flos": 20964998639880.0, + "grad_norm": 1.6697221301325422, + "language_loss": 0.80330956, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82690501, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13238525, + "step": 15499, + "time_per_iteration": 2.8027868270874023 + }, + { + "auxiliary_loss_clip": 0.01316318, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.21201885, + "balance_loss_mlp": 1.01791346, + "epoch": 0.9319104163535247, + "flos": 23446354790160.0, + "grad_norm": 1.5525660368792158, + "language_loss": 0.77199948, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79546797, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12615967, + "step": 15500, + "time_per_iteration": 2.9021713733673096 + }, + { + "auxiliary_loss_clip": 0.01320182, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.21575403, + "balance_loss_mlp": 1.01572371, + "epoch": 0.9319705396061927, + "flos": 22352712574320.0, + "grad_norm": 1.673399552841472, + "language_loss": 0.72589701, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.749376, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.11987305, + "step": 15501, + "time_per_iteration": 2.7949345111846924 + }, + { + "auxiliary_loss_clip": 0.01333796, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.22424412, + "balance_loss_mlp": 1.01869726, + "epoch": 0.9320306628588607, + "flos": 20998158688800.0, + "grad_norm": 1.7043683340658504, + "language_loss": 0.66215301, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68580252, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12451172, + "step": 15502, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.01332329, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.22256279, + "balance_loss_mlp": 1.01718593, + "epoch": 0.9320907861115286, + "flos": 24544098450360.0, + "grad_norm": 1.7387176244720302, + "language_loss": 0.65911949, + "learning_rate": 4.814014256446586e-08, + "loss": 0.68273997, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.12542725, + "step": 15503, + "time_per_iteration": 2.8393049240112305 + }, + { + "auxiliary_loss_clip": 0.01328327, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.21845746, + "balance_loss_mlp": 1.01509094, + "epoch": 0.9321509093641966, + "flos": 19789320890880.0, + "grad_norm": 2.2229395623633947, + "language_loss": 0.75390172, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77747035, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13439941, + "step": 15504, + "time_per_iteration": 2.8070545196533203 + }, + { + "auxiliary_loss_clip": 0.01322177, + "auxiliary_loss_mlp": 0.01026219, + "balance_loss_clip": 1.2151103, + "balance_loss_mlp": 1.01211619, + "epoch": 0.9322110326168646, + "flos": 24978079261080.0, + "grad_norm": 2.6650337174821512, + "language_loss": 0.71317595, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73665988, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.14099121, + "step": 15505, + "time_per_iteration": 2.8708598613739014 + }, + { + "auxiliary_loss_clip": 0.01329395, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.22036409, + "balance_loss_mlp": 1.01753449, + "epoch": 0.9322711558695326, + "flos": 16147961818560.0, + "grad_norm": 1.754598086680568, + "language_loss": 0.75729418, + "learning_rate": 4.788566917763614e-08, + "loss": 0.78090692, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.14355469, + "step": 15506, + "time_per_iteration": 2.903223752975464 + }, + { + "auxiliary_loss_clip": 0.0131921, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.21482003, + "balance_loss_mlp": 1.01377928, + "epoch": 0.9323312791222005, + "flos": 23737705748640.0, + "grad_norm": 1.9035046908190454, + "language_loss": 0.83204365, + "learning_rate": 4.780099275981597e-08, + "loss": 0.8554942, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12060547, + "step": 15507, + "time_per_iteration": 2.871204376220703 + }, + { + "auxiliary_loss_clip": 0.01327258, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.21826875, + "balance_loss_mlp": 1.01643312, + "epoch": 0.9323914023748685, + "flos": 20782914442920.0, + "grad_norm": 1.6144750968239012, + "language_loss": 0.68197012, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70553637, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12921143, + "step": 15508, + "time_per_iteration": 2.8700971603393555 + }, + { + "auxiliary_loss_clip": 0.01319618, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.21456456, + "balance_loss_mlp": 1.0191884, + "epoch": 0.9324515256275364, + "flos": 23920520896080.0, + "grad_norm": 1.5792819281279247, + "language_loss": 0.72800779, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.75152528, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12957764, + "step": 15509, + "time_per_iteration": 2.8788061141967773 + }, + { + "auxiliary_loss_clip": 0.01326114, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.21946836, + "balance_loss_mlp": 1.01686525, + "epoch": 0.9325116488802044, + "flos": 18009882467280.0, + "grad_norm": 1.7333843827703608, + "language_loss": 0.74478161, + "learning_rate": 4.754740768467624e-08, + "loss": 0.7683394, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12799072, + "step": 15510, + "time_per_iteration": 2.755655527114868 + }, + { + "auxiliary_loss_clip": 0.01337084, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.22437418, + "balance_loss_mlp": 1.01679528, + "epoch": 0.9325717721328723, + "flos": 29027406166560.0, + "grad_norm": 2.0804537888565773, + "language_loss": 0.70597565, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72964275, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.128479, + "step": 15511, + "time_per_iteration": 2.87941312789917 + }, + { + "auxiliary_loss_clip": 0.01322016, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.2151531, + "balance_loss_mlp": 1.02147794, + "epoch": 0.9326318953855404, + "flos": 21650510589120.0, + "grad_norm": 1.9831482295790233, + "language_loss": 0.78489363, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80845135, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12298584, + "step": 15512, + "time_per_iteration": 2.772095203399658 + }, + { + "auxiliary_loss_clip": 0.01324968, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.21779513, + "balance_loss_mlp": 1.0154016, + "epoch": 0.9326920186382083, + "flos": 26071071743160.0, + "grad_norm": 1.4697545635191893, + "language_loss": 0.80589044, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.8294276, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.13348389, + "step": 15513, + "time_per_iteration": 2.949467182159424 + }, + { + "auxiliary_loss_clip": 0.01337298, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.22569537, + "balance_loss_mlp": 1.01990843, + "epoch": 0.9327521418908763, + "flos": 12060886119480.0, + "grad_norm": 1.8651205210909703, + "language_loss": 0.80415481, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82786512, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13830566, + "step": 15514, + "time_per_iteration": 2.7906687259674072 + }, + { + "auxiliary_loss_clip": 0.01319862, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.21532071, + "balance_loss_mlp": 1.0220356, + "epoch": 0.9328122651435443, + "flos": 43842075899040.0, + "grad_norm": 1.6712979724260655, + "language_loss": 0.71681011, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.74034941, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12036133, + "step": 15515, + "time_per_iteration": 2.969175100326538 + }, + { + "auxiliary_loss_clip": 0.01338332, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.22659039, + "balance_loss_mlp": 1.01975787, + "epoch": 0.9328723883962122, + "flos": 15199142306400.0, + "grad_norm": 2.223289816452604, + "language_loss": 0.81399757, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83771765, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13909912, + "step": 15516, + "time_per_iteration": 2.738848924636841 + }, + { + "auxiliary_loss_clip": 0.01330056, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.22003949, + "balance_loss_mlp": 1.01861537, + "epoch": 0.9329325116488802, + "flos": 20266004715120.0, + "grad_norm": 1.9727615685145057, + "language_loss": 0.80842161, + "learning_rate": 4.695830062703643e-08, + "loss": 0.83203256, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12426758, + "step": 15517, + "time_per_iteration": 2.743896484375 + }, + { + "auxiliary_loss_clip": 0.01331648, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.22285008, + "balance_loss_mlp": 1.01285219, + "epoch": 0.9329926349015482, + "flos": 13118891176440.0, + "grad_norm": 1.8950862402306932, + "language_loss": 0.74411768, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76770055, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13775635, + "step": 15518, + "time_per_iteration": 2.9453327655792236 + }, + { + "auxiliary_loss_clip": 0.01326234, + "auxiliary_loss_mlp": 0.01030502, + "balance_loss_clip": 1.21914458, + "balance_loss_mlp": 1.01836622, + "epoch": 0.9330527581542162, + "flos": 23045574636720.0, + "grad_norm": 2.057002311804368, + "language_loss": 0.75323135, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77679873, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12127686, + "step": 15519, + "time_per_iteration": 2.8056442737579346 + }, + { + "auxiliary_loss_clip": 0.01321544, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.214872, + "balance_loss_mlp": 1.01997936, + "epoch": 0.9331128814068841, + "flos": 15563838609000.0, + "grad_norm": 1.9970777092816316, + "language_loss": 0.83852732, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.86208475, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.14227295, + "step": 15520, + "time_per_iteration": 2.817718029022217 + }, + { + "auxiliary_loss_clip": 0.01317197, + "auxiliary_loss_mlp": 0.01021737, + "balance_loss_clip": 1.21155465, + "balance_loss_mlp": 1.00997114, + "epoch": 0.9331730046595521, + "flos": 22276687078440.0, + "grad_norm": 1.54717789773521, + "language_loss": 0.76426387, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78765321, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.11779785, + "step": 15521, + "time_per_iteration": 2.777573585510254 + }, + { + "auxiliary_loss_clip": 0.01329994, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.22264433, + "balance_loss_mlp": 1.01895893, + "epoch": 0.93323312791222, + "flos": 15781763006640.0, + "grad_norm": 2.1837268504326746, + "language_loss": 0.77840233, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.80201483, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12280273, + "step": 15522, + "time_per_iteration": 2.7356629371643066 + }, + { + "auxiliary_loss_clip": 0.01324944, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.21706462, + "balance_loss_mlp": 1.01888537, + "epoch": 0.933293251164888, + "flos": 22015166283360.0, + "grad_norm": 1.8713459722061208, + "language_loss": 0.63141811, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.65498561, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12921143, + "step": 15523, + "time_per_iteration": 2.7890090942382812 + }, + { + "auxiliary_loss_clip": 0.01326251, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.22038388, + "balance_loss_mlp": 1.02002752, + "epoch": 0.933353374417556, + "flos": 26036977701960.0, + "grad_norm": 1.585347390566192, + "language_loss": 0.68568873, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7092793, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12780762, + "step": 15524, + "time_per_iteration": 2.950989007949829 + }, + { + "auxiliary_loss_clip": 0.01329274, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.22050738, + "balance_loss_mlp": 1.01843488, + "epoch": 0.933413497670224, + "flos": 24905546084160.0, + "grad_norm": 1.5666602791833237, + "language_loss": 0.74086285, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76447296, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13311768, + "step": 15525, + "time_per_iteration": 2.834181547164917 + }, + { + "auxiliary_loss_clip": 0.01321616, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.21664524, + "balance_loss_mlp": 1.01931643, + "epoch": 0.9334736209228919, + "flos": 23693012925480.0, + "grad_norm": 1.6978827126501377, + "language_loss": 0.84203666, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.86556721, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12127686, + "step": 15526, + "time_per_iteration": 2.794343948364258 + }, + { + "auxiliary_loss_clip": 0.01329585, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.22058904, + "balance_loss_mlp": 1.01318645, + "epoch": 0.9335337441755599, + "flos": 15381551370240.0, + "grad_norm": 2.5588586031320744, + "language_loss": 0.6941129, + "learning_rate": 4.61230144456366e-08, + "loss": 0.7176677, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.1270752, + "step": 15527, + "time_per_iteration": 4.165215730667114 + }, + { + "auxiliary_loss_clip": 0.01331182, + "auxiliary_loss_mlp": 0.01027338, + "balance_loss_clip": 1.22054195, + "balance_loss_mlp": 1.01328325, + "epoch": 0.9335938674282279, + "flos": 16110984583800.0, + "grad_norm": 1.9151427774240377, + "language_loss": 0.65612984, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67971504, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14038086, + "step": 15528, + "time_per_iteration": 2.841944456100464 + }, + { + "auxiliary_loss_clip": 0.01330661, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.22076559, + "balance_loss_mlp": 1.01958084, + "epoch": 0.9336539906808958, + "flos": 18957564945360.0, + "grad_norm": 2.020290673759804, + "language_loss": 0.74788308, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77151507, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12963867, + "step": 15529, + "time_per_iteration": 2.808367967605591 + }, + { + "auxiliary_loss_clip": 0.01326261, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.22002006, + "balance_loss_mlp": 1.01566792, + "epoch": 0.9337141139335638, + "flos": 18113057974800.0, + "grad_norm": 1.6679397910055578, + "language_loss": 0.63296717, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65650761, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12115479, + "step": 15530, + "time_per_iteration": 5.762489557266235 + }, + { + "auxiliary_loss_clip": 0.01317439, + "auxiliary_loss_mlp": 0.01028143, + "balance_loss_clip": 1.21258581, + "balance_loss_mlp": 1.01581645, + "epoch": 0.9337742371862318, + "flos": 17349936803640.0, + "grad_norm": 1.6427110993109248, + "language_loss": 0.72588468, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74934047, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12316895, + "step": 15531, + "time_per_iteration": 2.8652753829956055 + }, + { + "auxiliary_loss_clip": 0.01321856, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.21588016, + "balance_loss_mlp": 1.01911235, + "epoch": 0.9338343604388998, + "flos": 29064911310000.0, + "grad_norm": 2.6010170594883286, + "language_loss": 0.71249223, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73603523, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13336182, + "step": 15532, + "time_per_iteration": 2.8387339115142822 + }, + { + "auxiliary_loss_clip": 0.01332432, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.22271657, + "balance_loss_mlp": 1.02065063, + "epoch": 0.9338944836915677, + "flos": 18665523644760.0, + "grad_norm": 1.5426375444480696, + "language_loss": 0.7339195, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.7575835, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13317871, + "step": 15533, + "time_per_iteration": 2.795895576477051 + }, + { + "auxiliary_loss_clip": 0.01321801, + "auxiliary_loss_mlp": 0.01028173, + "balance_loss_clip": 1.21584582, + "balance_loss_mlp": 1.01541209, + "epoch": 0.9339546069442357, + "flos": 16622087316120.0, + "grad_norm": 2.7483000356245024, + "language_loss": 0.80208707, + "learning_rate": 4.554272235700507e-08, + "loss": 0.8255868, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12756348, + "step": 15534, + "time_per_iteration": 2.77750563621521 + }, + { + "auxiliary_loss_clip": 0.01311705, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.21190262, + "balance_loss_mlp": 1.01347208, + "epoch": 0.9340147301969036, + "flos": 23698007753760.0, + "grad_norm": 1.8986549434581714, + "language_loss": 0.74871933, + "learning_rate": 4.546011991495513e-08, + "loss": 0.77208257, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.11151123, + "step": 15535, + "time_per_iteration": 2.7899839878082275 + }, + { + "auxiliary_loss_clip": 0.01331211, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.22231054, + "balance_loss_mlp": 1.01632524, + "epoch": 0.9340748534495716, + "flos": 28659907887120.0, + "grad_norm": 2.27966379908365, + "language_loss": 0.78356981, + "learning_rate": 4.537759158925292e-08, + "loss": 0.80717444, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12939453, + "step": 15536, + "time_per_iteration": 4.3448402881622314 + }, + { + "auxiliary_loss_clip": 0.01324244, + "auxiliary_loss_mlp": 0.01022143, + "balance_loss_clip": 1.2174952, + "balance_loss_mlp": 1.00988865, + "epoch": 0.9341349767022396, + "flos": 24905018175480.0, + "grad_norm": 1.3877687226874722, + "language_loss": 0.80654109, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.83000493, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12255859, + "step": 15537, + "time_per_iteration": 2.8155148029327393 + }, + { + "auxiliary_loss_clip": 0.01330675, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.22212207, + "balance_loss_mlp": 1.0189451, + "epoch": 0.9341950999549076, + "flos": 29065520435400.0, + "grad_norm": 1.903855806223254, + "language_loss": 0.78193879, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.80555576, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12078857, + "step": 15538, + "time_per_iteration": 2.8271610736846924 + }, + { + "auxiliary_loss_clip": 0.013239, + "auxiliary_loss_mlp": 0.01028673, + "balance_loss_clip": 1.21770203, + "balance_loss_mlp": 1.0159353, + "epoch": 0.9342552232075755, + "flos": 23592395744640.0, + "grad_norm": 1.4449228779901553, + "language_loss": 0.73445666, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75798237, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12738037, + "step": 15539, + "time_per_iteration": 2.854701280593872 + }, + { + "auxiliary_loss_clip": 0.01317672, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.2109822, + "balance_loss_mlp": 1.01833129, + "epoch": 0.9343153464602435, + "flos": 36728366059440.0, + "grad_norm": 1.5328635330439293, + "language_loss": 0.65054721, + "learning_rate": 4.504821951247373e-08, + "loss": 0.67402577, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11865234, + "step": 15540, + "time_per_iteration": 2.8729970455169678 + }, + { + "auxiliary_loss_clip": 0.01323989, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.21699321, + "balance_loss_mlp": 1.01721096, + "epoch": 0.9343754697129115, + "flos": 22241577828240.0, + "grad_norm": 1.6428498221021164, + "language_loss": 0.76669836, + "learning_rate": 4.496606181539864e-08, + "loss": 0.79023659, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1262207, + "step": 15541, + "time_per_iteration": 2.826035976409912 + }, + { + "auxiliary_loss_clip": 0.01321906, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.21586919, + "balance_loss_mlp": 1.01896036, + "epoch": 0.9344355929655794, + "flos": 29715638875920.0, + "grad_norm": 1.8782978597450335, + "language_loss": 0.66966456, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.693205, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.13183594, + "step": 15542, + "time_per_iteration": 2.922048330307007 + }, + { + "auxiliary_loss_clip": 0.01322628, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.21614194, + "balance_loss_mlp": 1.01378107, + "epoch": 0.9344957162182475, + "flos": 18885112985160.0, + "grad_norm": 1.6333835039543294, + "language_loss": 0.69582248, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71931463, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1282959, + "step": 15543, + "time_per_iteration": 2.8257977962493896 + }, + { + "auxiliary_loss_clip": 0.01335046, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.22378635, + "balance_loss_mlp": 1.01948619, + "epoch": 0.9345558394709154, + "flos": 27423554602320.0, + "grad_norm": 1.6123919881505566, + "language_loss": 0.69521284, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71889234, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 1.11279297, + "router_z_loss_mlp": 0.13421631, + "step": 15544, + "time_per_iteration": 2.9040117263793945 + }, + { + "auxiliary_loss_clip": 0.01329712, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.22154737, + "balance_loss_mlp": 1.02252305, + "epoch": 0.9346159627235834, + "flos": 20746262075040.0, + "grad_norm": 1.6496706119384403, + "language_loss": 0.7717979, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79544616, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.12591553, + "step": 15545, + "time_per_iteration": 2.843731641769409 + }, + { + "auxiliary_loss_clip": 0.01335107, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.22532928, + "balance_loss_mlp": 1.0154537, + "epoch": 0.9346760859762513, + "flos": 21074184184680.0, + "grad_norm": 1.5471040538661203, + "language_loss": 0.69204193, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71566886, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12139893, + "step": 15546, + "time_per_iteration": 2.819538116455078 + }, + { + "auxiliary_loss_clip": 0.01321642, + "auxiliary_loss_mlp": 0.01025893, + "balance_loss_clip": 1.21750939, + "balance_loss_mlp": 1.01424074, + "epoch": 0.9347362092289193, + "flos": 29210871047760.0, + "grad_norm": 1.6706339234291916, + "language_loss": 0.82456529, + "learning_rate": 4.447467257852966e-08, + "loss": 0.8480407, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.11657715, + "step": 15547, + "time_per_iteration": 2.8046438694000244 + }, + { + "auxiliary_loss_clip": 0.01317877, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.213624, + "balance_loss_mlp": 1.01563501, + "epoch": 0.9347963324815872, + "flos": 19432096526520.0, + "grad_norm": 1.7716611061655232, + "language_loss": 0.83710253, + "learning_rate": 4.439303389230087e-08, + "loss": 0.86055255, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.11486816, + "step": 15548, + "time_per_iteration": 2.790501594543457 + }, + { + "auxiliary_loss_clip": 0.01333052, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.22087216, + "balance_loss_mlp": 1.01881289, + "epoch": 0.9348564557342552, + "flos": 36908582271840.0, + "grad_norm": 1.7382353182176293, + "language_loss": 0.65617394, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67983079, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13830566, + "step": 15549, + "time_per_iteration": 2.9290668964385986 + }, + { + "auxiliary_loss_clip": 0.01325901, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.2183162, + "balance_loss_mlp": 1.0172708, + "epoch": 0.9349165789869232, + "flos": 21695244020640.0, + "grad_norm": 1.712370056895534, + "language_loss": 0.80231678, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.825881, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13250732, + "step": 15550, + "time_per_iteration": 2.839200973510742 + }, + { + "auxiliary_loss_clip": 0.01319914, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.21427345, + "balance_loss_mlp": 1.01946914, + "epoch": 0.9349767022395912, + "flos": 18848785484160.0, + "grad_norm": 1.5717617894313949, + "language_loss": 0.75949126, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.78300846, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12353516, + "step": 15551, + "time_per_iteration": 2.911789894104004 + }, + { + "auxiliary_loss_clip": 0.01315076, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.21175134, + "balance_loss_mlp": 1.01889563, + "epoch": 0.9350368254922591, + "flos": 24978932036640.0, + "grad_norm": 1.6360409075512568, + "language_loss": 0.73812401, + "learning_rate": 4.406722074642255e-08, + "loss": 0.76157761, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.1138916, + "step": 15552, + "time_per_iteration": 2.7838897705078125 + }, + { + "auxiliary_loss_clip": 0.01325095, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.21802616, + "balance_loss_mlp": 1.0143981, + "epoch": 0.9350969487449271, + "flos": 23075120541600.0, + "grad_norm": 2.443972371162703, + "language_loss": 0.77238405, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79589963, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12054443, + "step": 15553, + "time_per_iteration": 2.880791187286377 + }, + { + "auxiliary_loss_clip": 0.01331858, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.22197473, + "balance_loss_mlp": 1.02210975, + "epoch": 0.9351570719975951, + "flos": 18629886485880.0, + "grad_norm": 1.8965563781993995, + "language_loss": 0.78924847, + "learning_rate": 4.390475917613723e-08, + "loss": 0.81291711, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12908936, + "step": 15554, + "time_per_iteration": 2.8455252647399902 + }, + { + "auxiliary_loss_clip": 0.01312836, + "auxiliary_loss_mlp": 0.01022525, + "balance_loss_clip": 1.21070051, + "balance_loss_mlp": 1.01153421, + "epoch": 0.935217195250263, + "flos": 15892654102560.0, + "grad_norm": 1.473052562986234, + "language_loss": 0.69420379, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71755743, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.10986328, + "step": 15555, + "time_per_iteration": 2.806502342224121 + }, + { + "auxiliary_loss_clip": 0.01321368, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.2155714, + "balance_loss_mlp": 1.01849985, + "epoch": 0.935277318502931, + "flos": 24395824036080.0, + "grad_norm": 1.4566777312902934, + "language_loss": 0.75718391, + "learning_rate": 4.374259430715965e-08, + "loss": 0.78071308, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13049316, + "step": 15556, + "time_per_iteration": 2.830448865890503 + }, + { + "auxiliary_loss_clip": 0.01321363, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.21509981, + "balance_loss_mlp": 1.01628268, + "epoch": 0.935337441755599, + "flos": 27606085491240.0, + "grad_norm": 1.4762073714337118, + "language_loss": 0.72692037, + "learning_rate": 4.366162314334953e-08, + "loss": 0.75041527, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.11846924, + "step": 15557, + "time_per_iteration": 2.7922441959381104 + }, + { + "auxiliary_loss_clip": 0.0132288, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.21520782, + "balance_loss_mlp": 1.01833773, + "epoch": 0.935397565008267, + "flos": 20487421431720.0, + "grad_norm": 1.4668779882398573, + "language_loss": 0.63205612, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65559864, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13037109, + "step": 15558, + "time_per_iteration": 2.7561936378479004 + }, + { + "auxiliary_loss_clip": 0.01325931, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.21794808, + "balance_loss_mlp": 1.01958609, + "epoch": 0.9354576882609349, + "flos": 23659000101000.0, + "grad_norm": 2.128976491394298, + "language_loss": 0.73498726, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75858355, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.14099121, + "step": 15559, + "time_per_iteration": 2.7523791790008545 + }, + { + "auxiliary_loss_clip": 0.01322079, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.21765459, + "balance_loss_mlp": 1.01667559, + "epoch": 0.9355178115136029, + "flos": 36691876125000.0, + "grad_norm": 1.6761756069791416, + "language_loss": 0.63874507, + "learning_rate": 4.341915477147062e-08, + "loss": 0.66224277, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.11016846, + "step": 15560, + "time_per_iteration": 2.921964406967163 + }, + { + "auxiliary_loss_clip": 0.0134389, + "auxiliary_loss_mlp": 0.01036749, + "balance_loss_clip": 1.22803736, + "balance_loss_mlp": 1.0217886, + "epoch": 0.9355779347662708, + "flos": 14463861489000.0, + "grad_norm": 5.666719766781145, + "language_loss": 0.64488077, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66868711, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.1496582, + "step": 15561, + "time_per_iteration": 2.6935455799102783 + }, + { + "auxiliary_loss_clip": 0.01323129, + "auxiliary_loss_mlp": 0.01034643, + "balance_loss_clip": 1.21804976, + "balance_loss_mlp": 1.02191734, + "epoch": 0.9356380580189388, + "flos": 23191290724320.0, + "grad_norm": 1.7625772135272453, + "language_loss": 0.75860095, + "learning_rate": 4.325788015381859e-08, + "loss": 0.78217864, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12719727, + "step": 15562, + "time_per_iteration": 2.9521069526672363 + }, + { + "auxiliary_loss_clip": 0.01142808, + "auxiliary_loss_mlp": 0.01002902, + "balance_loss_clip": 1.10011125, + "balance_loss_mlp": 1.0000174, + "epoch": 0.9356981812716068, + "flos": 67486433363280.0, + "grad_norm": 0.9632382012746586, + "language_loss": 0.62481606, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64627314, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02880859, + "step": 15563, + "time_per_iteration": 3.204050064086914 + }, + { + "auxiliary_loss_clip": 0.01317598, + "auxiliary_loss_mlp": 0.0103087, + "balance_loss_clip": 1.21332335, + "balance_loss_mlp": 1.01844835, + "epoch": 0.9357583045242748, + "flos": 24687093777840.0, + "grad_norm": 1.578404166059434, + "language_loss": 0.78193808, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80542278, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12432861, + "step": 15564, + "time_per_iteration": 2.866800308227539 + }, + { + "auxiliary_loss_clip": 0.01333913, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.22301042, + "balance_loss_mlp": 1.0184927, + "epoch": 0.9358184277769427, + "flos": 19468464635880.0, + "grad_norm": 1.6614555376200468, + "language_loss": 0.78125578, + "learning_rate": 4.301652473389694e-08, + "loss": 0.804914, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13415527, + "step": 15565, + "time_per_iteration": 4.254369020462036 + }, + { + "auxiliary_loss_clip": 0.0132353, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.2185328, + "balance_loss_mlp": 1.01712608, + "epoch": 0.9358785510296107, + "flos": 18921724744680.0, + "grad_norm": 2.36815157165822, + "language_loss": 0.72848111, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.75200564, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11798096, + "step": 15566, + "time_per_iteration": 2.8141682147979736 + }, + { + "auxiliary_loss_clip": 0.0132697, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.21775866, + "balance_loss_mlp": 1.01525831, + "epoch": 0.9359386742822787, + "flos": 23446598440320.0, + "grad_norm": 2.242509633398663, + "language_loss": 0.67801839, + "learning_rate": 4.285599216057889e-08, + "loss": 0.70156753, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12701416, + "step": 15567, + "time_per_iteration": 2.863478422164917 + }, + { + "auxiliary_loss_clip": 0.01325265, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.21865225, + "balance_loss_mlp": 1.02004719, + "epoch": 0.9359987975349466, + "flos": 32750475905160.0, + "grad_norm": 1.9340202961617445, + "language_loss": 0.62164122, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64521718, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.1227417, + "step": 15568, + "time_per_iteration": 5.819858074188232 + }, + { + "auxiliary_loss_clip": 0.01324643, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.21872771, + "balance_loss_mlp": 1.01945853, + "epoch": 0.9360589207876147, + "flos": 22824726437160.0, + "grad_norm": 1.4921762371309586, + "language_loss": 0.7888788, + "learning_rate": 4.269575644764556e-08, + "loss": 0.8124457, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12591553, + "step": 15569, + "time_per_iteration": 2.8947653770446777 + }, + { + "auxiliary_loss_clip": 0.01325742, + "auxiliary_loss_mlp": 0.01028664, + "balance_loss_clip": 1.21741009, + "balance_loss_mlp": 1.01544392, + "epoch": 0.9361190440402826, + "flos": 20889988353000.0, + "grad_norm": 2.137235437325806, + "language_loss": 0.69999611, + "learning_rate": 4.261574992142014e-08, + "loss": 0.72354019, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13232422, + "step": 15570, + "time_per_iteration": 2.8319454193115234 + }, + { + "auxiliary_loss_clip": 0.01322943, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.214957, + "balance_loss_mlp": 1.01583076, + "epoch": 0.9361791672929506, + "flos": 19322545506480.0, + "grad_norm": 1.6742168519082723, + "language_loss": 0.79627919, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.8197937, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12683105, + "step": 15571, + "time_per_iteration": 2.772644281387329 + }, + { + "auxiliary_loss_clip": 0.01327969, + "auxiliary_loss_mlp": 0.01032039, + "balance_loss_clip": 1.22011566, + "balance_loss_mlp": 1.01904535, + "epoch": 0.9362392905456185, + "flos": 15162327505080.0, + "grad_norm": 1.7929010283229265, + "language_loss": 0.77652341, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.80012345, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12988281, + "step": 15572, + "time_per_iteration": 2.764214277267456 + }, + { + "auxiliary_loss_clip": 0.01316697, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.21353161, + "balance_loss_mlp": 1.02084017, + "epoch": 0.9362994137982865, + "flos": 22090339003680.0, + "grad_norm": 1.6542080637980454, + "language_loss": 0.78367782, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80717468, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.12158203, + "step": 15573, + "time_per_iteration": 2.7768030166625977 + }, + { + "auxiliary_loss_clip": 0.01316585, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.21302152, + "balance_loss_mlp": 1.01590192, + "epoch": 0.9363595370509544, + "flos": 23517344849400.0, + "grad_norm": 1.526425777537633, + "language_loss": 0.74930108, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.77275217, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.1262207, + "step": 15574, + "time_per_iteration": 4.284398794174194 + }, + { + "auxiliary_loss_clip": 0.01319356, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.21446252, + "balance_loss_mlp": 1.01786697, + "epoch": 0.9364196603036224, + "flos": 27128914366680.0, + "grad_norm": 1.8033297276941502, + "language_loss": 0.68703884, + "learning_rate": 4.221683071397564e-08, + "loss": 0.71054065, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12939453, + "step": 15575, + "time_per_iteration": 2.812739610671997 + }, + { + "auxiliary_loss_clip": 0.01317278, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.21321702, + "balance_loss_mlp": 1.01958525, + "epoch": 0.9364797835562904, + "flos": 18483723706320.0, + "grad_norm": 1.8003631051761506, + "language_loss": 0.65551841, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67900997, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.12304688, + "step": 15576, + "time_per_iteration": 2.790527820587158 + }, + { + "auxiliary_loss_clip": 0.01321888, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.21323562, + "balance_loss_mlp": 1.01341605, + "epoch": 0.9365399068089584, + "flos": 13009421373120.0, + "grad_norm": 2.7990641407624577, + "language_loss": 0.76741397, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.79090017, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13323975, + "step": 15577, + "time_per_iteration": 2.813431739807129 + }, + { + "auxiliary_loss_clip": 0.01327216, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.21858311, + "balance_loss_mlp": 1.01713347, + "epoch": 0.9366000300616263, + "flos": 25671997140840.0, + "grad_norm": 1.764878317196202, + "language_loss": 0.529827, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.55340004, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1295166, + "step": 15578, + "time_per_iteration": 2.8444199562072754 + }, + { + "auxiliary_loss_clip": 0.0131612, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.21191883, + "balance_loss_mlp": 1.0217303, + "epoch": 0.9366601533142943, + "flos": 21438271361880.0, + "grad_norm": 1.840027006450749, + "language_loss": 0.70952141, + "learning_rate": 4.189903163783692e-08, + "loss": 0.73302513, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12524414, + "step": 15579, + "time_per_iteration": 2.8259196281433105 + }, + { + "auxiliary_loss_clip": 0.01323316, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.2171762, + "balance_loss_mlp": 1.01551795, + "epoch": 0.9367202765669622, + "flos": 24098016348360.0, + "grad_norm": 1.9189080078696459, + "language_loss": 0.76346958, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78697896, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12103271, + "step": 15580, + "time_per_iteration": 2.7622575759887695 + }, + { + "auxiliary_loss_clip": 0.013327, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.22212529, + "balance_loss_mlp": 1.0210197, + "epoch": 0.9367803998196302, + "flos": 20894049189000.0, + "grad_norm": 2.0968693760819073, + "language_loss": 0.66878295, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.69245833, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13818359, + "step": 15581, + "time_per_iteration": 2.796851634979248 + }, + { + "auxiliary_loss_clip": 0.0132285, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.21644974, + "balance_loss_mlp": 1.0154506, + "epoch": 0.9368405230722983, + "flos": 22569418721160.0, + "grad_norm": 2.2344585329556437, + "language_loss": 0.7681601, + "learning_rate": 4.166146195972042e-08, + "loss": 0.79167414, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.13110352, + "step": 15582, + "time_per_iteration": 2.8352980613708496 + }, + { + "auxiliary_loss_clip": 0.01322458, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.21674812, + "balance_loss_mlp": 1.01783979, + "epoch": 0.9369006463249662, + "flos": 18884909943360.0, + "grad_norm": 2.813614628496953, + "language_loss": 0.74220598, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.76573867, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1295166, + "step": 15583, + "time_per_iteration": 2.842092275619507 + }, + { + "auxiliary_loss_clip": 0.01337186, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.22522426, + "balance_loss_mlp": 1.02374506, + "epoch": 0.9369607695776342, + "flos": 26438163939000.0, + "grad_norm": 2.1294562966595323, + "language_loss": 0.84777862, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.8715229, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.1350708, + "step": 15584, + "time_per_iteration": 2.8429081439971924 + }, + { + "auxiliary_loss_clip": 0.01339475, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.22708249, + "balance_loss_mlp": 1.02253091, + "epoch": 0.9370208928303021, + "flos": 39574702770840.0, + "grad_norm": 1.583418937173506, + "language_loss": 0.72467899, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.7484374, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 1.12353516, + "router_z_loss_mlp": 0.1385498, + "step": 15585, + "time_per_iteration": 3.0187106132507324 + }, + { + "auxiliary_loss_clip": 0.01318984, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.2149564, + "balance_loss_mlp": 1.01571488, + "epoch": 0.9370810160829701, + "flos": 22967884198080.0, + "grad_norm": 1.7239229943470655, + "language_loss": 0.80546832, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82893473, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.11950684, + "step": 15586, + "time_per_iteration": 2.8280041217803955 + }, + { + "auxiliary_loss_clip": 0.01320631, + "auxiliary_loss_mlp": 0.01030835, + "balance_loss_clip": 1.21446359, + "balance_loss_mlp": 1.01838338, + "epoch": 0.937141139335638, + "flos": 23080155978240.0, + "grad_norm": 1.5564404000060463, + "language_loss": 0.76315522, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78666985, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12451172, + "step": 15587, + "time_per_iteration": 2.7964916229248047 + }, + { + "auxiliary_loss_clip": 0.0133091, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.22057748, + "balance_loss_mlp": 1.01809621, + "epoch": 0.937201262588306, + "flos": 16360241654160.0, + "grad_norm": 1.8377997052085973, + "language_loss": 0.88115191, + "learning_rate": 4.118832771491387e-08, + "loss": 0.90477288, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13092041, + "step": 15588, + "time_per_iteration": 2.8495492935180664 + }, + { + "auxiliary_loss_clip": 0.01319699, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.21670699, + "balance_loss_mlp": 1.01699829, + "epoch": 0.937261385840974, + "flos": 20198994275160.0, + "grad_norm": 1.7009100441953477, + "language_loss": 0.78092784, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80441344, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.11853027, + "step": 15589, + "time_per_iteration": 3.024589776992798 + }, + { + "auxiliary_loss_clip": 0.01318364, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.21395707, + "balance_loss_mlp": 1.02063894, + "epoch": 0.937321509093642, + "flos": 18300502475280.0, + "grad_norm": 1.7623356058089512, + "language_loss": 0.77754855, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80106485, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.1262207, + "step": 15590, + "time_per_iteration": 2.840284824371338 + }, + { + "auxiliary_loss_clip": 0.01335217, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.22429216, + "balance_loss_mlp": 1.02328968, + "epoch": 0.9373816323463099, + "flos": 25890165188640.0, + "grad_norm": 1.5494196749163396, + "language_loss": 0.71199465, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73571956, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13977051, + "step": 15591, + "time_per_iteration": 2.848630666732788 + }, + { + "auxiliary_loss_clip": 0.01337678, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.22564745, + "balance_loss_mlp": 1.01497364, + "epoch": 0.9374417555989779, + "flos": 27204614995680.0, + "grad_norm": 1.8966175593861545, + "language_loss": 0.53729594, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.56096351, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14111328, + "step": 15592, + "time_per_iteration": 2.807157278060913 + }, + { + "auxiliary_loss_clip": 0.01322883, + "auxiliary_loss_mlp": 0.01025664, + "balance_loss_clip": 1.21635962, + "balance_loss_mlp": 1.01409459, + "epoch": 0.9375018788516458, + "flos": 23626286744040.0, + "grad_norm": 1.452194583697205, + "language_loss": 0.67432833, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69781381, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11572266, + "step": 15593, + "time_per_iteration": 2.848076343536377 + }, + { + "auxiliary_loss_clip": 0.01324326, + "auxiliary_loss_mlp": 0.01028976, + "balance_loss_clip": 1.21645415, + "balance_loss_mlp": 1.0161612, + "epoch": 0.9375620021043138, + "flos": 22684492478160.0, + "grad_norm": 1.5083426614191855, + "language_loss": 0.74420202, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.767735, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12799072, + "step": 15594, + "time_per_iteration": 2.844956874847412 + }, + { + "auxiliary_loss_clip": 0.01320791, + "auxiliary_loss_mlp": 0.01025888, + "balance_loss_clip": 1.21624136, + "balance_loss_mlp": 1.01403248, + "epoch": 0.9376221253569819, + "flos": 27565615937520.0, + "grad_norm": 1.5702539780198188, + "language_loss": 0.73790765, + "learning_rate": 4.063971747165351e-08, + "loss": 0.76137441, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.11859131, + "step": 15595, + "time_per_iteration": 2.8701670169830322 + }, + { + "auxiliary_loss_clip": 0.01327159, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.2187463, + "balance_loss_mlp": 1.01491785, + "epoch": 0.9376822486096498, + "flos": 24134709324600.0, + "grad_norm": 1.7228424892782956, + "language_loss": 0.76112437, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78467071, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12548828, + "step": 15596, + "time_per_iteration": 2.839226245880127 + }, + { + "auxiliary_loss_clip": 0.01319928, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.21255374, + "balance_loss_mlp": 1.01620436, + "epoch": 0.9377423718623178, + "flos": 22789211103360.0, + "grad_norm": 3.0902243542490493, + "language_loss": 0.78774148, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.81122196, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11920166, + "step": 15597, + "time_per_iteration": 2.7961394786834717 + }, + { + "auxiliary_loss_clip": 0.01332105, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.22106242, + "balance_loss_mlp": 1.01889443, + "epoch": 0.9378024951149857, + "flos": 19173215274840.0, + "grad_norm": 1.518246106009256, + "language_loss": 0.8109653, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83460569, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.1305542, + "step": 15598, + "time_per_iteration": 2.745372772216797 + }, + { + "auxiliary_loss_clip": 0.01340944, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.22716546, + "balance_loss_mlp": 1.01743186, + "epoch": 0.9378626183676537, + "flos": 23509710477720.0, + "grad_norm": 5.608930637637119, + "language_loss": 0.63124168, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65495777, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.13238525, + "step": 15599, + "time_per_iteration": 2.7723309993743896 + }, + { + "auxiliary_loss_clip": 0.01328442, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.21927381, + "balance_loss_mlp": 1.01742792, + "epoch": 0.9379227416203216, + "flos": 18410256537120.0, + "grad_norm": 1.7102061768230212, + "language_loss": 0.73501241, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75859892, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12786865, + "step": 15600, + "time_per_iteration": 2.763221263885498 + }, + { + "auxiliary_loss_clip": 0.01321573, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.21621752, + "balance_loss_mlp": 1.01960802, + "epoch": 0.9379828648729897, + "flos": 17826376977720.0, + "grad_norm": 1.8833446428468337, + "language_loss": 0.69788629, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.72141737, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11932373, + "step": 15601, + "time_per_iteration": 2.8396379947662354 + }, + { + "auxiliary_loss_clip": 0.01142827, + "auxiliary_loss_mlp": 0.01004294, + "balance_loss_clip": 1.10017896, + "balance_loss_mlp": 1.00186253, + "epoch": 0.9380429881256576, + "flos": 68039995458960.0, + "grad_norm": 1.6534163554346768, + "language_loss": 0.58157337, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60304463, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02429199, + "step": 15602, + "time_per_iteration": 3.506732225418091 + }, + { + "auxiliary_loss_clip": 0.0132755, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.21895921, + "balance_loss_mlp": 1.02148533, + "epoch": 0.9381031113783256, + "flos": 20781858625560.0, + "grad_norm": 2.01104970024026, + "language_loss": 0.71910715, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74272037, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12310791, + "step": 15603, + "time_per_iteration": 2.813174247741699 + }, + { + "auxiliary_loss_clip": 0.01309612, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.20843232, + "balance_loss_mlp": 1.01769888, + "epoch": 0.9381632346309935, + "flos": 19029651430320.0, + "grad_norm": 1.6663486922065216, + "language_loss": 0.75999022, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78337717, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.11376953, + "step": 15604, + "time_per_iteration": 4.205943584442139 + }, + { + "auxiliary_loss_clip": 0.01327222, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.21874642, + "balance_loss_mlp": 1.01638961, + "epoch": 0.9382233578836615, + "flos": 23519131617240.0, + "grad_norm": 2.509178717287152, + "language_loss": 0.6582756, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.68184072, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12896729, + "step": 15605, + "time_per_iteration": 2.912724018096924 + }, + { + "auxiliary_loss_clip": 0.0133619, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.22399092, + "balance_loss_mlp": 1.01903927, + "epoch": 0.9382834811363294, + "flos": 43075421800560.0, + "grad_norm": 6.355738512448531, + "language_loss": 0.67652339, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.70021319, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13757324, + "step": 15606, + "time_per_iteration": 3.086616277694702 + }, + { + "auxiliary_loss_clip": 0.01312247, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.20980334, + "balance_loss_mlp": 1.01532185, + "epoch": 0.9383436043889974, + "flos": 16440693461280.0, + "grad_norm": 2.171126268158619, + "language_loss": 0.7779184, + "learning_rate": 3.970771343058166e-08, + "loss": 0.80130947, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11529541, + "step": 15607, + "time_per_iteration": 5.853420972824097 + }, + { + "auxiliary_loss_clip": 0.01330995, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.22288895, + "balance_loss_mlp": 1.01798463, + "epoch": 0.9384037276416655, + "flos": 20745490516200.0, + "grad_norm": 1.9139529201053564, + "language_loss": 0.82589823, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84950978, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12182617, + "step": 15608, + "time_per_iteration": 2.860865831375122 + }, + { + "auxiliary_loss_clip": 0.01321851, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.21618414, + "balance_loss_mlp": 1.02030313, + "epoch": 0.9384638508943334, + "flos": 19067400223920.0, + "grad_norm": 2.1603674074257317, + "language_loss": 0.68866855, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.71222246, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.13232422, + "step": 15609, + "time_per_iteration": 2.880753755569458 + }, + { + "auxiliary_loss_clip": 0.01332974, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.22186148, + "balance_loss_mlp": 1.01815689, + "epoch": 0.9385239741470014, + "flos": 23410230330960.0, + "grad_norm": 2.1270678512917187, + "language_loss": 0.75730819, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.78095698, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.13751221, + "step": 15610, + "time_per_iteration": 2.7923474311828613 + }, + { + "auxiliary_loss_clip": 0.0132635, + "auxiliary_loss_mlp": 0.01024669, + "balance_loss_clip": 1.21853805, + "balance_loss_mlp": 1.01271224, + "epoch": 0.9385840973996693, + "flos": 12828799077120.0, + "grad_norm": 1.869473257649862, + "language_loss": 0.75284827, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77635849, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.11962891, + "step": 15611, + "time_per_iteration": 2.774216890335083 + }, + { + "auxiliary_loss_clip": 0.01325426, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.21915412, + "balance_loss_mlp": 1.01674879, + "epoch": 0.9386442206523373, + "flos": 15491224215360.0, + "grad_norm": 2.5979408882012356, + "language_loss": 0.65749317, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68104458, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12963867, + "step": 15612, + "time_per_iteration": 2.846677541732788 + }, + { + "auxiliary_loss_clip": 0.01321339, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.21655536, + "balance_loss_mlp": 1.01811028, + "epoch": 0.9387043439050052, + "flos": 21183816421440.0, + "grad_norm": 1.7976802451147813, + "language_loss": 0.57295954, + "learning_rate": 3.924572515435742e-08, + "loss": 0.5964762, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12219238, + "step": 15613, + "time_per_iteration": 4.334366798400879 + }, + { + "auxiliary_loss_clip": 0.01329905, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.22058082, + "balance_loss_mlp": 1.02022326, + "epoch": 0.9387644671576733, + "flos": 27673461406440.0, + "grad_norm": 1.9036826701869594, + "language_loss": 0.70664227, + "learning_rate": 3.916898732330764e-08, + "loss": 0.73027176, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12823486, + "step": 15614, + "time_per_iteration": 2.8066675662994385 + }, + { + "auxiliary_loss_clip": 0.01332639, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.22172284, + "balance_loss_mlp": 1.01797187, + "epoch": 0.9388245904103412, + "flos": 18840054686760.0, + "grad_norm": 1.8736417807238506, + "language_loss": 0.81081337, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.8344546, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13500977, + "step": 15615, + "time_per_iteration": 2.8339529037475586 + }, + { + "auxiliary_loss_clip": 0.01320523, + "auxiliary_loss_mlp": 0.01025446, + "balance_loss_clip": 1.21456707, + "balance_loss_mlp": 1.01338768, + "epoch": 0.9388847136630092, + "flos": 25489344426840.0, + "grad_norm": 1.5036824363108965, + "language_loss": 0.71967447, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74313414, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12054443, + "step": 15616, + "time_per_iteration": 2.7879538536071777 + }, + { + "auxiliary_loss_clip": 0.01326711, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.22006261, + "balance_loss_mlp": 1.01656723, + "epoch": 0.9389448369156771, + "flos": 18739762372800.0, + "grad_norm": 1.7834890776317287, + "language_loss": 0.66146469, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68502426, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12683105, + "step": 15617, + "time_per_iteration": 2.784181833267212 + }, + { + "auxiliary_loss_clip": 0.01337173, + "auxiliary_loss_mlp": 0.01030812, + "balance_loss_clip": 1.22603297, + "balance_loss_mlp": 1.01688838, + "epoch": 0.9390049601683451, + "flos": 21721825515240.0, + "grad_norm": 2.8373284545772317, + "language_loss": 0.73528296, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75896281, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13928223, + "step": 15618, + "time_per_iteration": 2.7513427734375 + }, + { + "auxiliary_loss_clip": 0.0134165, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.22804689, + "balance_loss_mlp": 1.01763117, + "epoch": 0.939065083421013, + "flos": 19396012675680.0, + "grad_norm": 1.9577886428753604, + "language_loss": 0.70196408, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72569233, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.13531494, + "step": 15619, + "time_per_iteration": 2.8153183460235596 + }, + { + "auxiliary_loss_clip": 0.0133083, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.22241974, + "balance_loss_mlp": 1.01944232, + "epoch": 0.939125206673681, + "flos": 24686971952760.0, + "grad_norm": 1.676037146776955, + "language_loss": 0.77763391, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.80126619, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12945557, + "step": 15620, + "time_per_iteration": 2.8854823112487793 + }, + { + "auxiliary_loss_clip": 0.01318353, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.21349585, + "balance_loss_mlp": 1.01546597, + "epoch": 0.9391853299263491, + "flos": 16330289665680.0, + "grad_norm": 1.9907608784037885, + "language_loss": 0.74264777, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.76610941, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.12335205, + "step": 15621, + "time_per_iteration": 2.8053128719329834 + }, + { + "auxiliary_loss_clip": 0.01335366, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.22395897, + "balance_loss_mlp": 1.01596141, + "epoch": 0.939245453179017, + "flos": 11659496840640.0, + "grad_norm": 1.860451314798016, + "language_loss": 0.66364336, + "learning_rate": 3.855776169545688e-08, + "loss": 0.6872921, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13531494, + "step": 15622, + "time_per_iteration": 2.7993435859680176 + }, + { + "auxiliary_loss_clip": 0.01319779, + "auxiliary_loss_mlp": 0.0102556, + "balance_loss_clip": 1.21465158, + "balance_loss_mlp": 1.01300681, + "epoch": 0.939305576431685, + "flos": 23153907405960.0, + "grad_norm": 1.673916493886277, + "language_loss": 0.71804523, + "learning_rate": 3.848169316300209e-08, + "loss": 0.74149865, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.12561035, + "step": 15623, + "time_per_iteration": 2.8092153072357178 + }, + { + "auxiliary_loss_clip": 0.01331021, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.22306323, + "balance_loss_mlp": 1.01477647, + "epoch": 0.9393656996843529, + "flos": 33293804694120.0, + "grad_norm": 1.9053786375611437, + "language_loss": 0.73048478, + "learning_rate": 3.84056990115178e-08, + "loss": 0.75406963, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12683105, + "step": 15624, + "time_per_iteration": 2.9647436141967773 + }, + { + "auxiliary_loss_clip": 0.01321026, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.21593308, + "balance_loss_mlp": 1.01913047, + "epoch": 0.9394258229370209, + "flos": 21694472461800.0, + "grad_norm": 1.6910838499251717, + "language_loss": 0.89477408, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91829675, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12097168, + "step": 15625, + "time_per_iteration": 2.8326704502105713 + }, + { + "auxiliary_loss_clip": 0.01322765, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.21570611, + "balance_loss_mlp": 1.01812541, + "epoch": 0.9394859461896888, + "flos": 23878954916640.0, + "grad_norm": 1.6026714949565122, + "language_loss": 0.83970046, + "learning_rate": 3.825393386298592e-08, + "loss": 0.86324149, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13214111, + "step": 15626, + "time_per_iteration": 2.884368896484375 + }, + { + "auxiliary_loss_clip": 0.01143891, + "auxiliary_loss_mlp": 0.01001711, + "balance_loss_clip": 1.10111427, + "balance_loss_mlp": 0.99907631, + "epoch": 0.9395460694423569, + "flos": 61580830371120.0, + "grad_norm": 0.8631384902194449, + "language_loss": 0.56141406, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58287007, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02636719, + "step": 15627, + "time_per_iteration": 3.2203664779663086 + }, + { + "auxiliary_loss_clip": 0.01324943, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.21890473, + "balance_loss_mlp": 1.0178287, + "epoch": 0.9396061926950248, + "flos": 21000595190400.0, + "grad_norm": 1.4058725723569359, + "language_loss": 0.70173734, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72528666, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12145996, + "step": 15628, + "time_per_iteration": 2.829658269882202 + }, + { + "auxiliary_loss_clip": 0.0132271, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.21620107, + "balance_loss_mlp": 1.01741076, + "epoch": 0.9396663159476928, + "flos": 27493204585680.0, + "grad_norm": 2.080403656932253, + "language_loss": 0.75563025, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77915514, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12353516, + "step": 15629, + "time_per_iteration": 2.8191211223602295 + }, + { + "auxiliary_loss_clip": 0.01315007, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.21252608, + "balance_loss_mlp": 1.01436949, + "epoch": 0.9397264392003607, + "flos": 19432218351600.0, + "grad_norm": 1.578714650859377, + "language_loss": 0.74042767, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76384199, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.12060547, + "step": 15630, + "time_per_iteration": 2.7383899688720703 + }, + { + "auxiliary_loss_clip": 0.01318512, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.21451592, + "balance_loss_mlp": 1.0201931, + "epoch": 0.9397865624530287, + "flos": 18009435775320.0, + "grad_norm": 2.0337711614206966, + "language_loss": 0.6934216, + "learning_rate": 3.787582286001845e-08, + "loss": 0.7169292, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.1204834, + "step": 15631, + "time_per_iteration": 2.7170865535736084 + }, + { + "auxiliary_loss_clip": 0.01322535, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.218153, + "balance_loss_mlp": 1.01842928, + "epoch": 0.9398466857056966, + "flos": 22569702979680.0, + "grad_norm": 2.811725313136008, + "language_loss": 0.75168771, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77521807, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12078857, + "step": 15632, + "time_per_iteration": 2.74619722366333 + }, + { + "auxiliary_loss_clip": 0.01344052, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.23260844, + "balance_loss_mlp": 1.01527429, + "epoch": 0.9399068089583646, + "flos": 24541134040080.0, + "grad_norm": 1.4692810469416129, + "language_loss": 0.74721754, + "learning_rate": 3.772509926639622e-08, + "loss": 0.77095187, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14111328, + "step": 15633, + "time_per_iteration": 2.9332427978515625 + }, + { + "auxiliary_loss_clip": 0.01329461, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.21992755, + "balance_loss_mlp": 1.02186668, + "epoch": 0.9399669322110327, + "flos": 25635953898360.0, + "grad_norm": 1.8139597555449969, + "language_loss": 0.73101246, + "learning_rate": 3.764984908264823e-08, + "loss": 0.75466645, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14068604, + "step": 15634, + "time_per_iteration": 2.909874677658081 + }, + { + "auxiliary_loss_clip": 0.01333662, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.22300458, + "balance_loss_mlp": 1.01692176, + "epoch": 0.9400270554637006, + "flos": 17093451445200.0, + "grad_norm": 2.0986155892532428, + "language_loss": 0.69262475, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71626908, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13861084, + "step": 15635, + "time_per_iteration": 2.810483694076538 + }, + { + "auxiliary_loss_clip": 0.01317868, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.21415317, + "balance_loss_mlp": 1.01797032, + "epoch": 0.9400871787163686, + "flos": 22060305798480.0, + "grad_norm": 1.5182882395698383, + "language_loss": 0.74467331, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76815653, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12475586, + "step": 15636, + "time_per_iteration": 2.8543319702148438 + }, + { + "auxiliary_loss_clip": 0.01328814, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.22126961, + "balance_loss_mlp": 1.01981854, + "epoch": 0.9401473019690365, + "flos": 16987189702320.0, + "grad_norm": 2.110753867526084, + "language_loss": 0.83365828, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85727471, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13012695, + "step": 15637, + "time_per_iteration": 2.8620526790618896 + }, + { + "auxiliary_loss_clip": 0.01327453, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.21992791, + "balance_loss_mlp": 1.01842117, + "epoch": 0.9402074252217045, + "flos": 19686632683680.0, + "grad_norm": 1.9917559079533318, + "language_loss": 0.69508725, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.718678, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13208008, + "step": 15638, + "time_per_iteration": 2.7686569690704346 + }, + { + "auxiliary_loss_clip": 0.01313604, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.21129906, + "balance_loss_mlp": 1.0212779, + "epoch": 0.9402675484743724, + "flos": 24759748779840.0, + "grad_norm": 1.6174948730972547, + "language_loss": 0.85148865, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87494969, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.11224365, + "step": 15639, + "time_per_iteration": 2.776632070541382 + }, + { + "auxiliary_loss_clip": 0.01326887, + "auxiliary_loss_mlp": 0.01026082, + "balance_loss_clip": 1.21948862, + "balance_loss_mlp": 1.01324964, + "epoch": 0.9403276717270405, + "flos": 25564395322080.0, + "grad_norm": 1.4694846208276733, + "language_loss": 0.78287923, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80640894, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.128479, + "step": 15640, + "time_per_iteration": 2.8568403720855713 + }, + { + "auxiliary_loss_clip": 0.01333474, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.22306359, + "balance_loss_mlp": 1.01335883, + "epoch": 0.9403877949797084, + "flos": 26695786331520.0, + "grad_norm": 37.16061634106633, + "language_loss": 0.74160337, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76519489, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12304688, + "step": 15641, + "time_per_iteration": 2.887075901031494 + }, + { + "auxiliary_loss_clip": 0.0133723, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.22463977, + "balance_loss_mlp": 1.02132845, + "epoch": 0.9404479182323764, + "flos": 15015718033560.0, + "grad_norm": 1.989355471443445, + "language_loss": 0.82505018, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84878254, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14672852, + "step": 15642, + "time_per_iteration": 2.748216390609741 + }, + { + "auxiliary_loss_clip": 0.01320559, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.21566033, + "balance_loss_mlp": 1.01939607, + "epoch": 0.9405080414850443, + "flos": 24979094470080.0, + "grad_norm": 1.8560974057351867, + "language_loss": 0.69031072, + "learning_rate": 3.697594633355084e-08, + "loss": 0.71383369, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12347412, + "step": 15643, + "time_per_iteration": 4.250083923339844 + }, + { + "auxiliary_loss_clip": 0.01330925, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.22084713, + "balance_loss_mlp": 1.01866829, + "epoch": 0.9405681647377123, + "flos": 20848950282240.0, + "grad_norm": 1.7733204442962642, + "language_loss": 0.77042258, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79404873, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13018799, + "step": 15644, + "time_per_iteration": 2.7585508823394775 + }, + { + "auxiliary_loss_clip": 0.01322799, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.21814942, + "balance_loss_mlp": 1.02076948, + "epoch": 0.9406282879903802, + "flos": 23810888659320.0, + "grad_norm": 1.4841312516724865, + "language_loss": 0.67469889, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69825524, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.1206665, + "step": 15645, + "time_per_iteration": 2.8116419315338135 + }, + { + "auxiliary_loss_clip": 0.01320115, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.21674132, + "balance_loss_mlp": 1.01564598, + "epoch": 0.9406884112430483, + "flos": 27681014561400.0, + "grad_norm": 1.4463289059571822, + "language_loss": 0.70401478, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72749925, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12695312, + "step": 15646, + "time_per_iteration": 4.451687812805176 + }, + { + "auxiliary_loss_clip": 0.0132385, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.21770191, + "balance_loss_mlp": 1.01673293, + "epoch": 0.9407485344957163, + "flos": 23080155978240.0, + "grad_norm": 1.6169076614911744, + "language_loss": 0.74190605, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76543725, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12548828, + "step": 15647, + "time_per_iteration": 4.4265875816345215 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.00999704, + "balance_loss_clip": 1.09920418, + "balance_loss_mlp": 0.99683136, + "epoch": 0.9408086577483842, + "flos": 71029733581440.0, + "grad_norm": 0.8779632086029427, + "language_loss": 0.63534635, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65675706, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02868652, + "step": 15648, + "time_per_iteration": 3.455333709716797 + }, + { + "auxiliary_loss_clip": 0.01318306, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.21451771, + "balance_loss_mlp": 1.01991451, + "epoch": 0.9408687810010522, + "flos": 23736040805880.0, + "grad_norm": 1.4048383559113813, + "language_loss": 0.66707361, + "learning_rate": 3.653002741939337e-08, + "loss": 0.69057018, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.11431885, + "step": 15649, + "time_per_iteration": 2.895169734954834 + }, + { + "auxiliary_loss_clip": 0.01327382, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.2190547, + "balance_loss_mlp": 1.01862156, + "epoch": 0.9409289042537201, + "flos": 18373929036120.0, + "grad_norm": 1.888576134402067, + "language_loss": 0.77609617, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79968011, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12402344, + "step": 15650, + "time_per_iteration": 2.80324125289917 + }, + { + "auxiliary_loss_clip": 0.01324507, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.2185365, + "balance_loss_mlp": 1.0161829, + "epoch": 0.9409890275063881, + "flos": 23883624878040.0, + "grad_norm": 2.4871283416057093, + "language_loss": 0.74308491, + "learning_rate": 3.638198339114451e-08, + "loss": 0.7666167, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12506104, + "step": 15651, + "time_per_iteration": 2.7825374603271484 + }, + { + "auxiliary_loss_clip": 0.013224, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.21649146, + "balance_loss_mlp": 1.01677394, + "epoch": 0.941049150759056, + "flos": 16549472922480.0, + "grad_norm": 1.997294231795253, + "language_loss": 0.72245622, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74597597, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12805176, + "step": 15652, + "time_per_iteration": 4.2009642124176025 + }, + { + "auxiliary_loss_clip": 0.01342632, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.22904611, + "balance_loss_mlp": 1.02571082, + "epoch": 0.9411092740117241, + "flos": 25123795348680.0, + "grad_norm": 1.8968827911096873, + "language_loss": 0.66488522, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68871534, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.14660645, + "step": 15653, + "time_per_iteration": 2.8128929138183594 + }, + { + "auxiliary_loss_clip": 0.0133093, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.22267437, + "balance_loss_mlp": 1.01899481, + "epoch": 0.941169397264392, + "flos": 21147245270280.0, + "grad_norm": 1.8235225555304624, + "language_loss": 0.77760577, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80123746, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13250732, + "step": 15654, + "time_per_iteration": 2.722442388534546 + }, + { + "auxiliary_loss_clip": 0.01343336, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.22960413, + "balance_loss_mlp": 1.0186168, + "epoch": 0.94122952051706, + "flos": 38516088588480.0, + "grad_norm": 1.5383171835252973, + "language_loss": 0.7030046, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72675133, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.12731934, + "step": 15655, + "time_per_iteration": 2.957550525665283 + }, + { + "auxiliary_loss_clip": 0.01325529, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.21785808, + "balance_loss_mlp": 1.0144217, + "epoch": 0.9412896437697279, + "flos": 18373807211040.0, + "grad_norm": 1.8852918211062037, + "language_loss": 0.72581661, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74935317, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13696289, + "step": 15656, + "time_per_iteration": 2.748331308364868 + }, + { + "auxiliary_loss_clip": 0.01328423, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.22107828, + "balance_loss_mlp": 1.01726377, + "epoch": 0.9413497670223959, + "flos": 25890490055520.0, + "grad_norm": 5.337508921983008, + "language_loss": 0.78484511, + "learning_rate": 3.593963845018377e-08, + "loss": 0.80842102, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.11914062, + "step": 15657, + "time_per_iteration": 2.811382532119751 + }, + { + "auxiliary_loss_clip": 0.01323141, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.21555424, + "balance_loss_mlp": 1.01380539, + "epoch": 0.9414098902750638, + "flos": 16622737049880.0, + "grad_norm": 1.9232669189176719, + "language_loss": 0.84170288, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86519885, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12646484, + "step": 15658, + "time_per_iteration": 2.722691059112549 + }, + { + "auxiliary_loss_clip": 0.01336664, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.22454095, + "balance_loss_mlp": 1.01851678, + "epoch": 0.9414700135277319, + "flos": 18629724052440.0, + "grad_norm": 2.3859203620679557, + "language_loss": 0.707569, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.7312659, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.14508057, + "step": 15659, + "time_per_iteration": 2.7705276012420654 + }, + { + "auxiliary_loss_clip": 0.01322374, + "auxiliary_loss_mlp": 0.01034236, + "balance_loss_clip": 1.2158494, + "balance_loss_mlp": 1.02223742, + "epoch": 0.9415301367803999, + "flos": 26284935304800.0, + "grad_norm": 1.8268919957146732, + "language_loss": 0.80047256, + "learning_rate": 3.571947138643172e-08, + "loss": 0.82403868, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.12005615, + "step": 15660, + "time_per_iteration": 2.8594870567321777 + }, + { + "auxiliary_loss_clip": 0.01316035, + "auxiliary_loss_mlp": 0.01024562, + "balance_loss_clip": 1.21271873, + "balance_loss_mlp": 1.01233065, + "epoch": 0.9415902600330678, + "flos": 23267031961680.0, + "grad_norm": 1.4947782150377218, + "language_loss": 0.68361032, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70701623, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12231445, + "step": 15661, + "time_per_iteration": 2.8390135765075684 + }, + { + "auxiliary_loss_clip": 0.01322568, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.2142117, + "balance_loss_mlp": 1.01759791, + "epoch": 0.9416503832857358, + "flos": 14723067607560.0, + "grad_norm": 2.8160620426744867, + "language_loss": 0.66492105, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68844908, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12664795, + "step": 15662, + "time_per_iteration": 2.7044262886047363 + }, + { + "auxiliary_loss_clip": 0.01140711, + "auxiliary_loss_mlp": 0.01003326, + "balance_loss_clip": 1.09861696, + "balance_loss_mlp": 1.00061953, + "epoch": 0.9417105065384037, + "flos": 70327247337720.0, + "grad_norm": 0.7833118583902458, + "language_loss": 0.59305179, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61449218, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02709961, + "step": 15663, + "time_per_iteration": 3.3432118892669678 + }, + { + "auxiliary_loss_clip": 0.01337078, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.22518587, + "balance_loss_mlp": 1.01868737, + "epoch": 0.9417706297910717, + "flos": 34064519628600.0, + "grad_norm": 1.8840794310377786, + "language_loss": 0.6695959, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69329417, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.14074707, + "step": 15664, + "time_per_iteration": 2.8816945552825928 + }, + { + "auxiliary_loss_clip": 0.01321776, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.2162447, + "balance_loss_mlp": 1.01399314, + "epoch": 0.9418307530437396, + "flos": 16475924536560.0, + "grad_norm": 1.8370387886268518, + "language_loss": 0.7372781, + "learning_rate": 3.535401603143207e-08, + "loss": 0.76075864, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.1227417, + "step": 15665, + "time_per_iteration": 2.717027425765991 + }, + { + "auxiliary_loss_clip": 0.01318298, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.21415615, + "balance_loss_mlp": 1.01883864, + "epoch": 0.9418908762964077, + "flos": 11257092352800.0, + "grad_norm": 1.941065121276219, + "language_loss": 0.63711309, + "learning_rate": 3.528114844807773e-08, + "loss": 0.66060799, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12365723, + "step": 15666, + "time_per_iteration": 2.754596471786499 + }, + { + "auxiliary_loss_clip": 0.0132591, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.2185359, + "balance_loss_mlp": 1.01870799, + "epoch": 0.9419509995490756, + "flos": 18443010502440.0, + "grad_norm": 1.8267772114186722, + "language_loss": 0.79025412, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81383252, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.13220215, + "step": 15667, + "time_per_iteration": 2.7536933422088623 + }, + { + "auxiliary_loss_clip": 0.0132249, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.21745801, + "balance_loss_mlp": 1.01709569, + "epoch": 0.9420111228017436, + "flos": 20742485497560.0, + "grad_norm": 1.6417205769829648, + "language_loss": 0.75676841, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.78027958, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.11529541, + "step": 15668, + "time_per_iteration": 2.718134880065918 + }, + { + "auxiliary_loss_clip": 0.01327239, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.21766114, + "balance_loss_mlp": 1.01974082, + "epoch": 0.9420712460544115, + "flos": 21146879795040.0, + "grad_norm": 2.816608270598485, + "language_loss": 0.59042275, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61402154, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12896729, + "step": 15669, + "time_per_iteration": 2.7051281929016113 + }, + { + "auxiliary_loss_clip": 0.01317814, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.21499169, + "balance_loss_mlp": 1.01593423, + "epoch": 0.9421313693070795, + "flos": 15856042343040.0, + "grad_norm": 1.514949070449945, + "language_loss": 0.77322674, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.7966851, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.12084961, + "step": 15670, + "time_per_iteration": 2.751938581466675 + }, + { + "auxiliary_loss_clip": 0.01323817, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.21717978, + "balance_loss_mlp": 1.01982236, + "epoch": 0.9421914925597474, + "flos": 32422310145360.0, + "grad_norm": 1.8403532735681167, + "language_loss": 0.65044647, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67402595, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.14324951, + "step": 15671, + "time_per_iteration": 2.904020309448242 + }, + { + "auxiliary_loss_clip": 0.01324808, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.21819043, + "balance_loss_mlp": 1.02213359, + "epoch": 0.9422516158124155, + "flos": 19723203834840.0, + "grad_norm": 1.4745686921219976, + "language_loss": 0.79352158, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81711376, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.12268066, + "step": 15672, + "time_per_iteration": 2.7878785133361816 + }, + { + "auxiliary_loss_clip": 0.01335321, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.22314179, + "balance_loss_mlp": 1.01831746, + "epoch": 0.9423117390650835, + "flos": 16257634663680.0, + "grad_norm": 1.9059488929617432, + "language_loss": 0.73617107, + "learning_rate": 3.47731615843776e-08, + "loss": 0.7598412, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13372803, + "step": 15673, + "time_per_iteration": 2.7140891551971436 + }, + { + "auxiliary_loss_clip": 0.0132837, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.22118235, + "balance_loss_mlp": 1.01572633, + "epoch": 0.9423718623177514, + "flos": 31802996468880.0, + "grad_norm": 1.4563103411343568, + "language_loss": 0.70080137, + "learning_rate": 3.470089009683974e-08, + "loss": 0.7243765, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13415527, + "step": 15674, + "time_per_iteration": 2.8025758266448975 + }, + { + "auxiliary_loss_clip": 0.01326936, + "auxiliary_loss_mlp": 0.01021742, + "balance_loss_clip": 1.22022069, + "balance_loss_mlp": 1.00948715, + "epoch": 0.9424319855704194, + "flos": 23337128637000.0, + "grad_norm": 1.7888069163025868, + "language_loss": 0.8129527, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83643949, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12261963, + "step": 15675, + "time_per_iteration": 2.7864902019500732 + }, + { + "auxiliary_loss_clip": 0.0132345, + "auxiliary_loss_mlp": 0.01030018, + "balance_loss_clip": 1.21733689, + "balance_loss_mlp": 1.01721501, + "epoch": 0.9424921088230873, + "flos": 20782589576040.0, + "grad_norm": 2.4491552310770786, + "language_loss": 0.62694067, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.65047538, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12823486, + "step": 15676, + "time_per_iteration": 2.7099430561065674 + }, + { + "auxiliary_loss_clip": 0.01324928, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.21744215, + "balance_loss_mlp": 1.02144909, + "epoch": 0.9425522320757553, + "flos": 19031438198160.0, + "grad_norm": 1.704488818796953, + "language_loss": 0.67973441, + "learning_rate": 3.448452279120984e-08, + "loss": 0.70332646, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12835693, + "step": 15677, + "time_per_iteration": 2.7799901962280273 + }, + { + "auxiliary_loss_clip": 0.01328476, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.21899104, + "balance_loss_mlp": 1.01965737, + "epoch": 0.9426123553284232, + "flos": 25160813191800.0, + "grad_norm": 2.9314013913262693, + "language_loss": 0.64411628, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66773355, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13592529, + "step": 15678, + "time_per_iteration": 2.9971511363983154 + }, + { + "auxiliary_loss_clip": 0.01324326, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.21807086, + "balance_loss_mlp": 1.01394463, + "epoch": 0.9426724785810913, + "flos": 21184425546840.0, + "grad_norm": 1.4218767785749344, + "language_loss": 0.74823844, + "learning_rate": 3.434065057895097e-08, + "loss": 0.77174902, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12799072, + "step": 15679, + "time_per_iteration": 2.848564863204956 + }, + { + "auxiliary_loss_clip": 0.0133705, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.22704649, + "balance_loss_mlp": 1.02110326, + "epoch": 0.9427326018337592, + "flos": 14761019442960.0, + "grad_norm": 4.4628858738348764, + "language_loss": 0.77306056, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79677397, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.13201904, + "step": 15680, + "time_per_iteration": 2.782557487487793 + }, + { + "auxiliary_loss_clip": 0.01317206, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.21167827, + "balance_loss_mlp": 1.01575375, + "epoch": 0.9427927250864272, + "flos": 20928630530520.0, + "grad_norm": 1.7470141957828818, + "language_loss": 0.75359488, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77704477, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12042236, + "step": 15681, + "time_per_iteration": 2.7923896312713623 + }, + { + "auxiliary_loss_clip": 0.01329433, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.22067881, + "balance_loss_mlp": 1.02343428, + "epoch": 0.9428528483390951, + "flos": 19756891792440.0, + "grad_norm": 4.169331780492892, + "language_loss": 0.66317284, + "learning_rate": 3.412540130236086e-08, + "loss": 0.6868341, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13244629, + "step": 15682, + "time_per_iteration": 4.254220485687256 + }, + { + "auxiliary_loss_clip": 0.01325713, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.21875334, + "balance_loss_mlp": 1.01665163, + "epoch": 0.9429129715917631, + "flos": 24540362481240.0, + "grad_norm": 1.7418667004463484, + "language_loss": 0.77015841, + "learning_rate": 3.405380063219665e-08, + "loss": 0.79370898, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12683105, + "step": 15683, + "time_per_iteration": 2.8600196838378906 + }, + { + "auxiliary_loss_clip": 0.01331911, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.22227132, + "balance_loss_mlp": 1.01793158, + "epoch": 0.942973094844431, + "flos": 17963321659560.0, + "grad_norm": 3.6602183467564693, + "language_loss": 0.76081204, + "learning_rate": 3.398227451090885e-08, + "loss": 0.78444433, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1338501, + "step": 15684, + "time_per_iteration": 2.8116366863250732 + }, + { + "auxiliary_loss_clip": 0.01319055, + "auxiliary_loss_mlp": 0.01023791, + "balance_loss_clip": 1.2143501, + "balance_loss_mlp": 1.01233482, + "epoch": 0.9430332180970991, + "flos": 26142914577960.0, + "grad_norm": 1.5396551528952758, + "language_loss": 0.77463353, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79806197, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.11462402, + "step": 15685, + "time_per_iteration": 4.363309621810913 + }, + { + "auxiliary_loss_clip": 0.0132342, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.21903443, + "balance_loss_mlp": 1.01347458, + "epoch": 0.943093341349767, + "flos": 23956685963640.0, + "grad_norm": 1.7290113257064785, + "language_loss": 0.75816774, + "learning_rate": 3.383944592581023e-08, + "loss": 0.78165728, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12054443, + "step": 15686, + "time_per_iteration": 4.28046989440918 + }, + { + "auxiliary_loss_clip": 0.01333594, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.224123, + "balance_loss_mlp": 1.01547432, + "epoch": 0.943153464602435, + "flos": 17973027057600.0, + "grad_norm": 1.6839004277703282, + "language_loss": 0.80551219, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82913452, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13165283, + "step": 15687, + "time_per_iteration": 2.732461452484131 + }, + { + "auxiliary_loss_clip": 0.01331736, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.22072983, + "balance_loss_mlp": 1.01654005, + "epoch": 0.943213587855103, + "flos": 14505589901880.0, + "grad_norm": 2.826952208874954, + "language_loss": 0.75967461, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78329831, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.14086914, + "step": 15688, + "time_per_iteration": 2.730027914047241 + }, + { + "auxiliary_loss_clip": 0.0131442, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.21194029, + "balance_loss_mlp": 1.01432681, + "epoch": 0.9432737111077709, + "flos": 28992378133080.0, + "grad_norm": 1.8401502677755002, + "language_loss": 0.68684918, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.7102654, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.12884521, + "step": 15689, + "time_per_iteration": 2.8089852333068848 + }, + { + "auxiliary_loss_clip": 0.0131909, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.21440887, + "balance_loss_mlp": 1.02019978, + "epoch": 0.9433338343604389, + "flos": 21613127270760.0, + "grad_norm": 1.8975561513307815, + "language_loss": 0.80365103, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82714993, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.10595703, + "step": 15690, + "time_per_iteration": 4.229931831359863 + }, + { + "auxiliary_loss_clip": 0.01322909, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.21688426, + "balance_loss_mlp": 1.01556408, + "epoch": 0.9433939576131068, + "flos": 33188801810400.0, + "grad_norm": 1.8034477421491741, + "language_loss": 0.60785222, + "learning_rate": 3.348367925792317e-08, + "loss": 0.6313647, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.12780762, + "step": 15691, + "time_per_iteration": 2.8508949279785156 + }, + { + "auxiliary_loss_clip": 0.0132993, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.22088933, + "balance_loss_mlp": 1.01970863, + "epoch": 0.9434540808657749, + "flos": 20491401051000.0, + "grad_norm": 1.4294960823678717, + "language_loss": 0.66590357, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68952835, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1282959, + "step": 15692, + "time_per_iteration": 2.908088207244873 + }, + { + "auxiliary_loss_clip": 0.01327479, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.21961546, + "balance_loss_mlp": 1.01675034, + "epoch": 0.9435142041184428, + "flos": 21547903598640.0, + "grad_norm": 2.0988676719559614, + "language_loss": 0.75421005, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77777815, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12567139, + "step": 15693, + "time_per_iteration": 2.854217529296875 + }, + { + "auxiliary_loss_clip": 0.01324262, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.21769631, + "balance_loss_mlp": 1.0180105, + "epoch": 0.9435743273711108, + "flos": 25014487978800.0, + "grad_norm": 1.685195364113527, + "language_loss": 0.73301375, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75656128, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12469482, + "step": 15694, + "time_per_iteration": 2.8502891063690186 + }, + { + "auxiliary_loss_clip": 0.0114212, + "auxiliary_loss_mlp": 0.01002542, + "balance_loss_clip": 1.09945369, + "balance_loss_mlp": 1.00006223, + "epoch": 0.9436344506237787, + "flos": 60175265739480.0, + "grad_norm": 0.6954771834720003, + "language_loss": 0.50678307, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52822971, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02478027, + "step": 15695, + "time_per_iteration": 3.3699870109558105 + }, + { + "auxiliary_loss_clip": 0.01318516, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.21605444, + "balance_loss_mlp": 1.02044618, + "epoch": 0.9436945738764467, + "flos": 22242430603800.0, + "grad_norm": 1.5585374475364004, + "language_loss": 0.65036696, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67387199, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.11553955, + "step": 15696, + "time_per_iteration": 2.867872953414917 + }, + { + "auxiliary_loss_clip": 0.01325424, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.21826255, + "balance_loss_mlp": 1.01532412, + "epoch": 0.9437546971291146, + "flos": 25050531221280.0, + "grad_norm": 1.676416802751418, + "language_loss": 0.66401541, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68754232, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11938477, + "step": 15697, + "time_per_iteration": 2.934115171432495 + }, + { + "auxiliary_loss_clip": 0.01140983, + "auxiliary_loss_mlp": 0.01001971, + "balance_loss_clip": 1.09849083, + "balance_loss_mlp": 0.99933642, + "epoch": 0.9438148203817827, + "flos": 56805440745960.0, + "grad_norm": 0.8437920244236822, + "language_loss": 0.6332339, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65466344, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 15698, + "time_per_iteration": 3.1794512271881104 + }, + { + "auxiliary_loss_clip": 0.01332803, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.22156942, + "balance_loss_mlp": 1.02502751, + "epoch": 0.9438749436344506, + "flos": 22351534931880.0, + "grad_norm": 1.7942718930529595, + "language_loss": 0.69990945, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72362483, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.137146, + "step": 15699, + "time_per_iteration": 2.8358395099639893 + }, + { + "auxiliary_loss_clip": 0.01318452, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.21435726, + "balance_loss_mlp": 1.01682043, + "epoch": 0.9439350668871186, + "flos": 13374767409480.0, + "grad_norm": 1.9847024418377899, + "language_loss": 0.74647063, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76994348, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12023926, + "step": 15700, + "time_per_iteration": 2.817640542984009 + }, + { + "auxiliary_loss_clip": 0.01323432, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.21841049, + "balance_loss_mlp": 1.0188278, + "epoch": 0.9439951901397866, + "flos": 17789846434920.0, + "grad_norm": 1.4722804848887565, + "language_loss": 0.70633787, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72988087, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12036133, + "step": 15701, + "time_per_iteration": 2.836822032928467 + }, + { + "auxiliary_loss_clip": 0.01340214, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.22629642, + "balance_loss_mlp": 1.01569045, + "epoch": 0.9440553133924545, + "flos": 18884178992880.0, + "grad_norm": 1.829407547359765, + "language_loss": 0.78173375, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80542374, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.13104248, + "step": 15702, + "time_per_iteration": 2.777754068374634 + }, + { + "auxiliary_loss_clip": 0.01334164, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.22404337, + "balance_loss_mlp": 1.01408494, + "epoch": 0.9441154366451225, + "flos": 19577690789040.0, + "grad_norm": 1.8008623695589487, + "language_loss": 0.66537416, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68897808, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12133789, + "step": 15703, + "time_per_iteration": 2.8197717666625977 + }, + { + "auxiliary_loss_clip": 0.01330014, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.22160399, + "balance_loss_mlp": 1.01384807, + "epoch": 0.9441755598977905, + "flos": 30300493035960.0, + "grad_norm": 3.142315318508543, + "language_loss": 0.73478103, + "learning_rate": 3.256741150552833e-08, + "loss": 0.758349, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1293335, + "step": 15704, + "time_per_iteration": 2.9256227016448975 + }, + { + "auxiliary_loss_clip": 0.01319374, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.21455109, + "balance_loss_mlp": 1.01879072, + "epoch": 0.9442356831504585, + "flos": 20672794905840.0, + "grad_norm": 1.6901146208979592, + "language_loss": 0.74241924, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76593268, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.13171387, + "step": 15705, + "time_per_iteration": 2.7911508083343506 + }, + { + "auxiliary_loss_clip": 0.01321153, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.21543407, + "balance_loss_mlp": 1.01748395, + "epoch": 0.9442958064031264, + "flos": 16111593709200.0, + "grad_norm": 1.680293505926316, + "language_loss": 0.77570879, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79921639, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12139893, + "step": 15706, + "time_per_iteration": 2.7458138465881348 + }, + { + "auxiliary_loss_clip": 0.01308945, + "auxiliary_loss_mlp": 0.0102556, + "balance_loss_clip": 1.20615935, + "balance_loss_mlp": 1.01368642, + "epoch": 0.9443559296557944, + "flos": 20452068531360.0, + "grad_norm": 1.9938431528330212, + "language_loss": 0.69430804, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71765316, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.11871338, + "step": 15707, + "time_per_iteration": 2.827043056488037 + }, + { + "auxiliary_loss_clip": 0.01316237, + "auxiliary_loss_mlp": 0.0102844, + "balance_loss_clip": 1.21419263, + "balance_loss_mlp": 1.01707947, + "epoch": 0.9444160529084623, + "flos": 21615198297120.0, + "grad_norm": 4.015150055792731, + "language_loss": 0.6924789, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71592569, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.11358643, + "step": 15708, + "time_per_iteration": 2.889878034591675 + }, + { + "auxiliary_loss_clip": 0.01321763, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.21692252, + "balance_loss_mlp": 1.01584339, + "epoch": 0.9444761761611303, + "flos": 18446340387960.0, + "grad_norm": 8.547541626351755, + "language_loss": 0.71504802, + "learning_rate": 3.221835774749748e-08, + "loss": 0.73854327, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.11907959, + "step": 15709, + "time_per_iteration": 2.7285826206207275 + }, + { + "auxiliary_loss_clip": 0.01322341, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.21791673, + "balance_loss_mlp": 1.02194488, + "epoch": 0.9445362994137982, + "flos": 20961749971080.0, + "grad_norm": 1.9190535203387022, + "language_loss": 0.84952831, + "learning_rate": 3.214877084074774e-08, + "loss": 0.87309945, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.1282959, + "step": 15710, + "time_per_iteration": 2.759859323501587 + }, + { + "auxiliary_loss_clip": 0.01333409, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.22225952, + "balance_loss_mlp": 1.01950979, + "epoch": 0.9445964226664663, + "flos": 20308179819960.0, + "grad_norm": 1.5851196585244156, + "language_loss": 0.71595526, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73962212, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.13757324, + "step": 15711, + "time_per_iteration": 2.7338221073150635 + }, + { + "auxiliary_loss_clip": 0.01330291, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.22310257, + "balance_loss_mlp": 1.01812029, + "epoch": 0.9446565459191342, + "flos": 26401674004560.0, + "grad_norm": 1.5950747320594054, + "language_loss": 0.69687331, + "learning_rate": 3.200982089323179e-08, + "loss": 0.72048277, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12536621, + "step": 15712, + "time_per_iteration": 2.803011178970337 + }, + { + "auxiliary_loss_clip": 0.01336592, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.22617066, + "balance_loss_mlp": 1.0202198, + "epoch": 0.9447166691718022, + "flos": 16549107447240.0, + "grad_norm": 2.12982489850213, + "language_loss": 0.71273881, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73644382, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13696289, + "step": 15713, + "time_per_iteration": 2.731048345565796 + }, + { + "auxiliary_loss_clip": 0.0132121, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.21690381, + "balance_loss_mlp": 1.01665902, + "epoch": 0.9447767924244702, + "flos": 29170076627160.0, + "grad_norm": 1.4908754483422715, + "language_loss": 0.76845467, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79196811, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.13470459, + "step": 15714, + "time_per_iteration": 2.8216590881347656 + }, + { + "auxiliary_loss_clip": 0.01329811, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.21986806, + "balance_loss_mlp": 1.02055478, + "epoch": 0.9448369156771381, + "flos": 19278786675600.0, + "grad_norm": 2.2017336389452553, + "language_loss": 0.67880672, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.70244133, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.1307373, + "step": 15715, + "time_per_iteration": 2.8072476387023926 + }, + { + "auxiliary_loss_clip": 0.01328957, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.22020888, + "balance_loss_mlp": 1.01577425, + "epoch": 0.9448970389298061, + "flos": 23846363384760.0, + "grad_norm": 2.4626895236515893, + "language_loss": 0.75012636, + "learning_rate": 3.173281653583948e-08, + "loss": 0.7737087, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1350708, + "step": 15716, + "time_per_iteration": 2.825007200241089 + }, + { + "auxiliary_loss_clip": 0.0132774, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.22027397, + "balance_loss_mlp": 1.01552844, + "epoch": 0.944957162182474, + "flos": 22387334524200.0, + "grad_norm": 5.525584510717708, + "language_loss": 0.62469649, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64825845, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1293335, + "step": 15717, + "time_per_iteration": 2.8572933673858643 + }, + { + "auxiliary_loss_clip": 0.01324922, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.21796811, + "balance_loss_mlp": 1.0199666, + "epoch": 0.9450172854351421, + "flos": 17388335331000.0, + "grad_norm": 1.6079379387336479, + "language_loss": 0.79332662, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81689674, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12133789, + "step": 15718, + "time_per_iteration": 2.741058349609375 + }, + { + "auxiliary_loss_clip": 0.01141502, + "auxiliary_loss_mlp": 0.01001958, + "balance_loss_clip": 1.09913886, + "balance_loss_mlp": 0.9995023, + "epoch": 0.94507740868781, + "flos": 68482544633640.0, + "grad_norm": 0.7503997288528, + "language_loss": 0.57918638, + "learning_rate": 3.152584694592719e-08, + "loss": 0.60062099, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02453613, + "step": 15719, + "time_per_iteration": 3.3983874320983887 + }, + { + "auxiliary_loss_clip": 0.01331162, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.22175288, + "balance_loss_mlp": 1.02074409, + "epoch": 0.945137531940478, + "flos": 21147488920440.0, + "grad_norm": 1.5754404029821267, + "language_loss": 0.75980777, + "learning_rate": 3.145700636861193e-08, + "loss": 0.78345746, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.13067627, + "step": 15720, + "time_per_iteration": 4.281872034072876 + }, + { + "auxiliary_loss_clip": 0.01321101, + "auxiliary_loss_mlp": 0.01028289, + "balance_loss_clip": 1.21572626, + "balance_loss_mlp": 1.01656437, + "epoch": 0.9451976551931459, + "flos": 24539144230440.0, + "grad_norm": 1.9293850231561434, + "language_loss": 0.72807127, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75156522, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11730957, + "step": 15721, + "time_per_iteration": 2.855769395828247 + }, + { + "auxiliary_loss_clip": 0.01326534, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.21942329, + "balance_loss_mlp": 1.02130103, + "epoch": 0.9452577784458139, + "flos": 23445826881480.0, + "grad_norm": 1.9585266825659156, + "language_loss": 0.85245669, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87606514, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13012695, + "step": 15722, + "time_per_iteration": 2.9666693210601807 + }, + { + "auxiliary_loss_clip": 0.01144662, + "auxiliary_loss_mlp": 0.01010156, + "balance_loss_clip": 1.10232115, + "balance_loss_mlp": 1.0074259, + "epoch": 0.9453179016984818, + "flos": 52032568839120.0, + "grad_norm": 0.9016126906341769, + "language_loss": 0.64571619, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66726434, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02734375, + "step": 15723, + "time_per_iteration": 3.2172346115112305 + }, + { + "auxiliary_loss_clip": 0.0133222, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.22370958, + "balance_loss_mlp": 1.01931524, + "epoch": 0.9453780249511499, + "flos": 13476034324080.0, + "grad_norm": 3.2937651592618282, + "language_loss": 0.73133314, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75497717, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12860107, + "step": 15724, + "time_per_iteration": 4.3706746101379395 + }, + { + "auxiliary_loss_clip": 0.01325082, + "auxiliary_loss_mlp": 0.01025797, + "balance_loss_clip": 1.21782875, + "balance_loss_mlp": 1.01338768, + "epoch": 0.9454381482038178, + "flos": 23263783292880.0, + "grad_norm": 2.006634078904701, + "language_loss": 0.84335536, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86686414, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12390137, + "step": 15725, + "time_per_iteration": 4.1863744258880615 + }, + { + "auxiliary_loss_clip": 0.01328089, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.22120333, + "balance_loss_mlp": 1.01721203, + "epoch": 0.9454982714564858, + "flos": 19500893734320.0, + "grad_norm": 1.625618102565743, + "language_loss": 0.70957446, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73314893, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.121521, + "step": 15726, + "time_per_iteration": 2.7928953170776367 + }, + { + "auxiliary_loss_clip": 0.01325149, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.2176584, + "balance_loss_mlp": 1.02069986, + "epoch": 0.9455583947091538, + "flos": 23263336600920.0, + "grad_norm": 1.643935624008172, + "language_loss": 0.61365104, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63724512, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13537598, + "step": 15727, + "time_per_iteration": 2.756838798522949 + }, + { + "auxiliary_loss_clip": 0.0131996, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.21596181, + "balance_loss_mlp": 1.02041292, + "epoch": 0.9456185179618217, + "flos": 17677168571160.0, + "grad_norm": 1.849867025645382, + "language_loss": 0.82395434, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.84748209, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12408447, + "step": 15728, + "time_per_iteration": 2.78579044342041 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01005983, + "balance_loss_clip": 1.10145068, + "balance_loss_mlp": 1.00373006, + "epoch": 0.9456786412144897, + "flos": 61429063637880.0, + "grad_norm": 0.7457556615428974, + "language_loss": 0.59170747, + "learning_rate": 3.08408006157368e-08, + "loss": 0.6132049, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02258301, + "step": 15729, + "time_per_iteration": 4.714662551879883 + }, + { + "auxiliary_loss_clip": 0.01318677, + "auxiliary_loss_mlp": 0.0102452, + "balance_loss_clip": 1.21287346, + "balance_loss_mlp": 1.01174676, + "epoch": 0.9457387644671577, + "flos": 18593031076200.0, + "grad_norm": 1.8272729384321973, + "language_loss": 0.76722014, + "learning_rate": 3.077270662890052e-08, + "loss": 0.7906521, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12780762, + "step": 15730, + "time_per_iteration": 2.8228743076324463 + }, + { + "auxiliary_loss_clip": 0.01329155, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.22055686, + "balance_loss_mlp": 1.01852298, + "epoch": 0.9457988877198257, + "flos": 21113882179560.0, + "grad_norm": 1.7786963888747442, + "language_loss": 0.63128853, + "learning_rate": 3.070468731536047e-08, + "loss": 0.65490168, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13647461, + "step": 15731, + "time_per_iteration": 2.785590648651123 + }, + { + "auxiliary_loss_clip": 0.0132334, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.21515095, + "balance_loss_mlp": 1.01767814, + "epoch": 0.9458590109724936, + "flos": 26694121388760.0, + "grad_norm": 1.8250430066926038, + "language_loss": 0.64497483, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66852069, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13580322, + "step": 15732, + "time_per_iteration": 2.8273158073425293 + }, + { + "auxiliary_loss_clip": 0.01334235, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.22239208, + "balance_loss_mlp": 1.01170111, + "epoch": 0.9459191342251616, + "flos": 18666295203600.0, + "grad_norm": 1.944648229264516, + "language_loss": 0.84851754, + "learning_rate": 3.056887271848363e-08, + "loss": 0.87211066, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13360596, + "step": 15733, + "time_per_iteration": 2.7478630542755127 + }, + { + "auxiliary_loss_clip": 0.01318962, + "auxiliary_loss_mlp": 0.01029149, + "balance_loss_clip": 1.21499002, + "balance_loss_mlp": 1.01811004, + "epoch": 0.9459792574778295, + "flos": 23402961434520.0, + "grad_norm": 1.4237652642984182, + "language_loss": 0.72416532, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74764645, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.11035156, + "step": 15734, + "time_per_iteration": 2.8632166385650635 + }, + { + "auxiliary_loss_clip": 0.0131053, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.20840728, + "balance_loss_mlp": 1.01782274, + "epoch": 0.9460393807304975, + "flos": 24398829054720.0, + "grad_norm": 1.400572086212863, + "language_loss": 0.86995536, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89335132, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.11248779, + "step": 15735, + "time_per_iteration": 2.8759384155273438 + }, + { + "auxiliary_loss_clip": 0.01325863, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.21883237, + "balance_loss_mlp": 1.01499903, + "epoch": 0.9460995039831654, + "flos": 21943770140520.0, + "grad_norm": 1.9788832964534702, + "language_loss": 0.67941415, + "learning_rate": 3.036571093728102e-08, + "loss": 0.70294219, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1192627, + "step": 15736, + "time_per_iteration": 2.914759874343872 + }, + { + "auxiliary_loss_clip": 0.01141353, + "auxiliary_loss_mlp": 0.01001182, + "balance_loss_clip": 1.09911442, + "balance_loss_mlp": 0.99858344, + "epoch": 0.9461596272358335, + "flos": 70338292811640.0, + "grad_norm": 0.8745905308281882, + "language_loss": 0.65311021, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67453551, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02600098, + "step": 15737, + "time_per_iteration": 3.2791147232055664 + }, + { + "auxiliary_loss_clip": 0.01141356, + "auxiliary_loss_mlp": 0.01001355, + "balance_loss_clip": 1.09920907, + "balance_loss_mlp": 0.99849397, + "epoch": 0.9462197504885014, + "flos": 58607782284600.0, + "grad_norm": 0.795611561709913, + "language_loss": 0.58903325, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.61046034, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02856445, + "step": 15738, + "time_per_iteration": 3.283719301223755 + }, + { + "auxiliary_loss_clip": 0.01318833, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.21509683, + "balance_loss_mlp": 1.01749325, + "epoch": 0.9462798737411694, + "flos": 23438111293080.0, + "grad_norm": 1.847503701262233, + "language_loss": 0.71351957, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73700035, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.11755371, + "step": 15739, + "time_per_iteration": 2.8710780143737793 + }, + { + "auxiliary_loss_clip": 0.01328802, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.22090185, + "balance_loss_mlp": 1.01715112, + "epoch": 0.9463399969938374, + "flos": 25051546430280.0, + "grad_norm": 2.6983142012177397, + "language_loss": 0.64406919, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66765422, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12536621, + "step": 15740, + "time_per_iteration": 2.8439502716064453 + }, + { + "auxiliary_loss_clip": 0.01320228, + "auxiliary_loss_mlp": 0.01030549, + "balance_loss_clip": 1.21580958, + "balance_loss_mlp": 1.01796079, + "epoch": 0.9464001202465053, + "flos": 24357953417400.0, + "grad_norm": 1.6896754483604446, + "language_loss": 0.66523004, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68873787, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12591553, + "step": 15741, + "time_per_iteration": 2.826843500137329 + }, + { + "auxiliary_loss_clip": 0.01328244, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.22071147, + "balance_loss_mlp": 1.01429892, + "epoch": 0.9464602434991733, + "flos": 17169923633040.0, + "grad_norm": 1.8034005371529647, + "language_loss": 0.7579475, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.78149617, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12316895, + "step": 15742, + "time_per_iteration": 2.8336126804351807 + }, + { + "auxiliary_loss_clip": 0.01317182, + "auxiliary_loss_mlp": 0.01028755, + "balance_loss_clip": 1.2129004, + "balance_loss_mlp": 1.01685786, + "epoch": 0.9465203667518413, + "flos": 19942915000320.0, + "grad_norm": 1.8980283172558707, + "language_loss": 0.73059797, + "learning_rate": 2.989428100602187e-08, + "loss": 0.75405729, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11901855, + "step": 15743, + "time_per_iteration": 2.7826592922210693 + }, + { + "auxiliary_loss_clip": 0.01332081, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.22259772, + "balance_loss_mlp": 1.01575446, + "epoch": 0.9465804900045093, + "flos": 20125121022360.0, + "grad_norm": 2.004844844414881, + "language_loss": 0.79792893, + "learning_rate": 2.982723267901943e-08, + "loss": 0.82154179, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13452148, + "step": 15744, + "time_per_iteration": 2.801302433013916 + }, + { + "auxiliary_loss_clip": 0.01332069, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.22259641, + "balance_loss_mlp": 1.02239263, + "epoch": 0.9466406132571772, + "flos": 23916500668440.0, + "grad_norm": 1.6926488444408387, + "language_loss": 0.78787333, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.81155747, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13952637, + "step": 15745, + "time_per_iteration": 2.7404189109802246 + }, + { + "auxiliary_loss_clip": 0.01329911, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.21993971, + "balance_loss_mlp": 1.01266956, + "epoch": 0.9467007365098452, + "flos": 19937392263360.0, + "grad_norm": 1.472954394620014, + "language_loss": 0.70444202, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72799724, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1295166, + "step": 15746, + "time_per_iteration": 2.7508723735809326 + }, + { + "auxiliary_loss_clip": 0.01325579, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.21846032, + "balance_loss_mlp": 1.01681542, + "epoch": 0.9467608597625131, + "flos": 19314261401040.0, + "grad_norm": 2.0830708076366617, + "language_loss": 0.56802011, + "learning_rate": 2.962653596305964e-08, + "loss": 0.59157896, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13513184, + "step": 15747, + "time_per_iteration": 2.7380659580230713 + }, + { + "auxiliary_loss_clip": 0.01140483, + "auxiliary_loss_mlp": 0.0100196, + "balance_loss_clip": 1.09863794, + "balance_loss_mlp": 0.99957561, + "epoch": 0.9468209830151811, + "flos": 69646121091360.0, + "grad_norm": 0.6584461772083248, + "language_loss": 0.53309321, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55451763, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.02380371, + "step": 15748, + "time_per_iteration": 3.4238839149475098 + }, + { + "auxiliary_loss_clip": 0.01327484, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.22014821, + "balance_loss_mlp": 1.02359843, + "epoch": 0.946881106267849, + "flos": 27022408973640.0, + "grad_norm": 1.598430797990008, + "language_loss": 0.66426343, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68790323, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12884521, + "step": 15749, + "time_per_iteration": 2.8542041778564453 + }, + { + "auxiliary_loss_clip": 0.01328964, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.21955872, + "balance_loss_mlp": 1.01992548, + "epoch": 0.9469412295205171, + "flos": 20194486747200.0, + "grad_norm": 4.692464225558806, + "language_loss": 0.76644164, + "learning_rate": 2.942651169791621e-08, + "loss": 0.7900753, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.14489746, + "step": 15750, + "time_per_iteration": 2.767221689224243 + }, + { + "auxiliary_loss_clip": 0.01321274, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.21510005, + "balance_loss_mlp": 1.01483583, + "epoch": 0.947001352773185, + "flos": 21329938592640.0, + "grad_norm": 1.6724892977482821, + "language_loss": 0.68448371, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70797265, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12786865, + "step": 15751, + "time_per_iteration": 2.7952752113342285 + }, + { + "auxiliary_loss_clip": 0.01327273, + "auxiliary_loss_mlp": 0.01028147, + "balance_loss_clip": 1.21853232, + "balance_loss_mlp": 1.01582074, + "epoch": 0.947061476025853, + "flos": 21949008618960.0, + "grad_norm": 1.6639772764323606, + "language_loss": 0.66097927, + "learning_rate": 2.929353580532723e-08, + "loss": 0.68453348, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12329102, + "step": 15752, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.01320582, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.21368575, + "balance_loss_mlp": 1.01177597, + "epoch": 0.947121599278521, + "flos": 21399344925840.0, + "grad_norm": 1.7316050977223383, + "language_loss": 0.71810138, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.74155307, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12823486, + "step": 15753, + "time_per_iteration": 2.827460765838623 + }, + { + "auxiliary_loss_clip": 0.013348, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.22333908, + "balance_loss_mlp": 1.02052891, + "epoch": 0.9471817225311889, + "flos": 23080927537080.0, + "grad_norm": 1.9853276244904894, + "language_loss": 0.70071346, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72440827, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14147949, + "step": 15754, + "time_per_iteration": 2.917201042175293 + }, + { + "auxiliary_loss_clip": 0.01336472, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.22494805, + "balance_loss_mlp": 1.01732564, + "epoch": 0.947241845783857, + "flos": 11914439081400.0, + "grad_norm": 2.0513379997473735, + "language_loss": 0.79422784, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.81789458, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.12878418, + "step": 15755, + "time_per_iteration": 2.8226821422576904 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.22688711, + "balance_loss_mlp": 1.02210295, + "epoch": 0.9473019690365249, + "flos": 20745571732920.0, + "grad_norm": 2.367986891319361, + "language_loss": 0.7546277, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77839684, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.14550781, + "step": 15756, + "time_per_iteration": 2.822141408920288 + }, + { + "auxiliary_loss_clip": 0.0132739, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.21892357, + "balance_loss_mlp": 1.01579237, + "epoch": 0.9473620922891929, + "flos": 17644455214200.0, + "grad_norm": 2.0097031166261714, + "language_loss": 0.75051165, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.7740742, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.13079834, + "step": 15757, + "time_per_iteration": 2.794027805328369 + }, + { + "auxiliary_loss_clip": 0.01332569, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.22250164, + "balance_loss_mlp": 1.01617908, + "epoch": 0.9474222155418608, + "flos": 23555256076440.0, + "grad_norm": 1.8824094994370242, + "language_loss": 0.80035496, + "learning_rate": 2.889640171327512e-08, + "loss": 0.82397389, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13153076, + "step": 15758, + "time_per_iteration": 2.83780574798584 + }, + { + "auxiliary_loss_clip": 0.01321175, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.21517813, + "balance_loss_mlp": 1.01574111, + "epoch": 0.9474823387945288, + "flos": 27095551275960.0, + "grad_norm": 1.4181493076130132, + "language_loss": 0.71909297, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74258792, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12579346, + "step": 15759, + "time_per_iteration": 4.319457054138184 + }, + { + "auxiliary_loss_clip": 0.01317153, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.21544158, + "balance_loss_mlp": 1.01708961, + "epoch": 0.9475424620471967, + "flos": 22971741992280.0, + "grad_norm": 1.7849705582263473, + "language_loss": 0.75617725, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77963078, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.11102295, + "step": 15760, + "time_per_iteration": 2.807858943939209 + }, + { + "auxiliary_loss_clip": 0.01325166, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.21867824, + "balance_loss_mlp": 1.01690245, + "epoch": 0.9476025852998647, + "flos": 20052506628720.0, + "grad_norm": 1.7877512165310874, + "language_loss": 0.73137259, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75491357, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12036133, + "step": 15761, + "time_per_iteration": 2.8323845863342285 + }, + { + "auxiliary_loss_clip": 0.01324367, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.2183497, + "balance_loss_mlp": 1.0199517, + "epoch": 0.9476627085525327, + "flos": 14979552966000.0, + "grad_norm": 2.017830932938675, + "language_loss": 0.7205956, + "learning_rate": 2.863314050734722e-08, + "loss": 0.7441625, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.1237793, + "step": 15762, + "time_per_iteration": 2.7579925060272217 + }, + { + "auxiliary_loss_clip": 0.01338625, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.22586393, + "balance_loss_mlp": 1.02238345, + "epoch": 0.9477228318052007, + "flos": 18702460271160.0, + "grad_norm": 2.011174673917371, + "language_loss": 0.67150319, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69525409, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14086914, + "step": 15763, + "time_per_iteration": 4.3419647216796875 + }, + { + "auxiliary_loss_clip": 0.01326731, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.21899104, + "balance_loss_mlp": 1.02074218, + "epoch": 0.9477829550578686, + "flos": 23879929517280.0, + "grad_norm": 1.5346227056318749, + "language_loss": 0.70154577, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.72514528, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12481689, + "step": 15764, + "time_per_iteration": 2.7992515563964844 + }, + { + "auxiliary_loss_clip": 0.01312121, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.21209264, + "balance_loss_mlp": 1.01813364, + "epoch": 0.9478430783105366, + "flos": 22567753778400.0, + "grad_norm": 1.7155984909927204, + "language_loss": 0.71249712, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73591173, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.11212158, + "step": 15765, + "time_per_iteration": 2.8025217056274414 + }, + { + "auxiliary_loss_clip": 0.0114366, + "auxiliary_loss_mlp": 0.01002844, + "balance_loss_clip": 1.10105085, + "balance_loss_mlp": 1.00034094, + "epoch": 0.9479032015632046, + "flos": 60870547322280.0, + "grad_norm": 0.8110509001787594, + "language_loss": 0.59131247, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61277747, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02502441, + "step": 15766, + "time_per_iteration": 3.1262290477752686 + }, + { + "auxiliary_loss_clip": 0.01323644, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.2160697, + "balance_loss_mlp": 1.02321577, + "epoch": 0.9479633248158725, + "flos": 14687958357360.0, + "grad_norm": 1.7373120313422052, + "language_loss": 0.75131637, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.77490652, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12158203, + "step": 15767, + "time_per_iteration": 4.22212290763855 + }, + { + "auxiliary_loss_clip": 0.01336671, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.22509503, + "balance_loss_mlp": 1.02224302, + "epoch": 0.9480234480685406, + "flos": 20338009983360.0, + "grad_norm": 1.9729298373579944, + "language_loss": 0.73263395, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75635839, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13525391, + "step": 15768, + "time_per_iteration": 2.721454620361328 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01003794, + "balance_loss_clip": 1.10096765, + "balance_loss_mlp": 1.00114775, + "epoch": 0.9480835713212085, + "flos": 70309843332480.0, + "grad_norm": 0.7342538385037428, + "language_loss": 0.55330598, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57477701, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02648926, + "step": 15769, + "time_per_iteration": 3.255002021789551 + }, + { + "auxiliary_loss_clip": 0.01327153, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.2181344, + "balance_loss_mlp": 1.01863563, + "epoch": 0.9481436945738765, + "flos": 25455575252520.0, + "grad_norm": 1.3218854007491456, + "language_loss": 0.77666491, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.80025029, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12762451, + "step": 15770, + "time_per_iteration": 2.774221420288086 + }, + { + "auxiliary_loss_clip": 0.0132655, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.22005832, + "balance_loss_mlp": 1.0194478, + "epoch": 0.9482038178265444, + "flos": 26985634780680.0, + "grad_norm": 2.178487351434901, + "language_loss": 0.79783303, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82143408, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.14111328, + "step": 15771, + "time_per_iteration": 2.8014109134674072 + }, + { + "auxiliary_loss_clip": 0.01321452, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.21632504, + "balance_loss_mlp": 1.01418447, + "epoch": 0.9482639410792124, + "flos": 17789968260000.0, + "grad_norm": 1.9463811548458476, + "language_loss": 0.69996876, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.72345102, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12585449, + "step": 15772, + "time_per_iteration": 2.8044989109039307 + }, + { + "auxiliary_loss_clip": 0.01324911, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.21855783, + "balance_loss_mlp": 1.01824033, + "epoch": 0.9483240643318803, + "flos": 21001691616120.0, + "grad_norm": 1.5705364044382408, + "language_loss": 0.73952299, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76308393, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1295166, + "step": 15773, + "time_per_iteration": 2.815969228744507 + }, + { + "auxiliary_loss_clip": 0.01335168, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.2249918, + "balance_loss_mlp": 1.0180316, + "epoch": 0.9483841875845483, + "flos": 20088021962520.0, + "grad_norm": 2.1284762899777907, + "language_loss": 0.63632828, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.65999407, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1338501, + "step": 15774, + "time_per_iteration": 2.8261067867279053 + }, + { + "auxiliary_loss_clip": 0.01329619, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.22079015, + "balance_loss_mlp": 1.01684439, + "epoch": 0.9484443108372163, + "flos": 20818551601800.0, + "grad_norm": 1.6734222098328992, + "language_loss": 0.59709865, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.62069887, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13555908, + "step": 15775, + "time_per_iteration": 2.8173415660858154 + }, + { + "auxiliary_loss_clip": 0.01328123, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.2191956, + "balance_loss_mlp": 1.01350594, + "epoch": 0.9485044340898843, + "flos": 36436081108680.0, + "grad_norm": 1.4884985620428937, + "language_loss": 0.61792439, + "learning_rate": 2.772114638584555e-08, + "loss": 0.64147246, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.13165283, + "step": 15776, + "time_per_iteration": 2.9807910919189453 + }, + { + "auxiliary_loss_clip": 0.0132813, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.21848655, + "balance_loss_mlp": 1.01734567, + "epoch": 0.9485645573425522, + "flos": 22607817248520.0, + "grad_norm": 1.8686860168651243, + "language_loss": 0.74248374, + "learning_rate": 2.765656478622458e-08, + "loss": 0.76607734, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13897705, + "step": 15777, + "time_per_iteration": 2.867774724960327 + }, + { + "auxiliary_loss_clip": 0.01347105, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.23095, + "balance_loss_mlp": 1.02236986, + "epoch": 0.9486246805952202, + "flos": 22022841263400.0, + "grad_norm": 2.2292281594037098, + "language_loss": 0.72682381, + "learning_rate": 2.759205797806441e-08, + "loss": 0.75065887, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.14031982, + "step": 15778, + "time_per_iteration": 2.8632171154022217 + }, + { + "auxiliary_loss_clip": 0.01313756, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.2137332, + "balance_loss_mlp": 1.02134132, + "epoch": 0.9486848038478882, + "flos": 16513064204760.0, + "grad_norm": 3.8916838104572333, + "language_loss": 0.70195287, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.72541428, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.11035156, + "step": 15779, + "time_per_iteration": 2.7986154556274414 + }, + { + "auxiliary_loss_clip": 0.01327305, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.21932864, + "balance_loss_mlp": 1.0152204, + "epoch": 0.9487449271005561, + "flos": 19249037728920.0, + "grad_norm": 2.106364715765301, + "language_loss": 0.78392339, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80747843, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12969971, + "step": 15780, + "time_per_iteration": 2.9762837886810303 + }, + { + "auxiliary_loss_clip": 0.01322791, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.2162807, + "balance_loss_mlp": 1.01771975, + "epoch": 0.9488050503532242, + "flos": 21767777197560.0, + "grad_norm": 6.564227180097064, + "language_loss": 0.66418481, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.6877135, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12341309, + "step": 15781, + "time_per_iteration": 2.8450710773468018 + }, + { + "auxiliary_loss_clip": 0.01313281, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.20870852, + "balance_loss_mlp": 1.01729727, + "epoch": 0.9488651736058921, + "flos": 18373482344160.0, + "grad_norm": 2.001202490892584, + "language_loss": 0.79930937, + "learning_rate": 2.733477870890999e-08, + "loss": 0.8227424, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12738037, + "step": 15782, + "time_per_iteration": 2.7195510864257812 + }, + { + "auxiliary_loss_clip": 0.01143836, + "auxiliary_loss_mlp": 0.01014108, + "balance_loss_clip": 1.10174513, + "balance_loss_mlp": 1.0114255, + "epoch": 0.9489252968585601, + "flos": 70102071024840.0, + "grad_norm": 0.7321284013399976, + "language_loss": 0.59844136, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.62002075, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02685547, + "step": 15783, + "time_per_iteration": 3.3546884059906006 + }, + { + "auxiliary_loss_clip": 0.01327642, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.21878743, + "balance_loss_mlp": 1.01885176, + "epoch": 0.948985420111228, + "flos": 27861271382160.0, + "grad_norm": 1.5618536440022996, + "language_loss": 0.73919469, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76279008, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13043213, + "step": 15784, + "time_per_iteration": 2.8362877368927 + }, + { + "auxiliary_loss_clip": 0.01329971, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.2208221, + "balance_loss_mlp": 1.01326251, + "epoch": 0.949045543363896, + "flos": 24321382266240.0, + "grad_norm": 1.8732765008676382, + "language_loss": 0.70336479, + "learning_rate": 2.714260468695806e-08, + "loss": 0.72693241, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13543701, + "step": 15785, + "time_per_iteration": 2.79004168510437 + }, + { + "auxiliary_loss_clip": 0.01334001, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.22414505, + "balance_loss_mlp": 1.01738846, + "epoch": 0.9491056666165639, + "flos": 24246818671320.0, + "grad_norm": 1.4411874401169213, + "language_loss": 0.7638706, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78751165, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12719727, + "step": 15786, + "time_per_iteration": 2.7488033771514893 + }, + { + "auxiliary_loss_clip": 0.01324075, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.21812153, + "balance_loss_mlp": 1.01802361, + "epoch": 0.949165789869232, + "flos": 24536139211800.0, + "grad_norm": 1.6111808561433445, + "language_loss": 0.79300708, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81654477, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.11676025, + "step": 15787, + "time_per_iteration": 2.7487964630126953 + }, + { + "auxiliary_loss_clip": 0.01321406, + "auxiliary_loss_mlp": 0.01029924, + "balance_loss_clip": 1.21757042, + "balance_loss_mlp": 1.0177114, + "epoch": 0.9492259131218999, + "flos": 22240278360720.0, + "grad_norm": 1.5408533183717776, + "language_loss": 0.7663188, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78983212, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.12225342, + "step": 15788, + "time_per_iteration": 2.7907767295837402 + }, + { + "auxiliary_loss_clip": 0.01332384, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.22230816, + "balance_loss_mlp": 1.01945221, + "epoch": 0.9492860363745679, + "flos": 22971498342120.0, + "grad_norm": 1.6960975623015486, + "language_loss": 0.72145963, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.74511504, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 1.10009766, + "router_z_loss_mlp": 0.13696289, + "step": 15789, + "time_per_iteration": 2.7652971744537354 + }, + { + "auxiliary_loss_clip": 0.01322578, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.21611321, + "balance_loss_mlp": 1.01619077, + "epoch": 0.9493461596272358, + "flos": 18375025461840.0, + "grad_norm": 1.748778602713827, + "language_loss": 0.73424017, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75776547, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13757324, + "step": 15790, + "time_per_iteration": 2.8953444957733154 + }, + { + "auxiliary_loss_clip": 0.01334942, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.22489548, + "balance_loss_mlp": 1.0181663, + "epoch": 0.9494062828799038, + "flos": 20016625819680.0, + "grad_norm": 2.184082263399038, + "language_loss": 0.77925569, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.80291456, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12786865, + "step": 15791, + "time_per_iteration": 2.7974212169647217 + }, + { + "auxiliary_loss_clip": 0.01338895, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.22629523, + "balance_loss_mlp": 1.0205276, + "epoch": 0.9494664061325718, + "flos": 27233470558440.0, + "grad_norm": 2.067931056813035, + "language_loss": 0.74068999, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.76441187, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.12774658, + "step": 15792, + "time_per_iteration": 2.906526803970337 + }, + { + "auxiliary_loss_clip": 0.0132421, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.21773553, + "balance_loss_mlp": 1.01911378, + "epoch": 0.9495265293852397, + "flos": 18374497553160.0, + "grad_norm": 2.244984943221752, + "language_loss": 0.78410244, + "learning_rate": 2.663343248754679e-08, + "loss": 0.8076576, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12200928, + "step": 15793, + "time_per_iteration": 2.8595638275146484 + }, + { + "auxiliary_loss_clip": 0.01326468, + "auxiliary_loss_mlp": 0.01025799, + "balance_loss_clip": 1.22002363, + "balance_loss_mlp": 1.01392579, + "epoch": 0.9495866526379078, + "flos": 23080968145440.0, + "grad_norm": 1.5691468998200353, + "language_loss": 0.77489996, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79842257, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11871338, + "step": 15794, + "time_per_iteration": 2.902773141860962 + }, + { + "auxiliary_loss_clip": 0.01332265, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.22166741, + "balance_loss_mlp": 1.01778984, + "epoch": 0.9496467758905757, + "flos": 17534376285480.0, + "grad_norm": 1.8440700996668717, + "language_loss": 0.61082506, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63446456, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13891602, + "step": 15795, + "time_per_iteration": 2.784372329711914 + }, + { + "auxiliary_loss_clip": 0.01318448, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.21524525, + "balance_loss_mlp": 1.01923048, + "epoch": 0.9497068991432437, + "flos": 24139257460920.0, + "grad_norm": 1.6292911421219634, + "language_loss": 0.79275775, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81627631, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.1418457, + "step": 15796, + "time_per_iteration": 2.8007466793060303 + }, + { + "auxiliary_loss_clip": 0.01332022, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.22375405, + "balance_loss_mlp": 1.01799273, + "epoch": 0.9497670223959116, + "flos": 20308220428320.0, + "grad_norm": 2.0035488719585324, + "language_loss": 0.76188844, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.78552616, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13763428, + "step": 15797, + "time_per_iteration": 4.211870431900024 + }, + { + "auxiliary_loss_clip": 0.01330649, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.22279596, + "balance_loss_mlp": 1.01680875, + "epoch": 0.9498271456485796, + "flos": 13702892560920.0, + "grad_norm": 3.153771980543062, + "language_loss": 0.66423577, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.68784094, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1307373, + "step": 15798, + "time_per_iteration": 2.769789457321167 + }, + { + "auxiliary_loss_clip": 0.01337032, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.22578835, + "balance_loss_mlp": 1.0200212, + "epoch": 0.9498872689012475, + "flos": 20818957685400.0, + "grad_norm": 1.8790070642781593, + "language_loss": 0.77671015, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.80041516, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 1.11181641, + "router_z_loss_mlp": 0.13433838, + "step": 15799, + "time_per_iteration": 2.788530111312866 + }, + { + "auxiliary_loss_clip": 0.01318436, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.21454978, + "balance_loss_mlp": 1.01606536, + "epoch": 0.9499473921539155, + "flos": 21037653641880.0, + "grad_norm": 1.5991807034992338, + "language_loss": 0.70970392, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73317254, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.12365723, + "step": 15800, + "time_per_iteration": 2.8255703449249268 + }, + { + "auxiliary_loss_clip": 0.01319603, + "auxiliary_loss_mlp": 0.01023299, + "balance_loss_clip": 1.21322012, + "balance_loss_mlp": 1.01111627, + "epoch": 0.9500075154065835, + "flos": 21003965684280.0, + "grad_norm": 1.553664104992981, + "language_loss": 0.72337449, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74680352, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.12182617, + "step": 15801, + "time_per_iteration": 2.9986891746520996 + }, + { + "auxiliary_loss_clip": 0.01324837, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.21718979, + "balance_loss_mlp": 1.01624465, + "epoch": 0.9500676386592515, + "flos": 25127815576320.0, + "grad_norm": 1.5354115286613697, + "language_loss": 0.80958265, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83311647, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12316895, + "step": 15802, + "time_per_iteration": 4.426571607589722 + }, + { + "auxiliary_loss_clip": 0.01335369, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.22626305, + "balance_loss_mlp": 1.02465165, + "epoch": 0.9501277619119194, + "flos": 27528963569640.0, + "grad_norm": 1.6169673065927759, + "language_loss": 0.68234187, + "learning_rate": 2.60037021038646e-08, + "loss": 0.70607567, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.13342285, + "step": 15803, + "time_per_iteration": 4.240710735321045 + }, + { + "auxiliary_loss_clip": 0.01325605, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.21987569, + "balance_loss_mlp": 1.01583719, + "epoch": 0.9501878851645874, + "flos": 20818917077040.0, + "grad_norm": 1.6105710031559017, + "language_loss": 0.76191354, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78545284, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.12493896, + "step": 15804, + "time_per_iteration": 2.7610676288604736 + }, + { + "auxiliary_loss_clip": 0.01333383, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.22376871, + "balance_loss_mlp": 1.01987219, + "epoch": 0.9502480084172553, + "flos": 18374010252840.0, + "grad_norm": 1.5318792262489451, + "language_loss": 0.73224378, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75590599, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12976074, + "step": 15805, + "time_per_iteration": 2.7563416957855225 + }, + { + "auxiliary_loss_clip": 0.01325211, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.21749568, + "balance_loss_mlp": 1.02411652, + "epoch": 0.9503081316699233, + "flos": 23554809384480.0, + "grad_norm": 1.3719807729034241, + "language_loss": 0.80308765, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82670629, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12542725, + "step": 15806, + "time_per_iteration": 2.8267507553100586 + }, + { + "auxiliary_loss_clip": 0.01327804, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.21815896, + "balance_loss_mlp": 1.02268195, + "epoch": 0.9503682549225914, + "flos": 18044951109120.0, + "grad_norm": 1.9080910307415786, + "language_loss": 0.82501251, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84864545, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.12811279, + "step": 15807, + "time_per_iteration": 4.304141998291016 + }, + { + "auxiliary_loss_clip": 0.01322982, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.21639323, + "balance_loss_mlp": 1.01383901, + "epoch": 0.9504283781752593, + "flos": 25891830131400.0, + "grad_norm": 1.8472872366326754, + "language_loss": 0.71964604, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.74313843, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12432861, + "step": 15808, + "time_per_iteration": 2.7731196880340576 + }, + { + "auxiliary_loss_clip": 0.01322575, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.21595311, + "balance_loss_mlp": 1.01534176, + "epoch": 0.9504885014279273, + "flos": 22128696922680.0, + "grad_norm": 1.5061468502214745, + "language_loss": 0.69799376, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72150254, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.1295166, + "step": 15809, + "time_per_iteration": 2.7817728519439697 + }, + { + "auxiliary_loss_clip": 0.01321992, + "auxiliary_loss_mlp": 0.01023694, + "balance_loss_clip": 1.21508956, + "balance_loss_mlp": 1.01090908, + "epoch": 0.9505486246805952, + "flos": 21620924075880.0, + "grad_norm": 3.3866986992276513, + "language_loss": 0.75783765, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.78129447, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12805176, + "step": 15810, + "time_per_iteration": 2.9959099292755127 + }, + { + "auxiliary_loss_clip": 0.01323795, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.21613669, + "balance_loss_mlp": 1.02519393, + "epoch": 0.9506087479332632, + "flos": 22533334870320.0, + "grad_norm": 1.3533224432356157, + "language_loss": 0.80509907, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82872033, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13140869, + "step": 15811, + "time_per_iteration": 2.848695755004883 + }, + { + "auxiliary_loss_clip": 0.01330653, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.22308993, + "balance_loss_mlp": 1.01854825, + "epoch": 0.9506688711859311, + "flos": 27533186839080.0, + "grad_norm": 3.775437526506173, + "language_loss": 0.70346719, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72708297, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1237793, + "step": 15812, + "time_per_iteration": 2.867105484008789 + }, + { + "auxiliary_loss_clip": 0.01332788, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.22323036, + "balance_loss_mlp": 1.01965094, + "epoch": 0.9507289944385992, + "flos": 19870666081920.0, + "grad_norm": 1.6753700184154914, + "language_loss": 0.6573689, + "learning_rate": 2.538145713158446e-08, + "loss": 0.68102503, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13195801, + "step": 15813, + "time_per_iteration": 2.879281759262085 + }, + { + "auxiliary_loss_clip": 0.01325908, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.21758461, + "balance_loss_mlp": 1.01887703, + "epoch": 0.9507891176912671, + "flos": 25198927460640.0, + "grad_norm": 1.4249214822540612, + "language_loss": 0.70894098, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.73251998, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13104248, + "step": 15814, + "time_per_iteration": 2.8398189544677734 + }, + { + "auxiliary_loss_clip": 0.01323531, + "auxiliary_loss_mlp": 0.01022077, + "balance_loss_clip": 1.21856499, + "balance_loss_mlp": 1.01040614, + "epoch": 0.9508492409439351, + "flos": 24904936958760.0, + "grad_norm": 2.080163041880947, + "language_loss": 0.63072062, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65417671, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11669922, + "step": 15815, + "time_per_iteration": 2.7783968448638916 + }, + { + "auxiliary_loss_clip": 0.0132436, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.21759903, + "balance_loss_mlp": 1.01458156, + "epoch": 0.950909364196603, + "flos": 29789512128720.0, + "grad_norm": 1.712971134911088, + "language_loss": 0.58688951, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61039805, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.11907959, + "step": 15816, + "time_per_iteration": 2.878955364227295 + }, + { + "auxiliary_loss_clip": 0.01324126, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.2178905, + "balance_loss_mlp": 1.02323067, + "epoch": 0.950969487449271, + "flos": 24723218237040.0, + "grad_norm": 1.3214978133018356, + "language_loss": 0.73648143, + "learning_rate": 2.513465558735994e-08, + "loss": 0.76008105, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12597656, + "step": 15817, + "time_per_iteration": 2.790367603302002 + }, + { + "auxiliary_loss_clip": 0.01332529, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.22164345, + "balance_loss_mlp": 1.01604021, + "epoch": 0.9510296107019389, + "flos": 13703176819440.0, + "grad_norm": 1.5426356923243283, + "language_loss": 0.60117203, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62480628, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14855957, + "step": 15818, + "time_per_iteration": 2.779085397720337 + }, + { + "auxiliary_loss_clip": 0.0132435, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.21700621, + "balance_loss_mlp": 1.0198524, + "epoch": 0.9510897339546069, + "flos": 17316411279480.0, + "grad_norm": 1.661516901596187, + "language_loss": 0.69537604, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71894968, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13153076, + "step": 15819, + "time_per_iteration": 2.735727071762085 + }, + { + "auxiliary_loss_clip": 0.01331437, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.22237444, + "balance_loss_mlp": 1.01541376, + "epoch": 0.951149857207275, + "flos": 14798118502800.0, + "grad_norm": 1.7186862866985955, + "language_loss": 0.74429423, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76788723, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12457275, + "step": 15820, + "time_per_iteration": 2.763319492340088 + }, + { + "auxiliary_loss_clip": 0.0133662, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.22691965, + "balance_loss_mlp": 1.01962972, + "epoch": 0.9512099804599429, + "flos": 19395931458960.0, + "grad_norm": 1.5878759720025786, + "language_loss": 0.78929412, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.81298888, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13232422, + "step": 15821, + "time_per_iteration": 2.86000657081604 + }, + { + "auxiliary_loss_clip": 0.0132611, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.21925569, + "balance_loss_mlp": 1.01654458, + "epoch": 0.9512701037126109, + "flos": 36764855993880.0, + "grad_norm": 1.5862737503178452, + "language_loss": 0.71044201, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73399961, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13104248, + "step": 15822, + "time_per_iteration": 2.8568902015686035 + }, + { + "auxiliary_loss_clip": 0.01326286, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.21978295, + "balance_loss_mlp": 1.01928544, + "epoch": 0.9513302269652788, + "flos": 22643454407400.0, + "grad_norm": 1.6523602044313508, + "language_loss": 0.66194731, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68552947, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12652588, + "step": 15823, + "time_per_iteration": 2.7490851879119873 + }, + { + "auxiliary_loss_clip": 0.01323429, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.21797895, + "balance_loss_mlp": 1.01698971, + "epoch": 0.9513903502179468, + "flos": 22898396648160.0, + "grad_norm": 1.612536399059309, + "language_loss": 0.76990229, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79342997, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12341309, + "step": 15824, + "time_per_iteration": 2.845015287399292 + }, + { + "auxiliary_loss_clip": 0.0133682, + "auxiliary_loss_mlp": 0.0102756, + "balance_loss_clip": 1.22579467, + "balance_loss_mlp": 1.01391101, + "epoch": 0.9514504734706147, + "flos": 27934494901200.0, + "grad_norm": 1.9228763454132742, + "language_loss": 0.74189699, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.76554078, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13653564, + "step": 15825, + "time_per_iteration": 2.8665313720703125 + }, + { + "auxiliary_loss_clip": 0.01143939, + "auxiliary_loss_mlp": 0.01003095, + "balance_loss_clip": 1.10122108, + "balance_loss_mlp": 1.00068688, + "epoch": 0.9515105967232828, + "flos": 67382161430040.0, + "grad_norm": 0.8362286297086894, + "language_loss": 0.53481686, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55628717, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02404785, + "step": 15826, + "time_per_iteration": 3.222086191177368 + }, + { + "auxiliary_loss_clip": 0.01329129, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.22089827, + "balance_loss_mlp": 1.02125359, + "epoch": 0.9515707199759507, + "flos": 25851807269640.0, + "grad_norm": 1.8112263178067303, + "language_loss": 0.72895807, + "learning_rate": 2.452289414874076e-08, + "loss": 0.75259089, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12890625, + "step": 15827, + "time_per_iteration": 2.784869432449341 + }, + { + "auxiliary_loss_clip": 0.0132558, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.21781611, + "balance_loss_mlp": 1.01704061, + "epoch": 0.9516308432286187, + "flos": 21833203911480.0, + "grad_norm": 1.9230928015766011, + "language_loss": 0.74271286, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.7662679, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12878418, + "step": 15828, + "time_per_iteration": 2.780646562576294 + }, + { + "auxiliary_loss_clip": 0.01326329, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.22080326, + "balance_loss_mlp": 1.02161479, + "epoch": 0.9516909664812866, + "flos": 27275077146240.0, + "grad_norm": 1.6544011712635713, + "language_loss": 0.73494571, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75854266, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.11743164, + "step": 15829, + "time_per_iteration": 2.7812368869781494 + }, + { + "auxiliary_loss_clip": 0.01327391, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.22014832, + "balance_loss_mlp": 1.0160625, + "epoch": 0.9517510897339546, + "flos": 21220346964240.0, + "grad_norm": 2.5273218167248066, + "language_loss": 0.61436617, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63791955, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.11877441, + "step": 15830, + "time_per_iteration": 2.7856009006500244 + }, + { + "auxiliary_loss_clip": 0.01329378, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.21943474, + "balance_loss_mlp": 1.01441884, + "epoch": 0.9518112129866225, + "flos": 18738625338720.0, + "grad_norm": 2.479774362664783, + "language_loss": 0.73602146, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75960064, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.14123535, + "step": 15831, + "time_per_iteration": 2.722235918045044 + }, + { + "auxiliary_loss_clip": 0.01318854, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.21317267, + "balance_loss_mlp": 1.01292884, + "epoch": 0.9518713362392905, + "flos": 16768047053880.0, + "grad_norm": 1.6629571707714612, + "language_loss": 0.65643561, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67987788, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12438965, + "step": 15832, + "time_per_iteration": 2.8135759830474854 + }, + { + "auxiliary_loss_clip": 0.01320299, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.21726108, + "balance_loss_mlp": 1.0216558, + "epoch": 0.9519314594919586, + "flos": 15234738856920.0, + "grad_norm": 1.7372857437687015, + "language_loss": 0.78134072, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80488473, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.12445068, + "step": 15833, + "time_per_iteration": 2.741344690322876 + }, + { + "auxiliary_loss_clip": 0.01318108, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.21292162, + "balance_loss_mlp": 1.01751208, + "epoch": 0.9519915827446265, + "flos": 19357492323240.0, + "grad_norm": 1.872109839619903, + "language_loss": 0.74819881, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77167416, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.11920166, + "step": 15834, + "time_per_iteration": 2.760504961013794 + }, + { + "auxiliary_loss_clip": 0.01340603, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.22805929, + "balance_loss_mlp": 1.02273989, + "epoch": 0.9520517059972945, + "flos": 22270514607720.0, + "grad_norm": 2.149888013076929, + "language_loss": 0.76427364, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78804123, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.13415527, + "step": 15835, + "time_per_iteration": 2.754521369934082 + }, + { + "auxiliary_loss_clip": 0.01327135, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.21799183, + "balance_loss_mlp": 1.01492763, + "epoch": 0.9521118292499624, + "flos": 14865900501600.0, + "grad_norm": 1.7959519091793728, + "language_loss": 0.664267, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68781739, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12976074, + "step": 15836, + "time_per_iteration": 4.165642738342285 + }, + { + "auxiliary_loss_clip": 0.01319788, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.21539938, + "balance_loss_mlp": 1.01418591, + "epoch": 0.9521719525026304, + "flos": 23513446446840.0, + "grad_norm": 1.6017417345057812, + "language_loss": 0.7046147, + "learning_rate": 2.391862373676057e-08, + "loss": 0.7280823, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.12786865, + "step": 15837, + "time_per_iteration": 2.8085567951202393 + }, + { + "auxiliary_loss_clip": 0.01325218, + "auxiliary_loss_mlp": 0.01029667, + "balance_loss_clip": 1.21591091, + "balance_loss_mlp": 1.01559424, + "epoch": 0.9522320757552983, + "flos": 19719142998840.0, + "grad_norm": 1.7688248377816398, + "language_loss": 0.73249215, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75604105, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14080811, + "step": 15838, + "time_per_iteration": 2.782478094100952 + }, + { + "auxiliary_loss_clip": 0.01327491, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.21972513, + "balance_loss_mlp": 1.01946497, + "epoch": 0.9522921990079664, + "flos": 25926817556520.0, + "grad_norm": 1.8212258887641135, + "language_loss": 0.78342265, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80702025, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12811279, + "step": 15839, + "time_per_iteration": 2.8362221717834473 + }, + { + "auxiliary_loss_clip": 0.01331912, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.22245073, + "balance_loss_mlp": 1.02019334, + "epoch": 0.9523523222606343, + "flos": 19212791444640.0, + "grad_norm": 1.4372043050751926, + "language_loss": 0.80684626, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.83049357, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.12609863, + "step": 15840, + "time_per_iteration": 4.213912010192871 + }, + { + "auxiliary_loss_clip": 0.01317513, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.21401799, + "balance_loss_mlp": 1.01595855, + "epoch": 0.9524124455133023, + "flos": 20925950378760.0, + "grad_norm": 1.9026660553068633, + "language_loss": 0.73485667, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75829983, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.10839844, + "step": 15841, + "time_per_iteration": 4.282743453979492 + }, + { + "auxiliary_loss_clip": 0.01313603, + "auxiliary_loss_mlp": 0.01023763, + "balance_loss_clip": 1.21218324, + "balance_loss_mlp": 1.01221132, + "epoch": 0.9524725687659702, + "flos": 18848216967120.0, + "grad_norm": 2.148962582897335, + "language_loss": 0.79457545, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81794912, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.11553955, + "step": 15842, + "time_per_iteration": 2.690471887588501 + }, + { + "auxiliary_loss_clip": 0.01323962, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.21761644, + "balance_loss_mlp": 1.02228141, + "epoch": 0.9525326920186382, + "flos": 22679863125120.0, + "grad_norm": 1.6656258225851708, + "language_loss": 0.73053384, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.75412637, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.13018799, + "step": 15843, + "time_per_iteration": 2.7803592681884766 + }, + { + "auxiliary_loss_clip": 0.01327575, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.21768141, + "balance_loss_mlp": 1.01682901, + "epoch": 0.9525928152713061, + "flos": 22091191779240.0, + "grad_norm": 1.5651790183420595, + "language_loss": 0.7829138, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80650425, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.14630127, + "step": 15844, + "time_per_iteration": 2.7197213172912598 + }, + { + "auxiliary_loss_clip": 0.01334943, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.22271419, + "balance_loss_mlp": 1.02073622, + "epoch": 0.9526529385239741, + "flos": 20709690923880.0, + "grad_norm": 1.7610196290350593, + "language_loss": 0.70392323, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72762656, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14672852, + "step": 15845, + "time_per_iteration": 2.806081533432007 + }, + { + "auxiliary_loss_clip": 0.01333154, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.22285521, + "balance_loss_mlp": 1.02049327, + "epoch": 0.9527130617766422, + "flos": 23373334312920.0, + "grad_norm": 1.6114904926042342, + "language_loss": 0.75835395, + "learning_rate": 2.338118708818282e-08, + "loss": 0.78202653, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1361084, + "step": 15846, + "time_per_iteration": 4.297208786010742 + }, + { + "auxiliary_loss_clip": 0.01325825, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.21799898, + "balance_loss_mlp": 1.01588881, + "epoch": 0.9527731850293101, + "flos": 18990156477240.0, + "grad_norm": 1.6073569908351304, + "language_loss": 0.78661281, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.81015444, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12457275, + "step": 15847, + "time_per_iteration": 2.690290689468384 + }, + { + "auxiliary_loss_clip": 0.01321981, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.2167263, + "balance_loss_mlp": 1.02244687, + "epoch": 0.9528333082819781, + "flos": 19322789156640.0, + "grad_norm": 1.6781289894472289, + "language_loss": 0.78123736, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80480534, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.1237793, + "step": 15848, + "time_per_iteration": 2.7262074947357178 + }, + { + "auxiliary_loss_clip": 0.01333927, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.22257972, + "balance_loss_mlp": 1.02299047, + "epoch": 0.952893431534646, + "flos": 23956645355280.0, + "grad_norm": 1.6041411822021419, + "language_loss": 0.72595811, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74965817, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13098145, + "step": 15849, + "time_per_iteration": 2.7667768001556396 + }, + { + "auxiliary_loss_clip": 0.01335391, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.22347569, + "balance_loss_mlp": 1.01956713, + "epoch": 0.952953554787314, + "flos": 21035217140280.0, + "grad_norm": 1.5240573305291039, + "language_loss": 0.7514416, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77512395, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 1.11865234, + "router_z_loss_mlp": 0.13275146, + "step": 15850, + "time_per_iteration": 2.8061037063598633 + }, + { + "auxiliary_loss_clip": 0.01324579, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.21685696, + "balance_loss_mlp": 1.01879478, + "epoch": 0.9530136780399819, + "flos": 22388024866320.0, + "grad_norm": 2.045848403801259, + "language_loss": 0.72959304, + "learning_rate": 2.308523444215482e-08, + "loss": 0.75314689, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12017822, + "step": 15851, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.01321084, + "auxiliary_loss_mlp": 0.01023898, + "balance_loss_clip": 1.21423674, + "balance_loss_mlp": 1.01142311, + "epoch": 0.95307380129265, + "flos": 22164171648120.0, + "grad_norm": 1.670790846053329, + "language_loss": 0.8006891, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.824139, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12481689, + "step": 15852, + "time_per_iteration": 2.8600401878356934 + }, + { + "auxiliary_loss_clip": 0.01326688, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.21820867, + "balance_loss_mlp": 1.0216608, + "epoch": 0.9531339245453179, + "flos": 44031225950640.0, + "grad_norm": 2.0911819835905567, + "language_loss": 0.60065442, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.62426925, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13146973, + "step": 15853, + "time_per_iteration": 2.999415874481201 + }, + { + "auxiliary_loss_clip": 0.01314788, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.21143293, + "balance_loss_mlp": 1.01834571, + "epoch": 0.9531940477979859, + "flos": 20271568060440.0, + "grad_norm": 1.7502682540708767, + "language_loss": 0.72856271, + "learning_rate": 2.290856241425998e-08, + "loss": 0.75201106, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.11706543, + "step": 15854, + "time_per_iteration": 2.855867862701416 + }, + { + "auxiliary_loss_clip": 0.01327016, + "auxiliary_loss_mlp": 0.01028594, + "balance_loss_clip": 1.21746957, + "balance_loss_mlp": 1.01623178, + "epoch": 0.9532541710506538, + "flos": 25340948187480.0, + "grad_norm": 2.028357348133339, + "language_loss": 0.6808486, + "learning_rate": 2.284982167833127e-08, + "loss": 0.70440471, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12371826, + "step": 15855, + "time_per_iteration": 2.9732296466827393 + }, + { + "auxiliary_loss_clip": 0.01323469, + "auxiliary_loss_mlp": 0.0102874, + "balance_loss_clip": 1.2161231, + "balance_loss_mlp": 1.01649165, + "epoch": 0.9533142943033218, + "flos": 26474978740320.0, + "grad_norm": 1.435695178128837, + "language_loss": 0.7655549, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78907704, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12243652, + "step": 15856, + "time_per_iteration": 2.9056570529937744 + }, + { + "auxiliary_loss_clip": 0.01322799, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.21708906, + "balance_loss_mlp": 1.01645851, + "epoch": 0.9533744175559897, + "flos": 23661883294560.0, + "grad_norm": 1.6681542659713489, + "language_loss": 0.78123182, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80474389, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11938477, + "step": 15857, + "time_per_iteration": 2.8751323223114014 + }, + { + "auxiliary_loss_clip": 0.01139812, + "auxiliary_loss_mlp": 0.01002688, + "balance_loss_clip": 1.09752142, + "balance_loss_mlp": 1.00028002, + "epoch": 0.9534345408086577, + "flos": 61066275928200.0, + "grad_norm": 0.7056844205532211, + "language_loss": 0.62652934, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64795434, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02404785, + "step": 15858, + "time_per_iteration": 3.2676329612731934 + }, + { + "auxiliary_loss_clip": 0.01325281, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.2184, + "balance_loss_mlp": 1.01432586, + "epoch": 0.9534946640613258, + "flos": 18956184261120.0, + "grad_norm": 1.4554495380571102, + "language_loss": 0.56856668, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.5920862, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12359619, + "step": 15859, + "time_per_iteration": 2.772850751876831 + }, + { + "auxiliary_loss_clip": 0.01320206, + "auxiliary_loss_mlp": 0.01025614, + "balance_loss_clip": 1.21684754, + "balance_loss_mlp": 1.01365101, + "epoch": 0.9535547873139937, + "flos": 16658658467280.0, + "grad_norm": 2.0406338810955584, + "language_loss": 0.8245815, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84803969, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.11962891, + "step": 15860, + "time_per_iteration": 2.7337679862976074 + }, + { + "auxiliary_loss_clip": 0.01328041, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.21949077, + "balance_loss_mlp": 1.01461399, + "epoch": 0.9536149105666617, + "flos": 20672673080760.0, + "grad_norm": 1.9649495173178588, + "language_loss": 0.66873729, + "learning_rate": 2.249895178891159e-08, + "loss": 0.69228679, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12304688, + "step": 15861, + "time_per_iteration": 2.7905311584472656 + }, + { + "auxiliary_loss_clip": 0.01328791, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.22058439, + "balance_loss_mlp": 1.0206548, + "epoch": 0.9536750338193296, + "flos": 30706592884560.0, + "grad_norm": 1.9103420808796165, + "language_loss": 0.65547144, + "learning_rate": 2.244073591573037e-08, + "loss": 0.6790967, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13092041, + "step": 15862, + "time_per_iteration": 2.967299461364746 + }, + { + "auxiliary_loss_clip": 0.01317387, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.21433067, + "balance_loss_mlp": 1.01562846, + "epoch": 0.9537351570719976, + "flos": 20409446734560.0, + "grad_norm": 2.010706185729184, + "language_loss": 0.68132514, + "learning_rate": 2.238259503179485e-08, + "loss": 0.70477939, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.12408447, + "step": 15863, + "time_per_iteration": 2.772329330444336 + }, + { + "auxiliary_loss_clip": 0.01326794, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.21947491, + "balance_loss_mlp": 1.01512039, + "epoch": 0.9537952803246655, + "flos": 29934578482560.0, + "grad_norm": 1.6505715844003206, + "language_loss": 0.78652054, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.81007361, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13409424, + "step": 15864, + "time_per_iteration": 2.781989812850952 + }, + { + "auxiliary_loss_clip": 0.01326433, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.21989512, + "balance_loss_mlp": 1.01729608, + "epoch": 0.9538554035773336, + "flos": 20526226042680.0, + "grad_norm": 1.9186738063275752, + "language_loss": 0.59592557, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61948395, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12121582, + "step": 15865, + "time_per_iteration": 2.7782177925109863 + }, + { + "auxiliary_loss_clip": 0.0132199, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.21442246, + "balance_loss_mlp": 1.01587558, + "epoch": 0.9539155268300015, + "flos": 18411231137760.0, + "grad_norm": 1.650948523330975, + "language_loss": 0.70109957, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.7246033, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12506104, + "step": 15866, + "time_per_iteration": 2.7318508625030518 + }, + { + "auxiliary_loss_clip": 0.01325134, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.21697807, + "balance_loss_mlp": 1.01804447, + "epoch": 0.9539756500826695, + "flos": 26218858857120.0, + "grad_norm": 2.4061531110289494, + "language_loss": 0.84623063, + "learning_rate": 2.215078143255855e-08, + "loss": 0.86979389, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.13134766, + "step": 15867, + "time_per_iteration": 2.901700735092163 + }, + { + "auxiliary_loss_clip": 0.01143722, + "auxiliary_loss_mlp": 0.01004675, + "balance_loss_clip": 1.10075521, + "balance_loss_mlp": 1.00217175, + "epoch": 0.9540357733353374, + "flos": 68305942565280.0, + "grad_norm": 0.7540333771086478, + "language_loss": 0.61872125, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.64020526, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02502441, + "step": 15868, + "time_per_iteration": 3.3657009601593018 + }, + { + "auxiliary_loss_clip": 0.01325833, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.2189517, + "balance_loss_mlp": 1.01503146, + "epoch": 0.9540958965880054, + "flos": 21293286224760.0, + "grad_norm": 1.702369246277899, + "language_loss": 0.60153562, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62507021, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12591553, + "step": 15869, + "time_per_iteration": 2.807525634765625 + }, + { + "auxiliary_loss_clip": 0.01327169, + "auxiliary_loss_mlp": 0.01030906, + "balance_loss_clip": 1.22024047, + "balance_loss_mlp": 1.01939654, + "epoch": 0.9541560198406733, + "flos": 19755632933280.0, + "grad_norm": 2.25926999799102, + "language_loss": 0.71287131, + "learning_rate": 2.197770872795579e-08, + "loss": 0.7364521, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1151123, + "step": 15870, + "time_per_iteration": 2.7839126586914062 + }, + { + "auxiliary_loss_clip": 0.01323827, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.21736145, + "balance_loss_mlp": 1.01691353, + "epoch": 0.9542161430933414, + "flos": 24720781735440.0, + "grad_norm": 1.925090875647961, + "language_loss": 0.7637412, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78727686, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1282959, + "step": 15871, + "time_per_iteration": 2.874687671661377 + }, + { + "auxiliary_loss_clip": 0.01328808, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.22097874, + "balance_loss_mlp": 1.01861751, + "epoch": 0.9542762663460094, + "flos": 31072020137640.0, + "grad_norm": 1.8394091753887407, + "language_loss": 0.58544648, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60905373, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.1328125, + "step": 15872, + "time_per_iteration": 2.9173712730407715 + }, + { + "auxiliary_loss_clip": 0.01333993, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.2232964, + "balance_loss_mlp": 1.0176785, + "epoch": 0.9543363895986773, + "flos": 20781574367040.0, + "grad_norm": 1.6178001112000087, + "language_loss": 0.75181293, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.77547157, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14202881, + "step": 15873, + "time_per_iteration": 2.8873977661132812 + }, + { + "auxiliary_loss_clip": 0.01325667, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.21815228, + "balance_loss_mlp": 1.01799822, + "epoch": 0.9543965128513453, + "flos": 24468194779560.0, + "grad_norm": 2.004974351186748, + "language_loss": 0.62575281, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.6493156, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12615967, + "step": 15874, + "time_per_iteration": 4.24894118309021 + }, + { + "auxiliary_loss_clip": 0.01317209, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.21219206, + "balance_loss_mlp": 1.02126908, + "epoch": 0.9544566361040132, + "flos": 15264893887200.0, + "grad_norm": 2.073561980676614, + "language_loss": 0.89664567, + "learning_rate": 2.169075438538104e-08, + "loss": 0.92015338, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12286377, + "step": 15875, + "time_per_iteration": 2.8240408897399902 + }, + { + "auxiliary_loss_clip": 0.01335349, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.22424197, + "balance_loss_mlp": 1.01657319, + "epoch": 0.9545167593566812, + "flos": 25923974971320.0, + "grad_norm": 2.3601825081441166, + "language_loss": 0.67973411, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70339441, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.14123535, + "step": 15876, + "time_per_iteration": 2.814639091491699 + }, + { + "auxiliary_loss_clip": 0.01333488, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.22315335, + "balance_loss_mlp": 1.01787555, + "epoch": 0.9545768826093491, + "flos": 25633517396760.0, + "grad_norm": 1.8888214351572858, + "language_loss": 0.69380593, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71745694, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13751221, + "step": 15877, + "time_per_iteration": 2.8693268299102783 + }, + { + "auxiliary_loss_clip": 0.01333917, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.22436297, + "balance_loss_mlp": 1.01380897, + "epoch": 0.9546370058620172, + "flos": 22496560677360.0, + "grad_norm": 1.621086514328618, + "language_loss": 0.70977664, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.73339164, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13787842, + "step": 15878, + "time_per_iteration": 2.7913241386413574 + }, + { + "auxiliary_loss_clip": 0.01321247, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.21473181, + "balance_loss_mlp": 1.01497459, + "epoch": 0.9546971291146851, + "flos": 24615169726320.0, + "grad_norm": 1.263178403381106, + "language_loss": 0.68344545, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70692766, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.11999512, + "step": 15879, + "time_per_iteration": 5.7612690925598145 + }, + { + "auxiliary_loss_clip": 0.01322563, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.21756995, + "balance_loss_mlp": 1.01615644, + "epoch": 0.9547572523673531, + "flos": 28663643856240.0, + "grad_norm": 2.1281277228265614, + "language_loss": 0.84854221, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.87205309, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12371826, + "step": 15880, + "time_per_iteration": 2.8015077114105225 + }, + { + "auxiliary_loss_clip": 0.01327213, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.2190901, + "balance_loss_mlp": 1.02157235, + "epoch": 0.954817375620021, + "flos": 33809211912600.0, + "grad_norm": 1.7603548099247233, + "language_loss": 0.72352612, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74714565, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.13165283, + "step": 15881, + "time_per_iteration": 2.8553590774536133 + }, + { + "auxiliary_loss_clip": 0.01324975, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.21806479, + "balance_loss_mlp": 1.01879907, + "epoch": 0.954877498872689, + "flos": 14432244557760.0, + "grad_norm": 1.818807298428968, + "language_loss": 0.71517575, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73874378, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.13024902, + "step": 15882, + "time_per_iteration": 2.78299617767334 + }, + { + "auxiliary_loss_clip": 0.01331081, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.22292304, + "balance_loss_mlp": 1.01915598, + "epoch": 0.9549376221253569, + "flos": 59279348585160.0, + "grad_norm": 1.5852579135161828, + "language_loss": 0.65795594, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68158209, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.1237793, + "step": 15883, + "time_per_iteration": 3.095208168029785 + }, + { + "auxiliary_loss_clip": 0.01327272, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.21915174, + "balance_loss_mlp": 1.01549172, + "epoch": 0.954997745378025, + "flos": 17278824919320.0, + "grad_norm": 2.4387672203304787, + "language_loss": 0.78293616, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80650198, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13812256, + "step": 15884, + "time_per_iteration": 4.261118173599243 + }, + { + "auxiliary_loss_clip": 0.01331442, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.22163701, + "balance_loss_mlp": 1.01598454, + "epoch": 0.955057868630693, + "flos": 13010355365400.0, + "grad_norm": 1.7273211275445692, + "language_loss": 0.77908754, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.80269575, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13391113, + "step": 15885, + "time_per_iteration": 2.758866310119629 + }, + { + "auxiliary_loss_clip": 0.01329291, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.22110128, + "balance_loss_mlp": 1.01962698, + "epoch": 0.9551179918833609, + "flos": 22642804673640.0, + "grad_norm": 2.0807518724472103, + "language_loss": 0.70558095, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72919792, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.12792969, + "step": 15886, + "time_per_iteration": 2.909102439880371 + }, + { + "auxiliary_loss_clip": 0.01337388, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.2261188, + "balance_loss_mlp": 1.02063274, + "epoch": 0.9551781151360289, + "flos": 21547741165200.0, + "grad_norm": 1.7701267969486796, + "language_loss": 0.72874677, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.75247157, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.14459229, + "step": 15887, + "time_per_iteration": 2.7689223289489746 + }, + { + "auxiliary_loss_clip": 0.01315865, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.21160424, + "balance_loss_mlp": 1.01432586, + "epoch": 0.9552382383886968, + "flos": 20706888947040.0, + "grad_norm": 2.260185171604427, + "language_loss": 0.5693922, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59281468, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.1204834, + "step": 15888, + "time_per_iteration": 2.753063917160034 + }, + { + "auxiliary_loss_clip": 0.01142358, + "auxiliary_loss_mlp": 0.01007606, + "balance_loss_clip": 1.09957385, + "balance_loss_mlp": 1.00472152, + "epoch": 0.9552983616413648, + "flos": 67785459301800.0, + "grad_norm": 0.7836873769415412, + "language_loss": 0.57880723, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.60030687, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02880859, + "step": 15889, + "time_per_iteration": 3.3417880535125732 + }, + { + "auxiliary_loss_clip": 0.01332861, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.22171068, + "balance_loss_mlp": 1.01320004, + "epoch": 0.9553584848940327, + "flos": 21585002658480.0, + "grad_norm": 1.3244130364569282, + "language_loss": 0.67232406, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69592285, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13824463, + "step": 15890, + "time_per_iteration": 2.8029701709747314 + }, + { + "auxiliary_loss_clip": 0.01323448, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.21701121, + "balance_loss_mlp": 1.0182848, + "epoch": 0.9554186081467008, + "flos": 24213699230760.0, + "grad_norm": 1.4281866880765148, + "language_loss": 0.74240649, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76594096, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.1171875, + "step": 15891, + "time_per_iteration": 2.8184566497802734 + }, + { + "auxiliary_loss_clip": 0.01316077, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.21247458, + "balance_loss_mlp": 1.01730239, + "epoch": 0.9554787313993687, + "flos": 16255685462400.0, + "grad_norm": 1.7927743420885833, + "language_loss": 0.7810539, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80450469, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.11706543, + "step": 15892, + "time_per_iteration": 2.789886236190796 + }, + { + "auxiliary_loss_clip": 0.01321737, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.21553862, + "balance_loss_mlp": 1.01510549, + "epoch": 0.9555388546520367, + "flos": 23409377555400.0, + "grad_norm": 1.3594667882715754, + "language_loss": 0.70340109, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72689533, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12573242, + "step": 15893, + "time_per_iteration": 2.856036901473999 + }, + { + "auxiliary_loss_clip": 0.01322655, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.21732068, + "balance_loss_mlp": 1.01755738, + "epoch": 0.9555989779047046, + "flos": 14798240327880.0, + "grad_norm": 1.7583586830510252, + "language_loss": 0.66046923, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.68401301, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.14160156, + "step": 15894, + "time_per_iteration": 2.746431827545166 + }, + { + "auxiliary_loss_clip": 0.01330504, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.22068048, + "balance_loss_mlp": 1.02056682, + "epoch": 0.9556591011573726, + "flos": 22241862086760.0, + "grad_norm": 2.161346715284101, + "language_loss": 0.81795919, + "learning_rate": 2.056169412853581e-08, + "loss": 0.84160089, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13104248, + "step": 15895, + "time_per_iteration": 2.7644872665405273 + }, + { + "auxiliary_loss_clip": 0.01326138, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.21846282, + "balance_loss_mlp": 1.02167225, + "epoch": 0.9557192244100405, + "flos": 27861108948720.0, + "grad_norm": 2.551993863503997, + "language_loss": 0.72619092, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74980187, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13293457, + "step": 15896, + "time_per_iteration": 2.8164188861846924 + }, + { + "auxiliary_loss_clip": 0.01320678, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.21520245, + "balance_loss_mlp": 1.017349, + "epoch": 0.9557793476627086, + "flos": 17607274937640.0, + "grad_norm": 1.7943440433433693, + "language_loss": 0.79421639, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81772232, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12573242, + "step": 15897, + "time_per_iteration": 2.8646440505981445 + }, + { + "auxiliary_loss_clip": 0.01324881, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.21776748, + "balance_loss_mlp": 1.01851702, + "epoch": 0.9558394709153766, + "flos": 23880416817600.0, + "grad_norm": 1.5876479178168776, + "language_loss": 0.72740066, + "learning_rate": 2.03949242614303e-08, + "loss": 0.75096488, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13024902, + "step": 15898, + "time_per_iteration": 2.8836097717285156 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01002385, + "balance_loss_clip": 1.09913528, + "balance_loss_mlp": 0.99977458, + "epoch": 0.9558995941680445, + "flos": 53695268562960.0, + "grad_norm": 0.8687411384478667, + "language_loss": 0.5239445, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54538238, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02612305, + "step": 15899, + "time_per_iteration": 3.213913679122925 + }, + { + "auxiliary_loss_clip": 0.01334826, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.22345543, + "balance_loss_mlp": 1.01761103, + "epoch": 0.9559597174207125, + "flos": 13767507107640.0, + "grad_norm": 2.133505358106549, + "language_loss": 0.69306564, + "learning_rate": 2.028411968062782e-08, + "loss": 0.7167272, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13708496, + "step": 15900, + "time_per_iteration": 2.808258056640625 + }, + { + "auxiliary_loss_clip": 0.01328445, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.21974814, + "balance_loss_mlp": 1.01480317, + "epoch": 0.9560198406733804, + "flos": 19940925190680.0, + "grad_norm": 1.9087234611447377, + "language_loss": 0.83159316, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.8551625, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13684082, + "step": 15901, + "time_per_iteration": 2.699880361557007 + }, + { + "auxiliary_loss_clip": 0.01142731, + "auxiliary_loss_mlp": 0.010042, + "balance_loss_clip": 1.09926677, + "balance_loss_mlp": 1.00150633, + "epoch": 0.9560799639260484, + "flos": 57302696027520.0, + "grad_norm": 0.7124435545304189, + "language_loss": 0.54288119, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56435049, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02697754, + "step": 15902, + "time_per_iteration": 3.2664856910705566 + }, + { + "auxiliary_loss_clip": 0.01312084, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.2106353, + "balance_loss_mlp": 1.01821971, + "epoch": 0.9561400871787163, + "flos": 18921927786480.0, + "grad_norm": 1.5797489809251248, + "language_loss": 0.85503638, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87844837, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.10894775, + "step": 15903, + "time_per_iteration": 2.758967161178589 + }, + { + "auxiliary_loss_clip": 0.01328523, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.22096467, + "balance_loss_mlp": 1.01319075, + "epoch": 0.9562002104313844, + "flos": 18042839474400.0, + "grad_norm": 1.8978853459481573, + "language_loss": 0.80604517, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82958192, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.11968994, + "step": 15904, + "time_per_iteration": 2.73012113571167 + }, + { + "auxiliary_loss_clip": 0.01332848, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.22348189, + "balance_loss_mlp": 1.01903951, + "epoch": 0.9562603336840523, + "flos": 24723096411960.0, + "grad_norm": 2.1875271114589836, + "language_loss": 0.60028386, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62393403, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.13128662, + "step": 15905, + "time_per_iteration": 2.770390272140503 + }, + { + "auxiliary_loss_clip": 0.01325326, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.21871877, + "balance_loss_mlp": 1.0160675, + "epoch": 0.9563204569367203, + "flos": 21181989045240.0, + "grad_norm": 1.9252827320989143, + "language_loss": 0.70124328, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72478831, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13110352, + "step": 15906, + "time_per_iteration": 2.686338186264038 + }, + { + "auxiliary_loss_clip": 0.01337989, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.22747874, + "balance_loss_mlp": 1.01565444, + "epoch": 0.9563805801893882, + "flos": 20234550217320.0, + "grad_norm": 1.9687264403602387, + "language_loss": 0.71119535, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73486358, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13171387, + "step": 15907, + "time_per_iteration": 2.7529895305633545 + }, + { + "auxiliary_loss_clip": 0.01320606, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.21551251, + "balance_loss_mlp": 1.01950192, + "epoch": 0.9564407034420562, + "flos": 25416526991400.0, + "grad_norm": 3.9443898394149493, + "language_loss": 0.70916402, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.73269063, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12548828, + "step": 15908, + "time_per_iteration": 2.7372348308563232 + }, + { + "auxiliary_loss_clip": 0.01327202, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.22096562, + "balance_loss_mlp": 1.01916695, + "epoch": 0.9565008266947241, + "flos": 18628424584920.0, + "grad_norm": 1.7084560460296527, + "language_loss": 0.83490479, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85848719, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11877441, + "step": 15909, + "time_per_iteration": 2.757150173187256 + }, + { + "auxiliary_loss_clip": 0.01321603, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.21485329, + "balance_loss_mlp": 1.02127647, + "epoch": 0.9565609499473922, + "flos": 24867391206960.0, + "grad_norm": 2.0701625920008806, + "language_loss": 0.67706466, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.70061386, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12023926, + "step": 15910, + "time_per_iteration": 2.9289824962615967 + }, + { + "auxiliary_loss_clip": 0.01336587, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.22565079, + "balance_loss_mlp": 1.02019119, + "epoch": 0.9566210732000601, + "flos": 21803454964800.0, + "grad_norm": 3.7844027556843356, + "language_loss": 0.74532855, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76902133, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.125, + "step": 15911, + "time_per_iteration": 2.839261531829834 + }, + { + "auxiliary_loss_clip": 0.0132793, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.22037005, + "balance_loss_mlp": 1.01907611, + "epoch": 0.9566811964527281, + "flos": 18702257229360.0, + "grad_norm": 2.814517702153246, + "language_loss": 0.6990509, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.72264749, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12640381, + "step": 15912, + "time_per_iteration": 2.8681344985961914 + }, + { + "auxiliary_loss_clip": 0.01328809, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.22340035, + "balance_loss_mlp": 1.02385521, + "epoch": 0.9567413197053961, + "flos": 13003858027800.0, + "grad_norm": 3.001817504269269, + "language_loss": 0.71928197, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74293494, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12646484, + "step": 15913, + "time_per_iteration": 2.7792224884033203 + }, + { + "auxiliary_loss_clip": 0.01320854, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.21526432, + "balance_loss_mlp": 1.01554346, + "epoch": 0.956801442958064, + "flos": 19724056610400.0, + "grad_norm": 2.300195239920832, + "language_loss": 0.73404336, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75752789, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.1204834, + "step": 15914, + "time_per_iteration": 4.2744128704071045 + }, + { + "auxiliary_loss_clip": 0.01325005, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.21799958, + "balance_loss_mlp": 1.01834559, + "epoch": 0.956861566210732, + "flos": 18227360172960.0, + "grad_norm": 1.6703854713445443, + "language_loss": 0.67523098, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69879222, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.12762451, + "step": 15915, + "time_per_iteration": 2.7713685035705566 + }, + { + "auxiliary_loss_clip": 0.01317534, + "auxiliary_loss_mlp": 0.01027377, + "balance_loss_clip": 1.21305478, + "balance_loss_mlp": 1.01480615, + "epoch": 0.9569216894634, + "flos": 22201636183200.0, + "grad_norm": 1.7875464797494698, + "language_loss": 0.64292562, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66637474, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12567139, + "step": 15916, + "time_per_iteration": 2.8178844451904297 + }, + { + "auxiliary_loss_clip": 0.01312875, + "auxiliary_loss_mlp": 0.01025803, + "balance_loss_clip": 1.21158254, + "balance_loss_mlp": 1.01421034, + "epoch": 0.956981812716068, + "flos": 21694310028360.0, + "grad_norm": 2.0279613389909996, + "language_loss": 0.80832803, + "learning_rate": 1.935440639853536e-08, + "loss": 0.83171487, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.11602783, + "step": 15917, + "time_per_iteration": 2.7955946922302246 + }, + { + "auxiliary_loss_clip": 0.01323685, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.21792364, + "balance_loss_mlp": 1.02038431, + "epoch": 0.9570419359687359, + "flos": 13994933861520.0, + "grad_norm": 1.8498051822162085, + "language_loss": 0.73307526, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.756639, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12298584, + "step": 15918, + "time_per_iteration": 4.3393988609313965 + }, + { + "auxiliary_loss_clip": 0.01139889, + "auxiliary_loss_mlp": 0.01009896, + "balance_loss_clip": 1.0974102, + "balance_loss_mlp": 1.00739276, + "epoch": 0.9571020592214039, + "flos": 65214572052960.0, + "grad_norm": 0.6329823932408628, + "language_loss": 0.53189564, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55339348, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02502441, + "step": 15919, + "time_per_iteration": 3.3734872341156006 + }, + { + "auxiliary_loss_clip": 0.01333689, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.22315574, + "balance_loss_mlp": 1.02252626, + "epoch": 0.9571621824740718, + "flos": 17388538372800.0, + "grad_norm": 2.617953023811111, + "language_loss": 0.76077169, + "learning_rate": 1.919259224843972e-08, + "loss": 0.78446835, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13446045, + "step": 15920, + "time_per_iteration": 2.7718300819396973 + }, + { + "auxiliary_loss_clip": 0.01333966, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.22452474, + "balance_loss_mlp": 1.01744306, + "epoch": 0.9572223057267398, + "flos": 14542282878120.0, + "grad_norm": 1.8651334425226005, + "language_loss": 0.79619646, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.8198477, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13720703, + "step": 15921, + "time_per_iteration": 2.7934014797210693 + }, + { + "auxiliary_loss_clip": 0.01337622, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.22473979, + "balance_loss_mlp": 1.01673102, + "epoch": 0.9572824289794077, + "flos": 33954318874800.0, + "grad_norm": 1.7273378653785223, + "language_loss": 0.51113236, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53481102, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13513184, + "step": 15922, + "time_per_iteration": 4.449612617492676 + }, + { + "auxiliary_loss_clip": 0.01331124, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.22175443, + "balance_loss_mlp": 1.01740706, + "epoch": 0.9573425522320758, + "flos": 18699211602360.0, + "grad_norm": 1.9251641612034998, + "language_loss": 0.83780468, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86141968, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12957764, + "step": 15923, + "time_per_iteration": 2.9177701473236084 + }, + { + "auxiliary_loss_clip": 0.01322424, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.21623766, + "balance_loss_mlp": 1.01901424, + "epoch": 0.9574026754847437, + "flos": 28515978567360.0, + "grad_norm": 2.297708146681394, + "language_loss": 0.75795782, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.78149462, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12243652, + "step": 15924, + "time_per_iteration": 2.8343560695648193 + }, + { + "auxiliary_loss_clip": 0.01323613, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.21573448, + "balance_loss_mlp": 1.02408707, + "epoch": 0.9574627987374117, + "flos": 24357709767240.0, + "grad_norm": 1.861094064544069, + "language_loss": 0.86359847, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88720548, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.13012695, + "step": 15925, + "time_per_iteration": 2.940160036087036 + }, + { + "auxiliary_loss_clip": 0.01332774, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.22200155, + "balance_loss_mlp": 1.0217998, + "epoch": 0.9575229219900797, + "flos": 23515558081560.0, + "grad_norm": 1.7115534172376374, + "language_loss": 0.75834453, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.78202426, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.1340332, + "step": 15926, + "time_per_iteration": 2.803267240524292 + }, + { + "auxiliary_loss_clip": 0.01326057, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.21806264, + "balance_loss_mlp": 1.01859033, + "epoch": 0.9575830452427476, + "flos": 22680025558560.0, + "grad_norm": 1.5755870124984794, + "language_loss": 0.77841198, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.80197465, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.11608887, + "step": 15927, + "time_per_iteration": 2.7691402435302734 + }, + { + "auxiliary_loss_clip": 0.01332452, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.22161841, + "balance_loss_mlp": 1.01599169, + "epoch": 0.9576431684954156, + "flos": 30491795330640.0, + "grad_norm": 1.6102592227732464, + "language_loss": 0.69260806, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.71623486, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.14221191, + "step": 15928, + "time_per_iteration": 2.9519073963165283 + }, + { + "auxiliary_loss_clip": 0.01330436, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.22205377, + "balance_loss_mlp": 1.01450992, + "epoch": 0.9577032917480836, + "flos": 21692157785280.0, + "grad_norm": 1.8226153081904903, + "language_loss": 0.82044518, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84402424, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12969971, + "step": 15929, + "time_per_iteration": 2.8904521465301514 + }, + { + "auxiliary_loss_clip": 0.01338369, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.22520971, + "balance_loss_mlp": 1.0256176, + "epoch": 0.9577634150007516, + "flos": 29029152326040.0, + "grad_norm": 2.0716023275741957, + "language_loss": 0.72489321, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74866956, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.13641357, + "step": 15930, + "time_per_iteration": 3.005143165588379 + }, + { + "auxiliary_loss_clip": 0.01324861, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.21848965, + "balance_loss_mlp": 1.01564765, + "epoch": 0.9578235382534195, + "flos": 19287070781040.0, + "grad_norm": 1.610651262534641, + "language_loss": 0.62432933, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64785624, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12194824, + "step": 15931, + "time_per_iteration": 2.907559394836426 + }, + { + "auxiliary_loss_clip": 0.0132141, + "auxiliary_loss_mlp": 0.01026502, + "balance_loss_clip": 1.21696234, + "balance_loss_mlp": 1.01439047, + "epoch": 0.9578836615060875, + "flos": 13703298644520.0, + "grad_norm": 1.8109020743969744, + "language_loss": 0.69262135, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71610045, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.12109375, + "step": 15932, + "time_per_iteration": 2.9362430572509766 + }, + { + "auxiliary_loss_clip": 0.01335595, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.22367787, + "balance_loss_mlp": 1.02274537, + "epoch": 0.9579437847587554, + "flos": 17058788886960.0, + "grad_norm": 4.192139133146807, + "language_loss": 0.75757319, + "learning_rate": 1.849920999338961e-08, + "loss": 0.78129762, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 1.11962891, + "router_z_loss_mlp": 0.14093018, + "step": 15933, + "time_per_iteration": 2.9143714904785156 + }, + { + "auxiliary_loss_clip": 0.01139807, + "auxiliary_loss_mlp": 0.0100145, + "balance_loss_clip": 1.09713459, + "balance_loss_mlp": 0.99881536, + "epoch": 0.9580039080114234, + "flos": 60584150583720.0, + "grad_norm": 0.719520429877926, + "language_loss": 0.57345784, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59487033, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02636719, + "step": 15934, + "time_per_iteration": 3.4614741802215576 + }, + { + "auxiliary_loss_clip": 0.01140236, + "auxiliary_loss_mlp": 0.01007638, + "balance_loss_clip": 1.09805512, + "balance_loss_mlp": 1.00489616, + "epoch": 0.9580640312640913, + "flos": 66250567378800.0, + "grad_norm": 0.9154486951016042, + "language_loss": 0.66014433, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.6816231, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02746582, + "step": 15935, + "time_per_iteration": 3.183499336242676 + }, + { + "auxiliary_loss_clip": 0.01140542, + "auxiliary_loss_mlp": 0.01003274, + "balance_loss_clip": 1.09823275, + "balance_loss_mlp": 1.00067544, + "epoch": 0.9581241545167594, + "flos": 62232167062440.0, + "grad_norm": 0.7861169938733423, + "language_loss": 0.5708729, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59231102, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02600098, + "step": 15936, + "time_per_iteration": 3.2581980228424072 + }, + { + "auxiliary_loss_clip": 0.01331535, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.22271085, + "balance_loss_mlp": 1.01854992, + "epoch": 0.9581842777694273, + "flos": 23773261690800.0, + "grad_norm": 1.5614712919147222, + "language_loss": 0.78975689, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.8133896, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.1317749, + "step": 15937, + "time_per_iteration": 2.8530945777893066 + }, + { + "auxiliary_loss_clip": 0.01324327, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.21708775, + "balance_loss_mlp": 1.01702714, + "epoch": 0.9582444010220953, + "flos": 21217788637560.0, + "grad_norm": 1.6145331775354776, + "language_loss": 0.68687975, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.71042109, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12786865, + "step": 15938, + "time_per_iteration": 2.8280386924743652 + }, + { + "auxiliary_loss_clip": 0.01329399, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.22115397, + "balance_loss_mlp": 1.01519442, + "epoch": 0.9583045242747633, + "flos": 23810401359000.0, + "grad_norm": 2.2554098386899635, + "language_loss": 0.66318166, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.68675339, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12573242, + "step": 15939, + "time_per_iteration": 2.7577621936798096 + }, + { + "auxiliary_loss_clip": 0.01327543, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.21960843, + "balance_loss_mlp": 1.01834309, + "epoch": 0.9583646475274312, + "flos": 24136496092440.0, + "grad_norm": 1.646241364027285, + "language_loss": 0.74070966, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.76429403, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12548828, + "step": 15940, + "time_per_iteration": 2.8355939388275146 + }, + { + "auxiliary_loss_clip": 0.01325781, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.21793544, + "balance_loss_mlp": 1.0169692, + "epoch": 0.9584247707800992, + "flos": 20891572079040.0, + "grad_norm": 1.5652342415248954, + "language_loss": 0.73078668, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75434661, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13238525, + "step": 15941, + "time_per_iteration": 2.766969919204712 + }, + { + "auxiliary_loss_clip": 0.01323271, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.2160356, + "balance_loss_mlp": 1.01947439, + "epoch": 0.9584848940327672, + "flos": 26073183377880.0, + "grad_norm": 1.693213331891811, + "language_loss": 0.71666718, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.74021834, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12365723, + "step": 15942, + "time_per_iteration": 2.8146262168884277 + }, + { + "auxiliary_loss_clip": 0.01330339, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.22110081, + "balance_loss_mlp": 1.01897979, + "epoch": 0.9585450172854352, + "flos": 34499312606520.0, + "grad_norm": 1.7368401512347278, + "language_loss": 0.71898872, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74261653, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.13464355, + "step": 15943, + "time_per_iteration": 2.9268736839294434 + }, + { + "auxiliary_loss_clip": 0.01329803, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.22160935, + "balance_loss_mlp": 1.01817298, + "epoch": 0.9586051405381031, + "flos": 23115752528760.0, + "grad_norm": 1.802366756247676, + "language_loss": 0.68796611, + "learning_rate": 1.792242006001965e-08, + "loss": 0.71157515, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12927246, + "step": 15944, + "time_per_iteration": 2.968844413757324 + }, + { + "auxiliary_loss_clip": 0.0132844, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.21936846, + "balance_loss_mlp": 1.02131844, + "epoch": 0.9586652637907711, + "flos": 19608008252760.0, + "grad_norm": 1.7238193461078501, + "language_loss": 0.66330957, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68693781, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13067627, + "step": 15945, + "time_per_iteration": 2.834529399871826 + }, + { + "auxiliary_loss_clip": 0.01142662, + "auxiliary_loss_mlp": 0.01001138, + "balance_loss_clip": 1.09988856, + "balance_loss_mlp": 0.99837226, + "epoch": 0.958725387043439, + "flos": 72089322364440.0, + "grad_norm": 0.7417643889763359, + "language_loss": 0.6194855, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.6409235, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02770996, + "step": 15946, + "time_per_iteration": 3.318645715713501 + }, + { + "auxiliary_loss_clip": 0.01322059, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.21780694, + "balance_loss_mlp": 1.01796317, + "epoch": 0.958785510296107, + "flos": 28917692713080.0, + "grad_norm": 2.5210971120032464, + "language_loss": 0.75479048, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77831399, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12329102, + "step": 15947, + "time_per_iteration": 2.8321897983551025 + }, + { + "auxiliary_loss_clip": 0.01324917, + "auxiliary_loss_mlp": 0.01026273, + "balance_loss_clip": 1.21884286, + "balance_loss_mlp": 1.01385736, + "epoch": 0.958845633548775, + "flos": 18481612071600.0, + "grad_norm": 2.1618497587844505, + "language_loss": 0.70178819, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72530007, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12408447, + "step": 15948, + "time_per_iteration": 2.781804084777832 + }, + { + "auxiliary_loss_clip": 0.01323343, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.21642745, + "balance_loss_mlp": 1.01706302, + "epoch": 0.958905756801443, + "flos": 24212277938160.0, + "grad_norm": 1.9599467893402704, + "language_loss": 0.7917552, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.8152833, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12408447, + "step": 15949, + "time_per_iteration": 2.8679921627044678 + }, + { + "auxiliary_loss_clip": 0.01330636, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.22165537, + "balance_loss_mlp": 1.01758611, + "epoch": 0.9589658800541109, + "flos": 25013188511280.0, + "grad_norm": 1.762263585126461, + "language_loss": 0.69038427, + "learning_rate": 1.761164038992602e-08, + "loss": 0.71400177, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13519287, + "step": 15950, + "time_per_iteration": 2.8239479064941406 + }, + { + "auxiliary_loss_clip": 0.01325846, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.21733308, + "balance_loss_mlp": 1.02025938, + "epoch": 0.9590260033067789, + "flos": 23519943784440.0, + "grad_norm": 1.6480084958203727, + "language_loss": 0.86141944, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88500047, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.11993408, + "step": 15951, + "time_per_iteration": 2.830756664276123 + }, + { + "auxiliary_loss_clip": 0.01334738, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.2230531, + "balance_loss_mlp": 1.02107048, + "epoch": 0.9590861265594469, + "flos": 25525874969640.0, + "grad_norm": 3.2087257828881306, + "language_loss": 0.80712456, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.83081907, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13647461, + "step": 15952, + "time_per_iteration": 4.2492516040802 + }, + { + "auxiliary_loss_clip": 0.01327232, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.2201488, + "balance_loss_mlp": 1.01398325, + "epoch": 0.9591462498121148, + "flos": 21184384938480.0, + "grad_norm": 2.30457933239249, + "language_loss": 0.69570696, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71924913, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13000488, + "step": 15953, + "time_per_iteration": 2.8171467781066895 + }, + { + "auxiliary_loss_clip": 0.01326181, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.21716809, + "balance_loss_mlp": 1.01403832, + "epoch": 0.9592063730647828, + "flos": 21727713727440.0, + "grad_norm": 3.006687903662412, + "language_loss": 0.592978, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.6165185, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13824463, + "step": 15954, + "time_per_iteration": 2.9676711559295654 + }, + { + "auxiliary_loss_clip": 0.01329562, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.21931553, + "balance_loss_mlp": 1.01927519, + "epoch": 0.9592664963174508, + "flos": 29896545430440.0, + "grad_norm": 2.13467375740734, + "language_loss": 0.73750019, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76112682, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13818359, + "step": 15955, + "time_per_iteration": 3.0082919597625732 + }, + { + "auxiliary_loss_clip": 0.01330501, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.22062016, + "balance_loss_mlp": 1.01998425, + "epoch": 0.9593266195701188, + "flos": 18003547563120.0, + "grad_norm": 1.7878773760526858, + "language_loss": 0.62739038, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65102923, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13409424, + "step": 15956, + "time_per_iteration": 4.466093063354492 + }, + { + "auxiliary_loss_clip": 0.01329016, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.22094703, + "balance_loss_mlp": 1.01598382, + "epoch": 0.9593867428227867, + "flos": 18842247538200.0, + "grad_norm": 12.422692074455975, + "language_loss": 0.60100651, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62458879, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13220215, + "step": 15957, + "time_per_iteration": 2.7872495651245117 + }, + { + "auxiliary_loss_clip": 0.01331104, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.22259498, + "balance_loss_mlp": 1.02237535, + "epoch": 0.9594468660754547, + "flos": 29572684156800.0, + "grad_norm": 2.1094220210100474, + "language_loss": 0.74957508, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.77324533, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13537598, + "step": 15958, + "time_per_iteration": 2.8584256172180176 + }, + { + "auxiliary_loss_clip": 0.01322305, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.21564996, + "balance_loss_mlp": 1.01521289, + "epoch": 0.9595069893281226, + "flos": 20708025981120.0, + "grad_norm": 1.5298698620291267, + "language_loss": 0.74618506, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76968992, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12963867, + "step": 15959, + "time_per_iteration": 2.756903886795044 + }, + { + "auxiliary_loss_clip": 0.01334577, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.22413659, + "balance_loss_mlp": 1.01803756, + "epoch": 0.9595671125807906, + "flos": 22458487016880.0, + "grad_norm": 4.970096547523531, + "language_loss": 0.65099525, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67465335, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13195801, + "step": 15960, + "time_per_iteration": 2.7421915531158447 + }, + { + "auxiliary_loss_clip": 0.01321067, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.21600115, + "balance_loss_mlp": 1.01818526, + "epoch": 0.9596272358334585, + "flos": 23920561504440.0, + "grad_norm": 1.6446454695175077, + "language_loss": 0.7876997, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.8112253, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.13311768, + "step": 15961, + "time_per_iteration": 4.266614198684692 + }, + { + "auxiliary_loss_clip": 0.01320691, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.21497285, + "balance_loss_mlp": 1.01402557, + "epoch": 0.9596873590861266, + "flos": 17676600054120.0, + "grad_norm": 2.0716732002408853, + "language_loss": 0.75823003, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78170097, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1237793, + "step": 15962, + "time_per_iteration": 2.697852373123169 + }, + { + "auxiliary_loss_clip": 0.01336881, + "auxiliary_loss_mlp": 0.010304, + "balance_loss_clip": 1.22617662, + "balance_loss_mlp": 1.0170244, + "epoch": 0.9597474823387945, + "flos": 25813611784080.0, + "grad_norm": 1.957936195358937, + "language_loss": 0.71606755, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73974037, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13391113, + "step": 15963, + "time_per_iteration": 2.7758219242095947 + }, + { + "auxiliary_loss_clip": 0.0131299, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.21139967, + "balance_loss_mlp": 1.01864886, + "epoch": 0.9598076055914625, + "flos": 23773748991120.0, + "grad_norm": 1.4710760297163585, + "language_loss": 0.7433244, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76676148, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.12078857, + "step": 15964, + "time_per_iteration": 2.8286197185516357 + }, + { + "auxiliary_loss_clip": 0.01141817, + "auxiliary_loss_mlp": 0.01003891, + "balance_loss_clip": 1.09894609, + "balance_loss_mlp": 1.00138807, + "epoch": 0.9598677288441305, + "flos": 56528042082120.0, + "grad_norm": 0.8768379413991739, + "language_loss": 0.57678926, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59824634, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02502441, + "step": 15965, + "time_per_iteration": 3.2018020153045654 + }, + { + "auxiliary_loss_clip": 0.01328674, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.22043657, + "balance_loss_mlp": 1.01732707, + "epoch": 0.9599278520967984, + "flos": 23001978239280.0, + "grad_norm": 1.8540156933375702, + "language_loss": 0.79216105, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81574523, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12420654, + "step": 15966, + "time_per_iteration": 2.7630765438079834 + }, + { + "auxiliary_loss_clip": 0.01321184, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.21490622, + "balance_loss_mlp": 1.0183363, + "epoch": 0.9599879753494664, + "flos": 23044924902960.0, + "grad_norm": 1.6248504203566905, + "language_loss": 0.79831171, + "learning_rate": 1.674579558025102e-08, + "loss": 0.82183194, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12512207, + "step": 15967, + "time_per_iteration": 2.81693959236145 + }, + { + "auxiliary_loss_clip": 0.01331539, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.22218633, + "balance_loss_mlp": 1.01748645, + "epoch": 0.9600480986021344, + "flos": 16395756987960.0, + "grad_norm": 1.8815990644373235, + "language_loss": 0.81006551, + "learning_rate": 1.669554028728348e-08, + "loss": 0.83368814, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.13244629, + "step": 15968, + "time_per_iteration": 2.858325719833374 + }, + { + "auxiliary_loss_clip": 0.01337822, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.22600091, + "balance_loss_mlp": 1.02132308, + "epoch": 0.9601082218548024, + "flos": 24281359404480.0, + "grad_norm": 2.3678612560115972, + "language_loss": 0.67934263, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.7030831, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 1.11962891, + "router_z_loss_mlp": 0.14886475, + "step": 15969, + "time_per_iteration": 2.8204987049102783 + }, + { + "auxiliary_loss_clip": 0.0132666, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.21872485, + "balance_loss_mlp": 1.02507401, + "epoch": 0.9601683451074703, + "flos": 19614789848880.0, + "grad_norm": 2.581224047584588, + "language_loss": 0.79856527, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.82220364, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12109375, + "step": 15970, + "time_per_iteration": 2.8293631076812744 + }, + { + "auxiliary_loss_clip": 0.01318857, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.21554959, + "balance_loss_mlp": 1.01602364, + "epoch": 0.9602284683601383, + "flos": 26656738070400.0, + "grad_norm": 1.6355695644110526, + "language_loss": 0.77387398, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79735106, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.1282959, + "step": 15971, + "time_per_iteration": 2.842142105102539 + }, + { + "auxiliary_loss_clip": 0.01336339, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.22442341, + "balance_loss_mlp": 1.01918602, + "epoch": 0.9602885916128062, + "flos": 15557584921560.0, + "grad_norm": 1.7879950351568328, + "language_loss": 0.67817509, + "learning_rate": 1.64952712054669e-08, + "loss": 0.70186311, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13275146, + "step": 15972, + "time_per_iteration": 2.8315396308898926 + }, + { + "auxiliary_loss_clip": 0.01319707, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.21302223, + "balance_loss_mlp": 1.01189637, + "epoch": 0.9603487148654742, + "flos": 16505917133400.0, + "grad_norm": 2.111544368000964, + "language_loss": 0.76453114, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78796983, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12255859, + "step": 15973, + "time_per_iteration": 2.808349609375 + }, + { + "auxiliary_loss_clip": 0.01327957, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.22320271, + "balance_loss_mlp": 1.02091098, + "epoch": 0.9604088381181421, + "flos": 20849640624360.0, + "grad_norm": 1.618982894222838, + "language_loss": 0.69569629, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71930963, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12469482, + "step": 15974, + "time_per_iteration": 2.832002639770508 + }, + { + "auxiliary_loss_clip": 0.01329786, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.21984971, + "balance_loss_mlp": 1.01655972, + "epoch": 0.9604689613708102, + "flos": 19687932151200.0, + "grad_norm": 1.7495186382346495, + "language_loss": 0.68083549, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70443451, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13525391, + "step": 15975, + "time_per_iteration": 3.016209840774536 + }, + { + "auxiliary_loss_clip": 0.01317215, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.2145865, + "balance_loss_mlp": 1.01431382, + "epoch": 0.9605290846234781, + "flos": 24102929959920.0, + "grad_norm": 2.0442408867229123, + "language_loss": 0.5638206, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.58725417, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.1182251, + "step": 15976, + "time_per_iteration": 2.898250102996826 + }, + { + "auxiliary_loss_clip": 0.01314329, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.20998549, + "balance_loss_mlp": 1.01787615, + "epoch": 0.9605892078761461, + "flos": 27127899157680.0, + "grad_norm": 1.8225101896597447, + "language_loss": 0.68079746, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70424056, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12109375, + "step": 15977, + "time_per_iteration": 2.846169948577881 + }, + { + "auxiliary_loss_clip": 0.01325675, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.21787691, + "balance_loss_mlp": 1.02363908, + "epoch": 0.9606493311288141, + "flos": 14140974816000.0, + "grad_norm": 1.7477206192461747, + "language_loss": 0.8257376, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84935975, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12921143, + "step": 15978, + "time_per_iteration": 2.722867250442505 + }, + { + "auxiliary_loss_clip": 0.01333731, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.2232784, + "balance_loss_mlp": 1.02062118, + "epoch": 0.960709454381482, + "flos": 15817359557160.0, + "grad_norm": 2.393915762002696, + "language_loss": 0.83817744, + "learning_rate": 1.614769615070921e-08, + "loss": 0.86185497, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.1340332, + "step": 15979, + "time_per_iteration": 2.7104694843292236 + }, + { + "auxiliary_loss_clip": 0.01332966, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.22361755, + "balance_loss_mlp": 1.02705002, + "epoch": 0.96076957763415, + "flos": 22570596363600.0, + "grad_norm": 1.46381314530693, + "language_loss": 0.80011916, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82384163, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.12219238, + "step": 15980, + "time_per_iteration": 2.8133981227874756 + }, + { + "auxiliary_loss_clip": 0.01329733, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.21921408, + "balance_loss_mlp": 1.0139277, + "epoch": 0.960829700886818, + "flos": 24686687694240.0, + "grad_norm": 1.7019974071092654, + "language_loss": 0.6847136, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70827788, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12780762, + "step": 15981, + "time_per_iteration": 2.780533790588379 + }, + { + "auxiliary_loss_clip": 0.01321193, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.21579242, + "balance_loss_mlp": 1.01360822, + "epoch": 0.960889824139486, + "flos": 26547918000840.0, + "grad_norm": 1.3693489357000903, + "language_loss": 0.69324338, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71670926, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.11798096, + "step": 15982, + "time_per_iteration": 2.8181591033935547 + }, + { + "auxiliary_loss_clip": 0.01143236, + "auxiliary_loss_mlp": 0.01005011, + "balance_loss_clip": 1.10100698, + "balance_loss_mlp": 1.00282907, + "epoch": 0.9609499473921539, + "flos": 71129538595080.0, + "grad_norm": 0.6742713419356602, + "language_loss": 0.53327107, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55475354, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02185059, + "step": 15983, + "time_per_iteration": 3.3415420055389404 + }, + { + "auxiliary_loss_clip": 0.01326717, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.21930337, + "balance_loss_mlp": 1.0224663, + "epoch": 0.9610100706448219, + "flos": 20556584114760.0, + "grad_norm": 1.9402441021907069, + "language_loss": 0.68082815, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70444846, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.128479, + "step": 15984, + "time_per_iteration": 2.75048828125 + }, + { + "auxiliary_loss_clip": 0.01314348, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.21204185, + "balance_loss_mlp": 1.02166915, + "epoch": 0.9610701938974898, + "flos": 14068279205640.0, + "grad_norm": 1.627641167453087, + "language_loss": 0.68000269, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.70348394, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.12109375, + "step": 15985, + "time_per_iteration": 2.744166612625122 + }, + { + "auxiliary_loss_clip": 0.01329217, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.22094691, + "balance_loss_mlp": 1.01678061, + "epoch": 0.9611303171501578, + "flos": 20234834475840.0, + "grad_norm": 1.8436872323009779, + "language_loss": 0.7887435, + "learning_rate": 1.580380726142283e-08, + "loss": 0.8123256, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12219238, + "step": 15986, + "time_per_iteration": 2.8175930976867676 + }, + { + "auxiliary_loss_clip": 0.01327397, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.22032356, + "balance_loss_mlp": 1.0166986, + "epoch": 0.9611904404028258, + "flos": 20954887158240.0, + "grad_norm": 2.151777737113644, + "language_loss": 0.64460444, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.66818714, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1418457, + "step": 15987, + "time_per_iteration": 2.7886862754821777 + }, + { + "auxiliary_loss_clip": 0.01316473, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.21430302, + "balance_loss_mlp": 1.01692283, + "epoch": 0.9612505636554938, + "flos": 24833256557400.0, + "grad_norm": 1.702708531863592, + "language_loss": 0.67149609, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.6949473, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.11743164, + "step": 15988, + "time_per_iteration": 2.908280372619629 + }, + { + "auxiliary_loss_clip": 0.01324295, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.21686459, + "balance_loss_mlp": 1.02324545, + "epoch": 0.9613106869081617, + "flos": 17169395724360.0, + "grad_norm": 1.813511244191398, + "language_loss": 0.74970102, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.77330101, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12445068, + "step": 15989, + "time_per_iteration": 2.9028642177581787 + }, + { + "auxiliary_loss_clip": 0.01141735, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.09895194, + "balance_loss_mlp": 1.00060451, + "epoch": 0.9613708101608297, + "flos": 61578231436080.0, + "grad_norm": 0.8077251524783181, + "language_loss": 0.63151777, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65296698, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02575684, + "step": 15990, + "time_per_iteration": 3.1685733795166016 + }, + { + "auxiliary_loss_clip": 0.01330807, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.22174382, + "balance_loss_mlp": 1.0162642, + "epoch": 0.9614309334134977, + "flos": 27423717035760.0, + "grad_norm": 1.7938107380601822, + "language_loss": 0.77551436, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.7991153, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13018799, + "step": 15991, + "time_per_iteration": 4.290170192718506 + }, + { + "auxiliary_loss_clip": 0.01337516, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.22500348, + "balance_loss_mlp": 1.01472211, + "epoch": 0.9614910566661656, + "flos": 22824157920120.0, + "grad_norm": 2.290523341026606, + "language_loss": 0.84797776, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.87164152, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.14123535, + "step": 15992, + "time_per_iteration": 2.8782131671905518 + }, + { + "auxiliary_loss_clip": 0.01327742, + "auxiliary_loss_mlp": 0.01024788, + "balance_loss_clip": 1.21924138, + "balance_loss_mlp": 1.01176977, + "epoch": 0.9615511799188337, + "flos": 20672713689120.0, + "grad_norm": 2.0486317566867287, + "language_loss": 0.72466487, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74819016, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13018799, + "step": 15993, + "time_per_iteration": 2.840437412261963 + }, + { + "auxiliary_loss_clip": 0.01328913, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.22113514, + "balance_loss_mlp": 1.01928043, + "epoch": 0.9616113031715016, + "flos": 33156169670160.0, + "grad_norm": 1.5444559879159157, + "language_loss": 0.68579471, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70940357, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.12695312, + "step": 15994, + "time_per_iteration": 2.9721720218658447 + }, + { + "auxiliary_loss_clip": 0.0132436, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.21754706, + "balance_loss_mlp": 1.01893592, + "epoch": 0.9616714264241696, + "flos": 25015300146000.0, + "grad_norm": 1.688813622257062, + "language_loss": 0.84642696, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86998588, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12609863, + "step": 15995, + "time_per_iteration": 5.9153828620910645 + }, + { + "auxiliary_loss_clip": 0.01335517, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.22450161, + "balance_loss_mlp": 1.01770568, + "epoch": 0.9617315496768375, + "flos": 13550719744080.0, + "grad_norm": 1.8569793940526662, + "language_loss": 0.76220155, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78587008, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1362915, + "step": 15996, + "time_per_iteration": 2.8175125122070312 + }, + { + "auxiliary_loss_clip": 0.01326974, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.21939826, + "balance_loss_mlp": 1.01744747, + "epoch": 0.9617916729295055, + "flos": 11258188778520.0, + "grad_norm": 1.9449707811579722, + "language_loss": 0.77540976, + "learning_rate": 1.52708595287494e-08, + "loss": 0.79898888, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13494873, + "step": 15997, + "time_per_iteration": 2.710881471633911 + }, + { + "auxiliary_loss_clip": 0.01317621, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.21288633, + "balance_loss_mlp": 1.0145781, + "epoch": 0.9618517961821734, + "flos": 22824807653880.0, + "grad_norm": 1.68439292203813, + "language_loss": 0.67612678, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69957048, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.12182617, + "step": 15998, + "time_per_iteration": 2.8354227542877197 + }, + { + "auxiliary_loss_clip": 0.01323153, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.21664715, + "balance_loss_mlp": 1.01552308, + "epoch": 0.9619119194348414, + "flos": 16621640624160.0, + "grad_norm": 1.6806072911099672, + "language_loss": 0.73028481, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75380969, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.13806152, + "step": 15999, + "time_per_iteration": 2.731492042541504 + }, + { + "auxiliary_loss_clip": 0.01319254, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.21488905, + "balance_loss_mlp": 1.01633692, + "epoch": 0.9619720426875094, + "flos": 24540524914680.0, + "grad_norm": 1.8801322859682579, + "language_loss": 0.65353316, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67700672, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11761475, + "step": 16000, + "time_per_iteration": 4.354433298110962 + }, + { + "auxiliary_loss_clip": 0.01326827, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.21823144, + "balance_loss_mlp": 1.01485038, + "epoch": 0.9620321659401774, + "flos": 20637238963680.0, + "grad_norm": 1.7426990701900908, + "language_loss": 0.75524712, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77879405, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.13018799, + "step": 16001, + "time_per_iteration": 2.7335774898529053 + }, + { + "auxiliary_loss_clip": 0.01326227, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.21889913, + "balance_loss_mlp": 1.01523006, + "epoch": 0.9620922891928453, + "flos": 18519929382240.0, + "grad_norm": 1.6802137194530569, + "language_loss": 0.69002903, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.71357608, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13250732, + "step": 16002, + "time_per_iteration": 2.798734188079834 + }, + { + "auxiliary_loss_clip": 0.01320321, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.21425736, + "balance_loss_mlp": 1.01558709, + "epoch": 0.9621524124455133, + "flos": 28773316701360.0, + "grad_norm": 1.301194321146546, + "language_loss": 0.64990211, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.6733914, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13024902, + "step": 16003, + "time_per_iteration": 2.8795461654663086 + }, + { + "auxiliary_loss_clip": 0.01325162, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.21733665, + "balance_loss_mlp": 1.02567196, + "epoch": 0.9622125356981813, + "flos": 19103565291480.0, + "grad_norm": 1.7798743341999101, + "language_loss": 0.75952792, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78315789, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12176514, + "step": 16004, + "time_per_iteration": 2.762576103210449 + }, + { + "auxiliary_loss_clip": 0.0132392, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.21784377, + "balance_loss_mlp": 1.0149169, + "epoch": 0.9622726589508492, + "flos": 20307651911280.0, + "grad_norm": 2.064727329888265, + "language_loss": 0.7972101, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.82072634, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12792969, + "step": 16005, + "time_per_iteration": 2.778765916824341 + }, + { + "auxiliary_loss_clip": 0.01321255, + "auxiliary_loss_mlp": 0.0102789, + "balance_loss_clip": 1.21716976, + "balance_loss_mlp": 1.01580822, + "epoch": 0.9623327822035173, + "flos": 54939361063320.0, + "grad_norm": 1.7566598200566839, + "language_loss": 0.67554861, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69904006, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.12091064, + "step": 16006, + "time_per_iteration": 3.0901594161987305 + }, + { + "auxiliary_loss_clip": 0.01314506, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.21199083, + "balance_loss_mlp": 1.01950788, + "epoch": 0.9623929054561852, + "flos": 21763635144840.0, + "grad_norm": 1.514023035834821, + "language_loss": 0.78534412, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80879939, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.1151123, + "step": 16007, + "time_per_iteration": 2.7892699241638184 + }, + { + "auxiliary_loss_clip": 0.01327825, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.21920574, + "balance_loss_mlp": 1.02310681, + "epoch": 0.9624530287088532, + "flos": 17936131039560.0, + "grad_norm": 2.1339807165007127, + "language_loss": 0.68049502, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70413983, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13543701, + "step": 16008, + "time_per_iteration": 2.7971794605255127 + }, + { + "auxiliary_loss_clip": 0.01328203, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.21976221, + "balance_loss_mlp": 1.01787317, + "epoch": 0.9625131519615211, + "flos": 23258382381000.0, + "grad_norm": 2.0605962163998797, + "language_loss": 0.73619497, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75979489, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.13916016, + "step": 16009, + "time_per_iteration": 2.814126491546631 + }, + { + "auxiliary_loss_clip": 0.01322359, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.21649706, + "balance_loss_mlp": 1.01555443, + "epoch": 0.9625732752141891, + "flos": 18921318661080.0, + "grad_norm": 1.7038264554078058, + "language_loss": 0.75972223, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.78322673, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12536621, + "step": 16010, + "time_per_iteration": 2.830261468887329 + }, + { + "auxiliary_loss_clip": 0.01336033, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.22419989, + "balance_loss_mlp": 1.02083111, + "epoch": 0.962633398466857, + "flos": 16257228580080.0, + "grad_norm": 1.9519815446239206, + "language_loss": 0.6962043, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71992391, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 1.11962891, + "router_z_loss_mlp": 0.15081787, + "step": 16011, + "time_per_iteration": 2.799375057220459 + }, + { + "auxiliary_loss_clip": 0.01323211, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.21779013, + "balance_loss_mlp": 1.01845133, + "epoch": 0.962693521719525, + "flos": 54206597964240.0, + "grad_norm": 1.8128222894072854, + "language_loss": 0.68540299, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70894068, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12103271, + "step": 16012, + "time_per_iteration": 3.078568696975708 + }, + { + "auxiliary_loss_clip": 0.01340764, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.22768831, + "balance_loss_mlp": 1.02515554, + "epoch": 0.962753644972193, + "flos": 33111842322240.0, + "grad_norm": 1.8322852448046714, + "language_loss": 0.72746539, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.75126714, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.1427002, + "step": 16013, + "time_per_iteration": 2.93400239944458 + }, + { + "auxiliary_loss_clip": 0.01323892, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.21771979, + "balance_loss_mlp": 1.0202893, + "epoch": 0.962813768224861, + "flos": 42238346159880.0, + "grad_norm": 3.9999069061591417, + "language_loss": 0.63633406, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65991253, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.13653564, + "step": 16014, + "time_per_iteration": 3.053523302078247 + }, + { + "auxiliary_loss_clip": 0.01314528, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.21194315, + "balance_loss_mlp": 1.01580501, + "epoch": 0.9628738914775289, + "flos": 43952479694640.0, + "grad_norm": 1.507573577603468, + "language_loss": 0.72192597, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.74533826, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.10894775, + "step": 16015, + "time_per_iteration": 3.0929486751556396 + }, + { + "auxiliary_loss_clip": 0.01326638, + "auxiliary_loss_mlp": 0.01027226, + "balance_loss_clip": 1.21951985, + "balance_loss_mlp": 1.01453567, + "epoch": 0.9629340147301969, + "flos": 15600409760160.0, + "grad_norm": 2.346143259237916, + "language_loss": 0.77776849, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.80130708, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12695312, + "step": 16016, + "time_per_iteration": 2.85355544090271 + }, + { + "auxiliary_loss_clip": 0.01143521, + "auxiliary_loss_mlp": 0.01010657, + "balance_loss_clip": 1.10043097, + "balance_loss_mlp": 1.00799835, + "epoch": 0.9629941379828649, + "flos": 62965701720360.0, + "grad_norm": 0.8143204529071538, + "language_loss": 0.63118106, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65272284, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02661133, + "step": 16017, + "time_per_iteration": 3.150156259536743 + }, + { + "auxiliary_loss_clip": 0.013291, + "auxiliary_loss_mlp": 0.01027177, + "balance_loss_clip": 1.22087991, + "balance_loss_mlp": 1.01482725, + "epoch": 0.9630542612355328, + "flos": 29905276227840.0, + "grad_norm": 2.0437923500054396, + "language_loss": 0.67237681, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.6959396, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12335205, + "step": 16018, + "time_per_iteration": 2.8461146354675293 + }, + { + "auxiliary_loss_clip": 0.01328341, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.22145629, + "balance_loss_mlp": 1.02185333, + "epoch": 0.9631143844882009, + "flos": 17899072588080.0, + "grad_norm": 1.8658092305235183, + "language_loss": 0.79944265, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.82307088, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 1.06982422, + "router_z_loss_mlp": 0.12646484, + "step": 16019, + "time_per_iteration": 2.700225353240967 + }, + { + "auxiliary_loss_clip": 0.01318436, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.21272576, + "balance_loss_mlp": 1.01721978, + "epoch": 0.9631745077408688, + "flos": 26144579520720.0, + "grad_norm": 1.469412430803846, + "language_loss": 0.71912396, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74259174, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11120605, + "step": 16020, + "time_per_iteration": 2.789407253265381 + }, + { + "auxiliary_loss_clip": 0.01325662, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.21802616, + "balance_loss_mlp": 1.01485908, + "epoch": 0.9632346309935368, + "flos": 24979175686800.0, + "grad_norm": 1.5991016213939353, + "language_loss": 0.77124298, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79477119, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12298584, + "step": 16021, + "time_per_iteration": 2.767643690109253 + }, + { + "auxiliary_loss_clip": 0.01340314, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.22847927, + "balance_loss_mlp": 1.0190649, + "epoch": 0.9632947542462047, + "flos": 23621738607720.0, + "grad_norm": 2.0047868515940044, + "language_loss": 0.64861941, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67235792, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14483643, + "step": 16022, + "time_per_iteration": 2.7961573600769043 + }, + { + "auxiliary_loss_clip": 0.01318141, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.21253121, + "balance_loss_mlp": 1.01403272, + "epoch": 0.9633548774988727, + "flos": 26401186704240.0, + "grad_norm": 1.893951990558912, + "language_loss": 0.73075193, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75419271, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.11914062, + "step": 16023, + "time_per_iteration": 2.7687058448791504 + }, + { + "auxiliary_loss_clip": 0.01317573, + "auxiliary_loss_mlp": 0.01023633, + "balance_loss_clip": 1.21216762, + "balance_loss_mlp": 1.01145029, + "epoch": 0.9634150007515406, + "flos": 23772855607200.0, + "grad_norm": 1.501804736640985, + "language_loss": 0.81733835, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.8407504, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12188721, + "step": 16024, + "time_per_iteration": 2.7532269954681396 + }, + { + "auxiliary_loss_clip": 0.0133528, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.22375941, + "balance_loss_mlp": 1.01901913, + "epoch": 0.9634751240042086, + "flos": 24140881795320.0, + "grad_norm": 1.3963902274833127, + "language_loss": 0.81616646, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83984768, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13830566, + "step": 16025, + "time_per_iteration": 2.7989065647125244 + }, + { + "auxiliary_loss_clip": 0.01333358, + "auxiliary_loss_mlp": 0.01024439, + "balance_loss_clip": 1.22378469, + "balance_loss_mlp": 1.0119642, + "epoch": 0.9635352472568766, + "flos": 24353973798120.0, + "grad_norm": 1.7420988721588828, + "language_loss": 0.7689544, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.79253232, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.12463379, + "step": 16026, + "time_per_iteration": 2.8307266235351562 + }, + { + "auxiliary_loss_clip": 0.01325944, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.21748412, + "balance_loss_mlp": 1.01759744, + "epoch": 0.9635953705095446, + "flos": 23989683579120.0, + "grad_norm": 1.7188194212897574, + "language_loss": 0.63488024, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65845156, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13604736, + "step": 16027, + "time_per_iteration": 2.9165725708007812 + }, + { + "auxiliary_loss_clip": 0.01335635, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.22580171, + "balance_loss_mlp": 1.01671231, + "epoch": 0.9636554937622125, + "flos": 19833201546840.0, + "grad_norm": 2.157903681219097, + "language_loss": 0.87415087, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89780354, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1293335, + "step": 16028, + "time_per_iteration": 2.8303110599517822 + }, + { + "auxiliary_loss_clip": 0.01141707, + "auxiliary_loss_mlp": 0.01007165, + "balance_loss_clip": 1.09884572, + "balance_loss_mlp": 1.00432801, + "epoch": 0.9637156170148805, + "flos": 67450349512440.0, + "grad_norm": 0.6854554307501398, + "language_loss": 0.53271168, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55420041, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02832031, + "step": 16029, + "time_per_iteration": 3.301767349243164 + }, + { + "auxiliary_loss_clip": 0.01329436, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.21973729, + "balance_loss_mlp": 1.01502585, + "epoch": 0.9637757402675484, + "flos": 20305499668200.0, + "grad_norm": 1.6386221904930167, + "language_loss": 0.73875833, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76232833, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12548828, + "step": 16030, + "time_per_iteration": 4.366301536560059 + }, + { + "auxiliary_loss_clip": 0.01318273, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.21317244, + "balance_loss_mlp": 1.01890147, + "epoch": 0.9638358635202164, + "flos": 27240211546200.0, + "grad_norm": 1.847002872876011, + "language_loss": 0.66749746, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.69099456, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12536621, + "step": 16031, + "time_per_iteration": 2.9306280612945557 + }, + { + "auxiliary_loss_clip": 0.01142962, + "auxiliary_loss_mlp": 0.01004156, + "balance_loss_clip": 1.10038829, + "balance_loss_mlp": 1.00184345, + "epoch": 0.9638959867728845, + "flos": 70305213979440.0, + "grad_norm": 0.8662048087669547, + "language_loss": 0.60764909, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62912023, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02307129, + "step": 16032, + "time_per_iteration": 3.347841739654541 + }, + { + "auxiliary_loss_clip": 0.01310198, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.20863605, + "balance_loss_mlp": 1.02080286, + "epoch": 0.9639561100255524, + "flos": 25412587980480.0, + "grad_norm": 1.5954920278504097, + "language_loss": 0.66930193, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.69273114, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.1192627, + "step": 16033, + "time_per_iteration": 2.88484263420105 + }, + { + "auxiliary_loss_clip": 0.01327492, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.22174346, + "balance_loss_mlp": 1.01714921, + "epoch": 0.9640162332782204, + "flos": 18118458886680.0, + "grad_norm": 1.5894228996661413, + "language_loss": 0.65907502, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.68265176, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.13024902, + "step": 16034, + "time_per_iteration": 4.344576358795166 + }, + { + "auxiliary_loss_clip": 0.0132315, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.21715474, + "balance_loss_mlp": 1.0179404, + "epoch": 0.9640763565308883, + "flos": 23445502014600.0, + "grad_norm": 2.3589844429252054, + "language_loss": 0.74336517, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76690298, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12701416, + "step": 16035, + "time_per_iteration": 2.8089687824249268 + }, + { + "auxiliary_loss_clip": 0.01328591, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.22270942, + "balance_loss_mlp": 1.01637173, + "epoch": 0.9641364797835563, + "flos": 22424514800760.0, + "grad_norm": 2.420328760755484, + "language_loss": 0.8222459, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84582192, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.12646484, + "step": 16036, + "time_per_iteration": 2.8067991733551025 + }, + { + "auxiliary_loss_clip": 0.01328297, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.22123432, + "balance_loss_mlp": 1.01700258, + "epoch": 0.9641966030362242, + "flos": 30627765411840.0, + "grad_norm": 2.1575032404655157, + "language_loss": 0.70457816, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72816336, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13214111, + "step": 16037, + "time_per_iteration": 2.813511610031128 + }, + { + "auxiliary_loss_clip": 0.01329813, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.2210536, + "balance_loss_mlp": 1.01739287, + "epoch": 0.9642567262888923, + "flos": 20957201834760.0, + "grad_norm": 1.9251984267493223, + "language_loss": 0.63832474, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.66191924, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12255859, + "step": 16038, + "time_per_iteration": 2.8035054206848145 + }, + { + "auxiliary_loss_clip": 0.01331138, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.22262836, + "balance_loss_mlp": 1.02159083, + "epoch": 0.9643168495415602, + "flos": 22644753874920.0, + "grad_norm": 1.6453715333169168, + "language_loss": 0.7155304, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73918676, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12915039, + "step": 16039, + "time_per_iteration": 4.395143508911133 + }, + { + "auxiliary_loss_clip": 0.01327117, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.2188766, + "balance_loss_mlp": 1.01461565, + "epoch": 0.9643769727942282, + "flos": 20271080760120.0, + "grad_norm": 2.337296786185078, + "language_loss": 0.73738474, + "learning_rate": 1.327491870605657e-08, + "loss": 0.76092511, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12316895, + "step": 16040, + "time_per_iteration": 2.78997540473938 + }, + { + "auxiliary_loss_clip": 0.01330527, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.22166467, + "balance_loss_mlp": 1.01992393, + "epoch": 0.9644370960468961, + "flos": 13885910750160.0, + "grad_norm": 1.8911243991714213, + "language_loss": 0.73518509, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75882041, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13067627, + "step": 16041, + "time_per_iteration": 2.790696144104004 + }, + { + "auxiliary_loss_clip": 0.01311571, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.2107271, + "balance_loss_mlp": 1.0154779, + "epoch": 0.9644972192995641, + "flos": 17242659851760.0, + "grad_norm": 2.608607055616678, + "language_loss": 0.72114658, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74454105, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.12384033, + "step": 16042, + "time_per_iteration": 2.7641096115112305 + }, + { + "auxiliary_loss_clip": 0.01335611, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.22496057, + "balance_loss_mlp": 1.01758909, + "epoch": 0.964557342552232, + "flos": 23845429392480.0, + "grad_norm": 1.670044615783675, + "language_loss": 0.81202686, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83568382, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.12493896, + "step": 16043, + "time_per_iteration": 2.7627129554748535 + }, + { + "auxiliary_loss_clip": 0.01323266, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.21723771, + "balance_loss_mlp": 1.01845551, + "epoch": 0.9646174658049, + "flos": 21658185569160.0, + "grad_norm": 1.447458294856473, + "language_loss": 0.71774787, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.74128163, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.11639404, + "step": 16044, + "time_per_iteration": 2.761204719543457 + }, + { + "auxiliary_loss_clip": 0.01321989, + "auxiliary_loss_mlp": 0.01027143, + "balance_loss_clip": 1.21631241, + "balance_loss_mlp": 1.01418531, + "epoch": 0.9646775890575681, + "flos": 17134124040720.0, + "grad_norm": 1.7155401875072536, + "language_loss": 0.70246804, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72595936, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12969971, + "step": 16045, + "time_per_iteration": 2.7676076889038086 + }, + { + "auxiliary_loss_clip": 0.01329561, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.22170734, + "balance_loss_mlp": 1.01643038, + "epoch": 0.964737712310236, + "flos": 13009624414920.0, + "grad_norm": 1.7205852721478028, + "language_loss": 0.74882609, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77241862, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.13262939, + "step": 16046, + "time_per_iteration": 2.838291645050049 + }, + { + "auxiliary_loss_clip": 0.01332268, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.22197616, + "balance_loss_mlp": 1.02348185, + "epoch": 0.964797835562904, + "flos": 24284526856560.0, + "grad_norm": 2.109422140675685, + "language_loss": 0.62472802, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64842439, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13873291, + "step": 16047, + "time_per_iteration": 2.837122917175293 + }, + { + "auxiliary_loss_clip": 0.01325398, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.21840644, + "balance_loss_mlp": 1.02086294, + "epoch": 0.9648579588155719, + "flos": 20527687943640.0, + "grad_norm": 1.6715918418150222, + "language_loss": 0.68993276, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71351832, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1229248, + "step": 16048, + "time_per_iteration": 2.858607292175293 + }, + { + "auxiliary_loss_clip": 0.01332647, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.22232187, + "balance_loss_mlp": 1.01907492, + "epoch": 0.9649180820682399, + "flos": 32160464483400.0, + "grad_norm": 1.7084062620365057, + "language_loss": 0.64031005, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66396493, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13775635, + "step": 16049, + "time_per_iteration": 2.878480911254883 + }, + { + "auxiliary_loss_clip": 0.01330153, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.22199059, + "balance_loss_mlp": 1.01580238, + "epoch": 0.9649782053209078, + "flos": 20527566118560.0, + "grad_norm": 1.5078144930721282, + "language_loss": 0.70978928, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73337591, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1270752, + "step": 16050, + "time_per_iteration": 2.87636137008667 + }, + { + "auxiliary_loss_clip": 0.01336898, + "auxiliary_loss_mlp": 0.01034568, + "balance_loss_clip": 1.22459161, + "balance_loss_mlp": 1.02042937, + "epoch": 0.9650383285735759, + "flos": 43075503017280.0, + "grad_norm": 1.7772783673713248, + "language_loss": 0.69375253, + "learning_rate": 1.278669873970606e-08, + "loss": 0.71746719, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.14154053, + "step": 16051, + "time_per_iteration": 3.056612253189087 + }, + { + "auxiliary_loss_clip": 0.01142137, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 1.09959698, + "balance_loss_mlp": 0.99815243, + "epoch": 0.9650984518262438, + "flos": 61762955176440.0, + "grad_norm": 0.8411263040979252, + "language_loss": 0.59211034, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61354089, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02770996, + "step": 16052, + "time_per_iteration": 3.34194016456604 + }, + { + "auxiliary_loss_clip": 0.01319139, + "auxiliary_loss_mlp": 0.01028149, + "balance_loss_clip": 1.2148056, + "balance_loss_mlp": 1.0154767, + "epoch": 0.9651585750789118, + "flos": 29795440949280.0, + "grad_norm": 1.544424460771778, + "language_loss": 0.74563789, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76911074, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12664795, + "step": 16053, + "time_per_iteration": 2.8979945182800293 + }, + { + "auxiliary_loss_clip": 0.01332243, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.2239666, + "balance_loss_mlp": 1.01891983, + "epoch": 0.9652186983315797, + "flos": 16877435640480.0, + "grad_norm": 1.9528423485795667, + "language_loss": 0.68709201, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.71073806, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13458252, + "step": 16054, + "time_per_iteration": 2.7660107612609863 + }, + { + "auxiliary_loss_clip": 0.01320772, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.2151978, + "balance_loss_mlp": 1.02117467, + "epoch": 0.9652788215842477, + "flos": 31656062130480.0, + "grad_norm": 1.4366017309792958, + "language_loss": 0.62315089, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.64669603, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12548828, + "step": 16055, + "time_per_iteration": 2.9077565670013428 + }, + { + "auxiliary_loss_clip": 0.01323591, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.21840656, + "balance_loss_mlp": 1.02078557, + "epoch": 0.9653389448369156, + "flos": 24759708171480.0, + "grad_norm": 1.831347448544999, + "language_loss": 0.76792657, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79149896, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.12860107, + "step": 16056, + "time_per_iteration": 2.770777940750122 + }, + { + "auxiliary_loss_clip": 0.01324315, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.21613729, + "balance_loss_mlp": 1.01534796, + "epoch": 0.9653990680895836, + "flos": 20301357615480.0, + "grad_norm": 1.4985418042070466, + "language_loss": 0.71790266, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.7414245, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12524414, + "step": 16057, + "time_per_iteration": 2.783398151397705 + }, + { + "auxiliary_loss_clip": 0.01323056, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.21776152, + "balance_loss_mlp": 1.01724076, + "epoch": 0.9654591913422517, + "flos": 22533943995720.0, + "grad_norm": 2.3497927764802795, + "language_loss": 0.72184902, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74538112, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12908936, + "step": 16058, + "time_per_iteration": 2.788581371307373 + }, + { + "auxiliary_loss_clip": 0.013187, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.21400416, + "balance_loss_mlp": 1.02051091, + "epoch": 0.9655193145949196, + "flos": 26768969242200.0, + "grad_norm": 1.7393549958505476, + "language_loss": 0.74200433, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76551735, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12097168, + "step": 16059, + "time_per_iteration": 2.904000759124756 + }, + { + "auxiliary_loss_clip": 0.01334271, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.22334361, + "balance_loss_mlp": 1.01595938, + "epoch": 0.9655794378475876, + "flos": 41977759357080.0, + "grad_norm": 2.0988156250767194, + "language_loss": 0.73563683, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75926667, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.12762451, + "step": 16060, + "time_per_iteration": 2.9592807292938232 + }, + { + "auxiliary_loss_clip": 0.01319499, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.21697402, + "balance_loss_mlp": 1.01852703, + "epoch": 0.9656395611002555, + "flos": 27715230427680.0, + "grad_norm": 1.5054769179712055, + "language_loss": 0.77005202, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.79355061, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.11828613, + "step": 16061, + "time_per_iteration": 2.7908096313476562 + }, + { + "auxiliary_loss_clip": 0.01141037, + "auxiliary_loss_mlp": 0.01005074, + "balance_loss_clip": 1.09836602, + "balance_loss_mlp": 1.00229633, + "epoch": 0.9656996843529235, + "flos": 68983698317760.0, + "grad_norm": 0.7282370931053552, + "language_loss": 0.64176393, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66322505, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02783203, + "step": 16062, + "time_per_iteration": 3.2705628871917725 + }, + { + "auxiliary_loss_clip": 0.01310513, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.20742011, + "balance_loss_mlp": 1.01326382, + "epoch": 0.9657598076055914, + "flos": 20636223754680.0, + "grad_norm": 2.2400474014899894, + "language_loss": 0.93658888, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95994401, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.11743164, + "step": 16063, + "time_per_iteration": 2.786504030227661 + }, + { + "auxiliary_loss_clip": 0.01324444, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.21744204, + "balance_loss_mlp": 1.01854289, + "epoch": 0.9658199308582595, + "flos": 20453408607240.0, + "grad_norm": 1.8908456451379276, + "language_loss": 0.82246459, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84602225, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12792969, + "step": 16064, + "time_per_iteration": 2.7958781719207764 + }, + { + "auxiliary_loss_clip": 0.01323045, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.21877158, + "balance_loss_mlp": 1.01878667, + "epoch": 0.9658800541109274, + "flos": 24723502495560.0, + "grad_norm": 1.4733302031915059, + "language_loss": 0.84487712, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86841261, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11712646, + "step": 16065, + "time_per_iteration": 2.8082473278045654 + }, + { + "auxiliary_loss_clip": 0.01323998, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.21672225, + "balance_loss_mlp": 1.01850009, + "epoch": 0.9659401773635954, + "flos": 21614223696480.0, + "grad_norm": 1.779001788852437, + "language_loss": 0.67692357, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.70047539, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.12677002, + "step": 16066, + "time_per_iteration": 2.7756195068359375 + }, + { + "auxiliary_loss_clip": 0.01323752, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.21785736, + "balance_loss_mlp": 1.0173316, + "epoch": 0.9660003006162633, + "flos": 20305824535080.0, + "grad_norm": 1.6699229684284562, + "language_loss": 0.82421935, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84774965, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.1194458, + "step": 16067, + "time_per_iteration": 2.872469186782837 + }, + { + "auxiliary_loss_clip": 0.01324685, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.21845126, + "balance_loss_mlp": 1.01905632, + "epoch": 0.9660604238689313, + "flos": 24467301395640.0, + "grad_norm": 1.716547431951564, + "language_loss": 0.69701952, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.72058642, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12957764, + "step": 16068, + "time_per_iteration": 4.265304803848267 + }, + { + "auxiliary_loss_clip": 0.01309559, + "auxiliary_loss_mlp": 0.0102696, + "balance_loss_clip": 1.20960307, + "balance_loss_mlp": 1.01564097, + "epoch": 0.9661205471215992, + "flos": 19868676272280.0, + "grad_norm": 1.624167658511393, + "language_loss": 0.68279856, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70616376, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.11315918, + "step": 16069, + "time_per_iteration": 2.7816274166107178 + }, + { + "auxiliary_loss_clip": 0.01325282, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.2180891, + "balance_loss_mlp": 1.01546264, + "epoch": 0.9661806703742672, + "flos": 20562715977120.0, + "grad_norm": 1.956164958944805, + "language_loss": 0.88604939, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.90958738, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1305542, + "step": 16070, + "time_per_iteration": 2.808685064315796 + }, + { + "auxiliary_loss_clip": 0.01327267, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.21928954, + "balance_loss_mlp": 1.02229536, + "epoch": 0.9662407936269353, + "flos": 21435550601760.0, + "grad_norm": 2.250993321744054, + "language_loss": 0.76976144, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79339337, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1362915, + "step": 16071, + "time_per_iteration": 2.78328013420105 + }, + { + "auxiliary_loss_clip": 0.01323393, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.21799755, + "balance_loss_mlp": 1.01420641, + "epoch": 0.9663009168796032, + "flos": 14907019789080.0, + "grad_norm": 4.408318374703616, + "language_loss": 0.66088724, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68440151, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.13830566, + "step": 16072, + "time_per_iteration": 4.300310134887695 + }, + { + "auxiliary_loss_clip": 0.01332008, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.22298443, + "balance_loss_mlp": 1.0190835, + "epoch": 0.9663610401322712, + "flos": 24315534662400.0, + "grad_norm": 5.874986608769004, + "language_loss": 0.77871037, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.80234563, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12432861, + "step": 16073, + "time_per_iteration": 4.387882471084595 + }, + { + "auxiliary_loss_clip": 0.01332083, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.2218554, + "balance_loss_mlp": 1.0192337, + "epoch": 0.9664211633849391, + "flos": 17642140537680.0, + "grad_norm": 2.186250725413334, + "language_loss": 0.76080358, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.7844547, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13793945, + "step": 16074, + "time_per_iteration": 2.840207576751709 + }, + { + "auxiliary_loss_clip": 0.01328177, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.22010398, + "balance_loss_mlp": 1.01720631, + "epoch": 0.9664812866376071, + "flos": 29795522166000.0, + "grad_norm": 1.517154735062993, + "language_loss": 0.75893283, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.78251559, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.12890625, + "step": 16075, + "time_per_iteration": 2.875523805618286 + }, + { + "auxiliary_loss_clip": 0.01332571, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.22419214, + "balance_loss_mlp": 1.01736975, + "epoch": 0.966541409890275, + "flos": 14286325428360.0, + "grad_norm": 1.8394644322181644, + "language_loss": 0.78998482, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81360775, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12347412, + "step": 16076, + "time_per_iteration": 2.875638484954834 + }, + { + "auxiliary_loss_clip": 0.01326188, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.21828902, + "balance_loss_mlp": 1.02609682, + "epoch": 0.9666015331429431, + "flos": 19864615436280.0, + "grad_norm": 1.6190873595897837, + "language_loss": 0.72107226, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74472713, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13226318, + "step": 16077, + "time_per_iteration": 2.7892773151397705 + }, + { + "auxiliary_loss_clip": 0.01325739, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.21856081, + "balance_loss_mlp": 1.02144003, + "epoch": 0.966661656395611, + "flos": 27131716343520.0, + "grad_norm": 1.6075914676068315, + "language_loss": 0.59077311, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61437726, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13238525, + "step": 16078, + "time_per_iteration": 4.355426549911499 + }, + { + "auxiliary_loss_clip": 0.01332918, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.22228134, + "balance_loss_mlp": 1.01984453, + "epoch": 0.966721779648279, + "flos": 21513403473840.0, + "grad_norm": 1.9874567803758616, + "language_loss": 0.72814298, + "learning_rate": 1.158510609718899e-08, + "loss": 0.7518065, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13592529, + "step": 16079, + "time_per_iteration": 2.8139808177948 + }, + { + "auxiliary_loss_clip": 0.01314014, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.21054709, + "balance_loss_mlp": 1.01219809, + "epoch": 0.9667819029009469, + "flos": 23883381227880.0, + "grad_norm": 1.5346081619436638, + "language_loss": 0.72163653, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74502391, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12506104, + "step": 16080, + "time_per_iteration": 2.7529892921447754 + }, + { + "auxiliary_loss_clip": 0.01322539, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.21647859, + "balance_loss_mlp": 1.01992059, + "epoch": 0.9668420261536149, + "flos": 21512266439760.0, + "grad_norm": 1.8150290277603487, + "language_loss": 0.74115729, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76470178, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11981201, + "step": 16081, + "time_per_iteration": 2.7548813819885254 + }, + { + "auxiliary_loss_clip": 0.01324361, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.21768785, + "balance_loss_mlp": 1.01550436, + "epoch": 0.9669021494062828, + "flos": 26692862529600.0, + "grad_norm": 1.580603866725759, + "language_loss": 0.67477298, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69829738, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12585449, + "step": 16082, + "time_per_iteration": 2.8137600421905518 + }, + { + "auxiliary_loss_clip": 0.01322331, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.21585917, + "balance_loss_mlp": 1.01606965, + "epoch": 0.9669622726589508, + "flos": 29831037499800.0, + "grad_norm": 1.5061779023660744, + "language_loss": 0.76967824, + "learning_rate": 1.141827483932789e-08, + "loss": 0.79319215, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12988281, + "step": 16083, + "time_per_iteration": 2.8608016967773438 + }, + { + "auxiliary_loss_clip": 0.01327184, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.21905756, + "balance_loss_mlp": 1.01890802, + "epoch": 0.9670223959116189, + "flos": 22926724302240.0, + "grad_norm": 1.7988279673812413, + "language_loss": 0.79683423, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.82042873, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13348389, + "step": 16084, + "time_per_iteration": 2.847630500793457 + }, + { + "auxiliary_loss_clip": 0.01329526, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.2190975, + "balance_loss_mlp": 1.01693273, + "epoch": 0.9670825191642868, + "flos": 18629114927040.0, + "grad_norm": 2.315579973473543, + "language_loss": 0.67441428, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69800651, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.12750244, + "step": 16085, + "time_per_iteration": 2.8493919372558594 + }, + { + "auxiliary_loss_clip": 0.01339521, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.22817385, + "balance_loss_mlp": 1.01759529, + "epoch": 0.9671426424169548, + "flos": 24503222813040.0, + "grad_norm": 2.185486789625232, + "language_loss": 0.69100177, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71471238, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 1.11376953, + "router_z_loss_mlp": 0.13946533, + "step": 16086, + "time_per_iteration": 3.01318359375 + }, + { + "auxiliary_loss_clip": 0.01326761, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.22040892, + "balance_loss_mlp": 1.02103448, + "epoch": 0.9672027656696227, + "flos": 20375433910080.0, + "grad_norm": 1.4600175492692375, + "language_loss": 0.78422534, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80783069, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12738037, + "step": 16087, + "time_per_iteration": 3.0454869270324707 + }, + { + "auxiliary_loss_clip": 0.01325999, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.2184515, + "balance_loss_mlp": 1.0171901, + "epoch": 0.9672628889222907, + "flos": 18884869335000.0, + "grad_norm": 1.8108374929399995, + "language_loss": 0.71138263, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73494256, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.12799072, + "step": 16088, + "time_per_iteration": 2.8287105560302734 + }, + { + "auxiliary_loss_clip": 0.01320981, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.21700621, + "balance_loss_mlp": 1.01509845, + "epoch": 0.9673230121749586, + "flos": 28701270824760.0, + "grad_norm": 1.495365152841189, + "language_loss": 0.70895487, + "learning_rate": 1.117029020040916e-08, + "loss": 0.73243415, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.11865234, + "step": 16089, + "time_per_iteration": 2.906827211380005 + }, + { + "auxiliary_loss_clip": 0.01331405, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.22409534, + "balance_loss_mlp": 1.02110052, + "epoch": 0.9673831354276267, + "flos": 20489208199560.0, + "grad_norm": 2.0827176996510106, + "language_loss": 0.74964154, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.77328944, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.1227417, + "step": 16090, + "time_per_iteration": 2.82562255859375 + }, + { + "auxiliary_loss_clip": 0.0133815, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.22583723, + "balance_loss_mlp": 1.0182054, + "epoch": 0.9674432586802946, + "flos": 26803266325200.0, + "grad_norm": 1.6876180345290668, + "language_loss": 0.69086921, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71456528, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.13256836, + "step": 16091, + "time_per_iteration": 2.87585711479187 + }, + { + "auxiliary_loss_clip": 0.01322599, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.21727967, + "balance_loss_mlp": 1.01963615, + "epoch": 0.9675033819329626, + "flos": 22316222639880.0, + "grad_norm": 1.7097024259544034, + "language_loss": 0.76996434, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79351878, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.13226318, + "step": 16092, + "time_per_iteration": 2.8119964599609375 + }, + { + "auxiliary_loss_clip": 0.01323455, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.21774244, + "balance_loss_mlp": 1.01726508, + "epoch": 0.9675635051856305, + "flos": 12679347020400.0, + "grad_norm": 1.9550630342594566, + "language_loss": 0.766343, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78986841, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11834717, + "step": 16093, + "time_per_iteration": 2.845242738723755 + }, + { + "auxiliary_loss_clip": 0.0132594, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.21858644, + "balance_loss_mlp": 1.01240301, + "epoch": 0.9676236284382985, + "flos": 24613910867160.0, + "grad_norm": 1.5367479014897028, + "language_loss": 0.69358081, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71710587, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.14172363, + "step": 16094, + "time_per_iteration": 2.802717447280884 + }, + { + "auxiliary_loss_clip": 0.01335088, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.22465336, + "balance_loss_mlp": 1.01475441, + "epoch": 0.9676837516909664, + "flos": 23372319103920.0, + "grad_norm": 1.444366675734313, + "language_loss": 0.76163292, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78525722, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12579346, + "step": 16095, + "time_per_iteration": 2.796295404434204 + }, + { + "auxiliary_loss_clip": 0.01336993, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.2244668, + "balance_loss_mlp": 1.01837051, + "epoch": 0.9677438749436345, + "flos": 20491969568040.0, + "grad_norm": 1.6863627059477408, + "language_loss": 0.70459449, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72828376, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.13568115, + "step": 16096, + "time_per_iteration": 2.796607732772827 + }, + { + "auxiliary_loss_clip": 0.01332536, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.22410226, + "balance_loss_mlp": 1.01877761, + "epoch": 0.9678039981963025, + "flos": 47564049211920.0, + "grad_norm": 1.8350791974684664, + "language_loss": 0.72172773, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.74536955, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12860107, + "step": 16097, + "time_per_iteration": 2.9735047817230225 + }, + { + "auxiliary_loss_clip": 0.01325875, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.21945572, + "balance_loss_mlp": 1.02095616, + "epoch": 0.9678641214489704, + "flos": 25045211526120.0, + "grad_norm": 1.9104847386668746, + "language_loss": 0.78200734, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80560195, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12652588, + "step": 16098, + "time_per_iteration": 2.900608539581299 + }, + { + "auxiliary_loss_clip": 0.01322311, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.21716392, + "balance_loss_mlp": 1.02037239, + "epoch": 0.9679242447016384, + "flos": 19245139326360.0, + "grad_norm": 2.0306156411914613, + "language_loss": 0.90628588, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92982787, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1151123, + "step": 16099, + "time_per_iteration": 2.840545177459717 + }, + { + "auxiliary_loss_clip": 0.01335341, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.22415006, + "balance_loss_mlp": 1.01715136, + "epoch": 0.9679843679543063, + "flos": 33261131945520.0, + "grad_norm": 1.7036872439695245, + "language_loss": 0.66438794, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68804181, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12896729, + "step": 16100, + "time_per_iteration": 2.9569804668426514 + }, + { + "auxiliary_loss_clip": 0.01328137, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.22121882, + "balance_loss_mlp": 1.017138, + "epoch": 0.9680444912069743, + "flos": 22788764411400.0, + "grad_norm": 1.6548698247367253, + "language_loss": 0.73248893, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.7560662, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12445068, + "step": 16101, + "time_per_iteration": 2.7512776851654053 + }, + { + "auxiliary_loss_clip": 0.01323142, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.21709251, + "balance_loss_mlp": 1.01427603, + "epoch": 0.9681046144596422, + "flos": 24029381574000.0, + "grad_norm": 1.7742720952447573, + "language_loss": 0.73709726, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.76060551, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.13409424, + "step": 16102, + "time_per_iteration": 2.7668097019195557 + }, + { + "auxiliary_loss_clip": 0.01327787, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.22018814, + "balance_loss_mlp": 1.02207804, + "epoch": 0.9681647377123103, + "flos": 23446314181800.0, + "grad_norm": 1.874272090041452, + "language_loss": 0.77706581, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.80069977, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13519287, + "step": 16103, + "time_per_iteration": 2.7635838985443115 + }, + { + "auxiliary_loss_clip": 0.0132785, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.22082162, + "balance_loss_mlp": 1.01825714, + "epoch": 0.9682248609649782, + "flos": 22680309817080.0, + "grad_norm": 1.569529885129919, + "language_loss": 0.80342293, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82700169, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.11767578, + "step": 16104, + "time_per_iteration": 2.7946295738220215 + }, + { + "auxiliary_loss_clip": 0.01315584, + "auxiliary_loss_mlp": 0.0102832, + "balance_loss_clip": 1.21085668, + "balance_loss_mlp": 1.01732302, + "epoch": 0.9682849842176462, + "flos": 24434141346720.0, + "grad_norm": 2.538545167675006, + "language_loss": 0.77925134, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80269039, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11004639, + "step": 16105, + "time_per_iteration": 2.814605712890625 + }, + { + "auxiliary_loss_clip": 0.01143842, + "auxiliary_loss_mlp": 0.01005685, + "balance_loss_clip": 1.10072184, + "balance_loss_mlp": 1.00313401, + "epoch": 0.9683451074703141, + "flos": 60009367296960.0, + "grad_norm": 0.8218287000950567, + "language_loss": 0.56741416, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58890939, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.0255127, + "step": 16106, + "time_per_iteration": 3.315368413925171 + }, + { + "auxiliary_loss_clip": 0.01141843, + "auxiliary_loss_mlp": 0.010067, + "balance_loss_clip": 1.09980893, + "balance_loss_mlp": 1.00407743, + "epoch": 0.9684052307229821, + "flos": 52709187557520.0, + "grad_norm": 0.8739518404618504, + "language_loss": 0.61635172, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63783705, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.02624512, + "step": 16107, + "time_per_iteration": 4.5350658893585205 + }, + { + "auxiliary_loss_clip": 0.01325336, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.21779728, + "balance_loss_mlp": 1.02042937, + "epoch": 0.96846535397565, + "flos": 22789008061560.0, + "grad_norm": 1.8584140054480918, + "language_loss": 0.74208713, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76568091, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13623047, + "step": 16108, + "time_per_iteration": 2.893615961074829 + }, + { + "auxiliary_loss_clip": 0.01330129, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.2207737, + "balance_loss_mlp": 1.01873612, + "epoch": 0.968525477228318, + "flos": 23328276014520.0, + "grad_norm": 1.9445562045327145, + "language_loss": 0.5770371, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.60066259, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13677979, + "step": 16109, + "time_per_iteration": 2.812056303024292 + }, + { + "auxiliary_loss_clip": 0.01144133, + "auxiliary_loss_mlp": 0.01008784, + "balance_loss_clip": 1.10094547, + "balance_loss_mlp": 1.0061022, + "epoch": 0.9685856004809861, + "flos": 67899111766200.0, + "grad_norm": 0.6629824628838135, + "language_loss": 0.54280877, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56433797, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.02685547, + "step": 16110, + "time_per_iteration": 3.255084991455078 + }, + { + "auxiliary_loss_clip": 0.01334679, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.22310269, + "balance_loss_mlp": 1.02174711, + "epoch": 0.968645723733654, + "flos": 33954440699880.0, + "grad_norm": 1.4669786034606194, + "language_loss": 0.62466919, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64837444, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.14105225, + "step": 16111, + "time_per_iteration": 4.394861459732056 + }, + { + "auxiliary_loss_clip": 0.01320689, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.2150985, + "balance_loss_mlp": 1.01646423, + "epoch": 0.968705846986322, + "flos": 18556094449800.0, + "grad_norm": 1.7359759965050594, + "language_loss": 0.74636441, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76985383, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.11791992, + "step": 16112, + "time_per_iteration": 4.348974943161011 + }, + { + "auxiliary_loss_clip": 0.01315317, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.21152711, + "balance_loss_mlp": 1.01826477, + "epoch": 0.9687659702389899, + "flos": 17425434390840.0, + "grad_norm": 1.9490726866128356, + "language_loss": 0.66997218, + "learning_rate": 1.020550495531558e-08, + "loss": 0.69342279, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.11474609, + "step": 16113, + "time_per_iteration": 2.848085403442383 + }, + { + "auxiliary_loss_clip": 0.01145741, + "auxiliary_loss_mlp": 0.01000666, + "balance_loss_clip": 1.10334611, + "balance_loss_mlp": 0.99820989, + "epoch": 0.9688260934916579, + "flos": 62062833890520.0, + "grad_norm": 0.7090852062117036, + "language_loss": 0.56679988, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58826387, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02453613, + "step": 16114, + "time_per_iteration": 3.2495720386505127 + }, + { + "auxiliary_loss_clip": 0.01331915, + "auxiliary_loss_mlp": 0.01039292, + "balance_loss_clip": 1.22483182, + "balance_loss_mlp": 1.02619135, + "epoch": 0.9688862167443258, + "flos": 15079520413080.0, + "grad_norm": 1.7283262197442062, + "language_loss": 0.82694054, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.85065258, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13092041, + "step": 16115, + "time_per_iteration": 2.6794278621673584 + }, + { + "auxiliary_loss_clip": 0.01314299, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.21262598, + "balance_loss_mlp": 1.01790285, + "epoch": 0.9689463399969939, + "flos": 19943036825400.0, + "grad_norm": 1.6139399284571119, + "language_loss": 0.72265351, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74609625, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.12078857, + "step": 16116, + "time_per_iteration": 4.378063440322876 + }, + { + "auxiliary_loss_clip": 0.01327328, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.21853113, + "balance_loss_mlp": 1.01688957, + "epoch": 0.9690064632496618, + "flos": 19577772005760.0, + "grad_norm": 1.9784191167110818, + "language_loss": 0.76171589, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78528655, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.12854004, + "step": 16117, + "time_per_iteration": 2.814301013946533 + }, + { + "auxiliary_loss_clip": 0.01325072, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.21631575, + "balance_loss_mlp": 1.01570249, + "epoch": 0.9690665865023298, + "flos": 21877409434320.0, + "grad_norm": 1.8145859382345657, + "language_loss": 0.78328782, + "learning_rate": 1.000997769426548e-08, + "loss": 0.80682719, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13153076, + "step": 16118, + "time_per_iteration": 2.770531415939331 + }, + { + "auxiliary_loss_clip": 0.01334589, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.22567081, + "balance_loss_mlp": 1.01947999, + "epoch": 0.9691267097549977, + "flos": 20999214506160.0, + "grad_norm": 1.5961526722817818, + "language_loss": 0.7812463, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80491495, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12792969, + "step": 16119, + "time_per_iteration": 2.8543918132781982 + }, + { + "auxiliary_loss_clip": 0.01315731, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.21301699, + "balance_loss_mlp": 1.01668501, + "epoch": 0.9691868330076657, + "flos": 24284405031480.0, + "grad_norm": 1.380602602270852, + "language_loss": 0.76190978, + "learning_rate": 9.932295003832747e-09, + "loss": 0.78535151, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.11761475, + "step": 16120, + "time_per_iteration": 2.9025044441223145 + }, + { + "auxiliary_loss_clip": 0.01326789, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.21968007, + "balance_loss_mlp": 1.01555419, + "epoch": 0.9692469562603336, + "flos": 17680173589800.0, + "grad_norm": 1.9568296526351294, + "language_loss": 0.69923913, + "learning_rate": 9.89356685323095e-09, + "loss": 0.72278523, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12249756, + "step": 16121, + "time_per_iteration": 2.824124574661255 + }, + { + "auxiliary_loss_clip": 0.01320905, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.21487379, + "balance_loss_mlp": 1.02019513, + "epoch": 0.9693070795130017, + "flos": 26840162343240.0, + "grad_norm": 1.9648103489066804, + "language_loss": 0.69576746, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71930814, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12957764, + "step": 16122, + "time_per_iteration": 2.9211654663085938 + }, + { + "auxiliary_loss_clip": 0.0132771, + "auxiliary_loss_mlp": 0.0102759, + "balance_loss_clip": 1.2197783, + "balance_loss_mlp": 1.01650333, + "epoch": 0.9693672027656697, + "flos": 18081847127160.0, + "grad_norm": 1.7404590462821854, + "language_loss": 0.76100147, + "learning_rate": 9.81633694859907e-09, + "loss": 0.78455448, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.11090088, + "step": 16123, + "time_per_iteration": 2.8309528827667236 + }, + { + "auxiliary_loss_clip": 0.01332832, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.22284257, + "balance_loss_mlp": 1.01942527, + "epoch": 0.9694273260183376, + "flos": 21768142672800.0, + "grad_norm": 1.5674806886195827, + "language_loss": 0.74818259, + "learning_rate": 9.777835197497753e-09, + "loss": 0.77183926, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.13415527, + "step": 16124, + "time_per_iteration": 2.847827196121216 + }, + { + "auxiliary_loss_clip": 0.01331888, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.22324932, + "balance_loss_mlp": 1.01898575, + "epoch": 0.9694874492710056, + "flos": 24431542411680.0, + "grad_norm": 2.278979885003964, + "language_loss": 0.74607623, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76971102, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.1260376, + "step": 16125, + "time_per_iteration": 2.8484256267547607 + }, + { + "auxiliary_loss_clip": 0.0114308, + "auxiliary_loss_mlp": 0.0100166, + "balance_loss_clip": 1.10057878, + "balance_loss_mlp": 0.99913293, + "epoch": 0.9695475725236735, + "flos": 67665147066360.0, + "grad_norm": 0.8788567225034698, + "language_loss": 0.61574131, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63718873, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02526855, + "step": 16126, + "time_per_iteration": 3.2673966884613037 + }, + { + "auxiliary_loss_clip": 0.01313599, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.2103467, + "balance_loss_mlp": 1.02061749, + "epoch": 0.9696076957763415, + "flos": 19133233021440.0, + "grad_norm": 1.7064699004774109, + "language_loss": 0.75037181, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77383727, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12329102, + "step": 16127, + "time_per_iteration": 2.932966709136963 + }, + { + "auxiliary_loss_clip": 0.01332989, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.22124481, + "balance_loss_mlp": 1.02375555, + "epoch": 0.9696678190290094, + "flos": 15491061781920.0, + "grad_norm": 1.6583355266691149, + "language_loss": 0.69482446, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71852839, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13635254, + "step": 16128, + "time_per_iteration": 2.8462390899658203 + }, + { + "auxiliary_loss_clip": 0.01323757, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.21719456, + "balance_loss_mlp": 1.02228355, + "epoch": 0.9697279422816775, + "flos": 36215354734200.0, + "grad_norm": 1.5117348502096475, + "language_loss": 0.64947885, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67307246, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.13330078, + "step": 16129, + "time_per_iteration": 3.0531420707702637 + }, + { + "auxiliary_loss_clip": 0.01339789, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.22752786, + "balance_loss_mlp": 1.02191377, + "epoch": 0.9697880655343454, + "flos": 25489547468640.0, + "grad_norm": 1.869789298821617, + "language_loss": 0.63316447, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65691954, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13806152, + "step": 16130, + "time_per_iteration": 2.826792001724243 + }, + { + "auxiliary_loss_clip": 0.01338132, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.22629094, + "balance_loss_mlp": 1.01741385, + "epoch": 0.9698481887870134, + "flos": 15336777330360.0, + "grad_norm": 2.4296242503586916, + "language_loss": 0.70558524, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72926599, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.12530518, + "step": 16131, + "time_per_iteration": 2.8509833812713623 + }, + { + "auxiliary_loss_clip": 0.01331011, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.22188425, + "balance_loss_mlp": 1.02078176, + "epoch": 0.9699083120396813, + "flos": 21987447754680.0, + "grad_norm": 2.0508580636298417, + "language_loss": 0.76931566, + "learning_rate": 9.472538209986058e-09, + "loss": 0.79296935, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.13568115, + "step": 16132, + "time_per_iteration": 2.871743679046631 + }, + { + "auxiliary_loss_clip": 0.01328198, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.22032821, + "balance_loss_mlp": 1.01669121, + "epoch": 0.9699684352923493, + "flos": 15667623241920.0, + "grad_norm": 2.701742016331866, + "language_loss": 0.79275054, + "learning_rate": 9.434715735916477e-09, + "loss": 0.8163287, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12921143, + "step": 16133, + "time_per_iteration": 2.781834602355957 + }, + { + "auxiliary_loss_clip": 0.01317681, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.21380734, + "balance_loss_mlp": 1.01640594, + "epoch": 0.9700285585450172, + "flos": 21913452676800.0, + "grad_norm": 1.58908749409622, + "language_loss": 0.65403092, + "learning_rate": 9.396968744281863e-09, + "loss": 0.67748713, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11541748, + "step": 16134, + "time_per_iteration": 2.83077335357666 + }, + { + "auxiliary_loss_clip": 0.01327004, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.21911132, + "balance_loss_mlp": 1.01889896, + "epoch": 0.9700886817976853, + "flos": 23920196029200.0, + "grad_norm": 2.063108461698218, + "language_loss": 0.8077935, + "learning_rate": 9.359297236513519e-09, + "loss": 0.83138317, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13079834, + "step": 16135, + "time_per_iteration": 2.7801225185394287 + }, + { + "auxiliary_loss_clip": 0.01335739, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.22540164, + "balance_loss_mlp": 1.01787806, + "epoch": 0.9701488050503532, + "flos": 25453423009440.0, + "grad_norm": 1.739597371052925, + "language_loss": 0.73419559, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75787169, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13983154, + "step": 16136, + "time_per_iteration": 2.8232829570770264 + }, + { + "auxiliary_loss_clip": 0.0132027, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.2145412, + "balance_loss_mlp": 1.01797867, + "epoch": 0.9702089283030212, + "flos": 20595226292280.0, + "grad_norm": 1.390900108977196, + "language_loss": 0.76585126, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78935224, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11846924, + "step": 16137, + "time_per_iteration": 2.7461512088775635 + }, + { + "auxiliary_loss_clip": 0.01142133, + "auxiliary_loss_mlp": 0.01004477, + "balance_loss_clip": 1.09975934, + "balance_loss_mlp": 1.00193739, + "epoch": 0.9702690515556892, + "flos": 70667473780440.0, + "grad_norm": 0.7569660677717882, + "language_loss": 0.54971641, + "learning_rate": 9.246735630678015e-09, + "loss": 0.57118249, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02539062, + "step": 16138, + "time_per_iteration": 3.3615288734436035 + }, + { + "auxiliary_loss_clip": 0.01332703, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.2235465, + "balance_loss_mlp": 1.016855, + "epoch": 0.9703291748083571, + "flos": 35888204183400.0, + "grad_norm": 1.7797915295532716, + "language_loss": 0.71069193, + "learning_rate": 9.209366072632007e-09, + "loss": 0.73431337, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12567139, + "step": 16139, + "time_per_iteration": 2.8769407272338867 + }, + { + "auxiliary_loss_clip": 0.01333857, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.22522783, + "balance_loss_mlp": 1.01737249, + "epoch": 0.9703892980610251, + "flos": 24321869566560.0, + "grad_norm": 2.480715594278122, + "language_loss": 0.72363013, + "learning_rate": 9.172072005566134e-09, + "loss": 0.7472744, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13189697, + "step": 16140, + "time_per_iteration": 2.8010847568511963 + }, + { + "auxiliary_loss_clip": 0.0132545, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.21627378, + "balance_loss_mlp": 1.02420616, + "epoch": 0.970449421313693, + "flos": 18008217524520.0, + "grad_norm": 2.410426646906403, + "language_loss": 0.6852693, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70890141, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.13574219, + "step": 16141, + "time_per_iteration": 2.7509799003601074 + }, + { + "auxiliary_loss_clip": 0.01313511, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.20991075, + "balance_loss_mlp": 1.0151372, + "epoch": 0.9705095445663611, + "flos": 25343506514160.0, + "grad_norm": 1.79429000515719, + "language_loss": 0.68559062, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70900035, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12322998, + "step": 16142, + "time_per_iteration": 2.7886874675750732 + }, + { + "auxiliary_loss_clip": 0.01328837, + "auxiliary_loss_mlp": 0.01029779, + "balance_loss_clip": 1.22010398, + "balance_loss_mlp": 1.0175544, + "epoch": 0.970569667819029, + "flos": 26839268959320.0, + "grad_norm": 1.8621113195178358, + "language_loss": 0.55955899, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58314514, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12231445, + "step": 16143, + "time_per_iteration": 2.9573137760162354 + }, + { + "auxiliary_loss_clip": 0.01332805, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.22353399, + "balance_loss_mlp": 1.01861012, + "epoch": 0.970629791071697, + "flos": 25854203162880.0, + "grad_norm": 3.02099513957443, + "language_loss": 0.68064415, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70428228, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.12402344, + "step": 16144, + "time_per_iteration": 2.8872578144073486 + }, + { + "auxiliary_loss_clip": 0.01318028, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.21333265, + "balance_loss_mlp": 1.02301443, + "epoch": 0.9706899143243649, + "flos": 36547297071480.0, + "grad_norm": 1.6992175559195428, + "language_loss": 0.72305, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74658549, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.12506104, + "step": 16145, + "time_per_iteration": 2.9708199501037598 + }, + { + "auxiliary_loss_clip": 0.01328253, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.21818566, + "balance_loss_mlp": 1.01711917, + "epoch": 0.9707500375770329, + "flos": 12271013712000.0, + "grad_norm": 3.1388757992221414, + "language_loss": 0.80764502, + "learning_rate": 8.949892992753395e-09, + "loss": 0.83123356, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1348877, + "step": 16146, + "time_per_iteration": 4.2946391105651855 + }, + { + "auxiliary_loss_clip": 0.01141743, + "auxiliary_loss_mlp": 0.01001917, + "balance_loss_clip": 1.09922278, + "balance_loss_mlp": 0.99925834, + "epoch": 0.9708101608297008, + "flos": 60869833353000.0, + "grad_norm": 0.7580405933914295, + "language_loss": 0.54657644, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56801301, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02661133, + "step": 16147, + "time_per_iteration": 3.3040411472320557 + }, + { + "auxiliary_loss_clip": 0.01329366, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.21981359, + "balance_loss_mlp": 1.01658762, + "epoch": 0.9708702840823689, + "flos": 27130376267640.0, + "grad_norm": 2.558664352511677, + "language_loss": 0.6123578, + "learning_rate": 8.876437313434682e-09, + "loss": 0.6359514, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13409424, + "step": 16148, + "time_per_iteration": 2.852151393890381 + }, + { + "auxiliary_loss_clip": 0.01323704, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.21841943, + "balance_loss_mlp": 1.01897931, + "epoch": 0.9709304073350368, + "flos": 20782670792760.0, + "grad_norm": 2.481804014344762, + "language_loss": 0.73903596, + "learning_rate": 8.839822728487155e-09, + "loss": 0.76258147, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.11871338, + "step": 16149, + "time_per_iteration": 2.78594970703125 + }, + { + "auxiliary_loss_clip": 0.01323441, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.21641326, + "balance_loss_mlp": 1.02146411, + "epoch": 0.9709905305877048, + "flos": 41941634897880.0, + "grad_norm": 2.8700385126711385, + "language_loss": 0.75235546, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77593327, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12878418, + "step": 16150, + "time_per_iteration": 4.507524013519287 + }, + { + "auxiliary_loss_clip": 0.01340062, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.2270267, + "balance_loss_mlp": 1.01758671, + "epoch": 0.9710506538403728, + "flos": 17170086066480.0, + "grad_norm": 2.0197638303326313, + "language_loss": 0.74254042, + "learning_rate": 8.766820074958214e-09, + "loss": 0.76626891, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.15185547, + "step": 16151, + "time_per_iteration": 2.7443277835845947 + }, + { + "auxiliary_loss_clip": 0.01318181, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.21400988, + "balance_loss_mlp": 1.01817632, + "epoch": 0.9711107770930407, + "flos": 21177765775800.0, + "grad_norm": 5.887320136466745, + "language_loss": 0.75401354, + "learning_rate": 8.730432009145027e-09, + "loss": 0.77750707, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.12994385, + "step": 16152, + "time_per_iteration": 2.7835588455200195 + }, + { + "auxiliary_loss_clip": 0.0132855, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.22179985, + "balance_loss_mlp": 1.01714706, + "epoch": 0.9711709003457087, + "flos": 22242227562000.0, + "grad_norm": 1.6056938552323698, + "language_loss": 0.67401993, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69760376, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12695312, + "step": 16153, + "time_per_iteration": 2.7598352432250977 + }, + { + "auxiliary_loss_clip": 0.01320303, + "auxiliary_loss_mlp": 0.01027246, + "balance_loss_clip": 1.21281505, + "balance_loss_mlp": 1.01483059, + "epoch": 0.9712310235983767, + "flos": 26219508590880.0, + "grad_norm": 1.5405080413464312, + "language_loss": 0.70857179, + "learning_rate": 8.65788240632037e-09, + "loss": 0.73204726, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12414551, + "step": 16154, + "time_per_iteration": 2.827388286590576 + }, + { + "auxiliary_loss_clip": 0.0133428, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.22367489, + "balance_loss_mlp": 1.0193156, + "epoch": 0.9712911468510447, + "flos": 20673119772720.0, + "grad_norm": 1.7673642060386947, + "language_loss": 0.81229049, + "learning_rate": 8.621720872059812e-09, + "loss": 0.83596802, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14160156, + "step": 16155, + "time_per_iteration": 4.283860445022583 + }, + { + "auxiliary_loss_clip": 0.0133625, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.22385359, + "balance_loss_mlp": 1.0170995, + "epoch": 0.9713512701037126, + "flos": 13556851606440.0, + "grad_norm": 2.2685889095253855, + "language_loss": 0.68411911, + "learning_rate": 8.58563485106334e-09, + "loss": 0.70779312, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.14068604, + "step": 16156, + "time_per_iteration": 2.7072036266326904 + }, + { + "auxiliary_loss_clip": 0.01331256, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.22114992, + "balance_loss_mlp": 1.01951957, + "epoch": 0.9714113933563806, + "flos": 25854324987960.0, + "grad_norm": 11.029740070441541, + "language_loss": 0.91675341, + "learning_rate": 8.54962434469919e-09, + "loss": 0.94038486, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.12371826, + "step": 16157, + "time_per_iteration": 2.885723829269409 + }, + { + "auxiliary_loss_clip": 0.0133417, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.22424316, + "balance_loss_mlp": 1.02047873, + "epoch": 0.9714715166090485, + "flos": 12745991985120.0, + "grad_norm": 1.9438870514120816, + "language_loss": 0.72855121, + "learning_rate": 8.513689354332721e-09, + "loss": 0.75221795, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.12030029, + "step": 16158, + "time_per_iteration": 2.8068697452545166 + }, + { + "auxiliary_loss_clip": 0.01324041, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.21919036, + "balance_loss_mlp": 1.02007115, + "epoch": 0.9715316398617165, + "flos": 18410134712040.0, + "grad_norm": 2.521127751452832, + "language_loss": 0.60895741, + "learning_rate": 8.477829881326836e-09, + "loss": 0.63252378, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12518311, + "step": 16159, + "time_per_iteration": 2.732614040374756 + }, + { + "auxiliary_loss_clip": 0.01316838, + "auxiliary_loss_mlp": 0.01025179, + "balance_loss_clip": 1.21339417, + "balance_loss_mlp": 1.01356792, + "epoch": 0.9715917631143844, + "flos": 28919804347800.0, + "grad_norm": 1.5516028956049173, + "language_loss": 0.78702509, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81044525, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.1161499, + "step": 16160, + "time_per_iteration": 2.807640790939331 + }, + { + "auxiliary_loss_clip": 0.01142652, + "auxiliary_loss_mlp": 0.01008005, + "balance_loss_clip": 1.10021925, + "balance_loss_mlp": 1.00560939, + "epoch": 0.9716518863670525, + "flos": 65954384025480.0, + "grad_norm": 0.9787194959529103, + "language_loss": 0.54314673, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56465328, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02392578, + "step": 16161, + "time_per_iteration": 3.291712760925293 + }, + { + "auxiliary_loss_clip": 0.01318058, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.21451521, + "balance_loss_mlp": 1.0179193, + "epoch": 0.9717120096197204, + "flos": 17717313258000.0, + "grad_norm": 1.856351263642578, + "language_loss": 0.72147024, + "learning_rate": 8.3707045800554e-09, + "loss": 0.74495244, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12255859, + "step": 16162, + "time_per_iteration": 2.7742888927459717 + }, + { + "auxiliary_loss_clip": 0.01319661, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.21428359, + "balance_loss_mlp": 1.01640177, + "epoch": 0.9717721328723884, + "flos": 24468966338400.0, + "grad_norm": 1.7348793564075167, + "language_loss": 0.79317188, + "learning_rate": 8.335147190060787e-09, + "loss": 0.81665742, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12493896, + "step": 16163, + "time_per_iteration": 2.8715641498565674 + }, + { + "auxiliary_loss_clip": 0.01319342, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.21428227, + "balance_loss_mlp": 1.01548505, + "epoch": 0.9718322561250564, + "flos": 20781290108520.0, + "grad_norm": 1.8118098990850293, + "language_loss": 0.73165262, + "learning_rate": 8.299665324196903e-09, + "loss": 0.75512421, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12341309, + "step": 16164, + "time_per_iteration": 2.9387426376342773 + }, + { + "auxiliary_loss_clip": 0.01331944, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.22245026, + "balance_loss_mlp": 1.02338135, + "epoch": 0.9718923793777243, + "flos": 19030585422600.0, + "grad_norm": 1.9044535790242156, + "language_loss": 0.84684718, + "learning_rate": 8.264258983809114e-09, + "loss": 0.87053299, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13244629, + "step": 16165, + "time_per_iteration": 2.765409231185913 + }, + { + "auxiliary_loss_clip": 0.0132371, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.21706581, + "balance_loss_mlp": 1.01453567, + "epoch": 0.9719525026303923, + "flos": 21876759700560.0, + "grad_norm": 1.6940630116115236, + "language_loss": 0.79542077, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81891328, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.11004639, + "step": 16166, + "time_per_iteration": 2.8356330394744873 + }, + { + "auxiliary_loss_clip": 0.01323467, + "auxiliary_loss_mlp": 0.01025488, + "balance_loss_clip": 1.2169342, + "balance_loss_mlp": 1.01306021, + "epoch": 0.9720126258830603, + "flos": 14433462808560.0, + "grad_norm": 1.6959632022888964, + "language_loss": 0.70722353, + "learning_rate": 8.193672884830195e-09, + "loss": 0.73071313, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12438965, + "step": 16167, + "time_per_iteration": 2.8161075115203857 + }, + { + "auxiliary_loss_clip": 0.01322302, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.21704566, + "balance_loss_mlp": 1.02175188, + "epoch": 0.9720727491357283, + "flos": 26256932517600.0, + "grad_norm": 1.4035453372138085, + "language_loss": 0.75987178, + "learning_rate": 8.158493128915812e-09, + "loss": 0.78343654, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.12414551, + "step": 16168, + "time_per_iteration": 2.880676031112671 + }, + { + "auxiliary_loss_clip": 0.01333002, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.22433197, + "balance_loss_mlp": 1.02098918, + "epoch": 0.9721328723883962, + "flos": 22679213391360.0, + "grad_norm": 2.5715408404709215, + "language_loss": 0.73558891, + "learning_rate": 8.123388903830797e-09, + "loss": 0.75925833, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1293335, + "step": 16169, + "time_per_iteration": 2.961561441421509 + }, + { + "auxiliary_loss_clip": 0.01331259, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.22095919, + "balance_loss_mlp": 1.01568234, + "epoch": 0.9721929956410642, + "flos": 28080007947000.0, + "grad_norm": 1.692209319084963, + "language_loss": 0.58065712, + "learning_rate": 8.088360210906309e-09, + "loss": 0.60426182, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13531494, + "step": 16170, + "time_per_iteration": 2.9453506469726562 + }, + { + "auxiliary_loss_clip": 0.01325716, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.21727133, + "balance_loss_mlp": 1.01380503, + "epoch": 0.9722531188937321, + "flos": 21001204315800.0, + "grad_norm": 1.8818014847267568, + "language_loss": 0.7160213, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73954916, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1328125, + "step": 16171, + "time_per_iteration": 2.926187515258789 + }, + { + "auxiliary_loss_clip": 0.01328403, + "auxiliary_loss_mlp": 0.01039537, + "balance_loss_clip": 1.2204634, + "balance_loss_mlp": 1.02617919, + "epoch": 0.9723132421464001, + "flos": 16074900732960.0, + "grad_norm": 1.7526742480067867, + "language_loss": 0.68788058, + "learning_rate": 8.018529426850218e-09, + "loss": 0.71156001, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13354492, + "step": 16172, + "time_per_iteration": 2.7638370990753174 + }, + { + "auxiliary_loss_clip": 0.01320166, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.21576869, + "balance_loss_mlp": 1.01667821, + "epoch": 0.972373365399068, + "flos": 27751557928680.0, + "grad_norm": 1.7292193658697559, + "language_loss": 0.8618319, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88531965, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11938477, + "step": 16173, + "time_per_iteration": 2.922611713409424 + }, + { + "auxiliary_loss_clip": 0.01340569, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.22633076, + "balance_loss_mlp": 1.01855326, + "epoch": 0.9724334886517361, + "flos": 23008150710000.0, + "grad_norm": 2.005889774750503, + "language_loss": 0.6421234, + "learning_rate": 7.949000787339289e-09, + "loss": 0.6658653, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.1506958, + "step": 16174, + "time_per_iteration": 2.81567645072937 + }, + { + "auxiliary_loss_clip": 0.01321402, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.21637225, + "balance_loss_mlp": 1.01733589, + "epoch": 0.972493611904404, + "flos": 25452529625520.0, + "grad_norm": 1.4984189328939066, + "language_loss": 0.78258979, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80610007, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12298584, + "step": 16175, + "time_per_iteration": 2.7836661338806152 + }, + { + "auxiliary_loss_clip": 0.0132304, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.21615195, + "balance_loss_mlp": 1.02022171, + "epoch": 0.972553735157072, + "flos": 16987514569200.0, + "grad_norm": 2.6624685941524615, + "language_loss": 0.57458019, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59815359, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.14086914, + "step": 16176, + "time_per_iteration": 2.7452890872955322 + }, + { + "auxiliary_loss_clip": 0.01328981, + "auxiliary_loss_mlp": 0.01025766, + "balance_loss_clip": 1.22299957, + "balance_loss_mlp": 1.01368427, + "epoch": 0.97261385840974, + "flos": 26110323046080.0, + "grad_norm": 1.902032421915148, + "language_loss": 0.72191763, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74546504, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12084961, + "step": 16177, + "time_per_iteration": 2.7890636920928955 + }, + { + "auxiliary_loss_clip": 0.01329028, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.2195828, + "balance_loss_mlp": 1.01868331, + "epoch": 0.9726739816624079, + "flos": 25453463617800.0, + "grad_norm": 1.6775953864314348, + "language_loss": 0.68559086, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70919538, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12750244, + "step": 16178, + "time_per_iteration": 2.838589668273926 + }, + { + "auxiliary_loss_clip": 0.01333192, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.22238231, + "balance_loss_mlp": 1.01928496, + "epoch": 0.972734104915076, + "flos": 29019853011600.0, + "grad_norm": 2.524261475173847, + "language_loss": 0.67536658, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69902337, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.13201904, + "step": 16179, + "time_per_iteration": 2.866718292236328 + }, + { + "auxiliary_loss_clip": 0.01321903, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.21696329, + "balance_loss_mlp": 1.01902509, + "epoch": 0.9727942281677439, + "flos": 23442496995960.0, + "grad_norm": 1.683409974536158, + "language_loss": 0.77470398, + "learning_rate": 7.742227841308624e-09, + "loss": 0.79823452, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12139893, + "step": 16180, + "time_per_iteration": 2.795734167098999 + }, + { + "auxiliary_loss_clip": 0.01335022, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.22306263, + "balance_loss_mlp": 1.02026653, + "epoch": 0.9728543514204119, + "flos": 31731153634080.0, + "grad_norm": 1.4719143516904967, + "language_loss": 0.7664991, + "learning_rate": 7.708030089189188e-09, + "loss": 0.7901848, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1328125, + "step": 16181, + "time_per_iteration": 2.916949510574341 + }, + { + "auxiliary_loss_clip": 0.01326471, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.21950543, + "balance_loss_mlp": 1.0220083, + "epoch": 0.9729144746730798, + "flos": 16293677906160.0, + "grad_norm": 1.6453748244472297, + "language_loss": 0.63279527, + "learning_rate": 7.67390788498079e-09, + "loss": 0.6564073, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12719727, + "step": 16182, + "time_per_iteration": 2.830228090286255 + }, + { + "auxiliary_loss_clip": 0.01326895, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.21853757, + "balance_loss_mlp": 1.01839817, + "epoch": 0.9729745979257478, + "flos": 25046267343480.0, + "grad_norm": 1.9432256223137432, + "language_loss": 0.62457383, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64814878, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.12200928, + "step": 16183, + "time_per_iteration": 2.879223346710205 + }, + { + "auxiliary_loss_clip": 0.01320853, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.21564329, + "balance_loss_mlp": 1.0176276, + "epoch": 0.9730347211784157, + "flos": 22643982316080.0, + "grad_norm": 2.520205159362074, + "language_loss": 0.78267485, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80619192, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.13226318, + "step": 16184, + "time_per_iteration": 2.8945395946502686 + }, + { + "auxiliary_loss_clip": 0.01320879, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.21482015, + "balance_loss_mlp": 1.01691365, + "epoch": 0.9730948444310837, + "flos": 11002190720400.0, + "grad_norm": 2.293665462136122, + "language_loss": 0.80087364, + "learning_rate": 7.571994572747709e-09, + "loss": 0.82437348, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12194824, + "step": 16185, + "time_per_iteration": 4.232323408126831 + }, + { + "auxiliary_loss_clip": 0.01332554, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.22345245, + "balance_loss_mlp": 1.01682544, + "epoch": 0.9731549676837516, + "flos": 16803765429480.0, + "grad_norm": 1.9882092431216025, + "language_loss": 0.77749741, + "learning_rate": 7.538174573094469e-09, + "loss": 0.80111128, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.12005615, + "step": 16186, + "time_per_iteration": 2.846188545227051 + }, + { + "auxiliary_loss_clip": 0.0132781, + "auxiliary_loss_mlp": 0.01025585, + "balance_loss_clip": 1.22193098, + "balance_loss_mlp": 1.01270437, + "epoch": 0.9732150909364197, + "flos": 21146676753240.0, + "grad_norm": 1.5654827475704785, + "language_loss": 0.65365595, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67718995, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12884521, + "step": 16187, + "time_per_iteration": 2.847372531890869 + }, + { + "auxiliary_loss_clip": 0.01320394, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.21469057, + "balance_loss_mlp": 1.01922941, + "epoch": 0.9732752141890876, + "flos": 33733633108680.0, + "grad_norm": 1.7907420453560563, + "language_loss": 0.80622315, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82974076, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.12133789, + "step": 16188, + "time_per_iteration": 4.357828140258789 + }, + { + "auxiliary_loss_clip": 0.01318745, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.21501744, + "balance_loss_mlp": 1.01942587, + "epoch": 0.9733353374417556, + "flos": 23409661813920.0, + "grad_norm": 2.519745902460096, + "language_loss": 0.78141963, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80492139, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.11993408, + "step": 16189, + "time_per_iteration": 4.397196292877197 + }, + { + "auxiliary_loss_clip": 0.01326304, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.21960497, + "balance_loss_mlp": 1.01486611, + "epoch": 0.9733954606944236, + "flos": 39173069841840.0, + "grad_norm": 3.7235676310923456, + "language_loss": 0.51079977, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53433466, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12316895, + "step": 16190, + "time_per_iteration": 2.983814239501953 + }, + { + "auxiliary_loss_clip": 0.01326018, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.21854854, + "balance_loss_mlp": 1.01456428, + "epoch": 0.9734555839470915, + "flos": 21986960454360.0, + "grad_norm": 1.5735459671993046, + "language_loss": 0.80937654, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83291471, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13238525, + "step": 16191, + "time_per_iteration": 2.8122782707214355 + }, + { + "auxiliary_loss_clip": 0.01325091, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.21889532, + "balance_loss_mlp": 1.01705921, + "epoch": 0.9735157071997596, + "flos": 16579140652440.0, + "grad_norm": 1.6469688893565857, + "language_loss": 0.83095968, + "learning_rate": 7.336841261255111e-09, + "loss": 0.85450059, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.11938477, + "step": 16192, + "time_per_iteration": 2.7892942428588867 + }, + { + "auxiliary_loss_clip": 0.01333899, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.22533584, + "balance_loss_mlp": 1.01550198, + "epoch": 0.9735758304524275, + "flos": 20227200104160.0, + "grad_norm": 2.7626330953468976, + "language_loss": 0.75030899, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77393436, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13140869, + "step": 16193, + "time_per_iteration": 2.8504278659820557 + }, + { + "auxiliary_loss_clip": 0.01318591, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.21372592, + "balance_loss_mlp": 1.02010763, + "epoch": 0.9736359537050955, + "flos": 23657822458560.0, + "grad_norm": 1.6649551523898956, + "language_loss": 0.85077, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87427509, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1182251, + "step": 16194, + "time_per_iteration": 4.294155597686768 + }, + { + "auxiliary_loss_clip": 0.0131546, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.2123642, + "balance_loss_mlp": 1.0182724, + "epoch": 0.9736960769577634, + "flos": 15564488342760.0, + "grad_norm": 1.4905626346679532, + "language_loss": 0.7612735, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78473175, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.12103271, + "step": 16195, + "time_per_iteration": 2.7149765491485596 + }, + { + "auxiliary_loss_clip": 0.01141796, + "auxiliary_loss_mlp": 0.01002107, + "balance_loss_clip": 1.09942937, + "balance_loss_mlp": 0.99949616, + "epoch": 0.9737562002104314, + "flos": 65365834504680.0, + "grad_norm": 0.7243175737303448, + "language_loss": 0.52557743, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54701644, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02612305, + "step": 16196, + "time_per_iteration": 3.2295851707458496 + }, + { + "auxiliary_loss_clip": 0.01330577, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.22280335, + "balance_loss_mlp": 1.01693618, + "epoch": 0.9738163234630993, + "flos": 27202219102440.0, + "grad_norm": 1.606753566733985, + "language_loss": 0.76373565, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78733468, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.12390137, + "step": 16197, + "time_per_iteration": 2.8158516883850098 + }, + { + "auxiliary_loss_clip": 0.01338129, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.22586572, + "balance_loss_mlp": 1.01714396, + "epoch": 0.9738764467157673, + "flos": 21074346618120.0, + "grad_norm": 1.7928312205298265, + "language_loss": 0.67786312, + "learning_rate": 7.13822818063492e-09, + "loss": 0.70154423, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.128479, + "step": 16198, + "time_per_iteration": 2.7489330768585205 + }, + { + "auxiliary_loss_clip": 0.01328781, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.22046113, + "balance_loss_mlp": 1.01642597, + "epoch": 0.9739365699684353, + "flos": 21366225485280.0, + "grad_norm": 1.8053660612504006, + "language_loss": 0.78466368, + "learning_rate": 7.10539048654768e-09, + "loss": 0.8082428, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12701416, + "step": 16199, + "time_per_iteration": 2.84261417388916 + }, + { + "auxiliary_loss_clip": 0.01327075, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.22002101, + "balance_loss_mlp": 1.01651812, + "epoch": 0.9739966932211033, + "flos": 21906305605440.0, + "grad_norm": 1.6443325548841796, + "language_loss": 0.79430842, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81787026, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.1260376, + "step": 16200, + "time_per_iteration": 2.818631649017334 + }, + { + "auxiliary_loss_clip": 0.01342829, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.22850513, + "balance_loss_mlp": 1.02220678, + "epoch": 0.9740568164737712, + "flos": 24832809865440.0, + "grad_norm": 1.9105052340333881, + "language_loss": 0.6900543, + "learning_rate": 7.039941811905592e-09, + "loss": 0.71384203, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.13720703, + "step": 16201, + "time_per_iteration": 2.8342936038970947 + }, + { + "auxiliary_loss_clip": 0.01324987, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.21732092, + "balance_loss_mlp": 1.02124548, + "epoch": 0.9741169397264392, + "flos": 23628926287440.0, + "grad_norm": 1.3275757589667199, + "language_loss": 0.72891897, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.7525084, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12713623, + "step": 16202, + "time_per_iteration": 2.8515920639038086 + }, + { + "auxiliary_loss_clip": 0.01330011, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.22021294, + "balance_loss_mlp": 1.01623309, + "epoch": 0.9741770629791072, + "flos": 18845496207000.0, + "grad_norm": 1.8657556703351168, + "language_loss": 0.73171645, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75531411, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 1.09814453, + "router_z_loss_mlp": 0.13525391, + "step": 16203, + "time_per_iteration": 2.794461250305176 + }, + { + "auxiliary_loss_clip": 0.01324554, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.21640968, + "balance_loss_mlp": 1.01665485, + "epoch": 0.9742371862317751, + "flos": 22351372498440.0, + "grad_norm": 1.867158031540504, + "language_loss": 0.77641422, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79994613, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.11987305, + "step": 16204, + "time_per_iteration": 2.931741714477539 + }, + { + "auxiliary_loss_clip": 0.01328582, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.22029638, + "balance_loss_mlp": 1.01979792, + "epoch": 0.9742973094844432, + "flos": 21767858414280.0, + "grad_norm": 2.752801501746581, + "language_loss": 0.80229479, + "learning_rate": 6.909951351435905e-09, + "loss": 0.82591689, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.13830566, + "step": 16205, + "time_per_iteration": 2.9586691856384277 + }, + { + "auxiliary_loss_clip": 0.0132024, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.2138257, + "balance_loss_mlp": 1.01742077, + "epoch": 0.9743574327371111, + "flos": 26254171149120.0, + "grad_norm": 1.527765543490043, + "language_loss": 0.74477327, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76827168, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12164307, + "step": 16206, + "time_per_iteration": 2.9504880905151367 + }, + { + "auxiliary_loss_clip": 0.01328032, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.21908355, + "balance_loss_mlp": 1.01865602, + "epoch": 0.9744175559897791, + "flos": 12353171070240.0, + "grad_norm": 2.0700481030506026, + "language_loss": 0.84335768, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86695606, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1315918, + "step": 16207, + "time_per_iteration": 2.873427391052246 + }, + { + "auxiliary_loss_clip": 0.0131986, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.21442652, + "balance_loss_mlp": 1.01685345, + "epoch": 0.974477679242447, + "flos": 28403097661800.0, + "grad_norm": 1.5749197394767092, + "language_loss": 0.70910472, + "learning_rate": 6.813252072591425e-09, + "loss": 0.7325983, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12652588, + "step": 16208, + "time_per_iteration": 2.8932957649230957 + }, + { + "auxiliary_loss_clip": 0.01311584, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.21091557, + "balance_loss_mlp": 1.01497734, + "epoch": 0.974537802495115, + "flos": 17789887043280.0, + "grad_norm": 1.641822028129418, + "language_loss": 0.77703619, + "learning_rate": 6.781170141698878e-09, + "loss": 0.80041134, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.10943604, + "step": 16209, + "time_per_iteration": 2.9533843994140625 + }, + { + "auxiliary_loss_clip": 0.0133319, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.22272754, + "balance_loss_mlp": 1.01851177, + "epoch": 0.9745979257477829, + "flos": 23847662852280.0, + "grad_norm": 1.5958278249942597, + "language_loss": 0.79473174, + "learning_rate": 6.749163793864144e-09, + "loss": 0.81837469, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12597656, + "step": 16210, + "time_per_iteration": 2.813002586364746 + }, + { + "auxiliary_loss_clip": 0.01325315, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.21751583, + "balance_loss_mlp": 1.01973295, + "epoch": 0.9746580490004509, + "flos": 27021881064960.0, + "grad_norm": 2.6202947359423345, + "language_loss": 0.78154123, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80511087, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1192627, + "step": 16211, + "time_per_iteration": 2.87076735496521 + }, + { + "auxiliary_loss_clip": 0.01338028, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.22458506, + "balance_loss_mlp": 1.01836836, + "epoch": 0.9747181722531189, + "flos": 19796792829120.0, + "grad_norm": 2.2006729251229817, + "language_loss": 0.78688788, + "learning_rate": 6.685377852219787e-09, + "loss": 0.81059241, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.14074707, + "step": 16212, + "time_per_iteration": 2.7404611110687256 + }, + { + "auxiliary_loss_clip": 0.01319485, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.21412706, + "balance_loss_mlp": 1.01990223, + "epoch": 0.9747782955057869, + "flos": 31436838265320.0, + "grad_norm": 1.417251036514215, + "language_loss": 0.80253804, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82605743, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12554932, + "step": 16213, + "time_per_iteration": 2.881484270095825 + }, + { + "auxiliary_loss_clip": 0.0132202, + "auxiliary_loss_mlp": 0.01024215, + "balance_loss_clip": 1.21527696, + "balance_loss_mlp": 1.01171017, + "epoch": 0.9748384187584548, + "flos": 15965593363080.0, + "grad_norm": 4.380322491564765, + "language_loss": 0.66317999, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68664241, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12518311, + "step": 16214, + "time_per_iteration": 2.74998140335083 + }, + { + "auxiliary_loss_clip": 0.01330161, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.22005856, + "balance_loss_mlp": 1.01547623, + "epoch": 0.9748985420111228, + "flos": 20563649969400.0, + "grad_norm": 1.9338223545273308, + "language_loss": 0.74158633, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76517951, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13684082, + "step": 16215, + "time_per_iteration": 2.75848126411438 + }, + { + "auxiliary_loss_clip": 0.0132341, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.21571779, + "balance_loss_mlp": 1.01597738, + "epoch": 0.9749586652637908, + "flos": 36728609709600.0, + "grad_norm": 1.59859095702755, + "language_loss": 0.67065179, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69417357, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12792969, + "step": 16216, + "time_per_iteration": 2.924231767654419 + }, + { + "auxiliary_loss_clip": 0.01334574, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.22431552, + "balance_loss_mlp": 1.01322865, + "epoch": 0.9750187885164587, + "flos": 11002881062520.0, + "grad_norm": 2.1098600396014864, + "language_loss": 0.71648741, + "learning_rate": 6.527235786226937e-09, + "loss": 0.74010026, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13458252, + "step": 16217, + "time_per_iteration": 2.7818338871002197 + }, + { + "auxiliary_loss_clip": 0.01325306, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.21839333, + "balance_loss_mlp": 1.01768696, + "epoch": 0.9750789117691268, + "flos": 25745098834800.0, + "grad_norm": 1.5519730497207098, + "language_loss": 0.78637409, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80993116, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12713623, + "step": 16218, + "time_per_iteration": 2.8630919456481934 + }, + { + "auxiliary_loss_clip": 0.01315308, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.21138489, + "balance_loss_mlp": 1.01887655, + "epoch": 0.9751390350217947, + "flos": 13337871391440.0, + "grad_norm": 1.8811597542753546, + "language_loss": 0.77394259, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79741007, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.12548828, + "step": 16219, + "time_per_iteration": 2.7248010635375977 + }, + { + "auxiliary_loss_clip": 0.01327946, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.21891904, + "balance_loss_mlp": 1.01552987, + "epoch": 0.9751991582744627, + "flos": 22825822862880.0, + "grad_norm": 2.6095863095337255, + "language_loss": 0.81433475, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83789617, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12670898, + "step": 16220, + "time_per_iteration": 2.8207626342773438 + }, + { + "auxiliary_loss_clip": 0.01320491, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.21512723, + "balance_loss_mlp": 1.01693904, + "epoch": 0.9752592815271306, + "flos": 19650954916440.0, + "grad_norm": 1.701613980196599, + "language_loss": 0.75388479, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77738678, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12774658, + "step": 16221, + "time_per_iteration": 2.7796390056610107 + }, + { + "auxiliary_loss_clip": 0.01320686, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.21541452, + "balance_loss_mlp": 1.01806688, + "epoch": 0.9753194047797986, + "flos": 26696476673640.0, + "grad_norm": 1.43589993860788, + "language_loss": 0.66472054, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68823588, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12786865, + "step": 16222, + "time_per_iteration": 4.304383039474487 + }, + { + "auxiliary_loss_clip": 0.01325476, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.21871662, + "balance_loss_mlp": 1.0157398, + "epoch": 0.9753795280324665, + "flos": 23227009099920.0, + "grad_norm": 1.6297867541496354, + "language_loss": 0.88430047, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90784276, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13018799, + "step": 16223, + "time_per_iteration": 2.80070161819458 + }, + { + "auxiliary_loss_clip": 0.01320017, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.21552455, + "balance_loss_mlp": 1.01876688, + "epoch": 0.9754396512851345, + "flos": 19468586460960.0, + "grad_norm": 1.650888767201241, + "language_loss": 0.74715745, + "learning_rate": 6.309011819690457e-09, + "loss": 0.77066958, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12420654, + "step": 16224, + "time_per_iteration": 2.794987916946411 + }, + { + "auxiliary_loss_clip": 0.01142224, + "auxiliary_loss_mlp": 0.01008169, + "balance_loss_clip": 1.09952807, + "balance_loss_mlp": 1.00578523, + "epoch": 0.9754997745378025, + "flos": 68474585395080.0, + "grad_norm": 0.8270358829033846, + "language_loss": 0.5914669, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61297083, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02380371, + "step": 16225, + "time_per_iteration": 3.226851224899292 + }, + { + "auxiliary_loss_clip": 0.01319858, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.21404564, + "balance_loss_mlp": 1.02426839, + "epoch": 0.9755598977904705, + "flos": 26400699403920.0, + "grad_norm": 3.0051967592031326, + "language_loss": 0.69038093, + "learning_rate": 6.247342505960818e-09, + "loss": 0.71394438, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12225342, + "step": 16226, + "time_per_iteration": 2.8576016426086426 + }, + { + "auxiliary_loss_clip": 0.01331066, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.22353554, + "balance_loss_mlp": 1.01945436, + "epoch": 0.9756200210431384, + "flos": 16622006099400.0, + "grad_norm": 1.652371715240544, + "language_loss": 0.83211088, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85574317, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12719727, + "step": 16227, + "time_per_iteration": 5.724507093429565 + }, + { + "auxiliary_loss_clip": 0.01321253, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.21615744, + "balance_loss_mlp": 1.01743984, + "epoch": 0.9756801442958064, + "flos": 23628317162040.0, + "grad_norm": 1.6824912076048646, + "language_loss": 0.77974474, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80325651, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12487793, + "step": 16228, + "time_per_iteration": 2.8390746116638184 + }, + { + "auxiliary_loss_clip": 0.01143654, + "auxiliary_loss_mlp": 0.01007639, + "balance_loss_clip": 1.10098004, + "balance_loss_mlp": 1.00486183, + "epoch": 0.9757402675484744, + "flos": 61638785146800.0, + "grad_norm": 0.8458155513989842, + "language_loss": 0.55856347, + "learning_rate": 6.155405563025962e-09, + "loss": 0.58007646, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02783203, + "step": 16229, + "time_per_iteration": 3.262882709503174 + }, + { + "auxiliary_loss_clip": 0.01328634, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.2212559, + "balance_loss_mlp": 1.01771295, + "epoch": 0.9758003908011423, + "flos": 24064044132240.0, + "grad_norm": 1.974885041177344, + "language_loss": 0.75375265, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77734834, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.13220215, + "step": 16230, + "time_per_iteration": 2.8997223377227783 + }, + { + "auxiliary_loss_clip": 0.01313682, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.21083546, + "balance_loss_mlp": 1.01509142, + "epoch": 0.9758605140538104, + "flos": 17497520875800.0, + "grad_norm": 2.099624562384383, + "language_loss": 0.72305423, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74646223, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.12036133, + "step": 16231, + "time_per_iteration": 4.263237237930298 + }, + { + "auxiliary_loss_clip": 0.01332625, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.22219229, + "balance_loss_mlp": 1.01491332, + "epoch": 0.9759206373064783, + "flos": 24832363173480.0, + "grad_norm": 2.531995597791315, + "language_loss": 0.77054602, + "learning_rate": 6.064149081155267e-09, + "loss": 0.79415023, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12884521, + "step": 16232, + "time_per_iteration": 2.7824583053588867 + }, + { + "auxiliary_loss_clip": 0.01143641, + "auxiliary_loss_mlp": 0.01010195, + "balance_loss_clip": 1.10078454, + "balance_loss_mlp": 1.00735795, + "epoch": 0.9759807605591463, + "flos": 68175600064920.0, + "grad_norm": 0.7421907608203567, + "language_loss": 0.53807592, + "learning_rate": 6.033881472824465e-09, + "loss": 0.5596143, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02832031, + "step": 16233, + "time_per_iteration": 3.100632905960083 + }, + { + "auxiliary_loss_clip": 0.01323883, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.2168541, + "balance_loss_mlp": 1.0164336, + "epoch": 0.9760408838118142, + "flos": 18993892446360.0, + "grad_norm": 2.117136850582965, + "language_loss": 0.71419269, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73772502, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.12908936, + "step": 16234, + "time_per_iteration": 2.8106796741485596 + }, + { + "auxiliary_loss_clip": 0.01335218, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.22344065, + "balance_loss_mlp": 1.01913548, + "epoch": 0.9761010070644822, + "flos": 17130063204720.0, + "grad_norm": 2.0855282022086894, + "language_loss": 0.79097885, + "learning_rate": 5.973573091493156e-09, + "loss": 0.81466115, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13891602, + "step": 16235, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.01321862, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.21608889, + "balance_loss_mlp": 1.01359057, + "epoch": 0.9761611303171501, + "flos": 22057463213280.0, + "grad_norm": 2.3087085964420218, + "language_loss": 0.7738471, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79733574, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.13427734, + "step": 16236, + "time_per_iteration": 2.7741787433624268 + }, + { + "auxiliary_loss_clip": 0.01327645, + "auxiliary_loss_mlp": 0.01024524, + "balance_loss_clip": 1.22083724, + "balance_loss_mlp": 1.01219177, + "epoch": 0.9762212535698181, + "flos": 21761929593720.0, + "grad_norm": 1.8030912458834434, + "language_loss": 0.76354104, + "learning_rate": 5.913567164886446e-09, + "loss": 0.78706276, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.12329102, + "step": 16237, + "time_per_iteration": 2.7443578243255615 + }, + { + "auxiliary_loss_clip": 0.01328542, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.21899104, + "balance_loss_mlp": 1.01942921, + "epoch": 0.9762813768224861, + "flos": 25927183031760.0, + "grad_norm": 1.7558757469770292, + "language_loss": 0.73087358, + "learning_rate": 5.8836776249509e-09, + "loss": 0.75449288, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.13964844, + "step": 16238, + "time_per_iteration": 2.7753777503967285 + }, + { + "auxiliary_loss_clip": 0.01329336, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.22124374, + "balance_loss_mlp": 1.02015591, + "epoch": 0.9763415000751541, + "flos": 24056003676960.0, + "grad_norm": 2.1707259797208702, + "language_loss": 0.83978927, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.86341608, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13195801, + "step": 16239, + "time_per_iteration": 2.7194905281066895 + }, + { + "auxiliary_loss_clip": 0.01329225, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.22113776, + "balance_loss_mlp": 1.01800156, + "epoch": 0.976401623327822, + "flos": 17023029903000.0, + "grad_norm": 2.949574197683911, + "language_loss": 0.60362768, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62723255, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1328125, + "step": 16240, + "time_per_iteration": 2.856835126876831 + }, + { + "auxiliary_loss_clip": 0.01325351, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.2198751, + "balance_loss_mlp": 1.01715565, + "epoch": 0.97646174658049, + "flos": 16111228233960.0, + "grad_norm": 1.9661801112293587, + "language_loss": 0.8293314, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.85288477, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12835693, + "step": 16241, + "time_per_iteration": 2.835524082183838 + }, + { + "auxiliary_loss_clip": 0.01324071, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.21642685, + "balance_loss_mlp": 1.02268851, + "epoch": 0.9765218698331579, + "flos": 21257852107680.0, + "grad_norm": 3.867057873545802, + "language_loss": 0.83419728, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85778975, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.12487793, + "step": 16242, + "time_per_iteration": 2.8794236183166504 + }, + { + "auxiliary_loss_clip": 0.01323478, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.21549106, + "balance_loss_mlp": 1.0166949, + "epoch": 0.9765819930858259, + "flos": 18592665600960.0, + "grad_norm": 1.574982033361705, + "language_loss": 0.76061827, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.78414762, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12768555, + "step": 16243, + "time_per_iteration": 2.7839481830596924 + }, + { + "auxiliary_loss_clip": 0.01325602, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.2175138, + "balance_loss_mlp": 1.02059793, + "epoch": 0.976642116338494, + "flos": 20271730493880.0, + "grad_norm": 1.5094489871279881, + "language_loss": 0.70306873, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72666442, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13372803, + "step": 16244, + "time_per_iteration": 2.776095151901245 + }, + { + "auxiliary_loss_clip": 0.0133284, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.22421992, + "balance_loss_mlp": 1.01734436, + "epoch": 0.9767022395911619, + "flos": 25555177224360.0, + "grad_norm": 1.7501669796838297, + "language_loss": 0.83594489, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85958004, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.13323975, + "step": 16245, + "time_per_iteration": 2.8873980045318604 + }, + { + "auxiliary_loss_clip": 0.01320336, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.21388149, + "balance_loss_mlp": 1.01383543, + "epoch": 0.9767623628438299, + "flos": 21767736589200.0, + "grad_norm": 1.3454180725105211, + "language_loss": 0.78651941, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80998182, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12084961, + "step": 16246, + "time_per_iteration": 2.8350753784179688 + }, + { + "auxiliary_loss_clip": 0.01312055, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.21162391, + "balance_loss_mlp": 1.01734388, + "epoch": 0.9768224860964978, + "flos": 15855311392560.0, + "grad_norm": 1.3842975241583961, + "language_loss": 0.74096864, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76437521, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.11273193, + "step": 16247, + "time_per_iteration": 2.756579637527466 + }, + { + "auxiliary_loss_clip": 0.01327132, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.22001922, + "balance_loss_mlp": 1.01682115, + "epoch": 0.9768826093491658, + "flos": 25156062013680.0, + "grad_norm": 2.7865454847164033, + "language_loss": 0.80300546, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82657444, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.12957764, + "step": 16248, + "time_per_iteration": 2.816770315170288 + }, + { + "auxiliary_loss_clip": 0.01331018, + "auxiliary_loss_mlp": 0.01041047, + "balance_loss_clip": 1.22147262, + "balance_loss_mlp": 1.0274874, + "epoch": 0.9769427326018337, + "flos": 22972310509320.0, + "grad_norm": 1.9522488961764703, + "language_loss": 0.78796947, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81169015, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.13562012, + "step": 16249, + "time_per_iteration": 2.9107186794281006 + }, + { + "auxiliary_loss_clip": 0.01321061, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.21613359, + "balance_loss_mlp": 1.01848674, + "epoch": 0.9770028558545018, + "flos": 15268020730920.0, + "grad_norm": 79.3973250092077, + "language_loss": 0.6677258, + "learning_rate": 5.530901600093507e-09, + "loss": 0.6912514, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.13012695, + "step": 16250, + "time_per_iteration": 2.7623236179351807 + }, + { + "auxiliary_loss_clip": 0.01141685, + "auxiliary_loss_mlp": 0.01001571, + "balance_loss_clip": 1.09827256, + "balance_loss_mlp": 0.99867457, + "epoch": 0.9770629791071697, + "flos": 71465907243600.0, + "grad_norm": 0.7757672398999916, + "language_loss": 0.59881949, + "learning_rate": 5.501995169700846e-09, + "loss": 0.62025201, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.02893066, + "step": 16251, + "time_per_iteration": 3.315988063812256 + }, + { + "auxiliary_loss_clip": 0.01325408, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.21800876, + "balance_loss_mlp": 1.01925874, + "epoch": 0.9771231023598377, + "flos": 22417164687600.0, + "grad_norm": 1.8085629228697857, + "language_loss": 0.78719735, + "learning_rate": 5.473164370872307e-09, + "loss": 0.81077302, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12915039, + "step": 16252, + "time_per_iteration": 2.7760109901428223 + }, + { + "auxiliary_loss_clip": 0.01324387, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.21842837, + "balance_loss_mlp": 1.01641452, + "epoch": 0.9771832256125056, + "flos": 19030626030960.0, + "grad_norm": 2.583220277945475, + "language_loss": 0.65037382, + "learning_rate": 5.444409204701461e-09, + "loss": 0.67391223, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.13012695, + "step": 16253, + "time_per_iteration": 2.8394694328308105 + }, + { + "auxiliary_loss_clip": 0.01330628, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.22330284, + "balance_loss_mlp": 1.01851296, + "epoch": 0.9772433488651736, + "flos": 17826701844600.0, + "grad_norm": 3.2945502521664816, + "language_loss": 0.76908529, + "learning_rate": 5.415729672278324e-09, + "loss": 0.79271436, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.13763428, + "step": 16254, + "time_per_iteration": 2.9102182388305664 + }, + { + "auxiliary_loss_clip": 0.01331586, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.22202396, + "balance_loss_mlp": 1.01841438, + "epoch": 0.9773034721178415, + "flos": 37636391151000.0, + "grad_norm": 1.8515334786800395, + "language_loss": 0.64386141, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66749197, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13067627, + "step": 16255, + "time_per_iteration": 2.8988656997680664 + }, + { + "auxiliary_loss_clip": 0.01333471, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.22258103, + "balance_loss_mlp": 1.01711929, + "epoch": 0.9773635953705095, + "flos": 20307245827680.0, + "grad_norm": 1.5138841032626158, + "language_loss": 0.75817895, + "learning_rate": 5.358597513023033e-09, + "loss": 0.78182268, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13793945, + "step": 16256, + "time_per_iteration": 2.810626983642578 + }, + { + "auxiliary_loss_clip": 0.01321398, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.21724403, + "balance_loss_mlp": 1.0181241, + "epoch": 0.9774237186231776, + "flos": 22314314046960.0, + "grad_norm": 2.1341755184377886, + "language_loss": 0.77999383, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80352139, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.13238525, + "step": 16257, + "time_per_iteration": 2.759169816970825 + }, + { + "auxiliary_loss_clip": 0.01322417, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.21648872, + "balance_loss_mlp": 1.01880121, + "epoch": 0.9774838418758455, + "flos": 24210003870000.0, + "grad_norm": 1.5530782219200614, + "language_loss": 0.75110048, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77463865, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12597656, + "step": 16258, + "time_per_iteration": 2.7894461154937744 + }, + { + "auxiliary_loss_clip": 0.01141275, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.09844708, + "balance_loss_mlp": 1.00040138, + "epoch": 0.9775439651285135, + "flos": 66373965849600.0, + "grad_norm": 0.6818311535624579, + "language_loss": 0.59782493, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61926949, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02783203, + "step": 16259, + "time_per_iteration": 3.3225302696228027 + }, + { + "auxiliary_loss_clip": 0.01335314, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.22478437, + "balance_loss_mlp": 1.01516283, + "epoch": 0.9776040883811814, + "flos": 22606720822800.0, + "grad_norm": 1.5671358368120512, + "language_loss": 0.73758698, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.76123077, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.13897705, + "step": 16260, + "time_per_iteration": 2.824254274368286 + }, + { + "auxiliary_loss_clip": 0.0132461, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.21735168, + "balance_loss_mlp": 1.01926398, + "epoch": 0.9776642116338494, + "flos": 18446787079920.0, + "grad_norm": 3.298451167635147, + "language_loss": 0.7907604, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81432652, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12744141, + "step": 16261, + "time_per_iteration": 4.162928819656372 + }, + { + "auxiliary_loss_clip": 0.01330022, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.22008848, + "balance_loss_mlp": 1.01263154, + "epoch": 0.9777243348865173, + "flos": 22643657449200.0, + "grad_norm": 2.3716292924782665, + "language_loss": 0.74424845, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76780164, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12683105, + "step": 16262, + "time_per_iteration": 2.819159746170044 + }, + { + "auxiliary_loss_clip": 0.01328457, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.22043133, + "balance_loss_mlp": 1.01572311, + "epoch": 0.9777844581391854, + "flos": 31327652720520.0, + "grad_norm": 3.2559842597324176, + "language_loss": 0.70270848, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72629154, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.14123535, + "step": 16263, + "time_per_iteration": 2.85022234916687 + }, + { + "auxiliary_loss_clip": 0.01332151, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.22357619, + "balance_loss_mlp": 1.02159548, + "epoch": 0.9778445813918533, + "flos": 23044275169200.0, + "grad_norm": 2.3465866457596563, + "language_loss": 0.66102338, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68468702, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12615967, + "step": 16264, + "time_per_iteration": 4.254715919494629 + }, + { + "auxiliary_loss_clip": 0.01338375, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.22560978, + "balance_loss_mlp": 1.01887548, + "epoch": 0.9779047046445213, + "flos": 17570825611560.0, + "grad_norm": 1.8013858350108687, + "language_loss": 0.73036283, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75407422, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.13873291, + "step": 16265, + "time_per_iteration": 4.394821643829346 + }, + { + "auxiliary_loss_clip": 0.01319889, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.21531177, + "balance_loss_mlp": 1.01875424, + "epoch": 0.9779648278971892, + "flos": 20746099641600.0, + "grad_norm": 1.7589308231172298, + "language_loss": 0.69142991, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71494043, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12402344, + "step": 16266, + "time_per_iteration": 2.83662748336792 + }, + { + "auxiliary_loss_clip": 0.01314703, + "auxiliary_loss_mlp": 0.01026608, + "balance_loss_clip": 1.21218097, + "balance_loss_mlp": 1.01537275, + "epoch": 0.9780249511498572, + "flos": 21031156304280.0, + "grad_norm": 1.629578464630274, + "language_loss": 0.87274116, + "learning_rate": 5.049778912747049e-09, + "loss": 0.89615428, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.11248779, + "step": 16267, + "time_per_iteration": 2.7979483604431152 + }, + { + "auxiliary_loss_clip": 0.01328542, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.21871758, + "balance_loss_mlp": 1.01626348, + "epoch": 0.9780850744025251, + "flos": 30780425529000.0, + "grad_norm": 1.8767549267748083, + "language_loss": 0.69979841, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72338206, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13549805, + "step": 16268, + "time_per_iteration": 2.917534828186035 + }, + { + "auxiliary_loss_clip": 0.01331123, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.22278214, + "balance_loss_mlp": 1.01528788, + "epoch": 0.9781451976551931, + "flos": 20307854953080.0, + "grad_norm": 1.5251828079267955, + "language_loss": 0.74148059, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76507384, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.12902832, + "step": 16269, + "time_per_iteration": 4.337434530258179 + }, + { + "auxiliary_loss_clip": 0.0133151, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.22231507, + "balance_loss_mlp": 1.0198977, + "epoch": 0.9782053209078612, + "flos": 24321869566560.0, + "grad_norm": 1.8404936390140194, + "language_loss": 0.70912004, + "learning_rate": 4.967144221869501e-09, + "loss": 0.73277676, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.14245605, + "step": 16270, + "time_per_iteration": 2.8381426334381104 + }, + { + "auxiliary_loss_clip": 0.01326224, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.21837592, + "balance_loss_mlp": 1.02073312, + "epoch": 0.9782654441605291, + "flos": 32495899139640.0, + "grad_norm": 1.6574964117743796, + "language_loss": 0.6473465, + "learning_rate": 4.939750627212191e-09, + "loss": 0.67094213, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12591553, + "step": 16271, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.01319567, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.21610379, + "balance_loss_mlp": 1.01925898, + "epoch": 0.9783255674131971, + "flos": 26984700788400.0, + "grad_norm": 1.4867678670467444, + "language_loss": 0.70801032, + "learning_rate": 4.912432685439505e-09, + "loss": 0.73152578, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.12719727, + "step": 16272, + "time_per_iteration": 2.786210536956787 + }, + { + "auxiliary_loss_clip": 0.01328948, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.22008443, + "balance_loss_mlp": 1.02021194, + "epoch": 0.978385690665865, + "flos": 23117458079880.0, + "grad_norm": 1.7250135218536504, + "language_loss": 0.66815495, + "learning_rate": 4.88519039758728e-09, + "loss": 0.69177955, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13311768, + "step": 16273, + "time_per_iteration": 2.8032939434051514 + }, + { + "auxiliary_loss_clip": 0.01330604, + "auxiliary_loss_mlp": 0.0102539, + "balance_loss_clip": 1.22268248, + "balance_loss_mlp": 1.01207399, + "epoch": 0.978445813918533, + "flos": 25415024482080.0, + "grad_norm": 1.69192828561038, + "language_loss": 0.744771, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76833093, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13311768, + "step": 16274, + "time_per_iteration": 2.7660043239593506 + }, + { + "auxiliary_loss_clip": 0.01323131, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.21697974, + "balance_loss_mlp": 1.0169946, + "epoch": 0.9785059371712009, + "flos": 23555499726600.0, + "grad_norm": 1.6334318724341739, + "language_loss": 0.77782667, + "learning_rate": 4.830932787773579e-09, + "loss": 0.80134839, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.1204834, + "step": 16275, + "time_per_iteration": 2.8305137157440186 + }, + { + "auxiliary_loss_clip": 0.01330859, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.22057557, + "balance_loss_mlp": 1.01598084, + "epoch": 0.978566060423869, + "flos": 34358022830160.0, + "grad_norm": 1.6005151465148872, + "language_loss": 0.71393716, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73753858, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.13299561, + "step": 16276, + "time_per_iteration": 2.9325413703918457 + }, + { + "auxiliary_loss_clip": 0.01316966, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.21246421, + "balance_loss_mlp": 1.01849866, + "epoch": 0.9786261836765369, + "flos": 11622519605880.0, + "grad_norm": 1.8629335895512964, + "language_loss": 0.86049449, + "learning_rate": 4.776977806000726e-09, + "loss": 0.88396788, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.11871338, + "step": 16277, + "time_per_iteration": 2.7800180912017822 + }, + { + "auxiliary_loss_clip": 0.01328013, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.22262239, + "balance_loss_mlp": 1.01402235, + "epoch": 0.9786863069292049, + "flos": 17425474999200.0, + "grad_norm": 1.610244897671973, + "language_loss": 0.71358496, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73713166, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.12640381, + "step": 16278, + "time_per_iteration": 2.723907232284546 + }, + { + "auxiliary_loss_clip": 0.01325931, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.21950412, + "balance_loss_mlp": 1.01725507, + "epoch": 0.9787464301818728, + "flos": 20849153324040.0, + "grad_norm": 1.8217294054217854, + "language_loss": 0.84474814, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86831933, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.13934326, + "step": 16279, + "time_per_iteration": 2.755486249923706 + }, + { + "auxiliary_loss_clip": 0.01326317, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.21827269, + "balance_loss_mlp": 1.01385903, + "epoch": 0.9788065534345408, + "flos": 18227481998040.0, + "grad_norm": 1.7080765095493409, + "language_loss": 0.7937696, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81730151, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13024902, + "step": 16280, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.01316197, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.21348202, + "balance_loss_mlp": 1.01941729, + "epoch": 0.9788666766872087, + "flos": 21583012848840.0, + "grad_norm": 1.5479395986721076, + "language_loss": 0.79381108, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81728631, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.11907959, + "step": 16281, + "time_per_iteration": 2.883284091949463 + }, + { + "auxiliary_loss_clip": 0.01328648, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.22015047, + "balance_loss_mlp": 1.01920867, + "epoch": 0.9789267999398767, + "flos": 24905992776120.0, + "grad_norm": 1.7011598199273748, + "language_loss": 0.80266333, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82627147, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.1295166, + "step": 16282, + "time_per_iteration": 2.8578226566314697 + }, + { + "auxiliary_loss_clip": 0.01323459, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.21658611, + "balance_loss_mlp": 1.02161884, + "epoch": 0.9789869231925448, + "flos": 19577853222480.0, + "grad_norm": 2.1221483540719266, + "language_loss": 0.83678854, + "learning_rate": 4.616928710538204e-09, + "loss": 0.86036801, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12866211, + "step": 16283, + "time_per_iteration": 2.7501769065856934 + }, + { + "auxiliary_loss_clip": 0.0132278, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.21592236, + "balance_loss_mlp": 1.01756716, + "epoch": 0.9790470464452127, + "flos": 16800557369040.0, + "grad_norm": 1.968511170469165, + "language_loss": 0.71912622, + "learning_rate": 4.590518683360134e-09, + "loss": 0.742661, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13140869, + "step": 16284, + "time_per_iteration": 2.7894444465637207 + }, + { + "auxiliary_loss_clip": 0.01315215, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.21193099, + "balance_loss_mlp": 1.01743567, + "epoch": 0.9791071696978807, + "flos": 18373969644480.0, + "grad_norm": 1.6790528873151163, + "language_loss": 0.64407361, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66751689, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.11682129, + "step": 16285, + "time_per_iteration": 2.772020101547241 + }, + { + "auxiliary_loss_clip": 0.01318408, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.21336627, + "balance_loss_mlp": 1.01453578, + "epoch": 0.9791672929505486, + "flos": 24175666178640.0, + "grad_norm": 1.5063297004124587, + "language_loss": 0.70941293, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73286736, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.125, + "step": 16286, + "time_per_iteration": 2.792661666870117 + }, + { + "auxiliary_loss_clip": 0.013154, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.21099007, + "balance_loss_mlp": 1.01650977, + "epoch": 0.9792274162032166, + "flos": 24359658968520.0, + "grad_norm": 1.3126310445523808, + "language_loss": 0.58902895, + "learning_rate": 4.511742602582691e-09, + "loss": 0.61247122, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12335205, + "step": 16287, + "time_per_iteration": 2.8654258251190186 + }, + { + "auxiliary_loss_clip": 0.01323491, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.21777201, + "balance_loss_mlp": 1.02109861, + "epoch": 0.9792875394558845, + "flos": 26401389746040.0, + "grad_norm": 1.643920047877283, + "language_loss": 0.82070243, + "learning_rate": 4.485635245894626e-09, + "loss": 0.84427983, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.1315918, + "step": 16288, + "time_per_iteration": 2.862114667892456 + }, + { + "auxiliary_loss_clip": 0.01327723, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.21961188, + "balance_loss_mlp": 1.01606226, + "epoch": 0.9793476627085526, + "flos": 28153962416520.0, + "grad_norm": 1.4807216750421877, + "language_loss": 0.72115445, + "learning_rate": 4.459603559311631e-09, + "loss": 0.74472702, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13470459, + "step": 16289, + "time_per_iteration": 2.950056791305542 + }, + { + "auxiliary_loss_clip": 0.01323432, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.21765101, + "balance_loss_mlp": 1.01901484, + "epoch": 0.9794077859612205, + "flos": 16768331312400.0, + "grad_norm": 2.1971901427796072, + "language_loss": 0.75720382, + "learning_rate": 4.43364754382003e-09, + "loss": 0.78075171, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12335205, + "step": 16290, + "time_per_iteration": 2.9050145149230957 + }, + { + "auxiliary_loss_clip": 0.01332744, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.22206187, + "balance_loss_mlp": 1.01836038, + "epoch": 0.9794679092138885, + "flos": 19285487055000.0, + "grad_norm": 1.5559897251368262, + "language_loss": 0.67201769, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69567347, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.14489746, + "step": 16291, + "time_per_iteration": 2.7722220420837402 + }, + { + "auxiliary_loss_clip": 0.01336587, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.22504163, + "balance_loss_mlp": 1.01853192, + "epoch": 0.9795280324665564, + "flos": 32162494901400.0, + "grad_norm": 2.052514986979704, + "language_loss": 0.62235242, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64603752, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13397217, + "step": 16292, + "time_per_iteration": 2.8664464950561523 + }, + { + "auxiliary_loss_clip": 0.01326541, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.21955705, + "balance_loss_mlp": 1.01548171, + "epoch": 0.9795881557192244, + "flos": 19065451022640.0, + "grad_norm": 1.5168061970747444, + "language_loss": 0.73402596, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75757241, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12609863, + "step": 16293, + "time_per_iteration": 2.7552244663238525 + }, + { + "auxiliary_loss_clip": 0.0133404, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.22471833, + "balance_loss_mlp": 1.01522708, + "epoch": 0.9796482789718923, + "flos": 28335843571680.0, + "grad_norm": 1.6511015944943916, + "language_loss": 0.84376776, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86739075, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13024902, + "step": 16294, + "time_per_iteration": 2.8136532306671143 + }, + { + "auxiliary_loss_clip": 0.0131169, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.20910025, + "balance_loss_mlp": 1.02170968, + "epoch": 0.9797084022245603, + "flos": 17972539757280.0, + "grad_norm": 1.8005711228917056, + "language_loss": 0.72112703, + "learning_rate": 4.305002567088767e-09, + "loss": 0.74458218, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.12115479, + "step": 16295, + "time_per_iteration": 2.7411458492279053 + }, + { + "auxiliary_loss_clip": 0.01331239, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.22079086, + "balance_loss_mlp": 1.0232898, + "epoch": 0.9797685254772284, + "flos": 20271608668800.0, + "grad_norm": 1.7285489848759743, + "language_loss": 0.80788946, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.831563, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.12817383, + "step": 16296, + "time_per_iteration": 2.7938621044158936 + }, + { + "auxiliary_loss_clip": 0.01319479, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.21419466, + "balance_loss_mlp": 1.01983953, + "epoch": 0.9798286487298963, + "flos": 26912898561960.0, + "grad_norm": 1.9210047817395832, + "language_loss": 0.75806451, + "learning_rate": 4.254074308266853e-09, + "loss": 0.78158224, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12438965, + "step": 16297, + "time_per_iteration": 2.849602460861206 + }, + { + "auxiliary_loss_clip": 0.01333967, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.22252584, + "balance_loss_mlp": 1.0228982, + "epoch": 0.9798887719825643, + "flos": 27166663160280.0, + "grad_norm": 2.3441289534329983, + "language_loss": 0.78336155, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80705905, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.12902832, + "step": 16298, + "time_per_iteration": 2.973867654800415 + }, + { + "auxiliary_loss_clip": 0.01316311, + "auxiliary_loss_mlp": 0.01026913, + "balance_loss_clip": 1.21261668, + "balance_loss_mlp": 1.01466429, + "epoch": 0.9799488952352322, + "flos": 20673404031240.0, + "grad_norm": 1.4410953206884205, + "language_loss": 0.72897732, + "learning_rate": 4.203448764984019e-09, + "loss": 0.75240958, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.12249756, + "step": 16299, + "time_per_iteration": 2.8101589679718018 + }, + { + "auxiliary_loss_clip": 0.01327672, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.21707809, + "balance_loss_mlp": 1.01738846, + "epoch": 0.9800090184879002, + "flos": 21986270112240.0, + "grad_norm": 1.9711391831559775, + "language_loss": 0.88908792, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91267431, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.13574219, + "step": 16300, + "time_per_iteration": 4.215381145477295 + }, + { + "auxiliary_loss_clip": 0.01331394, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.22066927, + "balance_loss_mlp": 1.01382422, + "epoch": 0.9800691417405681, + "flos": 21293448658200.0, + "grad_norm": 2.2027572635406347, + "language_loss": 0.78116786, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.804757, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.13684082, + "step": 16301, + "time_per_iteration": 2.9065451622009277 + }, + { + "auxiliary_loss_clip": 0.01325524, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.21806169, + "balance_loss_mlp": 1.02138746, + "epoch": 0.9801292649932362, + "flos": 18444188144880.0, + "grad_norm": 1.8758338238377643, + "language_loss": 0.76115996, + "learning_rate": 4.128078058480921e-09, + "loss": 0.78475797, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12878418, + "step": 16302, + "time_per_iteration": 2.9138388633728027 + }, + { + "auxiliary_loss_clip": 0.01327956, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.22143054, + "balance_loss_mlp": 1.01866794, + "epoch": 0.9801893882459041, + "flos": 25051871297160.0, + "grad_norm": 1.656772750240421, + "language_loss": 0.79779643, + "learning_rate": 4.103105855705724e-09, + "loss": 0.8213926, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12988281, + "step": 16303, + "time_per_iteration": 4.345032215118408 + }, + { + "auxiliary_loss_clip": 0.01330794, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.22113109, + "balance_loss_mlp": 1.02110541, + "epoch": 0.9802495114985721, + "flos": 18515624896080.0, + "grad_norm": 2.3813675718329748, + "language_loss": 0.83143377, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85508072, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 1.09619141, + "router_z_loss_mlp": 0.12805176, + "step": 16304, + "time_per_iteration": 4.405202150344849 + }, + { + "auxiliary_loss_clip": 0.01314317, + "auxiliary_loss_mlp": 0.01025208, + "balance_loss_clip": 1.21247482, + "balance_loss_mlp": 1.01388288, + "epoch": 0.98030963475124, + "flos": 21474761296320.0, + "grad_norm": 1.7198359954221853, + "language_loss": 0.70524377, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72863901, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.11328125, + "step": 16305, + "time_per_iteration": 2.812272787094116 + }, + { + "auxiliary_loss_clip": 0.01332619, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.22457886, + "balance_loss_mlp": 1.02102733, + "epoch": 0.980369758003908, + "flos": 20417121714600.0, + "grad_norm": 1.957257908712823, + "language_loss": 0.72384709, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74751151, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12786865, + "step": 16306, + "time_per_iteration": 2.8312931060791016 + }, + { + "auxiliary_loss_clip": 0.01315891, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.21170402, + "balance_loss_mlp": 1.01741362, + "epoch": 0.9804298812565759, + "flos": 23403773601720.0, + "grad_norm": 1.5779820430669065, + "language_loss": 0.73752415, + "learning_rate": 4.00397390013385e-09, + "loss": 0.76097953, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12243652, + "step": 16307, + "time_per_iteration": 2.8633689880371094 + }, + { + "auxiliary_loss_clip": 0.01314128, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.21269655, + "balance_loss_mlp": 1.01582456, + "epoch": 0.980490004509244, + "flos": 23297633683920.0, + "grad_norm": 1.4139519722464076, + "language_loss": 0.7449612, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76837003, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.10943604, + "step": 16308, + "time_per_iteration": 4.479054927825928 + }, + { + "auxiliary_loss_clip": 0.01142709, + "auxiliary_loss_mlp": 0.01003905, + "balance_loss_clip": 1.10028148, + "balance_loss_mlp": 1.00133026, + "epoch": 0.980550127761912, + "flos": 56064271716360.0, + "grad_norm": 0.7575123620202885, + "language_loss": 0.57849002, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59995615, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02575684, + "step": 16309, + "time_per_iteration": 3.141824245452881 + }, + { + "auxiliary_loss_clip": 0.01329062, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.22036195, + "balance_loss_mlp": 1.01600051, + "epoch": 0.9806102510145799, + "flos": 25338186819000.0, + "grad_norm": 1.7385283055726644, + "language_loss": 0.66874027, + "learning_rate": 3.930419658033646e-09, + "loss": 0.69232047, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.1295166, + "step": 16310, + "time_per_iteration": 2.817760467529297 + }, + { + "auxiliary_loss_clip": 0.01143349, + "auxiliary_loss_mlp": 0.01006922, + "balance_loss_clip": 1.10088444, + "balance_loss_mlp": 1.00431144, + "epoch": 0.9806703742672479, + "flos": 67292508506400.0, + "grad_norm": 0.8216397512508724, + "language_loss": 0.54600418, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56750691, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02612305, + "step": 16311, + "time_per_iteration": 3.3428804874420166 + }, + { + "auxiliary_loss_clip": 0.01325919, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.21892679, + "balance_loss_mlp": 1.0133183, + "epoch": 0.9807304975199158, + "flos": 25234524011160.0, + "grad_norm": 1.4724295519701809, + "language_loss": 0.79860902, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82212931, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12792969, + "step": 16312, + "time_per_iteration": 2.940077543258667 + }, + { + "auxiliary_loss_clip": 0.01325096, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.22056186, + "balance_loss_mlp": 1.01696467, + "epoch": 0.9807906207725838, + "flos": 17460787291200.0, + "grad_norm": 1.8951613285653568, + "language_loss": 0.63393641, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65747452, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.11761475, + "step": 16313, + "time_per_iteration": 2.800313711166382 + }, + { + "auxiliary_loss_clip": 0.01323372, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.21780801, + "balance_loss_mlp": 1.01870751, + "epoch": 0.9808507440252517, + "flos": 21037328775000.0, + "grad_norm": 1.9411942446851256, + "language_loss": 0.72832811, + "learning_rate": 3.833407015731316e-09, + "loss": 0.75187862, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12988281, + "step": 16314, + "time_per_iteration": 2.780426025390625 + }, + { + "auxiliary_loss_clip": 0.01142303, + "auxiliary_loss_mlp": 0.01005922, + "balance_loss_clip": 1.09999979, + "balance_loss_mlp": 1.00335908, + "epoch": 0.9809108672779198, + "flos": 64059181502760.0, + "grad_norm": 0.6854763022336364, + "language_loss": 0.51757103, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53905332, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02563477, + "step": 16315, + "time_per_iteration": 3.21059513092041 + }, + { + "auxiliary_loss_clip": 0.01327679, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.21960735, + "balance_loss_mlp": 1.01900589, + "epoch": 0.9809709905305877, + "flos": 22785353309160.0, + "grad_norm": 1.274404950167343, + "language_loss": 0.69871455, + "learning_rate": 3.785354859932033e-09, + "loss": 0.72230494, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12359619, + "step": 16316, + "time_per_iteration": 2.821955680847168 + }, + { + "auxiliary_loss_clip": 0.01326685, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.21716285, + "balance_loss_mlp": 1.01621056, + "epoch": 0.9810311137832557, + "flos": 37020447968400.0, + "grad_norm": 1.6552585489538483, + "language_loss": 0.5520795, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57563353, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.12506104, + "step": 16317, + "time_per_iteration": 2.9132938385009766 + }, + { + "auxiliary_loss_clip": 0.0131568, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.21069467, + "balance_loss_mlp": 1.017946, + "epoch": 0.9810912370359236, + "flos": 18920953185840.0, + "grad_norm": 3.6272156083588416, + "language_loss": 0.73451412, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75797319, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12280273, + "step": 16318, + "time_per_iteration": 2.7625632286071777 + }, + { + "auxiliary_loss_clip": 0.01319544, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.21628749, + "balance_loss_mlp": 1.01677001, + "epoch": 0.9811513602885916, + "flos": 18446137346160.0, + "grad_norm": 2.3778015865819895, + "language_loss": 0.81851685, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84200108, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12103271, + "step": 16319, + "time_per_iteration": 2.802011489868164 + }, + { + "auxiliary_loss_clip": 0.01142371, + "auxiliary_loss_mlp": 0.01005571, + "balance_loss_clip": 1.09946525, + "balance_loss_mlp": 1.00284147, + "epoch": 0.9812114835412595, + "flos": 68074292541960.0, + "grad_norm": 0.7197794093522546, + "language_loss": 0.5359807, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55746019, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02734375, + "step": 16320, + "time_per_iteration": 3.2121469974517822 + }, + { + "auxiliary_loss_clip": 0.01327292, + "auxiliary_loss_mlp": 0.01034426, + "balance_loss_clip": 1.21901214, + "balance_loss_mlp": 1.02181363, + "epoch": 0.9812716067939276, + "flos": 25378331505840.0, + "grad_norm": 1.6524759553801596, + "language_loss": 0.7382797, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.76189685, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1262207, + "step": 16321, + "time_per_iteration": 2.8370821475982666 + }, + { + "auxiliary_loss_clip": 0.01321871, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.21836996, + "balance_loss_mlp": 1.01747918, + "epoch": 0.9813317300465956, + "flos": 22861987930440.0, + "grad_norm": 1.6127395007860945, + "language_loss": 0.78714621, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.81066281, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.1229248, + "step": 16322, + "time_per_iteration": 2.9470374584198 + }, + { + "auxiliary_loss_clip": 0.01320761, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.21354055, + "balance_loss_mlp": 1.01678419, + "epoch": 0.9813918532992635, + "flos": 23592273919560.0, + "grad_norm": 1.5808691835733613, + "language_loss": 0.81056195, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83406961, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.13220215, + "step": 16323, + "time_per_iteration": 2.964035749435425 + }, + { + "auxiliary_loss_clip": 0.01332923, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.2223711, + "balance_loss_mlp": 1.02088308, + "epoch": 0.9814519765519315, + "flos": 19610647796160.0, + "grad_norm": 2.2701323640916984, + "language_loss": 0.8535921, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87725735, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.12719727, + "step": 16324, + "time_per_iteration": 2.948923110961914 + }, + { + "auxiliary_loss_clip": 0.01326236, + "auxiliary_loss_mlp": 0.0102941, + "balance_loss_clip": 1.21900523, + "balance_loss_mlp": 1.0158143, + "epoch": 0.9815120998045994, + "flos": 33952613323680.0, + "grad_norm": 1.5873550787080917, + "language_loss": 0.74263465, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76619112, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13604736, + "step": 16325, + "time_per_iteration": 2.992168664932251 + }, + { + "auxiliary_loss_clip": 0.01317363, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.21480918, + "balance_loss_mlp": 1.02093601, + "epoch": 0.9815722230572674, + "flos": 20854919711160.0, + "grad_norm": 1.550305146807041, + "language_loss": 0.76516908, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78866524, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.11328125, + "step": 16326, + "time_per_iteration": 2.8816161155700684 + }, + { + "auxiliary_loss_clip": 0.01326821, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.21894729, + "balance_loss_mlp": 1.01580095, + "epoch": 0.9816323463099353, + "flos": 22899858549120.0, + "grad_norm": 1.5626199013209798, + "language_loss": 0.67756003, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.70111799, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.13165283, + "step": 16327, + "time_per_iteration": 2.9277772903442383 + }, + { + "auxiliary_loss_clip": 0.01334942, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.22263443, + "balance_loss_mlp": 1.02109611, + "epoch": 0.9816924695626034, + "flos": 31545617726520.0, + "grad_norm": 1.387013939517664, + "language_loss": 0.73619521, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75989234, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.13690186, + "step": 16328, + "time_per_iteration": 2.940924644470215 + }, + { + "auxiliary_loss_clip": 0.01343541, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.22830808, + "balance_loss_mlp": 1.01933432, + "epoch": 0.9817525928152713, + "flos": 21511900964520.0, + "grad_norm": 2.2390523589396216, + "language_loss": 0.81544292, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83921266, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.14105225, + "step": 16329, + "time_per_iteration": 2.819255828857422 + }, + { + "auxiliary_loss_clip": 0.01330686, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.21893811, + "balance_loss_mlp": 1.0166049, + "epoch": 0.9818127160679393, + "flos": 25555217832720.0, + "grad_norm": 1.8615876299829701, + "language_loss": 0.75895512, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78256273, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13464355, + "step": 16330, + "time_per_iteration": 2.865243673324585 + }, + { + "auxiliary_loss_clip": 0.01348759, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.2310729, + "balance_loss_mlp": 1.02008212, + "epoch": 0.9818728393206072, + "flos": 28809562985640.0, + "grad_norm": 2.0688661621572604, + "language_loss": 0.67137897, + "learning_rate": 3.434615511252126e-09, + "loss": 0.6952197, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.15222168, + "step": 16331, + "time_per_iteration": 2.83945631980896 + }, + { + "auxiliary_loss_clip": 0.013264, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.21982443, + "balance_loss_mlp": 1.01841402, + "epoch": 0.9819329625732752, + "flos": 23227537008600.0, + "grad_norm": 1.6967863481418415, + "language_loss": 0.73692453, + "learning_rate": 3.411838534981948e-09, + "loss": 0.76049495, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12237549, + "step": 16332, + "time_per_iteration": 2.8895418643951416 + }, + { + "auxiliary_loss_clip": 0.01325073, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.21780181, + "balance_loss_mlp": 1.01643896, + "epoch": 0.9819930858259431, + "flos": 17534985410880.0, + "grad_norm": 1.738373159155492, + "language_loss": 0.76848698, + "learning_rate": 3.389137269534936e-09, + "loss": 0.79202092, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.11877441, + "step": 16333, + "time_per_iteration": 2.8265745639801025 + }, + { + "auxiliary_loss_clip": 0.01328376, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.22063684, + "balance_loss_mlp": 1.01416218, + "epoch": 0.9820532090786112, + "flos": 12533549716080.0, + "grad_norm": 2.625456867533271, + "language_loss": 0.72972822, + "learning_rate": 3.366511715771958e-09, + "loss": 0.75327933, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12567139, + "step": 16334, + "time_per_iteration": 2.775341272354126 + }, + { + "auxiliary_loss_clip": 0.01325437, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.21702361, + "balance_loss_mlp": 1.02419996, + "epoch": 0.9821133323312792, + "flos": 18844399781280.0, + "grad_norm": 1.85539493019383, + "language_loss": 0.78378344, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80740517, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12536621, + "step": 16335, + "time_per_iteration": 2.791071653366089 + }, + { + "auxiliary_loss_clip": 0.01328198, + "auxiliary_loss_mlp": 0.0103641, + "balance_loss_clip": 1.2180686, + "balance_loss_mlp": 1.0225873, + "epoch": 0.9821734555839471, + "flos": 34830727035120.0, + "grad_norm": 1.9745381516739862, + "language_loss": 0.64409304, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66773909, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.13812256, + "step": 16336, + "time_per_iteration": 3.0287368297576904 + }, + { + "auxiliary_loss_clip": 0.01333273, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.2208364, + "balance_loss_mlp": 1.02062273, + "epoch": 0.9822335788366151, + "flos": 17132540314680.0, + "grad_norm": 1.9056960453299203, + "language_loss": 0.73395139, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75763571, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.14520264, + "step": 16337, + "time_per_iteration": 2.8639044761657715 + }, + { + "auxiliary_loss_clip": 0.01325563, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.21677458, + "balance_loss_mlp": 1.01375794, + "epoch": 0.982293702089283, + "flos": 20818104909840.0, + "grad_norm": 1.6210608249793053, + "language_loss": 0.73448056, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75800538, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13165283, + "step": 16338, + "time_per_iteration": 4.2685019969940186 + }, + { + "auxiliary_loss_clip": 0.01326025, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.21854508, + "balance_loss_mlp": 1.02005053, + "epoch": 0.982353825341951, + "flos": 24686200393920.0, + "grad_norm": 1.7449189075437999, + "language_loss": 0.81678069, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.8403722, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.13079834, + "step": 16339, + "time_per_iteration": 2.9153800010681152 + }, + { + "auxiliary_loss_clip": 0.01319663, + "auxiliary_loss_mlp": 0.01026685, + "balance_loss_clip": 1.21516752, + "balance_loss_mlp": 1.01484179, + "epoch": 0.982413948594619, + "flos": 20855447619840.0, + "grad_norm": 1.7850564710566053, + "language_loss": 0.62845957, + "learning_rate": 3.232348386403405e-09, + "loss": 0.65192306, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.11846924, + "step": 16340, + "time_per_iteration": 2.9449422359466553 + }, + { + "auxiliary_loss_clip": 0.01331046, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.22196174, + "balance_loss_mlp": 1.01837468, + "epoch": 0.982474071847287, + "flos": 15381591978600.0, + "grad_norm": 2.0325851123153984, + "language_loss": 0.8605181, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88414448, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.13214111, + "step": 16341, + "time_per_iteration": 2.9727275371551514 + }, + { + "auxiliary_loss_clip": 0.01318366, + "auxiliary_loss_mlp": 0.01024937, + "balance_loss_clip": 1.21503937, + "balance_loss_mlp": 1.0124979, + "epoch": 0.9825341950999549, + "flos": 23780774237400.0, + "grad_norm": 1.4794529203126923, + "language_loss": 0.66952026, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69295335, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.12457275, + "step": 16342, + "time_per_iteration": 5.86909556388855 + }, + { + "auxiliary_loss_clip": 0.01330249, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.22106433, + "balance_loss_mlp": 1.01844287, + "epoch": 0.9825943183526229, + "flos": 22751299876320.0, + "grad_norm": 1.5009484060075833, + "language_loss": 0.77333325, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79694748, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12713623, + "step": 16343, + "time_per_iteration": 2.887784242630005 + }, + { + "auxiliary_loss_clip": 0.01322726, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.21763659, + "balance_loss_mlp": 1.01631188, + "epoch": 0.9826544416052908, + "flos": 27715839553080.0, + "grad_norm": 1.7430717205169397, + "language_loss": 0.75264734, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77614987, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.11212158, + "step": 16344, + "time_per_iteration": 2.8861312866210938 + }, + { + "auxiliary_loss_clip": 0.01328325, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.22024059, + "balance_loss_mlp": 1.02456903, + "epoch": 0.9827145648579588, + "flos": 26947155036600.0, + "grad_norm": 2.1365685539595978, + "language_loss": 0.66339648, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68705326, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12780762, + "step": 16345, + "time_per_iteration": 2.8522419929504395 + }, + { + "auxiliary_loss_clip": 0.01315961, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.21243954, + "balance_loss_mlp": 1.01654661, + "epoch": 0.9827746881106267, + "flos": 21870912096720.0, + "grad_norm": 1.3608080340897304, + "language_loss": 0.79649675, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81993449, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.11254883, + "step": 16346, + "time_per_iteration": 3.013359308242798 + }, + { + "auxiliary_loss_clip": 0.01340401, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.22637415, + "balance_loss_mlp": 1.02133679, + "epoch": 0.9828348113632948, + "flos": 20855772486720.0, + "grad_norm": 1.8192079396388119, + "language_loss": 0.75288975, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77664453, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 1.14111328, + "router_z_loss_mlp": 0.13757324, + "step": 16347, + "time_per_iteration": 4.463235855102539 + }, + { + "auxiliary_loss_clip": 0.01314447, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.21005821, + "balance_loss_mlp": 1.01901019, + "epoch": 0.9828949346159628, + "flos": 34575987836160.0, + "grad_norm": 1.654262565083846, + "language_loss": 0.6727336, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.69618934, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12109375, + "step": 16348, + "time_per_iteration": 2.9521355628967285 + }, + { + "auxiliary_loss_clip": 0.0132477, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.21807551, + "balance_loss_mlp": 1.01642573, + "epoch": 0.9829550578686307, + "flos": 24462225350640.0, + "grad_norm": 1.725693608303446, + "language_loss": 0.69169104, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71523333, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.13024902, + "step": 16349, + "time_per_iteration": 2.876365900039673 + }, + { + "auxiliary_loss_clip": 0.01312938, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.21093225, + "balance_loss_mlp": 1.01580358, + "epoch": 0.9830151811212987, + "flos": 16914534700320.0, + "grad_norm": 1.8253028247660446, + "language_loss": 0.75951898, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78292322, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.11682129, + "step": 16350, + "time_per_iteration": 2.8530359268188477 + }, + { + "auxiliary_loss_clip": 0.01328501, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.22029257, + "balance_loss_mlp": 1.01801264, + "epoch": 0.9830753043739666, + "flos": 21293326833120.0, + "grad_norm": 1.941745489905005, + "language_loss": 0.84665316, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.8702504, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.13208008, + "step": 16351, + "time_per_iteration": 3.060537099838257 + }, + { + "auxiliary_loss_clip": 0.01326747, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.2193017, + "balance_loss_mlp": 1.01359677, + "epoch": 0.9831354276266346, + "flos": 31729935383280.0, + "grad_norm": 1.7478658365150956, + "language_loss": 0.68742716, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71095574, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12512207, + "step": 16352, + "time_per_iteration": 3.00591778755188 + }, + { + "auxiliary_loss_clip": 0.01321547, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.21610928, + "balance_loss_mlp": 1.01522171, + "epoch": 0.9831955508793025, + "flos": 21624416394840.0, + "grad_norm": 1.4504430669745583, + "language_loss": 0.66852951, + "learning_rate": 2.951012538143782e-09, + "loss": 0.69201529, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.11804199, + "step": 16353, + "time_per_iteration": 2.829925775527954 + }, + { + "auxiliary_loss_clip": 0.01315836, + "auxiliary_loss_mlp": 0.01029269, + "balance_loss_clip": 1.21200395, + "balance_loss_mlp": 1.01799154, + "epoch": 0.9832556741319706, + "flos": 22973650585200.0, + "grad_norm": 1.4388254423402123, + "language_loss": 0.74862093, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.77207196, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.11279297, + "step": 16354, + "time_per_iteration": 2.88354229927063 + }, + { + "auxiliary_loss_clip": 0.01325049, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.21850502, + "balance_loss_mlp": 1.02268863, + "epoch": 0.9833157973846385, + "flos": 21328801558560.0, + "grad_norm": 1.8367475139121856, + "language_loss": 0.77831542, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.8019188, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.1260376, + "step": 16355, + "time_per_iteration": 2.9485220909118652 + }, + { + "auxiliary_loss_clip": 0.01327174, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.22100878, + "balance_loss_mlp": 1.01635981, + "epoch": 0.9833759206373065, + "flos": 21073615667640.0, + "grad_norm": 1.8048613468066559, + "language_loss": 0.73701137, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.76056999, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12335205, + "step": 16356, + "time_per_iteration": 2.95174503326416 + }, + { + "auxiliary_loss_clip": 0.0132385, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.21919203, + "balance_loss_mlp": 1.01663256, + "epoch": 0.9834360438899744, + "flos": 18701932362480.0, + "grad_norm": 1.8797522162075608, + "language_loss": 0.76458383, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.78811169, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.12304688, + "step": 16357, + "time_per_iteration": 2.9284727573394775 + }, + { + "auxiliary_loss_clip": 0.01326078, + "auxiliary_loss_mlp": 0.01026557, + "balance_loss_clip": 1.22042346, + "balance_loss_mlp": 1.01320601, + "epoch": 0.9834961671426424, + "flos": 21110146210440.0, + "grad_norm": 2.011410559657842, + "language_loss": 0.80431074, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82783711, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.13354492, + "step": 16358, + "time_per_iteration": 3.034317970275879 + }, + { + "auxiliary_loss_clip": 0.01327316, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.22065914, + "balance_loss_mlp": 1.0152725, + "epoch": 0.9835562903953103, + "flos": 26693268613200.0, + "grad_norm": 2.3929087807496185, + "language_loss": 0.67895985, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.70250571, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.11993408, + "step": 16359, + "time_per_iteration": 3.104203701019287 + }, + { + "auxiliary_loss_clip": 0.01320755, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.21566021, + "balance_loss_mlp": 1.01685596, + "epoch": 0.9836164136479784, + "flos": 22095171398520.0, + "grad_norm": 1.5112664833190956, + "language_loss": 0.70006657, + "learning_rate": 2.804824870920264e-09, + "loss": 0.72355735, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.11474609, + "step": 16360, + "time_per_iteration": 3.022972822189331 + }, + { + "auxiliary_loss_clip": 0.0132825, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.22031581, + "balance_loss_mlp": 1.01774669, + "epoch": 0.9836765369006463, + "flos": 23883706094760.0, + "grad_norm": 1.899047383403038, + "language_loss": 0.84741104, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.87100291, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.13201904, + "step": 16361, + "time_per_iteration": 2.9683713912963867 + }, + { + "auxiliary_loss_clip": 0.01325237, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.21821237, + "balance_loss_mlp": 1.01693749, + "epoch": 0.9837366601533143, + "flos": 25850101718520.0, + "grad_norm": 1.607537097353174, + "language_loss": 0.75994378, + "learning_rate": 2.76373855876022e-09, + "loss": 0.7834841, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.11859131, + "step": 16362, + "time_per_iteration": 2.9427194595336914 + }, + { + "auxiliary_loss_clip": 0.01325154, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.21873558, + "balance_loss_mlp": 1.01966584, + "epoch": 0.9837967834059823, + "flos": 21362611341240.0, + "grad_norm": 1.9444830023041044, + "language_loss": 0.71698987, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.740569, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.13104248, + "step": 16363, + "time_per_iteration": 2.89927339553833 + }, + { + "auxiliary_loss_clip": 0.01317112, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.21453547, + "balance_loss_mlp": 1.01487851, + "epoch": 0.9838569066586502, + "flos": 18520497899280.0, + "grad_norm": 1.5958812041021153, + "language_loss": 0.63030434, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65374339, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.11920166, + "step": 16364, + "time_per_iteration": 2.9027552604675293 + }, + { + "auxiliary_loss_clip": 0.01327235, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.21988297, + "balance_loss_mlp": 1.02044487, + "epoch": 0.9839170299113182, + "flos": 22457106332640.0, + "grad_norm": 1.70101038316474, + "language_loss": 0.75500739, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77860349, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1192627, + "step": 16365, + "time_per_iteration": 2.988208293914795 + }, + { + "auxiliary_loss_clip": 0.01318557, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.21397483, + "balance_loss_mlp": 1.01296306, + "epoch": 0.9839771531639862, + "flos": 27898208008560.0, + "grad_norm": 1.6182250610284907, + "language_loss": 0.76409799, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78753567, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.12249756, + "step": 16366, + "time_per_iteration": 3.0702357292175293 + }, + { + "auxiliary_loss_clip": 0.01317925, + "auxiliary_loss_mlp": 0.01026709, + "balance_loss_clip": 1.21367741, + "balance_loss_mlp": 1.01491284, + "epoch": 0.9840372764166542, + "flos": 28219632780600.0, + "grad_norm": 1.6156335268638007, + "language_loss": 0.7752198, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79866612, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.11798096, + "step": 16367, + "time_per_iteration": 2.9945476055145264 + }, + { + "auxiliary_loss_clip": 0.01322648, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.21700954, + "balance_loss_mlp": 1.01952267, + "epoch": 0.9840973996693221, + "flos": 23409215121960.0, + "grad_norm": 1.6616279801855738, + "language_loss": 0.61449939, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63804805, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12695312, + "step": 16368, + "time_per_iteration": 2.844113349914551 + }, + { + "auxiliary_loss_clip": 0.01315465, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.21213925, + "balance_loss_mlp": 1.01928258, + "epoch": 0.9841575229219901, + "flos": 21400644393360.0, + "grad_norm": 1.468483807073413, + "language_loss": 0.6587472, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.68221378, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11907959, + "step": 16369, + "time_per_iteration": 2.842682361602783 + }, + { + "auxiliary_loss_clip": 0.01324033, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.21659076, + "balance_loss_mlp": 1.01961827, + "epoch": 0.984217646174658, + "flos": 24469697288880.0, + "grad_norm": 1.5431912217704618, + "language_loss": 0.68799782, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.71155715, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12286377, + "step": 16370, + "time_per_iteration": 2.9013876914978027 + }, + { + "auxiliary_loss_clip": 0.01326886, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.2178098, + "balance_loss_mlp": 1.01530743, + "epoch": 0.984277769427326, + "flos": 16439353385400.0, + "grad_norm": 1.8387669274903942, + "language_loss": 0.73622882, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75978565, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13482666, + "step": 16371, + "time_per_iteration": 2.957113027572632 + }, + { + "auxiliary_loss_clip": 0.01141734, + "auxiliary_loss_mlp": 0.01009228, + "balance_loss_clip": 1.09929323, + "balance_loss_mlp": 1.00648594, + "epoch": 0.9843378926799939, + "flos": 64546910800920.0, + "grad_norm": 0.7946696227839251, + "language_loss": 0.6523658, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67387545, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02746582, + "step": 16372, + "time_per_iteration": 3.4600138664245605 + }, + { + "auxiliary_loss_clip": 0.01321521, + "auxiliary_loss_mlp": 0.01026189, + "balance_loss_clip": 1.21565747, + "balance_loss_mlp": 1.01398802, + "epoch": 0.984398015932662, + "flos": 17387117080200.0, + "grad_norm": 1.9204377794464982, + "language_loss": 0.71089852, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.7343756, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12207031, + "step": 16373, + "time_per_iteration": 2.8023335933685303 + }, + { + "auxiliary_loss_clip": 0.01316919, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.2128216, + "balance_loss_mlp": 1.0184083, + "epoch": 0.9844581391853299, + "flos": 23884477653600.0, + "grad_norm": 2.9582581392375347, + "language_loss": 0.81597835, + "learning_rate": 2.523582674173186e-09, + "loss": 0.8394565, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.12506104, + "step": 16374, + "time_per_iteration": 2.907846450805664 + }, + { + "auxiliary_loss_clip": 0.01325774, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.21880555, + "balance_loss_mlp": 1.01875067, + "epoch": 0.9845182624379979, + "flos": 19870178781600.0, + "grad_norm": 1.6471543607801125, + "language_loss": 0.69422472, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71779299, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12298584, + "step": 16375, + "time_per_iteration": 2.8278536796569824 + }, + { + "auxiliary_loss_clip": 0.01334947, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.22440624, + "balance_loss_mlp": 1.01747894, + "epoch": 0.9845783856906659, + "flos": 28260264767760.0, + "grad_norm": 1.6539801525659366, + "language_loss": 0.80733603, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83099979, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.13946533, + "step": 16376, + "time_per_iteration": 2.9330713748931885 + }, + { + "auxiliary_loss_clip": 0.01319051, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.21420264, + "balance_loss_mlp": 1.01663065, + "epoch": 0.9846385089433338, + "flos": 28334097412200.0, + "grad_norm": 1.6409785028814532, + "language_loss": 0.62462783, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64811051, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12591553, + "step": 16377, + "time_per_iteration": 4.352689743041992 + }, + { + "auxiliary_loss_clip": 0.01327817, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.21979725, + "balance_loss_mlp": 1.01842713, + "epoch": 0.9846986321960018, + "flos": 24322194433440.0, + "grad_norm": 1.5866556976671213, + "language_loss": 0.73328239, + "learning_rate": 2.445954472695133e-09, + "loss": 0.75688004, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.13513184, + "step": 16378, + "time_per_iteration": 2.9669225215911865 + }, + { + "auxiliary_loss_clip": 0.0132627, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.21908867, + "balance_loss_mlp": 1.01689577, + "epoch": 0.9847587554486698, + "flos": 27277797906360.0, + "grad_norm": 1.664896940707027, + "language_loss": 0.71057367, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73412991, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12457275, + "step": 16379, + "time_per_iteration": 2.9351119995117188 + }, + { + "auxiliary_loss_clip": 0.01325293, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.21710658, + "balance_loss_mlp": 1.01708543, + "epoch": 0.9848188787013378, + "flos": 16546792770720.0, + "grad_norm": 1.8644547252371502, + "language_loss": 0.6886757, + "learning_rate": 2.407594853716999e-09, + "loss": 0.71222496, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.12567139, + "step": 16380, + "time_per_iteration": 4.341290712356567 + }, + { + "auxiliary_loss_clip": 0.0133191, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.22077012, + "balance_loss_mlp": 1.02275252, + "epoch": 0.9848790019540057, + "flos": 20198506974840.0, + "grad_norm": 1.8877714925845612, + "language_loss": 0.78760493, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81128085, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.12927246, + "step": 16381, + "time_per_iteration": 4.589237928390503 + }, + { + "auxiliary_loss_clip": 0.01327604, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.21913028, + "balance_loss_mlp": 1.01830935, + "epoch": 0.9849391252066737, + "flos": 28265990546520.0, + "grad_norm": 1.338159116214644, + "language_loss": 0.82474351, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84832823, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12579346, + "step": 16382, + "time_per_iteration": 3.057446002960205 + }, + { + "auxiliary_loss_clip": 0.01338896, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.22672236, + "balance_loss_mlp": 1.02480221, + "epoch": 0.9849992484593416, + "flos": 22459989526200.0, + "grad_norm": 1.8321194145047328, + "language_loss": 0.74246818, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76624405, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.13903809, + "step": 16383, + "time_per_iteration": 3.101595163345337 + }, + { + "auxiliary_loss_clip": 0.0132205, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.21505666, + "balance_loss_mlp": 1.01523256, + "epoch": 0.9850593717120096, + "flos": 34504063784640.0, + "grad_norm": 1.807221700640548, + "language_loss": 0.66621929, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68971586, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.12371826, + "step": 16384, + "time_per_iteration": 3.109905242919922 + }, + { + "auxiliary_loss_clip": 0.01336267, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.22568369, + "balance_loss_mlp": 1.02055526, + "epoch": 0.9851194949646775, + "flos": 38844497998440.0, + "grad_norm": 1.7749993695049064, + "language_loss": 0.70505172, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72875702, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13690186, + "step": 16385, + "time_per_iteration": 4.55446720123291 + }, + { + "auxiliary_loss_clip": 0.01331397, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.22294402, + "balance_loss_mlp": 1.02001655, + "epoch": 0.9851796182173456, + "flos": 17716947782760.0, + "grad_norm": 1.91702337418068, + "language_loss": 0.81525069, + "learning_rate": 2.294333993509978e-09, + "loss": 0.8388949, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13006592, + "step": 16386, + "time_per_iteration": 2.911027669906616 + }, + { + "auxiliary_loss_clip": 0.01332662, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.22368574, + "balance_loss_mlp": 1.01988852, + "epoch": 0.9852397414700135, + "flos": 27460125753480.0, + "grad_norm": 2.1272866781577466, + "language_loss": 0.680794, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.70444977, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.13031006, + "step": 16387, + "time_per_iteration": 2.9222183227539062 + }, + { + "auxiliary_loss_clip": 0.01316118, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.21344042, + "balance_loss_mlp": 1.01812589, + "epoch": 0.9852998647226815, + "flos": 18301355250840.0, + "grad_norm": 2.3857524684493665, + "language_loss": 0.74396366, + "learning_rate": 2.257186391438237e-09, + "loss": 0.76742423, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.11816406, + "step": 16388, + "time_per_iteration": 2.846096992492676 + }, + { + "auxiliary_loss_clip": 0.01319824, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.21357012, + "balance_loss_mlp": 1.01811051, + "epoch": 0.9853599879753495, + "flos": 19646853472080.0, + "grad_norm": 1.5961744770340043, + "language_loss": 0.8222543, + "learning_rate": 2.238726221962528e-09, + "loss": 0.8457545, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12097168, + "step": 16389, + "time_per_iteration": 2.7944259643554688 + }, + { + "auxiliary_loss_clip": 0.01324272, + "auxiliary_loss_mlp": 0.0102468, + "balance_loss_clip": 1.21817756, + "balance_loss_mlp": 1.0121218, + "epoch": 0.9854201112280174, + "flos": 23847256768680.0, + "grad_norm": 2.0437383337990798, + "language_loss": 0.67516106, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.6986506, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.12567139, + "step": 16390, + "time_per_iteration": 2.9154834747314453 + }, + { + "auxiliary_loss_clip": 0.01333175, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.22517371, + "balance_loss_mlp": 1.02229977, + "epoch": 0.9854802344806854, + "flos": 30086629474320.0, + "grad_norm": 1.5452131997022067, + "language_loss": 0.77248365, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79616863, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13024902, + "step": 16391, + "time_per_iteration": 2.922365665435791 + }, + { + "auxiliary_loss_clip": 0.01313738, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.21241379, + "balance_loss_mlp": 1.01972795, + "epoch": 0.9855403577333534, + "flos": 21912356251080.0, + "grad_norm": 1.761688404104784, + "language_loss": 0.68556452, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70901513, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.11590576, + "step": 16392, + "time_per_iteration": 2.8167073726654053 + }, + { + "auxiliary_loss_clip": 0.01337606, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.22470498, + "balance_loss_mlp": 1.0132314, + "epoch": 0.9856004809860214, + "flos": 15418000696320.0, + "grad_norm": 1.7817476150222045, + "language_loss": 0.56036699, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58401573, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.14044189, + "step": 16393, + "time_per_iteration": 2.798689365386963 + }, + { + "auxiliary_loss_clip": 0.01337509, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.22382462, + "balance_loss_mlp": 1.01914418, + "epoch": 0.9856606042386893, + "flos": 13655966277960.0, + "grad_norm": 2.4846898846045047, + "language_loss": 0.79659098, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.82029599, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.1385498, + "step": 16394, + "time_per_iteration": 2.874472141265869 + }, + { + "auxiliary_loss_clip": 0.01330825, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.22075129, + "balance_loss_mlp": 1.01966238, + "epoch": 0.9857207274913573, + "flos": 23484672100800.0, + "grad_norm": 1.407472486527835, + "language_loss": 0.76251376, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78615439, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13574219, + "step": 16395, + "time_per_iteration": 2.852735757827759 + }, + { + "auxiliary_loss_clip": 0.01324997, + "auxiliary_loss_mlp": 0.01023715, + "balance_loss_clip": 1.21847403, + "balance_loss_mlp": 1.01081634, + "epoch": 0.9857808507440252, + "flos": 21069960915240.0, + "grad_norm": 1.9872370494238478, + "language_loss": 0.75532019, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.7788074, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.12902832, + "step": 16396, + "time_per_iteration": 2.902221441268921 + }, + { + "auxiliary_loss_clip": 0.01324007, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.21804333, + "balance_loss_mlp": 1.01387739, + "epoch": 0.9858409739966932, + "flos": 25306854146280.0, + "grad_norm": 1.4241628178245556, + "language_loss": 0.71047461, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73398113, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12768555, + "step": 16397, + "time_per_iteration": 2.852844476699829 + }, + { + "auxiliary_loss_clip": 0.01314888, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.21396327, + "balance_loss_mlp": 1.0148977, + "epoch": 0.9859010972493611, + "flos": 20563690577760.0, + "grad_norm": 1.66303483735624, + "language_loss": 0.71479273, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73820651, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.11602783, + "step": 16398, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.01322269, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.21797323, + "balance_loss_mlp": 1.02117276, + "epoch": 0.9859612205020292, + "flos": 24760520338680.0, + "grad_norm": 1.3725402080494782, + "language_loss": 0.7439425, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76750112, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.12432861, + "step": 16399, + "time_per_iteration": 2.9028165340423584 + }, + { + "auxiliary_loss_clip": 0.01324635, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.21644795, + "balance_loss_mlp": 1.01667833, + "epoch": 0.9860213437546971, + "flos": 21110836552560.0, + "grad_norm": 1.857753220506106, + "language_loss": 0.57673401, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60027784, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.13085938, + "step": 16400, + "time_per_iteration": 2.8436453342437744 + }, + { + "auxiliary_loss_clip": 0.01342357, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.22925377, + "balance_loss_mlp": 1.01544785, + "epoch": 0.9860814670073651, + "flos": 19140867393120.0, + "grad_norm": 1.6192102469433436, + "language_loss": 0.80485523, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82856679, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 1.13134766, + "router_z_loss_mlp": 0.13348389, + "step": 16401, + "time_per_iteration": 2.8627188205718994 + }, + { + "auxiliary_loss_clip": 0.01320041, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.21556306, + "balance_loss_mlp": 1.0157311, + "epoch": 0.9861415902600331, + "flos": 17241360384240.0, + "grad_norm": 1.6641083532393326, + "language_loss": 0.78503555, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80852604, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.1328125, + "step": 16402, + "time_per_iteration": 2.8849823474884033 + }, + { + "auxiliary_loss_clip": 0.01326429, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.21844852, + "balance_loss_mlp": 1.02184153, + "epoch": 0.986201713512701, + "flos": 27788413338360.0, + "grad_norm": 1.6908902305712474, + "language_loss": 0.70533586, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72894299, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12451172, + "step": 16403, + "time_per_iteration": 2.8882603645324707 + }, + { + "auxiliary_loss_clip": 0.01323882, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.21792555, + "balance_loss_mlp": 1.01707006, + "epoch": 0.986261836765369, + "flos": 28736258249880.0, + "grad_norm": 1.7014997143833308, + "language_loss": 0.74563289, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76916641, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12420654, + "step": 16404, + "time_per_iteration": 2.8683905601501465 + }, + { + "auxiliary_loss_clip": 0.01329461, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.22092223, + "balance_loss_mlp": 1.01784587, + "epoch": 0.986321960018037, + "flos": 34320152211480.0, + "grad_norm": 1.7043759026896923, + "language_loss": 0.70450097, + "learning_rate": 1.953666699415768e-09, + "loss": 0.72809559, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1217041, + "step": 16405, + "time_per_iteration": 2.8866488933563232 + }, + { + "auxiliary_loss_clip": 0.01319557, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.21628392, + "balance_loss_mlp": 1.018327, + "epoch": 0.986382083270705, + "flos": 25194947841360.0, + "grad_norm": 1.604902256911194, + "language_loss": 0.70080233, + "learning_rate": 1.93649446302846e-09, + "loss": 0.72429681, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.11566162, + "step": 16406, + "time_per_iteration": 2.865577220916748 + }, + { + "auxiliary_loss_clip": 0.01317772, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.21335673, + "balance_loss_mlp": 1.01533341, + "epoch": 0.9864422065233729, + "flos": 11028284914680.0, + "grad_norm": 3.0887762023158034, + "language_loss": 0.75629926, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77975225, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.12200928, + "step": 16407, + "time_per_iteration": 2.8710434436798096 + }, + { + "auxiliary_loss_clip": 0.01325465, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.21935725, + "balance_loss_mlp": 1.01860178, + "epoch": 0.9865023297760409, + "flos": 16550041439520.0, + "grad_norm": 1.8131881516702717, + "language_loss": 0.77725208, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.8008225, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.12982178, + "step": 16408, + "time_per_iteration": 2.8016130924224854 + }, + { + "auxiliary_loss_clip": 0.01334615, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.22342587, + "balance_loss_mlp": 1.01819324, + "epoch": 0.9865624530287088, + "flos": 18884869335000.0, + "grad_norm": 1.5364060631964411, + "language_loss": 0.67835611, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70201594, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.13165283, + "step": 16409, + "time_per_iteration": 2.796372175216675 + }, + { + "auxiliary_loss_clip": 0.01141493, + "auxiliary_loss_mlp": 0.01006272, + "balance_loss_clip": 1.09883308, + "balance_loss_mlp": 1.00356627, + "epoch": 0.9866225762813768, + "flos": 68901906434760.0, + "grad_norm": 0.7998223747119635, + "language_loss": 0.61111838, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63259596, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02709961, + "step": 16410, + "time_per_iteration": 3.331789493560791 + }, + { + "auxiliary_loss_clip": 0.01328682, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.22119153, + "balance_loss_mlp": 1.01693535, + "epoch": 0.9866826995340447, + "flos": 29029111717680.0, + "grad_norm": 2.2816321689099186, + "language_loss": 0.66702747, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.69060916, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.12554932, + "step": 16411, + "time_per_iteration": 2.9215033054351807 + }, + { + "auxiliary_loss_clip": 0.0114316, + "auxiliary_loss_mlp": 0.01005393, + "balance_loss_clip": 1.10072744, + "balance_loss_mlp": 1.00284183, + "epoch": 0.9867428227867128, + "flos": 65394138531600.0, + "grad_norm": 0.7307126964998448, + "language_loss": 0.56314385, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58462936, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.0255127, + "step": 16412, + "time_per_iteration": 3.3707690238952637 + }, + { + "auxiliary_loss_clip": 0.01339642, + "auxiliary_loss_mlp": 0.01026021, + "balance_loss_clip": 1.22826505, + "balance_loss_mlp": 1.01249099, + "epoch": 0.9868029460393807, + "flos": 26512077800160.0, + "grad_norm": 1.6180124375481755, + "language_loss": 0.73527908, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75893569, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.13525391, + "step": 16413, + "time_per_iteration": 2.8392770290374756 + }, + { + "auxiliary_loss_clip": 0.01327113, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.2181468, + "balance_loss_mlp": 1.01590431, + "epoch": 0.9868630692920487, + "flos": 22972594767840.0, + "grad_norm": 1.3663853589556751, + "language_loss": 0.71622598, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73978221, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12609863, + "step": 16414, + "time_per_iteration": 2.843299150466919 + }, + { + "auxiliary_loss_clip": 0.01319414, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.21679139, + "balance_loss_mlp": 1.0187906, + "epoch": 0.9869231925447167, + "flos": 19833485805360.0, + "grad_norm": 1.6857978095065216, + "language_loss": 0.70679921, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.73030037, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.11914062, + "step": 16415, + "time_per_iteration": 2.8530800342559814 + }, + { + "auxiliary_loss_clip": 0.01309643, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.20838499, + "balance_loss_mlp": 1.018641, + "epoch": 0.9869833157973846, + "flos": 20200293742680.0, + "grad_norm": 4.485946946723115, + "language_loss": 0.75413674, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77753425, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.11456299, + "step": 16416, + "time_per_iteration": 2.843205690383911 + }, + { + "auxiliary_loss_clip": 0.01320858, + "auxiliary_loss_mlp": 0.01022711, + "balance_loss_clip": 1.21586919, + "balance_loss_mlp": 1.01039135, + "epoch": 0.9870434390500527, + "flos": 16102822303440.0, + "grad_norm": 1.8114219615463798, + "language_loss": 0.70588338, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.7293191, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.12335205, + "step": 16417, + "time_per_iteration": 4.208235740661621 + }, + { + "auxiliary_loss_clip": 0.01336589, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.22553599, + "balance_loss_mlp": 1.01658893, + "epoch": 0.9871035623027206, + "flos": 21765584346120.0, + "grad_norm": 1.440945540508972, + "language_loss": 0.70702434, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.73068428, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12823486, + "step": 16418, + "time_per_iteration": 2.8533356189727783 + }, + { + "auxiliary_loss_clip": 0.01144873, + "auxiliary_loss_mlp": 0.01010224, + "balance_loss_clip": 1.10202849, + "balance_loss_mlp": 1.00787568, + "epoch": 0.9871636855553886, + "flos": 70236254338200.0, + "grad_norm": 0.7091066918757248, + "language_loss": 0.53673857, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55828953, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.0234375, + "step": 16419, + "time_per_iteration": 4.7931482791900635 + }, + { + "auxiliary_loss_clip": 0.01334363, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.22228324, + "balance_loss_mlp": 1.01938939, + "epoch": 0.9872238088080565, + "flos": 25051262171760.0, + "grad_norm": 1.6194745417461134, + "language_loss": 0.7807126, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80439055, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.140625, + "step": 16420, + "time_per_iteration": 4.315997362136841 + }, + { + "auxiliary_loss_clip": 0.01318367, + "auxiliary_loss_mlp": 0.01026347, + "balance_loss_clip": 1.21435952, + "balance_loss_mlp": 1.01350808, + "epoch": 0.9872839320607245, + "flos": 19470860529120.0, + "grad_norm": 1.56441769468525, + "language_loss": 0.7099396, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73338675, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.12841797, + "step": 16421, + "time_per_iteration": 2.85030460357666 + }, + { + "auxiliary_loss_clip": 0.01332499, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.22252226, + "balance_loss_mlp": 1.01963449, + "epoch": 0.9873440553133924, + "flos": 26948576329200.0, + "grad_norm": 2.205300521480762, + "language_loss": 0.82049614, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84415102, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.13342285, + "step": 16422, + "time_per_iteration": 2.805830478668213 + }, + { + "auxiliary_loss_clip": 0.0132386, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.21934819, + "balance_loss_mlp": 1.01567793, + "epoch": 0.9874041785660604, + "flos": 19066872315240.0, + "grad_norm": 1.620132283576041, + "language_loss": 0.8626374, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88615775, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.125, + "step": 16423, + "time_per_iteration": 2.725679874420166 + }, + { + "auxiliary_loss_clip": 0.01327874, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.22026539, + "balance_loss_mlp": 1.01637435, + "epoch": 0.9874643018187284, + "flos": 21110795944200.0, + "grad_norm": 1.8032307636149705, + "language_loss": 0.70783633, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.73140669, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12792969, + "step": 16424, + "time_per_iteration": 4.228453874588013 + }, + { + "auxiliary_loss_clip": 0.01334303, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.22536612, + "balance_loss_mlp": 1.0176487, + "epoch": 0.9875244250713964, + "flos": 24431461194960.0, + "grad_norm": 1.931189910275384, + "language_loss": 0.8060506, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82969856, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12835693, + "step": 16425, + "time_per_iteration": 2.7958972454071045 + }, + { + "auxiliary_loss_clip": 0.01329897, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.2210598, + "balance_loss_mlp": 1.01694489, + "epoch": 0.9875845483240643, + "flos": 25122901964760.0, + "grad_norm": 1.917604099024668, + "language_loss": 0.80106342, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82467198, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.14001465, + "step": 16426, + "time_per_iteration": 2.7777724266052246 + }, + { + "auxiliary_loss_clip": 0.01323888, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.21814334, + "balance_loss_mlp": 1.01742101, + "epoch": 0.9876446715767323, + "flos": 16586206507080.0, + "grad_norm": 1.7287530297889773, + "language_loss": 0.85428047, + "learning_rate": 1.593380599750338e-09, + "loss": 0.87781507, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.121521, + "step": 16427, + "time_per_iteration": 2.8668081760406494 + }, + { + "auxiliary_loss_clip": 0.01324315, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.21905708, + "balance_loss_mlp": 1.01849532, + "epoch": 0.9877047948294003, + "flos": 21621127117680.0, + "grad_norm": 1.8847110991648108, + "language_loss": 0.7072103, + "learning_rate": 1.577875377599458e-09, + "loss": 0.73076099, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12249756, + "step": 16428, + "time_per_iteration": 2.7344162464141846 + }, + { + "auxiliary_loss_clip": 0.01319374, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.2147336, + "balance_loss_mlp": 1.02091646, + "epoch": 0.9877649180820682, + "flos": 21183450946200.0, + "grad_norm": 1.9293552176753077, + "language_loss": 0.80424619, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82776976, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1206665, + "step": 16429, + "time_per_iteration": 2.7849090099334717 + }, + { + "auxiliary_loss_clip": 0.01323667, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.21785378, + "balance_loss_mlp": 1.0169313, + "epoch": 0.9878250413347363, + "flos": 39756543317640.0, + "grad_norm": 1.4779567713908055, + "language_loss": 0.62725091, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.65077782, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12084961, + "step": 16430, + "time_per_iteration": 3.038891553878784 + }, + { + "auxiliary_loss_clip": 0.01324952, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.21715164, + "balance_loss_mlp": 1.01850581, + "epoch": 0.9878851645874042, + "flos": 29431841072400.0, + "grad_norm": 1.3579613609010361, + "language_loss": 0.73387325, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75743675, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.12890625, + "step": 16431, + "time_per_iteration": 2.98405122756958 + }, + { + "auxiliary_loss_clip": 0.01325023, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.21822691, + "balance_loss_mlp": 1.02163029, + "epoch": 0.9879452878400722, + "flos": 15809359710240.0, + "grad_norm": 2.922103960087626, + "language_loss": 0.81560183, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83919966, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.13128662, + "step": 16432, + "time_per_iteration": 2.707697868347168 + }, + { + "auxiliary_loss_clip": 0.0132208, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.21619916, + "balance_loss_mlp": 1.01447487, + "epoch": 0.9880054110927401, + "flos": 22238654026320.0, + "grad_norm": 1.4424390555635762, + "language_loss": 0.80943394, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.83292365, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.12420654, + "step": 16433, + "time_per_iteration": 2.772278070449829 + }, + { + "auxiliary_loss_clip": 0.01322285, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.21731544, + "balance_loss_mlp": 1.01736712, + "epoch": 0.9880655343454081, + "flos": 28769296473720.0, + "grad_norm": 2.734819309739918, + "language_loss": 0.65045989, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67398304, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.12670898, + "step": 16434, + "time_per_iteration": 2.833397388458252 + }, + { + "auxiliary_loss_clip": 0.01331065, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.22144055, + "balance_loss_mlp": 1.01694059, + "epoch": 0.988125657598076, + "flos": 32859783275040.0, + "grad_norm": 1.4494267105853207, + "language_loss": 0.6922518, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.7158637, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.13195801, + "step": 16435, + "time_per_iteration": 2.8777287006378174 + }, + { + "auxiliary_loss_clip": 0.01324233, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.21860421, + "balance_loss_mlp": 1.01533818, + "epoch": 0.988185780850744, + "flos": 19395403550280.0, + "grad_norm": 1.5760306943509452, + "language_loss": 0.75914305, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.78266644, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.12768555, + "step": 16436, + "time_per_iteration": 2.821779727935791 + }, + { + "auxiliary_loss_clip": 0.01324714, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.21795559, + "balance_loss_mlp": 1.01726007, + "epoch": 0.988245904103412, + "flos": 22533537912120.0, + "grad_norm": 1.972663913237587, + "language_loss": 0.7459268, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76947987, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.13323975, + "step": 16437, + "time_per_iteration": 2.7890520095825195 + }, + { + "auxiliary_loss_clip": 0.01313069, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.20988655, + "balance_loss_mlp": 1.01957035, + "epoch": 0.98830602735608, + "flos": 28665268190640.0, + "grad_norm": 1.6066405101374748, + "language_loss": 0.60065669, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62410825, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.12524414, + "step": 16438, + "time_per_iteration": 2.852299928665161 + }, + { + "auxiliary_loss_clip": 0.01325414, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.21890628, + "balance_loss_mlp": 1.01753986, + "epoch": 0.9883661506087479, + "flos": 21000960665640.0, + "grad_norm": 1.7598366894202995, + "language_loss": 0.72000873, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74356604, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.12774658, + "step": 16439, + "time_per_iteration": 2.760312080383301 + }, + { + "auxiliary_loss_clip": 0.01323262, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.21815896, + "balance_loss_mlp": 1.02132797, + "epoch": 0.9884262738614159, + "flos": 32712321027960.0, + "grad_norm": 1.5686714269952071, + "language_loss": 0.60428488, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62785697, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12615967, + "step": 16440, + "time_per_iteration": 2.8553812503814697 + }, + { + "auxiliary_loss_clip": 0.01331401, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.22119653, + "balance_loss_mlp": 1.01654565, + "epoch": 0.9884863971140839, + "flos": 17568957627000.0, + "grad_norm": 2.4077192454705765, + "language_loss": 0.76769108, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.79129553, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.12493896, + "step": 16441, + "time_per_iteration": 2.7982239723205566 + }, + { + "auxiliary_loss_clip": 0.01329829, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.22181046, + "balance_loss_mlp": 1.01536298, + "epoch": 0.9885465203667518, + "flos": 40560012217440.0, + "grad_norm": 1.6702980496975628, + "language_loss": 0.67610061, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69967771, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.12518311, + "step": 16442, + "time_per_iteration": 3.1143932342529297 + }, + { + "auxiliary_loss_clip": 0.01317473, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.21291411, + "balance_loss_mlp": 1.01783252, + "epoch": 0.9886066436194199, + "flos": 13811915672280.0, + "grad_norm": 5.962767725509349, + "language_loss": 0.75094855, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.77442807, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12658691, + "step": 16443, + "time_per_iteration": 2.7073848247528076 + }, + { + "auxiliary_loss_clip": 0.01325605, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.21748269, + "balance_loss_mlp": 1.01450896, + "epoch": 0.9886667668720878, + "flos": 23329575482040.0, + "grad_norm": 2.334501684777599, + "language_loss": 0.74232507, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.76585811, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13195801, + "step": 16444, + "time_per_iteration": 2.7731924057006836 + }, + { + "auxiliary_loss_clip": 0.01322419, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.2179507, + "balance_loss_mlp": 1.01802111, + "epoch": 0.9887268901247558, + "flos": 22710546064080.0, + "grad_norm": 1.7642648083559984, + "language_loss": 0.69203258, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71555799, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.12121582, + "step": 16445, + "time_per_iteration": 2.8148467540740967 + }, + { + "auxiliary_loss_clip": 0.0132831, + "auxiliary_loss_mlp": 0.01025688, + "balance_loss_clip": 1.22076476, + "balance_loss_mlp": 1.01205039, + "epoch": 0.9887870133774237, + "flos": 13045058532000.0, + "grad_norm": 2.7194480999352257, + "language_loss": 0.60664248, + "learning_rate": 1.311740377491155e-09, + "loss": 0.6301825, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.13647461, + "step": 16446, + "time_per_iteration": 2.7078919410705566 + }, + { + "auxiliary_loss_clip": 0.01320325, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.21457243, + "balance_loss_mlp": 1.01988494, + "epoch": 0.9888471366300917, + "flos": 15163099063920.0, + "grad_norm": 1.8904263789111808, + "language_loss": 0.71407545, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73759532, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.11773682, + "step": 16447, + "time_per_iteration": 2.728607177734375 + }, + { + "auxiliary_loss_clip": 0.01324043, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.21816528, + "balance_loss_mlp": 1.01751077, + "epoch": 0.9889072598827596, + "flos": 25124242040640.0, + "grad_norm": 3.5354458874940202, + "language_loss": 0.83868468, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.86221981, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.11950684, + "step": 16448, + "time_per_iteration": 2.7924575805664062 + }, + { + "auxiliary_loss_clip": 0.01320226, + "auxiliary_loss_mlp": 0.01025752, + "balance_loss_clip": 1.215837, + "balance_loss_mlp": 1.01467776, + "epoch": 0.9889673831354276, + "flos": 16733222062200.0, + "grad_norm": 1.9599935300296136, + "language_loss": 0.70248926, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72594905, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.11071777, + "step": 16449, + "time_per_iteration": 2.7049143314361572 + }, + { + "auxiliary_loss_clip": 0.01330608, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.22075915, + "balance_loss_mlp": 1.0162673, + "epoch": 0.9890275063880956, + "flos": 35591330487960.0, + "grad_norm": 1.8635390429542706, + "language_loss": 0.74418283, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76777977, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.12823486, + "step": 16450, + "time_per_iteration": 2.8645431995391846 + }, + { + "auxiliary_loss_clip": 0.01326106, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.2157681, + "balance_loss_mlp": 1.01505947, + "epoch": 0.9890876296407636, + "flos": 18956955819960.0, + "grad_norm": 1.5700846307855743, + "language_loss": 0.79231417, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81585336, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.12756348, + "step": 16451, + "time_per_iteration": 2.7821953296661377 + }, + { + "auxiliary_loss_clip": 0.01331335, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.22056985, + "balance_loss_mlp": 1.01690388, + "epoch": 0.9891477528934315, + "flos": 23774642375040.0, + "grad_norm": 2.3830147592565707, + "language_loss": 0.70466089, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72827756, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.13421631, + "step": 16452, + "time_per_iteration": 2.762179136276245 + }, + { + "auxiliary_loss_clip": 0.0131632, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.21334624, + "balance_loss_mlp": 1.01688898, + "epoch": 0.9892078761460995, + "flos": 20777269880880.0, + "grad_norm": 1.964167048633735, + "language_loss": 0.74320126, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76664782, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.11450195, + "step": 16453, + "time_per_iteration": 2.8354170322418213 + }, + { + "auxiliary_loss_clip": 0.01326582, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.21769536, + "balance_loss_mlp": 1.02125275, + "epoch": 0.9892679993987675, + "flos": 23373090662760.0, + "grad_norm": 1.7998488780711104, + "language_loss": 0.69847703, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.72207493, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.11956787, + "step": 16454, + "time_per_iteration": 2.925140380859375 + }, + { + "auxiliary_loss_clip": 0.01315332, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.21204138, + "balance_loss_mlp": 1.01619744, + "epoch": 0.9893281226514354, + "flos": 22709855721960.0, + "grad_norm": 1.9242724189851126, + "language_loss": 0.75554371, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.7789765, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.11755371, + "step": 16455, + "time_per_iteration": 4.237555265426636 + }, + { + "auxiliary_loss_clip": 0.0131367, + "auxiliary_loss_mlp": 0.01022968, + "balance_loss_clip": 1.20954013, + "balance_loss_mlp": 1.01024866, + "epoch": 0.9893882459041035, + "flos": 21801424546800.0, + "grad_norm": 1.5709838987034737, + "language_loss": 0.65691054, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.68027687, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.12695312, + "step": 16456, + "time_per_iteration": 2.844841718673706 + }, + { + "auxiliary_loss_clip": 0.01334595, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.22556758, + "balance_loss_mlp": 1.01682162, + "epoch": 0.9894483691567714, + "flos": 18118337061600.0, + "grad_norm": 1.6613098283308132, + "language_loss": 0.74457598, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76821315, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12316895, + "step": 16457, + "time_per_iteration": 2.7582602500915527 + }, + { + "auxiliary_loss_clip": 0.01329286, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.22080004, + "balance_loss_mlp": 1.01909828, + "epoch": 0.9895084924094394, + "flos": 31217817441960.0, + "grad_norm": 2.1903124345906635, + "language_loss": 0.69318277, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71679056, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1239624, + "step": 16458, + "time_per_iteration": 5.799431800842285 + }, + { + "auxiliary_loss_clip": 0.01317969, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.21269894, + "balance_loss_mlp": 1.0178082, + "epoch": 0.9895686156621073, + "flos": 19682206372440.0, + "grad_norm": 1.6820545978030639, + "language_loss": 0.79250991, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81599391, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12615967, + "step": 16459, + "time_per_iteration": 2.843001365661621 + }, + { + "auxiliary_loss_clip": 0.01331366, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.22174728, + "balance_loss_mlp": 1.01892662, + "epoch": 0.9896287389147753, + "flos": 23586304490640.0, + "grad_norm": 1.813860463972097, + "language_loss": 0.70913857, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73277199, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.1305542, + "step": 16460, + "time_per_iteration": 2.8050267696380615 + }, + { + "auxiliary_loss_clip": 0.01327553, + "auxiliary_loss_mlp": 0.01024839, + "balance_loss_clip": 1.21878099, + "balance_loss_mlp": 1.01169038, + "epoch": 0.9896888621674432, + "flos": 29611204509240.0, + "grad_norm": 1.500517426821927, + "language_loss": 0.87468469, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89820856, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.13165283, + "step": 16461, + "time_per_iteration": 2.8759939670562744 + }, + { + "auxiliary_loss_clip": 0.01325083, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.21832526, + "balance_loss_mlp": 1.01975775, + "epoch": 0.9897489854201112, + "flos": 23700200605200.0, + "grad_norm": 2.2934855883852445, + "language_loss": 0.62963557, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65321976, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.13580322, + "step": 16462, + "time_per_iteration": 2.807891368865967 + }, + { + "auxiliary_loss_clip": 0.0132579, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.21979165, + "balance_loss_mlp": 1.01527619, + "epoch": 0.9898091086727792, + "flos": 13229132538600.0, + "grad_norm": 1.7361956971815065, + "language_loss": 0.73170173, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75523949, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12719727, + "step": 16463, + "time_per_iteration": 4.293610572814941 + }, + { + "auxiliary_loss_clip": 0.01328668, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.22065866, + "balance_loss_mlp": 1.01966774, + "epoch": 0.9898692319254472, + "flos": 22935658141440.0, + "grad_norm": 1.6870081759739062, + "language_loss": 0.6994983, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72311473, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.13317871, + "step": 16464, + "time_per_iteration": 2.8547921180725098 + }, + { + "auxiliary_loss_clip": 0.01327384, + "auxiliary_loss_mlp": 0.0102789, + "balance_loss_clip": 1.21908712, + "balance_loss_mlp": 1.01534915, + "epoch": 0.9899293551781151, + "flos": 12462153573240.0, + "grad_norm": 2.400126962910346, + "language_loss": 0.73480797, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75836068, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12524414, + "step": 16465, + "time_per_iteration": 2.8258306980133057 + }, + { + "auxiliary_loss_clip": 0.0131901, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.21417594, + "balance_loss_mlp": 1.01871765, + "epoch": 0.9899894784307831, + "flos": 26876936536200.0, + "grad_norm": 1.6704637091463745, + "language_loss": 0.86715579, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.8906486, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.11560059, + "step": 16466, + "time_per_iteration": 2.8520143032073975 + }, + { + "auxiliary_loss_clip": 0.01319906, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.21406019, + "balance_loss_mlp": 1.01142025, + "epoch": 0.990049601683451, + "flos": 21546928998000.0, + "grad_norm": 1.6741221769395696, + "language_loss": 0.72036499, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.74380529, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 1.05908203, + "router_z_loss_mlp": 0.12701416, + "step": 16467, + "time_per_iteration": 2.791292905807495 + }, + { + "auxiliary_loss_clip": 0.01320131, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.21423984, + "balance_loss_mlp": 1.01859403, + "epoch": 0.990109724936119, + "flos": 28778311529640.0, + "grad_norm": 1.2537992212395819, + "language_loss": 0.65048873, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67399621, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.12036133, + "step": 16468, + "time_per_iteration": 2.8529279232025146 + }, + { + "auxiliary_loss_clip": 0.01334488, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.22253752, + "balance_loss_mlp": 1.01928782, + "epoch": 0.9901698481887871, + "flos": 29279749472280.0, + "grad_norm": 2.534350847574575, + "language_loss": 0.61780977, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64147389, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.12646484, + "step": 16469, + "time_per_iteration": 2.847245931625366 + }, + { + "auxiliary_loss_clip": 0.01329563, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.22111917, + "balance_loss_mlp": 1.01574278, + "epoch": 0.990229971441455, + "flos": 15963035036400.0, + "grad_norm": 5.665104241737443, + "language_loss": 0.70973885, + "learning_rate": 9.950925847685976e-10, + "loss": 0.73331845, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12664795, + "step": 16470, + "time_per_iteration": 2.774273157119751 + }, + { + "auxiliary_loss_clip": 0.01143261, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.10054636, + "balance_loss_mlp": 0.99989474, + "epoch": 0.990290094694123, + "flos": 69796710182160.0, + "grad_norm": 0.6725463862390747, + "language_loss": 0.55578625, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57724345, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02563477, + "step": 16471, + "time_per_iteration": 3.530024528503418 + }, + { + "auxiliary_loss_clip": 0.01325686, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.21955884, + "balance_loss_mlp": 1.01998329, + "epoch": 0.9903502179467909, + "flos": 16256700671400.0, + "grad_norm": 2.0065166192149135, + "language_loss": 0.84665108, + "learning_rate": 9.706760407131032e-10, + "loss": 0.87023509, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.1272583, + "step": 16472, + "time_per_iteration": 2.8068759441375732 + }, + { + "auxiliary_loss_clip": 0.01327389, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.21917629, + "balance_loss_mlp": 1.01761627, + "epoch": 0.9904103411994589, + "flos": 21693294819360.0, + "grad_norm": 1.6824037413377053, + "language_loss": 0.85739326, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88096386, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12060547, + "step": 16473, + "time_per_iteration": 2.929689884185791 + }, + { + "auxiliary_loss_clip": 0.01323275, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.21675515, + "balance_loss_mlp": 1.01835322, + "epoch": 0.9904704644521268, + "flos": 25745017618080.0, + "grad_norm": 1.7622391832112638, + "language_loss": 0.84888321, + "learning_rate": 9.465627102240859e-10, + "loss": 0.87241673, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.11730957, + "step": 16474, + "time_per_iteration": 2.8467860221862793 + }, + { + "auxiliary_loss_clip": 0.01322649, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.21625161, + "balance_loss_mlp": 1.02045298, + "epoch": 0.9905305877047949, + "flos": 21913412068440.0, + "grad_norm": 1.633541512267741, + "language_loss": 0.77098501, + "learning_rate": 9.346197512116738e-10, + "loss": 0.7945329, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.11688232, + "step": 16475, + "time_per_iteration": 2.931053638458252 + }, + { + "auxiliary_loss_clip": 0.01328416, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.22000027, + "balance_loss_mlp": 1.01528835, + "epoch": 0.9905907109574628, + "flos": 21396745990800.0, + "grad_norm": 1.4861870481283663, + "language_loss": 0.7565521, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78012085, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.13153076, + "step": 16476, + "time_per_iteration": 2.847174644470215 + }, + { + "auxiliary_loss_clip": 0.01339803, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.22662473, + "balance_loss_mlp": 1.01547456, + "epoch": 0.9906508342101308, + "flos": 20526469692840.0, + "grad_norm": 1.8683731742200749, + "language_loss": 0.6735059, + "learning_rate": 9.109612479154538e-10, + "loss": 0.6971969, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13818359, + "step": 16477, + "time_per_iteration": 2.9959287643432617 + }, + { + "auxiliary_loss_clip": 0.01336942, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.22536659, + "balance_loss_mlp": 1.01903462, + "epoch": 0.9907109574627987, + "flos": 21366225485280.0, + "grad_norm": 1.7321071276199664, + "language_loss": 0.72432774, + "learning_rate": 8.992457045289282e-10, + "loss": 0.74803352, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14587402, + "step": 16478, + "time_per_iteration": 3.0154433250427246 + }, + { + "auxiliary_loss_clip": 0.0132865, + "auxiliary_loss_mlp": 0.01033542, + "balance_loss_clip": 1.22143674, + "balance_loss_mlp": 1.02011871, + "epoch": 0.9907710807154667, + "flos": 17341896348360.0, + "grad_norm": 2.675008839409499, + "language_loss": 0.81502783, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83864975, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.13427734, + "step": 16479, + "time_per_iteration": 2.811277151107788 + }, + { + "auxiliary_loss_clip": 0.01332811, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.22337162, + "balance_loss_mlp": 1.0192523, + "epoch": 0.9908312039681346, + "flos": 28627844263920.0, + "grad_norm": 1.5666421750060873, + "language_loss": 0.66375339, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68739688, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.12280273, + "step": 16480, + "time_per_iteration": 2.984492778778076 + }, + { + "auxiliary_loss_clip": 0.01323933, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.21834302, + "balance_loss_mlp": 1.01741898, + "epoch": 0.9908913272208026, + "flos": 35777231870760.0, + "grad_norm": 1.7396424881008254, + "language_loss": 0.72872698, + "learning_rate": 8.645539127374313e-10, + "loss": 0.75226831, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.12768555, + "step": 16481, + "time_per_iteration": 3.0446596145629883 + }, + { + "auxiliary_loss_clip": 0.01318374, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.2133826, + "balance_loss_mlp": 1.01514602, + "epoch": 0.9909514504734707, + "flos": 19907074799640.0, + "grad_norm": 2.611533212080421, + "language_loss": 0.77903068, + "learning_rate": 8.531415963912713e-10, + "loss": 0.80248213, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.11633301, + "step": 16482, + "time_per_iteration": 2.8038249015808105 + }, + { + "auxiliary_loss_clip": 0.01332202, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.22284102, + "balance_loss_mlp": 1.01756668, + "epoch": 0.9910115737261386, + "flos": 20008788406200.0, + "grad_norm": 1.8967164256925209, + "language_loss": 0.75585657, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77947664, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.12237549, + "step": 16483, + "time_per_iteration": 2.8924901485443115 + }, + { + "auxiliary_loss_clip": 0.01144435, + "auxiliary_loss_mlp": 0.01005411, + "balance_loss_clip": 1.10143638, + "balance_loss_mlp": 1.00259805, + "epoch": 0.9910716969788066, + "flos": 70704451015200.0, + "grad_norm": 0.6852176774467097, + "language_loss": 0.53704274, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55854118, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02807617, + "step": 16484, + "time_per_iteration": 3.4250295162200928 + }, + { + "auxiliary_loss_clip": 0.01318345, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.21512651, + "balance_loss_mlp": 1.01905835, + "epoch": 0.9911318202314745, + "flos": 21439042920720.0, + "grad_norm": 1.9353613904609068, + "language_loss": 0.81975949, + "learning_rate": 8.19359496165184e-10, + "loss": 0.843252, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.11859131, + "step": 16485, + "time_per_iteration": 2.8414254188537598 + }, + { + "auxiliary_loss_clip": 0.01320625, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.21615171, + "balance_loss_mlp": 1.02484703, + "epoch": 0.9911919434841425, + "flos": 19831374170640.0, + "grad_norm": 1.5578522904427552, + "language_loss": 0.81468987, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83827883, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.13409424, + "step": 16486, + "time_per_iteration": 2.9294633865356445 + }, + { + "auxiliary_loss_clip": 0.0133045, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.22145629, + "balance_loss_mlp": 1.0158999, + "epoch": 0.9912520667368104, + "flos": 41726918560680.0, + "grad_norm": 1.4257556124404742, + "language_loss": 0.66338587, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68697673, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12750244, + "step": 16487, + "time_per_iteration": 2.9822614192962646 + }, + { + "auxiliary_loss_clip": 0.01317994, + "auxiliary_loss_mlp": 0.01026574, + "balance_loss_clip": 1.2138133, + "balance_loss_mlp": 1.01523149, + "epoch": 0.9913121899894785, + "flos": 23775048458640.0, + "grad_norm": 1.566267490659694, + "language_loss": 0.77059782, + "learning_rate": 7.862596780936481e-10, + "loss": 0.79404348, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.11340332, + "step": 16488, + "time_per_iteration": 2.916106700897217 + }, + { + "auxiliary_loss_clip": 0.01339849, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.22671592, + "balance_loss_mlp": 1.01402962, + "epoch": 0.9913723132421464, + "flos": 23775048458640.0, + "grad_norm": 2.1929693875256784, + "language_loss": 0.68584561, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70951927, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.13494873, + "step": 16489, + "time_per_iteration": 2.9618918895721436 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.01001813, + "balance_loss_clip": 1.098454, + "balance_loss_mlp": 0.998761, + "epoch": 0.9914324364948144, + "flos": 71282889054360.0, + "grad_norm": 0.6100386258576932, + "language_loss": 0.52594149, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54737449, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.03051758, + "step": 16490, + "time_per_iteration": 3.4844846725463867 + }, + { + "auxiliary_loss_clip": 0.01337565, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.22606564, + "balance_loss_mlp": 1.02002263, + "epoch": 0.9914925597474823, + "flos": 23700647297160.0, + "grad_norm": 1.5044435167618646, + "language_loss": 0.75677919, + "learning_rate": 7.538421534734052e-10, + "loss": 0.7804907, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.13562012, + "step": 16491, + "time_per_iteration": 2.9804904460906982 + }, + { + "auxiliary_loss_clip": 0.01333189, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.22352958, + "balance_loss_mlp": 1.01878405, + "epoch": 0.9915526830001503, + "flos": 13436498762640.0, + "grad_norm": 2.030094735329153, + "language_loss": 0.70476413, + "learning_rate": 7.431879346191383e-10, + "loss": 0.7284162, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13256836, + "step": 16492, + "time_per_iteration": 2.8855855464935303 + }, + { + "auxiliary_loss_clip": 0.01324327, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.21716547, + "balance_loss_mlp": 1.0164727, + "epoch": 0.9916128062528182, + "flos": 20745896599800.0, + "grad_norm": 1.830758377171298, + "language_loss": 0.68346703, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70700729, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.13214111, + "step": 16493, + "time_per_iteration": 3.07249116897583 + }, + { + "auxiliary_loss_clip": 0.01333344, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.22252166, + "balance_loss_mlp": 1.02241302, + "epoch": 0.9916729295054862, + "flos": 22491565849080.0, + "grad_norm": 1.7163578772915709, + "language_loss": 0.71564269, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73933172, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.13146973, + "step": 16494, + "time_per_iteration": 2.9231789112091064 + }, + { + "auxiliary_loss_clip": 0.01331136, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.22122002, + "balance_loss_mlp": 1.0201323, + "epoch": 0.9917330527581543, + "flos": 14796453560040.0, + "grad_norm": 2.059107210445667, + "language_loss": 0.67973453, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70338827, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14123535, + "step": 16495, + "time_per_iteration": 5.812985420227051 + }, + { + "auxiliary_loss_clip": 0.01142408, + "auxiliary_loss_mlp": 0.01002509, + "balance_loss_clip": 1.09950757, + "balance_loss_mlp": 0.99967211, + "epoch": 0.9917931760108222, + "flos": 59205614138640.0, + "grad_norm": 0.7236939813695421, + "language_loss": 0.5352875, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55673665, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02832031, + "step": 16496, + "time_per_iteration": 3.4603495597839355 + }, + { + "auxiliary_loss_clip": 0.01327168, + "auxiliary_loss_mlp": 0.0102898, + "balance_loss_clip": 1.21822679, + "balance_loss_mlp": 1.01534283, + "epoch": 0.9918532992634902, + "flos": 26767426124520.0, + "grad_norm": 3.769439902390012, + "language_loss": 0.71823472, + "learning_rate": 6.91054028607585e-10, + "loss": 0.7417962, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.13653564, + "step": 16497, + "time_per_iteration": 4.444952726364136 + }, + { + "auxiliary_loss_clip": 0.01335465, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.22386813, + "balance_loss_mlp": 1.01583064, + "epoch": 0.9919134225161581, + "flos": 14979837224520.0, + "grad_norm": 2.05238603518557, + "language_loss": 0.82389617, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84754324, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.13427734, + "step": 16498, + "time_per_iteration": 2.860654354095459 + }, + { + "auxiliary_loss_clip": 0.01327354, + "auxiliary_loss_mlp": 0.01039042, + "balance_loss_clip": 1.21882045, + "balance_loss_mlp": 1.02611375, + "epoch": 0.9919735457688261, + "flos": 27823928672160.0, + "grad_norm": 1.6873322313444952, + "language_loss": 0.67946041, + "learning_rate": 6.707311614246869e-10, + "loss": 0.7031244, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.12945557, + "step": 16499, + "time_per_iteration": 2.9932305812835693 + }, + { + "auxiliary_loss_clip": 0.01332554, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.22331953, + "balance_loss_mlp": 1.01887083, + "epoch": 0.992033669021494, + "flos": 22567631953320.0, + "grad_norm": 1.950483062182569, + "language_loss": 0.82151294, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84515154, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12426758, + "step": 16500, + "time_per_iteration": 2.937537431716919 + }, + { + "auxiliary_loss_clip": 0.01329093, + "auxiliary_loss_mlp": 0.01025626, + "balance_loss_clip": 1.22043073, + "balance_loss_mlp": 1.01293659, + "epoch": 0.9920937922741621, + "flos": 25380118273680.0, + "grad_norm": 1.697384393968752, + "language_loss": 0.82239383, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84594101, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.12689209, + "step": 16501, + "time_per_iteration": 2.8931710720062256 + }, + { + "auxiliary_loss_clip": 0.0133038, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.2222898, + "balance_loss_mlp": 1.01754451, + "epoch": 0.99215391552683, + "flos": 22059574848000.0, + "grad_norm": 1.8953573015374028, + "language_loss": 0.77492142, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79852587, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.12512207, + "step": 16502, + "time_per_iteration": 4.419483661651611 + }, + { + "auxiliary_loss_clip": 0.01340204, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.22975421, + "balance_loss_mlp": 1.01451802, + "epoch": 0.992214038779498, + "flos": 15418163129760.0, + "grad_norm": 2.2228881537131775, + "language_loss": 0.71814489, + "learning_rate": 6.309952072811597e-10, + "loss": 0.74182451, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.13250732, + "step": 16503, + "time_per_iteration": 2.926422357559204 + }, + { + "auxiliary_loss_clip": 0.01143708, + "auxiliary_loss_mlp": 0.01008063, + "balance_loss_clip": 1.10125446, + "balance_loss_mlp": 1.0055474, + "epoch": 0.9922741620321659, + "flos": 62030648442240.0, + "grad_norm": 0.6329571984013393, + "language_loss": 0.55161941, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57313704, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02514648, + "step": 16504, + "time_per_iteration": 3.5464508533477783 + }, + { + "auxiliary_loss_clip": 0.01327936, + "auxiliary_loss_mlp": 0.01024963, + "balance_loss_clip": 1.22068536, + "balance_loss_mlp": 1.01321507, + "epoch": 0.9923342852848339, + "flos": 17169964241400.0, + "grad_norm": 1.6368002493291283, + "language_loss": 0.69863552, + "learning_rate": 6.115821263481536e-10, + "loss": 0.72216451, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.11737061, + "step": 16505, + "time_per_iteration": 2.9055657386779785 + }, + { + "auxiliary_loss_clip": 0.0133448, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.2228651, + "balance_loss_mlp": 1.01698685, + "epoch": 0.9923944085375018, + "flos": 23188163880600.0, + "grad_norm": 1.836528212734023, + "language_loss": 0.66072679, + "learning_rate": 6.019893112119146e-10, + "loss": 0.68438244, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.14099121, + "step": 16506, + "time_per_iteration": 2.8351879119873047 + }, + { + "auxiliary_loss_clip": 0.0131925, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.21178222, + "balance_loss_mlp": 1.01730061, + "epoch": 0.9924545317901698, + "flos": 20818795251960.0, + "grad_norm": 1.8755857664376128, + "language_loss": 0.63223445, + "learning_rate": 5.924723134487219e-10, + "loss": 0.65573573, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.13592529, + "step": 16507, + "time_per_iteration": 2.8644614219665527 + }, + { + "auxiliary_loss_clip": 0.01329393, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.22158694, + "balance_loss_mlp": 1.02355397, + "epoch": 0.9925146550428379, + "flos": 20088184395960.0, + "grad_norm": 2.147082015184305, + "language_loss": 0.73010337, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75376427, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.13146973, + "step": 16508, + "time_per_iteration": 2.8364508152008057 + }, + { + "auxiliary_loss_clip": 0.01331513, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.22279167, + "balance_loss_mlp": 1.01509666, + "epoch": 0.9925747782955058, + "flos": 24979256903520.0, + "grad_norm": 2.8404999171524077, + "language_loss": 0.70425081, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72784877, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13189697, + "step": 16509, + "time_per_iteration": 2.9413530826568604 + }, + { + "auxiliary_loss_clip": 0.01327226, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.21785593, + "balance_loss_mlp": 1.01962686, + "epoch": 0.9926349015481738, + "flos": 60482460604320.0, + "grad_norm": 1.828171405738807, + "language_loss": 0.68492901, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70853764, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.14013672, + "step": 16510, + "time_per_iteration": 3.257232189178467 + }, + { + "auxiliary_loss_clip": 0.01330324, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.22169328, + "balance_loss_mlp": 1.01881909, + "epoch": 0.9926950248008417, + "flos": 20746749375360.0, + "grad_norm": 2.5260300027940836, + "language_loss": 0.8213774, + "learning_rate": 5.551625032997886e-10, + "loss": 0.8449989, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.13000488, + "step": 16511, + "time_per_iteration": 3.019681453704834 + }, + { + "auxiliary_loss_clip": 0.01320072, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.21397281, + "balance_loss_mlp": 1.01845753, + "epoch": 0.9927551480535097, + "flos": 24358359501000.0, + "grad_norm": 1.7180444373991273, + "language_loss": 0.92372906, + "learning_rate": 5.460245977570998e-10, + "loss": 0.94722885, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.11444092, + "step": 16512, + "time_per_iteration": 2.9798548221588135 + }, + { + "auxiliary_loss_clip": 0.0114074, + "auxiliary_loss_mlp": 0.01006798, + "balance_loss_clip": 1.0978421, + "balance_loss_mlp": 1.00398445, + "epoch": 0.9928152713061776, + "flos": 71292107152080.0, + "grad_norm": 0.7020823785611855, + "language_loss": 0.5526526, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57412803, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.02807617, + "step": 16513, + "time_per_iteration": 3.498561382293701 + }, + { + "auxiliary_loss_clip": 0.0131925, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.21279693, + "balance_loss_mlp": 1.01634741, + "epoch": 0.9928753945588457, + "flos": 57819142082160.0, + "grad_norm": 1.3823522961535302, + "language_loss": 0.65089893, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67437971, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12475586, + "step": 16514, + "time_per_iteration": 3.222551107406616 + }, + { + "auxiliary_loss_clip": 0.01333043, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.22360623, + "balance_loss_mlp": 1.0139904, + "epoch": 0.9929355178115136, + "flos": 19573183261080.0, + "grad_norm": 1.9774247080724423, + "language_loss": 0.72973788, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75334585, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.13757324, + "step": 16515, + "time_per_iteration": 2.8902130126953125 + }, + { + "auxiliary_loss_clip": 0.01329094, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.22140777, + "balance_loss_mlp": 1.0186336, + "epoch": 0.9929956410641816, + "flos": 22969630357560.0, + "grad_norm": 1.4953059327354288, + "language_loss": 0.77052486, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79412735, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12518311, + "step": 16516, + "time_per_iteration": 2.9525091648101807 + }, + { + "auxiliary_loss_clip": 0.01322912, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.21588337, + "balance_loss_mlp": 1.01617837, + "epoch": 0.9930557643168495, + "flos": 22571733397680.0, + "grad_norm": 1.407265594929451, + "language_loss": 0.78155613, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80506849, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.12145996, + "step": 16517, + "time_per_iteration": 3.0801498889923096 + }, + { + "auxiliary_loss_clip": 0.01339309, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.22748172, + "balance_loss_mlp": 1.0180105, + "epoch": 0.9931158875695175, + "flos": 17205398358480.0, + "grad_norm": 2.1839515055878906, + "language_loss": 0.67600858, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69972396, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 1.11865234, + "router_z_loss_mlp": 0.14208984, + "step": 16518, + "time_per_iteration": 3.0271496772766113 + }, + { + "auxiliary_loss_clip": 0.01143529, + "auxiliary_loss_mlp": 0.01007261, + "balance_loss_clip": 1.10126364, + "balance_loss_mlp": 1.00475812, + "epoch": 0.9931760108221854, + "flos": 63725208747480.0, + "grad_norm": 0.7295882490583837, + "language_loss": 0.53465927, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55616719, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02502441, + "step": 16519, + "time_per_iteration": 3.2745399475097656 + }, + { + "auxiliary_loss_clip": 0.01322054, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.21669936, + "balance_loss_mlp": 1.01514578, + "epoch": 0.9932361340748534, + "flos": 15309058801680.0, + "grad_norm": 1.641307049533124, + "language_loss": 0.6012584, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62476146, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.13116455, + "step": 16520, + "time_per_iteration": 2.8118205070495605 + }, + { + "auxiliary_loss_clip": 0.01328238, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.22091532, + "balance_loss_mlp": 1.01848793, + "epoch": 0.9932962573275215, + "flos": 36071831498040.0, + "grad_norm": 2.2740929752263748, + "language_loss": 0.6248076, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64840114, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1262207, + "step": 16521, + "time_per_iteration": 2.985146999359131 + }, + { + "auxiliary_loss_clip": 0.01333566, + "auxiliary_loss_mlp": 0.01030205, + "balance_loss_clip": 1.22389531, + "balance_loss_mlp": 1.01653206, + "epoch": 0.9933563805801894, + "flos": 21475532855160.0, + "grad_norm": 1.7540676838094027, + "language_loss": 0.748447, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.77208471, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.13684082, + "step": 16522, + "time_per_iteration": 2.966287851333618 + }, + { + "auxiliary_loss_clip": 0.013251, + "auxiliary_loss_mlp": 0.01025856, + "balance_loss_clip": 1.22012138, + "balance_loss_mlp": 1.01379812, + "epoch": 0.9934165038328574, + "flos": 23991673388760.0, + "grad_norm": 1.4932497090258667, + "language_loss": 0.73153436, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75504392, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.12060547, + "step": 16523, + "time_per_iteration": 2.9536373615264893 + }, + { + "auxiliary_loss_clip": 0.01322511, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.21569741, + "balance_loss_mlp": 1.01603103, + "epoch": 0.9934766270855253, + "flos": 21912802943040.0, + "grad_norm": 1.518326097878839, + "language_loss": 0.71416271, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73767209, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.1237793, + "step": 16524, + "time_per_iteration": 2.9261841773986816 + }, + { + "auxiliary_loss_clip": 0.01326773, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.21953988, + "balance_loss_mlp": 1.0190196, + "epoch": 0.9935367503381933, + "flos": 17133636740400.0, + "grad_norm": 2.101692953824672, + "language_loss": 0.79440081, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81798142, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1227417, + "step": 16525, + "time_per_iteration": 2.8547680377960205 + }, + { + "auxiliary_loss_clip": 0.0132727, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.22128057, + "balance_loss_mlp": 1.01618528, + "epoch": 0.9935968735908612, + "flos": 22351372498440.0, + "grad_norm": 2.151779917022753, + "language_loss": 0.74870121, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77226317, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.12738037, + "step": 16526, + "time_per_iteration": 2.909479856491089 + }, + { + "auxiliary_loss_clip": 0.01317856, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.21443963, + "balance_loss_mlp": 1.0157125, + "epoch": 0.9936569968435293, + "flos": 29466544239000.0, + "grad_norm": 1.457673215218072, + "language_loss": 0.7300182, + "learning_rate": 4.180545412333369e-10, + "loss": 0.75347233, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.11859131, + "step": 16527, + "time_per_iteration": 2.9356346130371094 + }, + { + "auxiliary_loss_clip": 0.01327701, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.21837318, + "balance_loss_mlp": 1.01647699, + "epoch": 0.9937171200961972, + "flos": 16548295280040.0, + "grad_norm": 3.8555687594386043, + "language_loss": 0.76562589, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78919852, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.13098145, + "step": 16528, + "time_per_iteration": 2.9701972007751465 + }, + { + "auxiliary_loss_clip": 0.01331118, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.22145879, + "balance_loss_mlp": 1.0166626, + "epoch": 0.9937772433488652, + "flos": 24395783427720.0, + "grad_norm": 2.2613459555535753, + "language_loss": 0.684937, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70855844, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.14373779, + "step": 16529, + "time_per_iteration": 2.960813522338867 + }, + { + "auxiliary_loss_clip": 0.01332287, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.22115922, + "balance_loss_mlp": 1.01784348, + "epoch": 0.9938373666015331, + "flos": 15674242404600.0, + "grad_norm": 2.797706256903791, + "language_loss": 0.65495217, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67859185, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.13830566, + "step": 16530, + "time_per_iteration": 3.0499327182769775 + }, + { + "auxiliary_loss_clip": 0.01329414, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.22113919, + "balance_loss_mlp": 1.01733851, + "epoch": 0.9938974898542011, + "flos": 19500406434000.0, + "grad_norm": 2.515151509309977, + "language_loss": 0.71129441, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73488224, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12030029, + "step": 16531, + "time_per_iteration": 2.93550181388855 + }, + { + "auxiliary_loss_clip": 0.01324788, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.2166208, + "balance_loss_mlp": 1.01861393, + "epoch": 0.993957613106869, + "flos": 26912614303440.0, + "grad_norm": 1.5396075506768454, + "language_loss": 0.74351132, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76707143, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1262207, + "step": 16532, + "time_per_iteration": 3.0954160690307617 + }, + { + "auxiliary_loss_clip": 0.01314464, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.21376526, + "balance_loss_mlp": 1.01657057, + "epoch": 0.994017736359537, + "flos": 14943753373680.0, + "grad_norm": 1.6727006921889238, + "language_loss": 0.70543861, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72886419, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.11523438, + "step": 16533, + "time_per_iteration": 4.270679950714111 + }, + { + "auxiliary_loss_clip": 0.01330926, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.2210499, + "balance_loss_mlp": 1.01878798, + "epoch": 0.9940778596122051, + "flos": 15382160495640.0, + "grad_norm": 8.303657897083296, + "language_loss": 0.84217966, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86581719, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.14025879, + "step": 16534, + "time_per_iteration": 4.441558122634888 + }, + { + "auxiliary_loss_clip": 0.01315048, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.21260512, + "balance_loss_mlp": 1.0156697, + "epoch": 0.994137982864873, + "flos": 25233549410520.0, + "grad_norm": 1.4265799702378967, + "language_loss": 0.66461885, + "learning_rate": 3.567796158934211e-10, + "loss": 0.68804789, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.12182617, + "step": 16535, + "time_per_iteration": 4.411844730377197 + }, + { + "auxiliary_loss_clip": 0.01326647, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.22155595, + "balance_loss_mlp": 1.01788652, + "epoch": 0.994198106117541, + "flos": 18446746471560.0, + "grad_norm": 3.5642176267286554, + "language_loss": 0.6515308, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.67509514, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.11889648, + "step": 16536, + "time_per_iteration": 3.0101466178894043 + }, + { + "auxiliary_loss_clip": 0.01324437, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.21921563, + "balance_loss_mlp": 1.01825666, + "epoch": 0.9942582293702089, + "flos": 16658455425480.0, + "grad_norm": 1.6529094612439559, + "language_loss": 0.78850412, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.81205857, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.12768555, + "step": 16537, + "time_per_iteration": 2.9816718101501465 + }, + { + "auxiliary_loss_clip": 0.01337442, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.22348261, + "balance_loss_mlp": 1.01738071, + "epoch": 0.9943183526228769, + "flos": 21949617744360.0, + "grad_norm": 1.5378996803576306, + "language_loss": 0.68928027, + "learning_rate": 3.35052651107004e-10, + "loss": 0.7129662, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.13763428, + "step": 16538, + "time_per_iteration": 2.834674596786499 + }, + { + "auxiliary_loss_clip": 0.01311585, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.20825934, + "balance_loss_mlp": 1.02033913, + "epoch": 0.9943784758755448, + "flos": 23847947110800.0, + "grad_norm": 1.882849909850589, + "language_loss": 0.75926733, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.78271472, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.12805176, + "step": 16539, + "time_per_iteration": 2.90869402885437 + }, + { + "auxiliary_loss_clip": 0.01331456, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.22267997, + "balance_loss_mlp": 1.01953316, + "epoch": 0.9944385991282129, + "flos": 21474923729760.0, + "grad_norm": 2.1142096381917996, + "language_loss": 0.70742691, + "learning_rate": 3.209471449341361e-10, + "loss": 0.73106253, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.12567139, + "step": 16540, + "time_per_iteration": 2.783923625946045 + }, + { + "auxiliary_loss_clip": 0.01319817, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.21428287, + "balance_loss_mlp": 1.01243639, + "epoch": 0.9944987223808808, + "flos": 22931840955600.0, + "grad_norm": 1.9922772504423427, + "language_loss": 0.7559427, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77938002, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.11474609, + "step": 16541, + "time_per_iteration": 4.3411548137664795 + }, + { + "auxiliary_loss_clip": 0.01327811, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.22041225, + "balance_loss_mlp": 1.02210915, + "epoch": 0.9945588456335488, + "flos": 22388187299760.0, + "grad_norm": 1.871020996083338, + "language_loss": 0.77280247, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.79643005, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12841797, + "step": 16542, + "time_per_iteration": 2.76019549369812 + }, + { + "auxiliary_loss_clip": 0.01329587, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.22017705, + "balance_loss_mlp": 1.01397741, + "epoch": 0.9946189688862167, + "flos": 21402512377920.0, + "grad_norm": 2.452530529619901, + "language_loss": 0.74962699, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.77319872, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 1.09521484, + "router_z_loss_mlp": 0.1361084, + "step": 16543, + "time_per_iteration": 2.744490146636963 + }, + { + "auxiliary_loss_clip": 0.01338637, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.22703338, + "balance_loss_mlp": 1.01922143, + "epoch": 0.9946790921388847, + "flos": 12419653601520.0, + "grad_norm": 2.2396107056318146, + "language_loss": 0.82025939, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84396875, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.13079834, + "step": 16544, + "time_per_iteration": 2.7526135444641113 + }, + { + "auxiliary_loss_clip": 0.01323668, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.21736836, + "balance_loss_mlp": 1.0156312, + "epoch": 0.9947392153915526, + "flos": 19062446004000.0, + "grad_norm": 1.6986630425943325, + "language_loss": 0.7873143, + "learning_rate": 2.870103745831187e-10, + "loss": 0.8108328, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.12554932, + "step": 16545, + "time_per_iteration": 2.8362534046173096 + }, + { + "auxiliary_loss_clip": 0.01327641, + "auxiliary_loss_mlp": 0.01025241, + "balance_loss_clip": 1.21805537, + "balance_loss_mlp": 1.01212168, + "epoch": 0.9947993386442207, + "flos": 27314653316040.0, + "grad_norm": 1.6947408987629198, + "language_loss": 0.72400182, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74753064, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13104248, + "step": 16546, + "time_per_iteration": 2.912932872772217 + }, + { + "auxiliary_loss_clip": 0.01321939, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.21724439, + "balance_loss_mlp": 1.02190614, + "epoch": 0.9948594618968887, + "flos": 20810470538160.0, + "grad_norm": 1.8738130689876482, + "language_loss": 0.77662325, + "learning_rate": 2.739664698798716e-10, + "loss": 0.80018389, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.12213135, + "step": 16547, + "time_per_iteration": 2.8230040073394775 + }, + { + "auxiliary_loss_clip": 0.01326959, + "auxiliary_loss_mlp": 0.01028253, + "balance_loss_clip": 1.21922553, + "balance_loss_mlp": 1.01619458, + "epoch": 0.9949195851495566, + "flos": 23297958550800.0, + "grad_norm": 2.0028011047115997, + "language_loss": 0.7064153, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72996747, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12054443, + "step": 16548, + "time_per_iteration": 2.772653818130493 + }, + { + "auxiliary_loss_clip": 0.01327203, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.22054505, + "balance_loss_mlp": 1.01662421, + "epoch": 0.9949797084022246, + "flos": 18520132424040.0, + "grad_norm": 1.739588326107785, + "language_loss": 0.75398779, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77755499, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12890625, + "step": 16549, + "time_per_iteration": 2.8035664558410645 + }, + { + "auxiliary_loss_clip": 0.01333621, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.22395885, + "balance_loss_mlp": 1.02123308, + "epoch": 0.9950398316548925, + "flos": 30413658200040.0, + "grad_norm": 1.5920820292948716, + "language_loss": 0.74871767, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.77240616, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.13977051, + "step": 16550, + "time_per_iteration": 2.8737375736236572 + }, + { + "auxiliary_loss_clip": 0.01324032, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.21717393, + "balance_loss_mlp": 1.01540005, + "epoch": 0.9950999549075605, + "flos": 19905572290320.0, + "grad_norm": 1.6894321492299205, + "language_loss": 0.78060472, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80411935, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.12023926, + "step": 16551, + "time_per_iteration": 2.8790175914764404 + }, + { + "auxiliary_loss_clip": 0.01311402, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.21080577, + "balance_loss_mlp": 1.01742649, + "epoch": 0.9951600781602284, + "flos": 17608127713200.0, + "grad_norm": 1.7145168793723076, + "language_loss": 0.66945744, + "learning_rate": 2.426837340270271e-10, + "loss": 0.69285285, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.1071167, + "step": 16552, + "time_per_iteration": 2.8871660232543945 + }, + { + "auxiliary_loss_clip": 0.01325862, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.2177285, + "balance_loss_mlp": 1.01617074, + "epoch": 0.9952202014128965, + "flos": 28956781582560.0, + "grad_norm": 1.326914088948989, + "language_loss": 0.81594586, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83949757, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.13134766, + "step": 16553, + "time_per_iteration": 2.8777732849121094 + }, + { + "auxiliary_loss_clip": 0.01141826, + "auxiliary_loss_mlp": 0.01005201, + "balance_loss_clip": 1.09934688, + "balance_loss_mlp": 1.00272179, + "epoch": 0.9952803246655644, + "flos": 70828052527800.0, + "grad_norm": 0.7194854204802127, + "language_loss": 0.5739218, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59539211, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02478027, + "step": 16554, + "time_per_iteration": 3.3801162242889404 + }, + { + "auxiliary_loss_clip": 0.01326806, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.21824467, + "balance_loss_mlp": 1.0227468, + "epoch": 0.9953404479182324, + "flos": 21804145306920.0, + "grad_norm": 1.6858562337177443, + "language_loss": 0.7731111, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79673004, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.12335205, + "step": 16555, + "time_per_iteration": 2.773038864135742 + }, + { + "auxiliary_loss_clip": 0.01319994, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.21263695, + "balance_loss_mlp": 1.01758862, + "epoch": 0.9954005711709003, + "flos": 21941496072360.0, + "grad_norm": 1.8223022110539862, + "language_loss": 0.86362946, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88713479, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.12963867, + "step": 16556, + "time_per_iteration": 2.8924596309661865 + }, + { + "auxiliary_loss_clip": 0.01324501, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.21915698, + "balance_loss_mlp": 1.01520371, + "epoch": 0.9954606944235683, + "flos": 19359319699440.0, + "grad_norm": 1.6051951905665294, + "language_loss": 0.73046911, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75399017, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12408447, + "step": 16557, + "time_per_iteration": 2.907883882522583 + }, + { + "auxiliary_loss_clip": 0.01318288, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.21373606, + "balance_loss_mlp": 1.01563966, + "epoch": 0.9955208176762362, + "flos": 30524752337760.0, + "grad_norm": 1.9332831899446759, + "language_loss": 0.76710415, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.79056889, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.12542725, + "step": 16558, + "time_per_iteration": 2.878377676010132 + }, + { + "auxiliary_loss_clip": 0.01321108, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.21226096, + "balance_loss_mlp": 1.01818085, + "epoch": 0.9955809409289043, + "flos": 30014380555920.0, + "grad_norm": 2.1996369455936593, + "language_loss": 0.63763309, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.66115421, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12835693, + "step": 16559, + "time_per_iteration": 2.8645436763763428 + }, + { + "auxiliary_loss_clip": 0.01318675, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.21348536, + "balance_loss_mlp": 1.01534009, + "epoch": 0.9956410641815723, + "flos": 21548269073880.0, + "grad_norm": 2.2810910750548556, + "language_loss": 0.74170566, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76517236, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.12677002, + "step": 16560, + "time_per_iteration": 2.8800933361053467 + }, + { + "auxiliary_loss_clip": 0.01326011, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.22015393, + "balance_loss_mlp": 1.01973498, + "epoch": 0.9957011874342402, + "flos": 21694513070160.0, + "grad_norm": 1.6243945003970404, + "language_loss": 0.79736412, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.8209424, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12091064, + "step": 16561, + "time_per_iteration": 2.7328922748565674 + }, + { + "auxiliary_loss_clip": 0.01322726, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.21837926, + "balance_loss_mlp": 1.01710963, + "epoch": 0.9957613106869082, + "flos": 17705902308840.0, + "grad_norm": 2.189649278801003, + "language_loss": 0.6657232, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.68924439, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.1227417, + "step": 16562, + "time_per_iteration": 2.7249393463134766 + }, + { + "auxiliary_loss_clip": 0.01333518, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.22292042, + "balance_loss_mlp": 1.02600908, + "epoch": 0.9958214339395761, + "flos": 30562257481200.0, + "grad_norm": 1.8407696864188818, + "language_loss": 0.64614201, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66987956, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.14227295, + "step": 16563, + "time_per_iteration": 2.8413007259368896 + }, + { + "auxiliary_loss_clip": 0.01326859, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.22010148, + "balance_loss_mlp": 1.01745582, + "epoch": 0.9958815571922441, + "flos": 24174366711120.0, + "grad_norm": 1.6716461472444537, + "language_loss": 0.65152252, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.67509615, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.13043213, + "step": 16564, + "time_per_iteration": 2.7800676822662354 + }, + { + "auxiliary_loss_clip": 0.01318135, + "auxiliary_loss_mlp": 0.01026502, + "balance_loss_clip": 1.21236658, + "balance_loss_mlp": 1.01382947, + "epoch": 0.995941680444912, + "flos": 15491264823720.0, + "grad_norm": 1.7786105866797355, + "language_loss": 0.74851, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.77195632, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.12664795, + "step": 16565, + "time_per_iteration": 2.7438957691192627 + }, + { + "auxiliary_loss_clip": 0.01318988, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.21134627, + "balance_loss_mlp": 1.01659179, + "epoch": 0.9960018036975801, + "flos": 18625419566280.0, + "grad_norm": 1.7528518593847662, + "language_loss": 0.79646671, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81994563, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.12304688, + "step": 16566, + "time_per_iteration": 2.7219583988189697 + }, + { + "auxiliary_loss_clip": 0.01320197, + "auxiliary_loss_mlp": 0.01025789, + "balance_loss_clip": 1.21530795, + "balance_loss_mlp": 1.01409483, + "epoch": 0.996061926950248, + "flos": 20088955954800.0, + "grad_norm": 1.4885940601319458, + "language_loss": 0.71021342, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.73367327, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.11700439, + "step": 16567, + "time_per_iteration": 2.810206413269043 + }, + { + "auxiliary_loss_clip": 0.01332724, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.22271752, + "balance_loss_mlp": 1.02047896, + "epoch": 0.996122050202916, + "flos": 24352552505520.0, + "grad_norm": 2.5138131923576017, + "language_loss": 0.78489387, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.8085674, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.14160156, + "step": 16568, + "time_per_iteration": 2.862147331237793 + }, + { + "auxiliary_loss_clip": 0.01318428, + "auxiliary_loss_mlp": 0.01025091, + "balance_loss_clip": 1.21558213, + "balance_loss_mlp": 1.01373076, + "epoch": 0.9961821734555839, + "flos": 24203668965840.0, + "grad_norm": 1.7704187172864692, + "language_loss": 0.82237828, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84581345, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.1137085, + "step": 16569, + "time_per_iteration": 2.827843189239502 + }, + { + "auxiliary_loss_clip": 0.01319191, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.21577537, + "balance_loss_mlp": 1.01748693, + "epoch": 0.9962422967082519, + "flos": 22638337754040.0, + "grad_norm": 1.7154489859402062, + "language_loss": 0.70195782, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72545093, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.12646484, + "step": 16570, + "time_per_iteration": 2.914301633834839 + }, + { + "auxiliary_loss_clip": 0.01324833, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.21798468, + "balance_loss_mlp": 1.01898944, + "epoch": 0.9963024199609198, + "flos": 26401674004560.0, + "grad_norm": 1.498427745940316, + "language_loss": 0.75194752, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77550882, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.12310791, + "step": 16571, + "time_per_iteration": 2.9040462970733643 + }, + { + "auxiliary_loss_clip": 0.01322033, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.21494853, + "balance_loss_mlp": 1.02072668, + "epoch": 0.9963625432135879, + "flos": 16585922248560.0, + "grad_norm": 1.6662490382379416, + "language_loss": 0.80112708, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.82468081, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.1262207, + "step": 16572, + "time_per_iteration": 5.713165044784546 + }, + { + "auxiliary_loss_clip": 0.01322801, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.21745658, + "balance_loss_mlp": 1.01530242, + "epoch": 0.9964226664662559, + "flos": 26474125964760.0, + "grad_norm": 1.7766645068734173, + "language_loss": 0.70291388, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72641921, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.12438965, + "step": 16573, + "time_per_iteration": 2.839324951171875 + }, + { + "auxiliary_loss_clip": 0.0133602, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.22484994, + "balance_loss_mlp": 1.01710105, + "epoch": 0.9964827897189238, + "flos": 27453344157360.0, + "grad_norm": 1.7752314255724873, + "language_loss": 0.63182271, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65548372, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.12976074, + "step": 16574, + "time_per_iteration": 4.894059419631958 + }, + { + "auxiliary_loss_clip": 0.01341821, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.22996998, + "balance_loss_mlp": 1.01731515, + "epoch": 0.9965429129715918, + "flos": 11724355037520.0, + "grad_norm": 6.890822028139399, + "language_loss": 0.76295447, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78668445, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.13873291, + "step": 16575, + "time_per_iteration": 3.339663028717041 + }, + { + "auxiliary_loss_clip": 0.0132452, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.21755981, + "balance_loss_mlp": 1.02239394, + "epoch": 0.9966030362242597, + "flos": 19760830803360.0, + "grad_norm": 1.653418932289709, + "language_loss": 0.70148069, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72508317, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.13348389, + "step": 16576, + "time_per_iteration": 3.2632193565368652 + }, + { + "auxiliary_loss_clip": 0.0131625, + "auxiliary_loss_mlp": 0.01024625, + "balance_loss_clip": 1.21296859, + "balance_loss_mlp": 1.01189947, + "epoch": 0.9966631594769277, + "flos": 23920520896080.0, + "grad_norm": 1.6956170159662771, + "language_loss": 0.72483855, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74824733, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.12731934, + "step": 16577, + "time_per_iteration": 3.280630111694336 + }, + { + "auxiliary_loss_clip": 0.01324592, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.21720791, + "balance_loss_mlp": 1.01723623, + "epoch": 0.9967232827295956, + "flos": 15563798000640.0, + "grad_norm": 2.035088756376441, + "language_loss": 0.78637874, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80992484, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.12792969, + "step": 16578, + "time_per_iteration": 3.341886520385742 + }, + { + "auxiliary_loss_clip": 0.01326832, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.22034574, + "balance_loss_mlp": 1.02079844, + "epoch": 0.9967834059822637, + "flos": 20817861259680.0, + "grad_norm": 1.8832545713564863, + "language_loss": 0.76134574, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.7849443, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.12225342, + "step": 16579, + "time_per_iteration": 4.942188024520874 + }, + { + "auxiliary_loss_clip": 0.01334412, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.22511101, + "balance_loss_mlp": 1.018471, + "epoch": 0.9968435292349316, + "flos": 36725929557840.0, + "grad_norm": 1.9580387428355805, + "language_loss": 0.70005918, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.72373229, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.14428711, + "step": 16580, + "time_per_iteration": 3.416489839553833 + }, + { + "auxiliary_loss_clip": 0.01324639, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.21656442, + "balance_loss_mlp": 1.01326978, + "epoch": 0.9969036524875996, + "flos": 26766613957320.0, + "grad_norm": 18.32977435999457, + "language_loss": 0.79613709, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81963664, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.12042236, + "step": 16581, + "time_per_iteration": 3.2727842330932617 + }, + { + "auxiliary_loss_clip": 0.01322046, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.21723938, + "balance_loss_mlp": 1.01411378, + "epoch": 0.9969637757402675, + "flos": 24832688040360.0, + "grad_norm": 1.6838346072765273, + "language_loss": 0.80631584, + "learning_rate": 9.479950191249031e-11, + "loss": 0.8297922, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.11474609, + "step": 16582, + "time_per_iteration": 3.2627387046813965 + }, + { + "auxiliary_loss_clip": 0.01317557, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.21547234, + "balance_loss_mlp": 1.01841497, + "epoch": 0.9970238989929355, + "flos": 23043869085600.0, + "grad_norm": 1.639403403666421, + "language_loss": 0.61071879, + "learning_rate": 9.104547011951069e-11, + "loss": 0.63420188, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.12322998, + "step": 16583, + "time_per_iteration": 3.28365421295166 + }, + { + "auxiliary_loss_clip": 0.0132775, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.21994662, + "balance_loss_mlp": 1.01993334, + "epoch": 0.9970840222456034, + "flos": 25303970952720.0, + "grad_norm": 1.6842514472832364, + "language_loss": 0.781977, + "learning_rate": 8.736727507452357e-11, + "loss": 0.80557662, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12286377, + "step": 16584, + "time_per_iteration": 3.3408753871917725 + }, + { + "auxiliary_loss_clip": 0.01320037, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.21636009, + "balance_loss_mlp": 1.01962197, + "epoch": 0.9971441454982715, + "flos": 21620517992280.0, + "grad_norm": 3.1179503727999016, + "language_loss": 0.69400144, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71751523, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.11712646, + "step": 16585, + "time_per_iteration": 3.4254038333892822 + }, + { + "auxiliary_loss_clip": 0.01318946, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.21378613, + "balance_loss_mlp": 1.01905906, + "epoch": 0.9972042687509394, + "flos": 14979715399440.0, + "grad_norm": 2.202582008996186, + "language_loss": 0.81899559, + "learning_rate": 8.023839578363834e-11, + "loss": 0.842502, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.1262207, + "step": 16586, + "time_per_iteration": 3.3472864627838135 + }, + { + "auxiliary_loss_clip": 0.0132919, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.22136366, + "balance_loss_mlp": 1.02181137, + "epoch": 0.9972643920036074, + "flos": 25811500149360.0, + "grad_norm": 1.7208690421712305, + "language_loss": 0.7816745, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80530548, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.12115479, + "step": 16587, + "time_per_iteration": 3.375666379928589 + }, + { + "auxiliary_loss_clip": 0.01333221, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.22480822, + "balance_loss_mlp": 1.0243777, + "epoch": 0.9973245152562754, + "flos": 23330834341200.0, + "grad_norm": 1.6439586496702245, + "language_loss": 0.73243082, + "learning_rate": 7.341286512074773e-11, + "loss": 0.75613225, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12542725, + "step": 16588, + "time_per_iteration": 3.413696527481079 + }, + { + "auxiliary_loss_clip": 0.01336539, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.22386169, + "balance_loss_mlp": 1.02002752, + "epoch": 0.9973846385089433, + "flos": 12169949839200.0, + "grad_norm": 2.555248418152511, + "language_loss": 0.83414847, + "learning_rate": 7.011385585031781e-11, + "loss": 0.85784608, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.13183594, + "step": 16589, + "time_per_iteration": 3.2035865783691406 + }, + { + "auxiliary_loss_clip": 0.01341173, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.22827053, + "balance_loss_mlp": 1.01936674, + "epoch": 0.9974447617616113, + "flos": 20049664043520.0, + "grad_norm": 2.4259484210563427, + "language_loss": 0.7063179, + "learning_rate": 6.689068412168986e-11, + "loss": 0.73007065, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.14733887, + "step": 16590, + "time_per_iteration": 3.2913310527801514 + }, + { + "auxiliary_loss_clip": 0.01332262, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.22334135, + "balance_loss_mlp": 1.0143863, + "epoch": 0.9975048850142793, + "flos": 32020961474880.0, + "grad_norm": 2.1118164765297203, + "language_loss": 0.63392884, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65752441, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.12902832, + "step": 16591, + "time_per_iteration": 3.355698823928833 + }, + { + "auxiliary_loss_clip": 0.01325943, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.2174139, + "balance_loss_mlp": 1.01540756, + "epoch": 0.9975650082669473, + "flos": 36939874336200.0, + "grad_norm": 1.6210326538659454, + "language_loss": 0.7333467, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75688863, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.12860107, + "step": 16592, + "time_per_iteration": 3.370670795440674 + }, + { + "auxiliary_loss_clip": 0.01330807, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.22226453, + "balance_loss_mlp": 1.01783586, + "epoch": 0.9976251315196152, + "flos": 16476493053600.0, + "grad_norm": 1.419372571840881, + "language_loss": 0.85236061, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87597966, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.13262939, + "step": 16593, + "time_per_iteration": 3.2289838790893555 + }, + { + "auxiliary_loss_clip": 0.01328194, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.22258496, + "balance_loss_mlp": 1.01883125, + "epoch": 0.9976852547722832, + "flos": 19651685866920.0, + "grad_norm": 1.6773683363632887, + "language_loss": 0.70444691, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.72803164, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.11444092, + "step": 16594, + "time_per_iteration": 3.321836471557617 + }, + { + "auxiliary_loss_clip": 0.01335725, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.22551417, + "balance_loss_mlp": 1.01487446, + "epoch": 0.9977453780249511, + "flos": 20453246173800.0, + "grad_norm": 1.9118910366537059, + "language_loss": 0.72647923, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.75011438, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.12908936, + "step": 16595, + "time_per_iteration": 3.2288753986358643 + }, + { + "auxiliary_loss_clip": 0.01144048, + "auxiliary_loss_mlp": 0.01006735, + "balance_loss_clip": 1.10131109, + "balance_loss_mlp": 1.00436234, + "epoch": 0.9978055012776191, + "flos": 65468360278440.0, + "grad_norm": 0.8001027843457202, + "language_loss": 0.60409629, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62560415, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02368164, + "step": 16596, + "time_per_iteration": 3.4562504291534424 + }, + { + "auxiliary_loss_clip": 0.01328863, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.22136474, + "balance_loss_mlp": 1.01787674, + "epoch": 0.997865624530287, + "flos": 20636426796480.0, + "grad_norm": 3.0381861144708338, + "language_loss": 0.7733255, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79691494, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.12213135, + "step": 16597, + "time_per_iteration": 3.2694008350372314 + }, + { + "auxiliary_loss_clip": 0.01330302, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.22130144, + "balance_loss_mlp": 1.01913285, + "epoch": 0.9979257477829551, + "flos": 29393036461440.0, + "grad_norm": 1.6933723757914387, + "language_loss": 0.82459658, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84822071, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.12982178, + "step": 16598, + "time_per_iteration": 3.0908989906311035 + }, + { + "auxiliary_loss_clip": 0.01337662, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.22613144, + "balance_loss_mlp": 1.01814306, + "epoch": 0.997985871035623, + "flos": 22639840263360.0, + "grad_norm": 1.781605357915116, + "language_loss": 0.65286291, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67656273, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.14190674, + "step": 16599, + "time_per_iteration": 3.035752773284912 + }, + { + "auxiliary_loss_clip": 0.01144587, + "auxiliary_loss_mlp": 0.01006975, + "balance_loss_clip": 1.10225689, + "balance_loss_mlp": 1.00447202, + "epoch": 0.998045994288291, + "flos": 61819813526400.0, + "grad_norm": 0.8549591052607461, + "language_loss": 0.62443978, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64595538, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.02502441, + "step": 16600, + "time_per_iteration": 3.2002511024475098 + }, + { + "auxiliary_loss_clip": 0.01320626, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.21423471, + "balance_loss_mlp": 1.01642275, + "epoch": 0.998106117540959, + "flos": 19250337196440.0, + "grad_norm": 1.6062229785105118, + "language_loss": 0.78283197, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80632353, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.12109375, + "step": 16601, + "time_per_iteration": 2.861213207244873 + }, + { + "auxiliary_loss_clip": 0.01331611, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.22065556, + "balance_loss_mlp": 1.01656699, + "epoch": 0.9981662407936269, + "flos": 21111242636160.0, + "grad_norm": 2.2008370169555156, + "language_loss": 0.82223517, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84584254, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.12554932, + "step": 16602, + "time_per_iteration": 3.056522846221924 + }, + { + "auxiliary_loss_clip": 0.01328524, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.22113931, + "balance_loss_mlp": 1.02283156, + "epoch": 0.998226364046295, + "flos": 24322275650160.0, + "grad_norm": 2.100178253444126, + "language_loss": 0.62308359, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64672196, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.12481689, + "step": 16603, + "time_per_iteration": 3.009852170944214 + }, + { + "auxiliary_loss_clip": 0.01329476, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.22186065, + "balance_loss_mlp": 1.01513243, + "epoch": 0.9982864872989629, + "flos": 23840962472880.0, + "grad_norm": 2.3283670730206927, + "language_loss": 0.71088648, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73447227, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1395874, + "step": 16604, + "time_per_iteration": 2.962480068206787 + }, + { + "auxiliary_loss_clip": 0.01326771, + "auxiliary_loss_mlp": 0.01023018, + "balance_loss_clip": 1.21897531, + "balance_loss_mlp": 1.01065648, + "epoch": 0.9983466105516309, + "flos": 18337763968560.0, + "grad_norm": 1.5133110090195696, + "language_loss": 0.64680505, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.67030299, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.12365723, + "step": 16605, + "time_per_iteration": 2.917107343673706 + }, + { + "auxiliary_loss_clip": 0.01321035, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.21505427, + "balance_loss_mlp": 1.01742196, + "epoch": 0.9984067338042988, + "flos": 17241035517360.0, + "grad_norm": 1.5307097450591065, + "language_loss": 0.71320254, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73670667, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.11962891, + "step": 16606, + "time_per_iteration": 2.9348654747009277 + }, + { + "auxiliary_loss_clip": 0.01325766, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.21916533, + "balance_loss_mlp": 1.01576686, + "epoch": 0.9984668570569668, + "flos": 20672794905840.0, + "grad_norm": 1.907980909347837, + "language_loss": 0.82282066, + "learning_rate": 2.370001590090709e-11, + "loss": 0.84635854, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.12255859, + "step": 16607, + "time_per_iteration": 2.978623151779175 + }, + { + "auxiliary_loss_clip": 0.013312, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.21995223, + "balance_loss_mlp": 1.01783407, + "epoch": 0.9985269803096347, + "flos": 30268551237840.0, + "grad_norm": 1.5590045363398617, + "language_loss": 0.67199993, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69563103, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.14074707, + "step": 16608, + "time_per_iteration": 3.023878335952759 + }, + { + "auxiliary_loss_clip": 0.01327166, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.22004771, + "balance_loss_mlp": 1.0140847, + "epoch": 0.9985871035623027, + "flos": 10564514548920.0, + "grad_norm": 10.911507591551427, + "language_loss": 0.80987704, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.83341628, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12683105, + "step": 16609, + "time_per_iteration": 3.0263335704803467 + }, + { + "auxiliary_loss_clip": 0.01324144, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.21577692, + "balance_loss_mlp": 1.02228045, + "epoch": 0.9986472268149706, + "flos": 16877882332440.0, + "grad_norm": 1.4027667720826622, + "language_loss": 0.62706834, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65065753, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.12512207, + "step": 16610, + "time_per_iteration": 2.9980533123016357 + }, + { + "auxiliary_loss_clip": 0.0132014, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.21320808, + "balance_loss_mlp": 1.02072167, + "epoch": 0.9987073500676387, + "flos": 22059656064720.0, + "grad_norm": 1.9644532691290477, + "language_loss": 0.67952824, + "learning_rate": 1.672274094288717e-11, + "loss": 0.70306003, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.12335205, + "step": 16611, + "time_per_iteration": 5.849200963973999 + }, + { + "auxiliary_loss_clip": 0.01327762, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.22026491, + "balance_loss_mlp": 1.021456, + "epoch": 0.9987674733203066, + "flos": 30489115178880.0, + "grad_norm": 1.4209967947135755, + "language_loss": 0.69818926, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.72181457, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.13305664, + "step": 16612, + "time_per_iteration": 4.683733701705933 + }, + { + "auxiliary_loss_clip": 0.01316455, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.21311831, + "balance_loss_mlp": 1.01955521, + "epoch": 0.9988275965729746, + "flos": 27750948803280.0, + "grad_norm": 1.4280345632840705, + "language_loss": 0.7402631, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76373804, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.11486816, + "step": 16613, + "time_per_iteration": 3.253657341003418 + }, + { + "auxiliary_loss_clip": 0.01324185, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.21586227, + "balance_loss_mlp": 1.01279509, + "epoch": 0.9988877198256426, + "flos": 17527919556240.0, + "grad_norm": 1.9671331503077576, + "language_loss": 0.73787963, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.7613824, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.13299561, + "step": 16614, + "time_per_iteration": 3.0928139686584473 + }, + { + "auxiliary_loss_clip": 0.01330888, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.22370243, + "balance_loss_mlp": 1.0231446, + "epoch": 0.9989478430783105, + "flos": 21001488574320.0, + "grad_norm": 1.7328331511728614, + "language_loss": 0.72828817, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75195575, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1272583, + "step": 16615, + "time_per_iteration": 3.1511013507843018 + }, + { + "auxiliary_loss_clip": 0.01331057, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.22135127, + "balance_loss_mlp": 1.01622462, + "epoch": 0.9990079663309785, + "flos": 13374564367680.0, + "grad_norm": 2.35215079198286, + "language_loss": 0.78358889, + "learning_rate": 9.70753783247069e-12, + "loss": 0.80718696, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.12524414, + "step": 16616, + "time_per_iteration": 3.054511785507202 + }, + { + "auxiliary_loss_clip": 0.01324481, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.21721888, + "balance_loss_mlp": 1.01735473, + "epoch": 0.9990680895836465, + "flos": 17314583903280.0, + "grad_norm": 1.8556778112399017, + "language_loss": 0.82761365, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85116661, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.13476562, + "step": 16617, + "time_per_iteration": 2.9491679668426514 + }, + { + "auxiliary_loss_clip": 0.01320842, + "auxiliary_loss_mlp": 0.01026035, + "balance_loss_clip": 1.21505761, + "balance_loss_mlp": 1.01422131, + "epoch": 0.9991282128363145, + "flos": 24213496188960.0, + "grad_norm": 1.9596842922746185, + "language_loss": 0.78855103, + "learning_rate": 7.43233506206309e-12, + "loss": 0.81201982, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.11810303, + "step": 16618, + "time_per_iteration": 4.633387088775635 + }, + { + "auxiliary_loss_clip": 0.01323696, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.21752489, + "balance_loss_mlp": 1.01453686, + "epoch": 0.9991883360889824, + "flos": 21179836802160.0, + "grad_norm": 1.72645029391132, + "language_loss": 0.74910241, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77261364, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.12896729, + "step": 16619, + "time_per_iteration": 2.9345779418945312 + }, + { + "auxiliary_loss_clip": 0.01318061, + "auxiliary_loss_mlp": 0.0102453, + "balance_loss_clip": 1.21319461, + "balance_loss_mlp": 1.01342595, + "epoch": 0.9992484593416504, + "flos": 19905897157200.0, + "grad_norm": 8.767997607653163, + "language_loss": 0.86532223, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88874817, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.11108398, + "step": 16620, + "time_per_iteration": 2.8790488243103027 + }, + { + "auxiliary_loss_clip": 0.01317279, + "auxiliary_loss_mlp": 0.01023505, + "balance_loss_clip": 1.21234035, + "balance_loss_mlp": 1.01157248, + "epoch": 0.9993085825943183, + "flos": 24862518203760.0, + "grad_norm": 2.3292061351538833, + "language_loss": 0.73048019, + "learning_rate": 4.58833038607942e-12, + "loss": 0.75388801, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.11938477, + "step": 16621, + "time_per_iteration": 2.9602651596069336 + }, + { + "auxiliary_loss_clip": 0.01144844, + "auxiliary_loss_mlp": 0.01008424, + "balance_loss_clip": 1.10210073, + "balance_loss_mlp": 1.00578976, + "epoch": 0.9993687058469863, + "flos": 71302178025360.0, + "grad_norm": 0.7449809894749047, + "language_loss": 0.56557405, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58710682, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.02636719, + "step": 16622, + "time_per_iteration": 3.5434067249298096 + }, + { + "auxiliary_loss_clip": 0.01329122, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.22075057, + "balance_loss_mlp": 1.01412284, + "epoch": 0.9994288290996542, + "flos": 12202906846320.0, + "grad_norm": 1.847090507767546, + "language_loss": 0.71376765, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73732972, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1295166, + "step": 16623, + "time_per_iteration": 2.917539119720459 + }, + { + "auxiliary_loss_clip": 0.01322072, + "auxiliary_loss_mlp": 0.01027294, + "balance_loss_clip": 1.2155602, + "balance_loss_mlp": 1.01399565, + "epoch": 0.9994889523523223, + "flos": 17893834109640.0, + "grad_norm": 1.949774857809783, + "language_loss": 0.75346839, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77696204, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.13287354, + "step": 16624, + "time_per_iteration": 2.9944801330566406 + }, + { + "auxiliary_loss_clip": 0.01327019, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.21911216, + "balance_loss_mlp": 1.01405895, + "epoch": 0.9995490756049902, + "flos": 26584610977080.0, + "grad_norm": 1.5362805450339498, + "language_loss": 0.73917258, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.76271904, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.13549805, + "step": 16625, + "time_per_iteration": 3.1767218112945557 + }, + { + "auxiliary_loss_clip": 0.01313989, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.21015477, + "balance_loss_mlp": 1.01471496, + "epoch": 0.9996091988576582, + "flos": 22205128502160.0, + "grad_norm": 2.0226551880787085, + "language_loss": 0.77533543, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.79874462, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.12194824, + "step": 16626, + "time_per_iteration": 3.0671961307525635 + }, + { + "auxiliary_loss_clip": 0.01324585, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.21962321, + "balance_loss_mlp": 1.01576328, + "epoch": 0.9996693221103262, + "flos": 27377927786880.0, + "grad_norm": 2.0029690043188877, + "language_loss": 0.8235507, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84707934, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.12506104, + "step": 16627, + "time_per_iteration": 3.0622518062591553 + }, + { + "auxiliary_loss_clip": 0.01327088, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.21671081, + "balance_loss_mlp": 1.01937103, + "epoch": 0.9997294453629941, + "flos": 26876124369000.0, + "grad_norm": 2.082774330370107, + "language_loss": 0.71370828, + "learning_rate": 6.067215747584952e-13, + "loss": 0.73730373, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.13085938, + "step": 16628, + "time_per_iteration": 3.12781023979187 + }, + { + "auxiliary_loss_clip": 0.01323182, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.2149564, + "balance_loss_mlp": 1.01624978, + "epoch": 0.9997895686156621, + "flos": 23482235599200.0, + "grad_norm": 1.3130451419584417, + "language_loss": 0.75584185, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77936387, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.12786865, + "step": 16629, + "time_per_iteration": 3.024195671081543 + }, + { + "auxiliary_loss_clip": 0.01336769, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.22508585, + "balance_loss_mlp": 1.02280521, + "epoch": 0.9998496918683301, + "flos": 20229149305440.0, + "grad_norm": 1.6941375343603977, + "language_loss": 0.60364389, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62737143, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.13165283, + "step": 16630, + "time_per_iteration": 3.04084849357605 + }, + { + "auxiliary_loss_clip": 0.0132972, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.22183752, + "balance_loss_mlp": 1.01664448, + "epoch": 0.9999098151209981, + "flos": 21657941919000.0, + "grad_norm": 2.261291749359652, + "language_loss": 0.60896468, + "learning_rate": 3.792010017100722e-14, + "loss": 0.63256019, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1317749, + "step": 16631, + "time_per_iteration": 2.9723968505859375 + }, + { + "auxiliary_loss_clip": 0.01319845, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.21631336, + "balance_loss_mlp": 1.01824903, + "epoch": 0.999969938373666, + "flos": 11548077836040.0, + "grad_norm": 2.0083432744026166, + "language_loss": 0.72876966, + "learning_rate": 0.0, + "loss": 0.75226486, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.11425781, + "step": 16632, + "time_per_iteration": 2.9824728965759277 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3995539087407186e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}